Unverified Commit 05ccaf88 authored by Lukasz Kaiser's avatar Lukasz Kaiser Committed by GitHub
Browse files

Merge pull request #3521 from YknZhu/master

Add deeplab model in tensorflow models
parents 6571d16d 1e9b07d8
# Copyright 2018 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Exports trained model to TensorFlow frozen graph."""
import os
import tensorflow as tf
from tensorflow.python.tools import freeze_graph
from deeplab import common
from deeplab import input_preprocess
from deeplab import model
slim = tf.contrib.slim
flags = tf.app.flags
FLAGS = flags.FLAGS
flags.DEFINE_string('checkpoint_path', None, 'Checkpoint path')
flags.DEFINE_string('export_path', None,
'Path to output Tensorflow frozen graph.')
flags.DEFINE_integer('num_classes', 21, 'Number of classes.')
flags.DEFINE_multi_integer('crop_size', [513, 513],
'Crop size [height, width].')
# For `xception_65`, use atrous_rates = [12, 24, 36] if output_stride = 8, or
# rates = [6, 12, 18] if output_stride = 16. For `mobilenet_v2`, use None. Note
# one could use different atrous_rates/output_stride during training/evaluation.
flags.DEFINE_multi_integer('atrous_rates', None,
'Atrous rates for atrous spatial pyramid pooling.')
flags.DEFINE_integer('output_stride', 8,
'The ratio of input to output spatial resolution.')
# Change to [0.5, 0.75, 1.0, 1.25, 1.5, 1.75] for multi-scale inference.
flags.DEFINE_multi_float('inference_scales', [1.0],
'The scales to resize images for inference.')
flags.DEFINE_bool('add_flipped_images', False,
'Add flipped images during inference or not.')
# Input name of the exported model.
_INPUT_NAME = 'ImageTensor'
# Output name of the exported model.
_OUTPUT_NAME = 'SemanticPredictions'
def _create_input_tensors():
"""Creates and prepares input tensors for DeepLab model.
This method creates a 4-D uint8 image tensor 'ImageTensor' with shape
[1, None, None, 3]. The actual input tensor name to use during inference is
'ImageTensor:0'.
Returns:
image: Preprocessed 4-D float32 tensor with shape [1, crop_height,
crop_width, 3].
original_image_size: Original image shape tensor [height, width].
resized_image_size: Resized image shape tensor [height, width].
"""
# input_preprocess takes 4-D image tensor as input.
input_image = tf.placeholder(tf.uint8, [1, None, None, 3], name=_INPUT_NAME)
original_image_size = tf.shape(input_image)[1:3]
# Squeeze the dimension in axis=0 since `preprocess_image_and_label` assumes
# image to be 3-D.
image = tf.squeeze(input_image, axis=0)
resized_image, image, _ = input_preprocess.preprocess_image_and_label(
image,
label=None,
crop_height=FLAGS.crop_size[0],
crop_width=FLAGS.crop_size[1],
min_resize_value=FLAGS.min_resize_value,
max_resize_value=FLAGS.max_resize_value,
resize_factor=FLAGS.resize_factor,
is_training=False,
model_variant=FLAGS.model_variant)
resized_image_size = tf.shape(resized_image)[:2]
# Expand the dimension in axis=0, since the following operations assume the
# image to be 4-D.
image = tf.expand_dims(image, 0)
return image, original_image_size, resized_image_size
def main(unused_argv):
tf.logging.set_verbosity(tf.logging.INFO)
tf.logging.info('Prepare to export model to: %s', FLAGS.export_path)
with tf.Graph().as_default():
image, image_size, resized_image_size = _create_input_tensors()
model_options = common.ModelOptions(
outputs_to_num_classes={common.OUTPUT_TYPE: FLAGS.num_classes},
crop_size=FLAGS.crop_size,
atrous_rates=FLAGS.atrous_rates,
output_stride=FLAGS.output_stride)
if tuple(FLAGS.inference_scales) == (1.0,):
tf.logging.info('Exported model performs single-scale inference.')
predictions = model.predict_labels(
image,
model_options=model_options,
image_pyramid=FLAGS.image_pyramid)
else:
tf.logging.info('Exported model performs multi-scale inference.')
predictions = model.predict_labels_multi_scale(
image,
model_options=model_options,
eval_scales=FLAGS.inference_scales,
add_flipped_images=FLAGS.add_flipped_images)
# Crop the valid regions from the predictions.
semantic_predictions = tf.slice(
predictions[common.OUTPUT_TYPE],
[0, 0, 0],
[1, resized_image_size[0], resized_image_size[1]])
# Resize back the prediction to the original image size.
def _resize_label(label, label_size):
# Expand dimension of label to [1, height, width, 1] for resize operation.
label = tf.expand_dims(label, 3)
resized_label = tf.image.resize_images(
label,
label_size,
method=tf.image.ResizeMethod.NEAREST_NEIGHBOR,
align_corners=True)
return tf.squeeze(resized_label, 3)
semantic_predictions = _resize_label(semantic_predictions, image_size)
semantic_predictions = tf.identity(semantic_predictions, name=_OUTPUT_NAME)
saver = tf.train.Saver(tf.model_variables())
tf.gfile.MakeDirs(os.path.dirname(FLAGS.export_path))
freeze_graph.freeze_graph_with_def_protos(
tf.get_default_graph().as_graph_def(add_shapes=True),
saver.as_saver_def(),
FLAGS.checkpoint_path,
_OUTPUT_NAME,
restore_op_name=None,
filename_tensor_name=None,
output_graph=FLAGS.export_path,
clear_devices=True,
initializer_nodes=None)
if __name__ == '__main__':
flags.mark_flag_as_required('checkpoint_path')
flags.mark_flag_as_required('export_path')
tf.app.run()
# Running DeepLab on Cityscapes Semantic Segmentation Dataset
This page walks through the steps required to run DeepLab on Cityscapes on a
local machine.
## Download dataset and convert to TFRecord
We have prepared the script (under the folder `datasets`) to convert Cityscapes
dataset to TFRecord. The users are required to download the dataset beforehand
by registering the [website](https://www.cityscapes-dataset.com/).
```bash
# From the tensorflow/models/research/deeplab/datasets directory.
sh convert_cityscapes.sh
```
The converted dataset will be saved at ./deeplab/datasets/cityscapes/tfrecord.
## Recommended Directory Structure for Training and Evaluation
```
+ datasets
+ cityscapes
+ leftImg8bit
+ gtFine
+ tfrecord
+ exp
+ train_on_train_set
+ train
+ eval
+ vis
```
where the folder `train_on_train_set` stores the train/eval/vis events and
results (when training DeepLab on the Cityscapes train set).
## Running the train/eval/vis jobs
A local training job using `xception_65` can be run with the following command:
```bash
# From tensorflow/models/research/
python deeplab/train.py \
--logtostderr \
--train_split="train" \
--model_variant="xception_65" \
--atrous_rates=6 \
--atrous_rates=12 \
--atrous_rates=18 \
--output_stride=16 \
--decoder_output_stride=4 \
--train_crop_size=769 \
--train_crop_size=769 \
--train_batch_size=1 \
--tf_initial_checkpoints=${PATH_TO_INITIAL_CHECKPOINT} \
--train_logdir=${PATH_TO_TRAIN_DIR} \
--dataset_dir=${PATH_TO_DATASET}
```
where ${PATH_TO_INITIAL_CHECKPOINT} is the path to the initial checkpoint
(usually an ImageNet pretrained checkpoint), ${PATH_TO_TRAIN_DIR} is the
directory in which training checkpoints and events will be written to, and
${PATH_TO_DATASET} is the directory in which the Cityscapes dataset resides.
Note that for {train,eval,vis}.py:
1. We use small batch size during training. The users could change it based on
the available GPU memory and also set `fine_tune_batch_norm` to be False or
True depending on the use case.
2. The users should change atrous_rates from [6, 12, 18] to [12, 24, 36] if
setting output_stride=8.
3. The users could skip the flag, `decoder_output_stride`, if you do not want
to use the decoder structure.
A local evaluation job using `xception_65` can be run with the following
command:
```bash
# From tensorflow/models/research/
python deeplab/eval.py \
--logtostderr \
--eval_split="val" \
--model_variant="xception_65" \
--atrous_rates=6 \
--atrous_rates=12 \
--atrous_rates=18 \
--output_stride=16 \
--decoder_output_stride=4 \
--eval_crop_size=1025 \
--eval_crop_size=2049 \
--checkpoint_dir=${PATH_TO_CHECKPOINT} \
--eval_logdir=${PATH_TO_EVAL_DIR} \
--dataset_dir=${PATH_TO_DATASET}
```
where ${PATH_TO_CHECKPOINT} is the path to the trained checkpoint (i.e., the
path to train_logdir), ${PATH_TO_EVAL_DIR} is the directory in which evaluation
events will be written to, and ${PATH_TO_DATASET} is the directory in which the
Cityscapes dataset resides.
A local visualization job using `xception_65` can be run with the following
command:
```bash
# From tensorflow/models/research/
python deeplab/vis.py \
--logtostderr \
--vis_split="val" \
--model_variant="xception_65" \
--atrous_rates=6 \
--atrous_rates=12 \
--atrous_rates=18 \
--output_stride=16 \
--decoder_output_stride=4 \
--vis_crop_size=1025 \
--vis_crop_size=2049 \
--colormap_type="cityscapes" \
--checkpoint_dir=${PATH_TO_CHECKPOINT} \
--vis_logdir=${PATH_TO_VIS_DIR} \
--dataset_dir=${PATH_TO_DATASET}
```
where ${PATH_TO_CHECKPOINT} is the path to the trained checkpoint (i.e., the
path to train_logdir), ${PATH_TO_VIS_DIR} is the directory in which evaluation
events will be written to, and ${PATH_TO_DATASET} is the directory in which the
Cityscapes dataset resides. Note that if the users would like to save the
segmentation results for evaluation server, set also_save_raw_predictions =
True.
## Running Tensorboard
Progress for training and evaluation jobs can be inspected using Tensorboard. If
using the recommended directory structure, Tensorboard can be run using the
following command:
```bash
tensorboard --logdir=${PATH_TO_LOG_DIRECTORY}
```
where `${PATH_TO_LOG_DIRECTORY}` points to the directory that contains the
train, eval, and vis directories (e.g., the folder `train_on_train_set` in the
above example). Please note it may take Tensorboard a couple minutes to populate
with data.
# Export trained deeplab model to frozen inference graph
After model training finishes, you could export it to a frozen TensorFlow
inference graph proto. Your trained model checkpoint usually includes the
following files:
* model.ckpt-${CHECKPOINT_NUMBER}.data-00000-of-00001,
* model.ckpt-${CHECKPOINT_NUMBER}.index
* model.ckpt-${CHECKPOINT_NUMBER}.meta
After you have identified a candidate checkpoint to export, you can run the
following commandline to export to a frozen graph:
```bash
# From tensorflow/models/research/
# Assume all checkpoint files share the same path prefix `${CHECKPOINT_PATH}`.
python deeplab/export_model.py \
--checkpoint_path=${CHECKPOINT_PATH} \
--export_path=${OUTPUT_DIR}/frozen_inference_graph.pb
```
Please also add other model specific flags as you use for training, such as
`model_variant`, `add_image_level_feature`, etc.
# FAQ
___
Q1: What if I want to use other network backbones, such as ResNet [1], instead of only those provided ones (e.g., Xception)?
A: The users could modify the provided core/feature_extractor.py to support more network backbones.
___
Q2: What if I want to train the model on other datasets?
A: The users could modify the provided dataset/build_{cityscapes,voc2012}_data.py and dataset/segmentation_dataset.py to build their own dataset.
___
Q3: Where can I download the PASCAL VOC augmented training set?
A: The PASCAL VOC augmented training set is provided by Bharath Hariharan et al. [2] Please refer to their [website](http://home.bharathh.info/pubs/codes/SBD/download.html) for details and consider citing their paper if using the dataset.
___
Q4: Why the implementation does not include DenseCRF [3]?
A: We have not tried this. The interested users could take a look at Philipp Krähenbühl's [website](http://graphics.stanford.edu/projects/densecrf/) and [paper](https://arxiv.org/abs/1210.5644) for details.
___
Q5: What if I want to train the model and fine-tune the batch normalization parameters?
A: Fine-tuning batch normalization requires large batch size, and thus in the train.py we suggest setting `num_clones` (number of GPUs on one machine) and `train_batch_size` to be as large as possible.
___
Q6: How can I train the model asynchronously?
A: In the train.py, the users could set `num_replicas` (number of machines for training) and `num_ps_tasks` (we usually set `num_ps_tasks` = `num_replicas` / 2). See slim.deployment.model_deploy for more details.
___
## References
1. **Deep Residual Learning for Image Recognition**<br />
Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun<br />
[[link]](https://arxiv.org/abs/1512.03385), In CVPR, 2016.
2. **Semantic Contours from Inverse Detectors**<br />
Bharath Hariharan, Pablo Arbelaez, Lubomir Bourdev, Subhransu Maji, Jitendra Malik<br />
[[link]](http://home.bharathh.info/pubs/codes/SBD/download.html), In ICCV, 2011.
3. **Efficient Inference in Fully Connected CRFs with Gaussian Edge Potentials**<br />
Philipp Krähenbühl, Vladlen Koltun<br />
[[link]](http://graphics.stanford.edu/projects/densecrf/), In NIPS, 2011.
Image provenance:
image1.jpg: Philippe Put,
https://www.flickr.com/photos/34547181@N00/14499172124
image2.jpg: Peretz Partensky
https://www.flickr.com/photos/ifl/3926001309
image3.jpg: Peter Harrison
https://www.flickr.com/photos/devcentre/392585679
vis[1-3].png: Showing original image together with DeepLab segmentation map.
# Installation
## Dependencies
DeepLab depends on the following libraries:
* Numpy
* Pillow 1.0
* tf Slim (which is included in the "tensorflow/models/research/" checkout)
* Jupyter notebook
* Matplotlib
* Tensorflow
For detailed steps to install Tensorflow, follow the [Tensorflow installation
instructions](https://www.tensorflow.org/install/). A typical user can install
Tensorflow using one of the following commands:
```bash
# For CPU
pip install tensorflow
# For GPU
pip install tensorflow-gpu
```
The remaining libraries can be installed on Ubuntu 14.04 using via apt-get:
```bash
sudo apt-get install python-pil python-numpy
sudo pip install jupyter
sudo pip install matplotlib
```
## Add Libraries to PYTHONPATH
When running locally, the tensorflow/models/research/ and slim directories
should be appended to PYTHONPATH. This can be done by running the following from
tensorflow/models/research/:
```bash
# From tensorflow/models/research/
export PYTHONPATH=$PYTHONPATH:`pwd`:`pwd`/slim
```
Note: This command needs to run from every new terminal you start. If you wish
to avoid running this manually, you can add it as a new line to the end of your
~/.bashrc file.
# Testing the Installation
You can test if you have successfully installed the Tensorflow DeepLab by
running the following commands:
Quick test by running model_test.py:
```bash
# From tensorflow/models/research/
python deeplab/model_test.py
```
Quick running the whole code on the PASCAL VOC 2012 dataset:
```bash
# From tensorflow/models/research/deeplab
sh local_test.sh
```
# TensorFlow DeepLab Model Zoo
We provide deeplab models pretrained on PASCAL VOC 2012 and Cityscapes datasets
for reproducing our results, as well as some checkpoints that are only
pretrained on ImageNet for training your own models.
## DeepLab models trained on PASCAL VOC 2012
Un-tar'ed directory includes:
* a frozen inference graph (`frozen_inference_graph.pb`). All frozen inference
graphs use output stride of 8 and a single eval scale of 1.0. No left-right
flips are used.
* a checkpoint (`model.ckpt.data-00000-of-00001`, `model.ckpt.index`)
### Model details
We provide several checkpoints that have been pretrained on VOC 2012 train_aug
set or train_aug + trainval set. In the former case, one could train their model
with smaller batch size and freeze batch normalization when limited GPU memory
is available, since we have already fine-tuned the batch normalization for you.
In the latter case, one could directly evaluate the checkpoints on VOC 2012 test
set or use this checkpoint for demo.
Checkpoint name | Network backbone | Pretrained dataset | ASPP | Decoder
--------------------------- | :--------------: | :-----------------: | :---: | :-----:
xception_coco_voc_trainaug | Xception_65 | MS-COCO <br> VOC 2012 train_aug set| [6,12,18] for OS=16 <br> [12,24,36] for OS=8 | OS = 4
xception_coco_voc_trainval | Xception_65 | MS-COCO <br> VOC 2012 train_aug + trainval sets | [6,12,18] for OS=16 <br> [12,24,36] for OS=8 | OS = 4
In the table, **OS** denotes output stride.
Checkpoint name | Eval OS | Eval scales | Left-right Flip | Multiply-Adds | Runtime (sec) | PASCAL mIOU | File Size
------------------------------------------------------------------------------------------------------------------------ | :-------: | :------------------------: | :-------------: | :------------------: | :------------: | :----------------------------: | :-------:
[xception_coco_voc_trainaug](http://download.tensorflow.org/models/deeplabv3_pascal_train_aug_2018_01_04.tar.gz) | 16 <br> 8 | [1.0] <br> [0.5:0.25:1.75] | No <br> Yes | 54.17B <br> 3055.35B | 0.7 <br> 223.2 | 82.20% (val) <br> 83.58% (val) | 439MB
[xception_coco_voc_trainval](http://download.tensorflow.org/models/deeplabv3_pascal_trainval_2018_01_04.tar.gz) | 8 | [0.5:0.25:1.75] | Yes | 3055.35B | 223.2 | 87.80% (**test**) | 439MB
In the table, we report both computation complexity (in terms of Multiply-Adds
and CPU Runtime) and segmentation performance (in terms of mIOU) on the PASCAL
VOC val or test set. The reported runtime is calculated by tfprof on a
workstation with CPU E5-1650 v3 @ 3.50GHz and 32GB memory. Note that applying
multi-scale inputs and left-right flips increases the segmentation performance
but also significantly increases the computation and thus may not be suitable
for real-time applications.
## DeepLab models trained on Cityscapes
### Model details
We provide several checkpoints that have been pretrained on Cityscapes
train_fine set.
Checkpoint name | Network backbone | Pretrained dataset | ASPP | Decoder
------------------------------------- | :--------------: | :-------------------------------------: | :----------------------------------------------: | :-----:
xception_cityscapes_trainfine | Xception_65 | ImageNet <br> Cityscapes train_fine set | [6, 12, 18] for OS=16 <br> [12, 24, 36] for OS=8 | OS = 4
In the table, **OS** denotes output stride.
Checkpoint name | Eval OS | Eval scales | Left-right Flip | Multiply-Adds | Runtime (sec) | Cityscapes mIOU | File Size
-------------------------------------------------------------------------------------------------------------------------------- | :-------: | :-------------------------: | :-------------: | :-------------------: | :------------: | :----------------------------: | :-------:
[xception_cityscapes_trainfine](http://download.tensorflow.org/models/deeplabv3_cityscapes_train_2018_02_06.tar.gz) | 16 <br> 8 | [1.0] <br> [0.75:0.25:1.25] | No <br> Yes | 418.64B <br> 8677.92B | 5.0 <br> 422.8 | 78.79% (val) <br> 80.42% (val) | 439MB
## Checkpoints pretrained on ImageNet
Un-tar'ed directory includes:
* model checkpoint (`model.ckpt.data-00000-of-00001`, `model.ckpt.index`).
### Model details
We also provide some checkpoints that are only pretrained on ImageNet so that
one could use this for training your own models.
* xception: We adapt the original Xception model to the task of semantic
segmentation with the following changes: (1) more layers, (2) all max
pooling operations are replaced by strided (atrous) separable convolutions,
and (3) extra batch-norm and ReLU after each 3x3 depthwise convolution are
added.
Model name | File Size
-------------------------------------------------------------------------------------- | :-------:
[xception](http://download.tensorflow.org/models/deeplabv3_xception_2018_01_04.tar.gz) | 447MB
## References
1. **Mobilenets: Efficient convolutional neural networks for mobile vision applications**<br />
Andrew G. Howard, Menglong Zhu, Bo Chen, Dmitry Kalenichenko, Weijun Wang, Tobias Weyand, Marco Andreetto, Hartwig Adam<br />
[[link]](https://arxiv.org/abs/1704.04861). arXiv:1704.04861, 2017.
2. **Inverted Residuals and Linear Bottlenecks: Mobile Networks for Classification, Detection and Segmentation**<br />
Mark Sandler, Andrew Howard, Menglong Zhu, Andrey Zhmoginov, Liang-Chieh Chen<br />
[[link]](https://arxiv.org/abs/1801.04381). arXiv:1801.04381, 2018.
3. **Xception: Deep Learning with Depthwise Separable Convolutions**<br />
François Chollet<br />
[[link]](https://arxiv.org/abs/1610.02357). In the Proc. of CVPR, 2017.
4. **Deformable Convolutional Networks -- COCO Detection and Segmentation Challenge 2017 Entry**<br />
Haozhi Qi, Zheng Zhang, Bin Xiao, Han Hu, Bowen Cheng, Yichen Wei, Jifeng Dai<br />
[[link]](http://presentations.cocodataset.org/COCO17-Detect-MSRA.pdf). ICCV COCO Challenge
Workshop, 2017.
5. **The Pascal Visual Object Classes Challenge: A Retrospective**<br />
Mark Everingham, S. M. Ali Eslami, Luc Van Gool, Christopher K. I. Williams, John M. Winn, Andrew Zisserman<br />
[[link]](http://host.robots.ox.ac.uk/pascal/VOC/voc2012/). IJCV, 2014.
6. **Semantic Contours from Inverse Detectors**<br />
Bharath Hariharan, Pablo Arbelaez, Lubomir Bourdev, Subhransu Maji, Jitendra Malik<br />
[[link]](http://home.bharathh.info/pubs/codes/SBD/download.html). In the Proc. of ICCV, 2011.
7. **The Cityscapes Dataset for Semantic Urban Scene Understanding**<br />
Cordts, Marius, Mohamed Omran, Sebastian Ramos, Timo Rehfeld, Markus Enzweiler, Rodrigo Benenson, Uwe Franke, Stefan Roth, Bernt Schiele. <br />
[[link]](https://www.cityscapes-dataset.com/). In the Proc. of CVPR, 2016.
8. **Microsoft COCO: Common Objects in Context**<br />
Tsung-Yi Lin, Michael Maire, Serge Belongie, Lubomir Bourdev, Ross Girshick, James Hays, Pietro Perona, Deva Ramanan, C. Lawrence Zitnick, Piotr Dollar<br />
[[link]](http://cocodataset.org/). In the Proc. of ECCV, 2014.
9. **ImageNet Large Scale Visual Recognition Challenge**<br />
Olga Russakovsky, Jia Deng, Hao Su, Jonathan Krause, Sanjeev Satheesh, Sean Ma, Zhiheng Huang, Andrej Karpathy, Aditya Khosla, Michael Bernstein, Alexander C. Berg, Li Fei-Fei<br />
[[link]](http://www.image-net.org/). IJCV, 2015.
# Running DeepLab on PASCAL VOC 2012 Semantic Segmentation Dataset
This page walks through the steps required to run DeepLab on PASCAL VOC 2012 on
a local machine.
## Download dataset and convert to TFRecord
We have prepared the script (under the folder `datasets`) to download and
convert PASCAL VOC 2012 semantic segmentation dataset to TFRecord.
```bash
# From the tensorflow/models/research/deeplab/datasets directory.
sh download_and_convert_voc2012.sh
```
The converted dataset will be saved at
./deeplab/datasets/pascal_voc_seg/tfrecord
## Recommended Directory Structure for Training and Evaluation
```
+ datasets
+ pascal_voc_seg
+ VOCdevkit
+ VOC2012
+ JPEGImages
+ SegmentationClass
+ tfrecord
+ exp
+ train_on_train_set
+ train
+ eval
+ vis
```
where the folder `train_on_train_set` stores the train/eval/vis events and
results (when training DeepLab on the PASCAL VOC 2012 train set).
## Running the train/eval/vis jobs
A local training job using `xception_65` can be run with the following command:
```bash
# From tensorflow/models/research/
python deeplab/train.py \
--logtostderr \
--train_split="train" \
--model_variant="xception_65" \
--atrous_rates=6 \
--atrous_rates=12 \
--atrous_rates=18 \
--output_stride=16 \
--decoder_output_stride=4 \
--train_crop_size=513 \
--train_crop_size=513 \
--train_batch_size=1 \
--tf_initial_checkpoints=${PATH_TO_INITIAL_CHECKPOINT} \
--train_logdir=${PATH_TO_TRAIN_DIR} \
--dataset_dir=${PATH_TO_DATASET}
```
where ${PATH_TO_INITIAL_CHECKPOINT} is the path to the initial checkpoint
(usually an ImageNet pretrained checkpoint), ${PATH_TO_TRAIN_DIR} is the
directory in which training checkpoints and events will be written to, and
${PATH_TO_DATASET} is the directory in which the PASCAL VOC 2012 dataset
resides.
Note that for {train,eval,vis}.py:
1. We use small batch size during training. The users could change it based on
the available GPU memory and also set `fine_tune_batch_norm` to be False or
True depending on the use case.
2. The users should change atrous_rates from [6, 12, 18] to [12, 24, 36] if
setting output_stride=8.
3. The users could skip the flag, `decoder_output_stride`, if you do not want
to use the decoder structure.
A local evaluation job using `xception_65` can be run with the following
command:
```bash
# From tensorflow/models/research/
python deeplab/eval.py \
--logtostderr \
--eval_split="val" \
--model_variant="xception_65" \
--atrous_rates=6 \
--atrous_rates=12 \
--atrous_rates=18 \
--output_stride=16 \
--decoder_output_stride=4 \
--eval_crop_size=513 \
--eval_crop_size=513 \
--checkpoint_dir=${PATH_TO_CHECKPOINT} \
--eval_logdir=${PATH_TO_EVAL_DIR} \
--dataset_dir=${PATH_TO_DATASET}
```
where ${PATH_TO_CHECKPOINT} is the path to the trained checkpoint (i.e., the
path to train_logdir), ${PATH_TO_EVAL_DIR} is the directory in which evaluation
events will be written to, and ${PATH_TO_DATASET} is the directory in which the
PASCAL VOC 2012 dataset resides.
A local visualization job using `xception_65` can be run with the following
command:
```bash
# From tensorflow/models/research/
python deeplab/vis.py \
--logtostderr \
--vis_split="val" \
--model_variant="xception_65" \
--atrous_rates=6 \
--atrous_rates=12 \
--atrous_rates=18 \
--output_stride=16 \
--decoder_output_stride=4 \
--vis_crop_size=513 \
--vis_crop_size=513 \
--checkpoint_dir=${PATH_TO_CHECKPOINT} \
--vis_logdir=${PATH_TO_VIS_DIR} \
--dataset_dir=${PATH_TO_DATASET}
```
where ${PATH_TO_CHECKPOINT} is the path to the trained checkpoint (i.e., the
path to train_logdir), ${PATH_TO_VIS_DIR} is the directory in which evaluation
events will be written to, and ${PATH_TO_DATASET} is the directory in which the
PASCAL VOC 2012 dataset resides. Note that if the users would like to save the
segmentation results for evaluation server, set also_save_raw_predictions =
True.
## Running Tensorboard
Progress for training and evaluation jobs can be inspected using Tensorboard. If
using the recommended directory structure, Tensorboard can be run using the
following command:
```bash
tensorboard --logdir=${PATH_TO_LOG_DIRECTORY}
```
where `${PATH_TO_LOG_DIRECTORY}` points to the directory that contains the
train, eval, and vis directories (e.g., the folder `train_on_train_set` in the
above example). Please note it may take Tensorboard a couple minutes to populate
with data.
## Example
We provide a script to run the {train,eval,vis,export_model}.py on the PASCAL VOC
2012 dataset as an example. See the code in local_test.sh for details.
```bash
# From tensorflow/models/research/deeplab
sh local_test.sh
```
# Copyright 2018 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Prepares the data used for DeepLab training/evaluation."""
import tensorflow as tf
from deeplab.core import feature_extractor
from deeplab.core import preprocess_utils
# The probability of flipping the images and labels
# left-right during training
_PROB_OF_FLIP = 0.5
def preprocess_image_and_label(image,
label,
crop_height,
crop_width,
min_resize_value=None,
max_resize_value=None,
resize_factor=None,
min_scale_factor=1.,
max_scale_factor=1.,
scale_factor_step_size=0,
ignore_label=255,
is_training=True,
model_variant=None):
"""Preprocesses the image and label.
Args:
image: Input image.
label: Ground truth annotation label.
crop_height: The height value used to crop the image and label.
crop_width: The width value used to crop the image and label.
min_resize_value: Desired size of the smaller image side.
max_resize_value: Maximum allowed size of the larger image side.
resize_factor: Resized dimensions are multiple of factor plus one.
min_scale_factor: Minimum scale factor value.
max_scale_factor: Maximum scale factor value.
scale_factor_step_size: The step size from min scale factor to max scale
factor. The input is randomly scaled based on the value of
(min_scale_factor, max_scale_factor, scale_factor_step_size).
ignore_label: The label value which will be ignored for training and
evaluation.
is_training: If the preprocessing is used for training or not.
model_variant: Model variant (string) for choosing how to mean-subtract the
images. See feature_extractor.network_map for supported model variants.
Returns:
original_image: Original image (could be resized).
processed_image: Preprocessed image.
label: Preprocessed ground truth segmentation label.
Raises:
ValueError: Ground truth label not provided during training.
"""
if is_training and label is None:
raise ValueError('During training, label must be provided.')
if model_variant is None:
tf.logging.warning('Default mean-subtraction is performed. Please specify '
'a model_variant. See feature_extractor.network_map for '
'supported model variants.')
# Keep reference to original image.
original_image = image
processed_image = tf.cast(image, tf.float32)
if label is not None:
label = tf.cast(label, tf.int32)
# Resize image and label to the desired range.
if min_resize_value is not None or max_resize_value is not None:
[processed_image, label] = (
preprocess_utils.resize_to_range(
image=processed_image,
label=label,
min_size=min_resize_value,
max_size=max_resize_value,
factor=resize_factor,
align_corners=True))
# The `original_image` becomes the resized image.
original_image = tf.identity(processed_image)
# Data augmentation by randomly scaling the inputs.
scale = preprocess_utils.get_random_scale(
min_scale_factor, max_scale_factor, scale_factor_step_size)
processed_image, label = preprocess_utils.randomly_scale_image_and_label(
processed_image, label, scale)
processed_image.set_shape([None, None, 3])
# Pad image and label to have dimensions >= [crop_height, crop_width]
image_shape = tf.shape(processed_image)
image_height = image_shape[0]
image_width = image_shape[1]
target_height = image_height + tf.maximum(crop_height - image_height, 0)
target_width = image_width + tf.maximum(crop_width - image_width, 0)
# Pad image with mean pixel value.
mean_pixel = tf.reshape(
feature_extractor.mean_pixel(model_variant), [1, 1, 3])
processed_image = preprocess_utils.pad_to_bounding_box(
processed_image, 0, 0, target_height, target_width, mean_pixel)
if label is not None:
label = preprocess_utils.pad_to_bounding_box(
label, 0, 0, target_height, target_width, ignore_label)
# Randomly crop the image and label.
if is_training and label is not None:
processed_image, label = preprocess_utils.random_crop(
[processed_image, label], crop_height, crop_width)
processed_image.set_shape([crop_height, crop_width, 3])
if label is not None:
label.set_shape([crop_height, crop_width, 1])
if is_training:
# Randomly left-right flip the image and label.
processed_image, label, _ = preprocess_utils.flip_dim(
[processed_image, label], _PROB_OF_FLIP, dim=1)
return original_image, processed_image, label
#!/bin/bash
# Copyright 2018 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
#
# This script is used to run local test on PASCAL VOC 2012. Users could also
# modify from this script for their use case.
#
# Usage:
# # From the tensorflow/models/research/deeplab directory.
# sh ./local_test.sh
#
#
# Exit immediately if a command exits with a non-zero status.
set -e
# Move one-level up to tensorflow/models/research directory.
cd ..
# Update PYTHONPATH.
export PYTHONPATH=$PYTHONPATH:`pwd`:`pwd`/slim
# Set up the working environment.
CURRENT_DIR=$(pwd)
WORK_DIR="${CURRENT_DIR}/deeplab"
# Run model_test first to make sure the PYTHONPATH is correctly set.
python "${WORK_DIR}"/model_test.py -v
# Go to datasets folder and download PASCAL VOC 2012 segmentation dataset.
DATASET_DIR="datasets"
cd "${WORK_DIR}/${DATASET_DIR}"
sh download_and_convert_voc2012.sh
# Go back to original directory.
cd "${CURRENT_DIR}"
# Set up the working directories.
PASCAL_FOLDER="pascal_voc_seg"
EXP_FOLDER="exp/train_on_trainval_set"
INIT_FOLDER="${WORK_DIR}/${DATASET_DIR}/${PASCAL_FOLDER}/init_models"
TRAIN_LOGDIR="${WORK_DIR}/${DATASET_DIR}/${PASCAL_FOLDER}/${EXP_FOLDER}/train"
EVAL_LOGDIR="${WORK_DIR}/${DATASET_DIR}/${PASCAL_FOLDER}/${EXP_FOLDER}/eval"
VIS_LOGDIR="${WORK_DIR}/${DATASET_DIR}/${PASCAL_FOLDER}/${EXP_FOLDER}/vis"
EXPORT_DIR="${WORK_DIR}/${DATASET_DIR}/${PASCAL_FOLDER}/${EXP_FOLDER}/export"
mkdir -p "${INIT_FOLDER}"
mkdir -p "${TRAIN_LOGDIR}"
mkdir -p "${EVAL_LOGDIR}"
mkdir -p "${VIS_LOGDIR}"
mkdir -p "${EXPORT_DIR}"
# Copy locally the trained checkpoint as the initial checkpoint.
TF_INIT_ROOT="http://download.tensorflow.org/models"
TF_INIT_CKPT="deeplabv3_pascal_train_aug_2018_01_04.tar.gz"
cd "${INIT_FOLDER}"
wget -nd -c "${TF_INIT_ROOT}/${TF_INIT_CKPT}"
tar -xf "${TF_INIT_CKPT}"
cd "${CURRENT_DIR}"
PASCAL_DATASET="${WORK_DIR}/${DATASET_DIR}/${PASCAL_FOLDER}/tfrecord"
# Train 10 iterations.
NUM_ITERATIONS=10
python "${WORK_DIR}"/train.py \
--logtostderr \
--train_split="trainval" \
--model_variant="xception_65" \
--atrous_rates=6 \
--atrous_rates=12 \
--atrous_rates=18 \
--output_stride=16 \
--decoder_output_stride=4 \
--train_crop_size=513 \
--train_crop_size=513 \
--train_batch_size=4 \
--training_number_of_steps="${NUM_ITERATIONS}" \
--fine_tune_batch_norm=true \
--tf_initial_checkpoint="${INIT_FOLDER}/deeplabv3_pascal_train_aug/model.ckpt" \
--train_logdir="${TRAIN_LOGDIR}" \
--dataset_dir="${PASCAL_DATASET}"
# Run evaluation. This performs eval over the full val split (1449 images) and
# will take a while.
# Using the provided checkpoint, one should expect mIOU=82.20%.
python "${WORK_DIR}"/eval.py \
--logtostderr \
--eval_split="val" \
--model_variant="xception_65" \
--atrous_rates=6 \
--atrous_rates=12 \
--atrous_rates=18 \
--output_stride=16 \
--decoder_output_stride=4 \
--eval_crop_size=513 \
--eval_crop_size=513 \
--checkpoint_dir="${TRAIN_LOGDIR}" \
--eval_logdir="${EVAL_LOGDIR}" \
--dataset_dir="${PASCAL_DATASET}" \
--max_number_of_evaluations=1
# Visualize the results.
python "${WORK_DIR}"/vis.py \
--logtostderr \
--vis_split="val" \
--model_variant="xception_65" \
--atrous_rates=6 \
--atrous_rates=12 \
--atrous_rates=18 \
--output_stride=16 \
--decoder_output_stride=4 \
--vis_crop_size=513 \
--vis_crop_size=513 \
--checkpoint_dir="${TRAIN_LOGDIR}" \
--vis_logdir="${VIS_LOGDIR}" \
--dataset_dir="${PASCAL_DATASET}" \
--max_number_of_iterations=1
# Export the trained checkpoint.
CKPT_PATH="${TRAIN_LOGDIR}/model.ckpt-${NUM_ITERATIONS}"
EXPORT_PATH="${EXPORT_DIR}/frozen_inference_graph.pb"
python "${WORK_DIR}"/export_model.py \
--logtostderr \
--checkpoint_path="${CKPT_PATH}" \
--export_path="${EXPORT_PATH}" \
--model_variant="xception_65" \
--atrous_rates=6 \
--atrous_rates=12 \
--atrous_rates=18 \
--output_stride=16 \
--decoder_output_stride=4 \
--num_classes=21 \
--crop_size=513 \
--crop_size=513 \
--inference_scales=1.0
# Run inference with the exported checkpoint.
# Please refer to the provided deeplab_demo.ipynb for an example.
# Copyright 2018 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
r"""Provides DeepLab model definition and helper functions.
DeepLab is a deep learning system for semantic image segmentation with
the following features:
(1) Atrous convolution to explicitly control the resolution at which
feature responses are computed within Deep Convolutional Neural Networks.
(2) Atrous spatial pyramid pooling (ASPP) to robustly segment objects at
multiple scales with filters at multiple sampling rates and effective
fields-of-views.
(3) ASPP module augmented with image-level feature and batch normalization.
(4) A simple yet effective decoder module to recover the object boundaries.
See the following papers for more details:
"Encoder-Decoder with Atrous Separable Convolution for Semantic Image
Segmentation"
Liang-Chieh Chen, Yukun Zhu, George Papandreou, Florian Schroff, Hartwig Adam.
(https://arxiv.org/abs1802.02611)
"Rethinking Atrous Convolution for Semantic Image Segmentation,"
Liang-Chieh Chen, George Papandreou, Florian Schroff, Hartwig Adam
(https://arxiv.org/abs/1706.05587)
"DeepLab: Semantic Image Segmentation with Deep Convolutional Nets,
Atrous Convolution, and Fully Connected CRFs",
Liang-Chieh Chen*, George Papandreou*, Iasonas Kokkinos, Kevin Murphy,
Alan L Yuille (* equal contribution)
(https://arxiv.org/abs/1606.00915)
"Semantic Image Segmentation with Deep Convolutional Nets and Fully Connected
CRFs"
Liang-Chieh Chen*, George Papandreou*, Iasonas Kokkinos, Kevin Murphy,
Alan L. Yuille (* equal contribution)
(https://arxiv.org/abs/1412.7062)
"""
import tensorflow as tf
from deeplab.core import feature_extractor
slim = tf.contrib.slim
_LOGITS_SCOPE_NAME = 'logits'
_MERGED_LOGITS_SCOPE = 'merged_logits'
_IMAGE_POOLING_SCOPE = 'image_pooling'
_ASPP_SCOPE = 'aspp'
_CONCAT_PROJECTION_SCOPE = 'concat_projection'
_DECODER_SCOPE = 'decoder'
def get_extra_layer_scopes():
"""Gets the scopes for extra layers.
Returns:
A list of scopes for extra layers.
"""
return [
_LOGITS_SCOPE_NAME,
_IMAGE_POOLING_SCOPE,
_ASPP_SCOPE,
_CONCAT_PROJECTION_SCOPE,
_DECODER_SCOPE,
]
def predict_labels_multi_scale(images,
model_options,
eval_scales=(1.0,),
add_flipped_images=False):
"""Predicts segmentation labels.
Args:
images: A tensor of size [batch, height, width, channels].
model_options: A ModelOptions instance to configure models.
eval_scales: The scales to resize images for evaluation.
add_flipped_images: Add flipped images for evaluation or not.
Returns:
A dictionary with keys specifying the output_type (e.g., semantic
prediction) and values storing Tensors representing predictions (argmax
over channels). Each prediction has size [batch, height, width].
"""
outputs_to_predictions = {
output: []
for output in model_options.outputs_to_num_classes
}
for i, image_scale in enumerate(eval_scales):
with tf.variable_scope(tf.get_variable_scope(), reuse=True if i else None):
outputs_to_scales_to_logits = multi_scale_logits(
images,
model_options=model_options,
image_pyramid=[image_scale],
is_training=False,
fine_tune_batch_norm=False)
if add_flipped_images:
with tf.variable_scope(tf.get_variable_scope(), reuse=True):
outputs_to_scales_to_logits_reversed = multi_scale_logits(
tf.reverse_v2(images, [2]),
model_options=model_options,
image_pyramid=[image_scale],
is_training=False,
fine_tune_batch_norm=False)
for output in sorted(outputs_to_scales_to_logits):
scales_to_logits = outputs_to_scales_to_logits[output]
logits = tf.image.resize_bilinear(
scales_to_logits[_MERGED_LOGITS_SCOPE],
tf.shape(images)[1:3],
align_corners=True)
outputs_to_predictions[output].append(
tf.expand_dims(tf.nn.softmax(logits), 4))
if add_flipped_images:
scales_to_logits_reversed = (
outputs_to_scales_to_logits_reversed[output])
logits_reversed = tf.image.resize_bilinear(
tf.reverse_v2(scales_to_logits_reversed[_MERGED_LOGITS_SCOPE], [2]),
tf.shape(images)[1:3],
align_corners=True)
outputs_to_predictions[output].append(
tf.expand_dims(tf.nn.softmax(logits_reversed), 4))
for output in sorted(outputs_to_predictions):
predictions = outputs_to_predictions[output]
# Compute average prediction across different scales and flipped images.
predictions = tf.reduce_mean(tf.concat(predictions, 4), axis=4)
outputs_to_predictions[output] = tf.argmax(predictions, 3)
return outputs_to_predictions
def predict_labels(images, model_options, image_pyramid=None):
"""Predicts segmentation labels.
Args:
images: A tensor of size [batch, height, width, channels].
model_options: A ModelOptions instance to configure models.
image_pyramid: Input image scales for multi-scale feature extraction.
Returns:
A dictionary with keys specifying the output_type (e.g., semantic
prediction) and values storing Tensors representing predictions (argmax
over channels). Each prediction has size [batch, height, width].
"""
outputs_to_scales_to_logits = multi_scale_logits(
images,
model_options=model_options,
image_pyramid=image_pyramid,
is_training=False,
fine_tune_batch_norm=False)
predictions = {}
for output in sorted(outputs_to_scales_to_logits):
scales_to_logits = outputs_to_scales_to_logits[output]
logits = tf.image.resize_bilinear(
scales_to_logits[_MERGED_LOGITS_SCOPE],
tf.shape(images)[1:3],
align_corners=True)
predictions[output] = tf.argmax(logits, 3)
return predictions
def scale_dimension(dim, scale):
"""Scales the input dimension.
Args:
dim: Input dimension (a scalar or a scalar Tensor).
scale: The amount of scaling applied to the input.
Returns:
Scaled dimension.
"""
if isinstance(dim, tf.Tensor):
return tf.cast((tf.to_float(dim) - 1.0) * scale + 1.0, dtype=tf.int32)
else:
return int((float(dim) - 1.0) * scale + 1.0)
def multi_scale_logits(images,
model_options,
image_pyramid,
weight_decay=0.0001,
is_training=False,
fine_tune_batch_norm=False):
"""Gets the logits for multi-scale inputs.
The returned logits are all downsampled (due to max-pooling layers)
for both training and evaluation.
Args:
images: A tensor of size [batch, height, width, channels].
model_options: A ModelOptions instance to configure models.
image_pyramid: Input image scales for multi-scale feature extraction.
weight_decay: The weight decay for model variables.
is_training: Is training or not.
fine_tune_batch_norm: Fine-tune the batch norm parameters or not.
Returns:
outputs_to_scales_to_logits: A map of maps from output_type (e.g.,
semantic prediction) to a dictionary of multi-scale logits names to
logits. For each output_type, the dictionary has keys which
correspond to the scales and values which correspond to the logits.
For example, if `scales` equals [1.0, 1.5], then the keys would
include 'merged_logits', 'logits_1.00' and 'logits_1.50'.
Raises:
ValueError: If model_options doesn't specify crop_size and its
add_image_level_feature = True, since add_image_level_feature requires
crop_size information. Or, if model_options has model_variant =
'mobilenet_v2' but atrous_rates or decoder_output_stride are not None.
"""
# Setup default values.
if not image_pyramid:
image_pyramid = [1.0]
if model_options.crop_size is None and model_options.add_image_level_feature:
raise ValueError(
'Crop size must be specified for using image-level feature.')
crop_height = (
model_options.crop_size[0]
if model_options.crop_size else tf.shape(images)[1])
crop_width = (
model_options.crop_size[1]
if model_options.crop_size else tf.shape(images)[2])
# Compute the height, width for the output logits.
logits_output_stride = (
model_options.decoder_output_stride or model_options.output_stride)
logits_height = scale_dimension(
crop_height,
max(1.0, max(image_pyramid)) / logits_output_stride)
logits_width = scale_dimension(
crop_width,
max(1.0, max(image_pyramid)) / logits_output_stride)
# Compute the logits for each scale in the image pyramid.
outputs_to_scales_to_logits = {
k: {}
for k in model_options.outputs_to_num_classes
}
for count, image_scale in enumerate(image_pyramid):
if image_scale != 1.0:
scaled_height = scale_dimension(crop_height, image_scale)
scaled_width = scale_dimension(crop_width, image_scale)
scaled_crop_size = [scaled_height, scaled_width]
scaled_images = tf.image.resize_bilinear(
images, scaled_crop_size, align_corners=True)
if model_options.crop_size:
scaled_images.set_shape([None, scaled_height, scaled_width, 3])
else:
scaled_crop_size = model_options.crop_size
scaled_images = images
updated_options = model_options._replace(crop_size=scaled_crop_size)
outputs_to_logits = _get_logits(
scaled_images,
updated_options,
weight_decay=weight_decay,
reuse=True if count else None,
is_training=is_training,
fine_tune_batch_norm=fine_tune_batch_norm)
# Resize the logits to have the same dimension before merging.
for output in sorted(outputs_to_logits):
outputs_to_logits[output] = tf.image.resize_bilinear(
outputs_to_logits[output], [logits_height, logits_width],
align_corners=True)
# Return when only one input scale.
if len(image_pyramid) == 1:
for output in sorted(model_options.outputs_to_num_classes):
outputs_to_scales_to_logits[output][
_MERGED_LOGITS_SCOPE] = outputs_to_logits[output]
return outputs_to_scales_to_logits
# Save logits to the output map.
for output in sorted(model_options.outputs_to_num_classes):
outputs_to_scales_to_logits[output][
'logits_%.2f' % image_scale] = outputs_to_logits[output]
# Merge the logits from all the multi-scale inputs.
for output in sorted(model_options.outputs_to_num_classes):
# Concatenate the multi-scale logits for each output type.
all_logits = [
tf.expand_dims(logits, axis=4)
for logits in outputs_to_scales_to_logits[output].values()
]
all_logits = tf.concat(all_logits, 4)
merge_fn = (
tf.reduce_max
if model_options.merge_method == 'max' else tf.reduce_mean)
outputs_to_scales_to_logits[output][_MERGED_LOGITS_SCOPE] = merge_fn(
all_logits, axis=4)
return outputs_to_scales_to_logits
def _extract_features(images,
model_options,
weight_decay=0.0001,
reuse=None,
is_training=False,
fine_tune_batch_norm=False):
"""Extracts features by the particular model_variant.
Args:
images: A tensor of size [batch, height, width, channels].
model_options: A ModelOptions instance to configure models.
weight_decay: The weight decay for model variables.
reuse: Reuse the model variables or not.
is_training: Is training or not.
fine_tune_batch_norm: Fine-tune the batch norm parameters or not.
Returns:
concat_logits: A tensor of size [batch, feature_height, feature_width,
feature_channels], where feature_height/feature_width are determined by
the images height/width and output_stride.
end_points: A dictionary from components of the network to the corresponding
activation.
"""
features, end_points = feature_extractor.extract_features(
images,
output_stride=model_options.output_stride,
multi_grid=model_options.multi_grid,
model_variant=model_options.model_variant,
weight_decay=weight_decay,
reuse=reuse,
is_training=is_training,
fine_tune_batch_norm=fine_tune_batch_norm)
if not model_options.aspp_with_batch_norm:
return features, end_points
else:
batch_norm_params = {
'is_training': is_training and fine_tune_batch_norm,
'decay': 0.9997,
'epsilon': 1e-5,
'scale': True,
}
with slim.arg_scope(
[slim.conv2d, slim.separable_conv2d],
weights_regularizer=slim.l2_regularizer(weight_decay),
activation_fn=tf.nn.relu,
normalizer_fn=slim.batch_norm,
padding='SAME',
stride=1,
reuse=reuse):
with slim.arg_scope([slim.batch_norm], **batch_norm_params):
depth = 256
branch_logits = []
if model_options.add_image_level_feature:
pool_height = scale_dimension(model_options.crop_size[0],
1. / model_options.output_stride)
pool_width = scale_dimension(model_options.crop_size[1],
1. / model_options.output_stride)
image_feature = slim.avg_pool2d(
features, [pool_height, pool_width], [pool_height, pool_width],
padding='VALID')
image_feature = slim.conv2d(
image_feature, depth, 1, scope=_IMAGE_POOLING_SCOPE)
image_feature = tf.image.resize_bilinear(
image_feature, [pool_height, pool_width], align_corners=True)
image_feature.set_shape([None, pool_height, pool_width, depth])
branch_logits.append(image_feature)
# Employ a 1x1 convolution.
branch_logits.append(slim.conv2d(features, depth, 1,
scope=_ASPP_SCOPE + str(0)))
if model_options.atrous_rates:
# Employ 3x3 convolutions with different atrous rates.
for i, rate in enumerate(model_options.atrous_rates, 1):
scope = _ASPP_SCOPE + str(i)
if model_options.aspp_with_separable_conv:
aspp_features = _split_separable_conv2d(
features,
filters=depth,
rate=rate,
weight_decay=weight_decay,
scope=scope)
else:
aspp_features = slim.conv2d(
features, depth, 3, rate=rate, scope=scope)
branch_logits.append(aspp_features)
# Merge branch logits.
concat_logits = tf.concat(branch_logits, 3)
concat_logits = slim.conv2d(
concat_logits, depth, 1, scope=_CONCAT_PROJECTION_SCOPE)
concat_logits = slim.dropout(
concat_logits,
keep_prob=0.9,
is_training=is_training,
scope=_CONCAT_PROJECTION_SCOPE + '_dropout')
return concat_logits, end_points
def _get_logits(images,
model_options,
weight_decay=0.0001,
reuse=None,
is_training=False,
fine_tune_batch_norm=False):
"""Gets the logits by atrous/image spatial pyramid pooling.
Args:
images: A tensor of size [batch, height, width, channels].
model_options: A ModelOptions instance to configure models.
weight_decay: The weight decay for model variables.
reuse: Reuse the model variables or not.
is_training: Is training or not.
fine_tune_batch_norm: Fine-tune the batch norm parameters or not.
Returns:
outputs_to_logits: A map from output_type to logits.
"""
features, end_points = _extract_features(
images,
model_options,
weight_decay=weight_decay,
reuse=reuse,
is_training=is_training,
fine_tune_batch_norm=fine_tune_batch_norm)
if model_options.decoder_output_stride is not None:
decoder_height = scale_dimension(model_options.crop_size[0],
1.0 / model_options.decoder_output_stride)
decoder_width = scale_dimension(model_options.crop_size[1],
1.0 / model_options.decoder_output_stride)
features = refine_by_decoder(
features,
end_points,
decoder_height=decoder_height,
decoder_width=decoder_width,
decoder_use_separable_conv=model_options.decoder_use_separable_conv,
model_variant=model_options.model_variant,
weight_decay=weight_decay,
reuse=reuse,
is_training=is_training,
fine_tune_batch_norm=fine_tune_batch_norm)
outputs_to_logits = {}
for output in sorted(model_options.outputs_to_num_classes):
outputs_to_logits[output] = _get_branch_logits(
features,
model_options.outputs_to_num_classes[output],
model_options.atrous_rates,
aspp_with_batch_norm=model_options.aspp_with_batch_norm,
kernel_size=model_options.logits_kernel_size,
weight_decay=weight_decay,
reuse=reuse,
scope_suffix=output)
return outputs_to_logits
def refine_by_decoder(features,
end_points,
decoder_height,
decoder_width,
decoder_use_separable_conv=False,
model_variant=None,
weight_decay=0.0001,
reuse=None,
is_training=False,
fine_tune_batch_norm=False):
"""Adds the decoder to obtain sharper segmentation results.
Args:
features: A tensor of size [batch, features_height, features_width,
features_channels].
end_points: A dictionary from components of the network to the corresponding
activation.
decoder_height: The height of decoder feature maps.
decoder_width: The width of decoder feature maps.
decoder_use_separable_conv: Employ separable convolution for decoder or not.
model_variant: Model variant for feature extraction.
weight_decay: The weight decay for model variables.
reuse: Reuse the model variables or not.
is_training: Is training or not.
fine_tune_batch_norm: Fine-tune the batch norm parameters or not.
Returns:
Decoder output with size [batch, decoder_height, decoder_width,
decoder_channels].
"""
batch_norm_params = {
'is_training': is_training and fine_tune_batch_norm,
'decay': 0.9997,
'epsilon': 1e-5,
'scale': True,
}
with slim.arg_scope(
[slim.conv2d, slim.separable_conv2d],
weights_regularizer=slim.l2_regularizer(weight_decay),
activation_fn=tf.nn.relu,
normalizer_fn=slim.batch_norm,
padding='SAME',
stride=1,
reuse=reuse):
with slim.arg_scope([slim.batch_norm], **batch_norm_params):
with tf.variable_scope(_DECODER_SCOPE, _DECODER_SCOPE, [features]):
feature_list = feature_extractor.networks_to_feature_maps[
model_variant][feature_extractor.DECODER_END_POINTS]
if feature_list is None:
tf.logging.info('Not found any decoder end points.')
return features
else:
decoder_features = features
for i, name in enumerate(feature_list):
decoder_features_list = [decoder_features]
feature_name = '{}/{}'.format(
feature_extractor.name_scope[model_variant], name)
decoder_features_list.append(
slim.conv2d(
end_points[feature_name],
48,
1,
scope='feature_projection' + str(i)))
# Resize to decoder_height/decoder_width.
for j, feature in enumerate(decoder_features_list):
decoder_features_list[j] = tf.image.resize_bilinear(
feature, [decoder_height, decoder_width], align_corners=True)
decoder_features_list[j].set_shape(
[None, decoder_height, decoder_width, None])
decoder_depth = 256
if decoder_use_separable_conv:
decoder_features = _split_separable_conv2d(
tf.concat(decoder_features_list, 3),
filters=decoder_depth,
rate=1,
weight_decay=weight_decay,
scope='decoder_conv0')
decoder_features = _split_separable_conv2d(
decoder_features,
filters=decoder_depth,
rate=1,
weight_decay=weight_decay,
scope='decoder_conv1')
else:
num_convs = 2
decoder_features = slim.repeat(
tf.concat(decoder_features_list, 3),
num_convs,
slim.conv2d,
decoder_depth,
3,
scope='decoder_conv' + str(i))
return decoder_features
def _get_branch_logits(features,
num_classes,
atrous_rates=None,
aspp_with_batch_norm=False,
kernel_size=1,
weight_decay=0.0001,
reuse=None,
scope_suffix=''):
"""Gets the logits from each model's branch.
The underlying model is branched out in the last layer when atrous
spatial pyramid pooling is employed, and all branches are sum-merged
to form the final logits.
Args:
features: A float tensor of shape [batch, height, width, channels].
num_classes: Number of classes to predict.
atrous_rates: A list of atrous convolution rates for last layer.
aspp_with_batch_norm: Use batch normalization layers for ASPP.
kernel_size: Kernel size for convolution.
weight_decay: Weight decay for the model variables.
reuse: Reuse model variables or not.
scope_suffix: Scope suffix for the model variables.
Returns:
Merged logits with shape [batch, height, width, num_classes].
Raises:
ValueError: Upon invalid input kernel_size value.
"""
# When using batch normalization with ASPP, ASPP has been applied before
# in _extract_features, and thus we simply apply 1x1 convolution here.
if aspp_with_batch_norm or atrous_rates is None:
if kernel_size != 1:
raise ValueError('Kernel size must be 1 when atrous_rates is None or '
'using aspp_with_batch_norm. Gets %d.' % kernel_size)
atrous_rates = [1]
with slim.arg_scope(
[slim.conv2d],
weights_regularizer=slim.l2_regularizer(weight_decay),
weights_initializer=tf.truncated_normal_initializer(stddev=0.01),
reuse=reuse):
with tf.variable_scope(_LOGITS_SCOPE_NAME, _LOGITS_SCOPE_NAME, [features]):
branch_logits = []
for i, rate in enumerate(atrous_rates):
scope = scope_suffix
if i:
scope += '_%d' % i
branch_logits.append(
slim.conv2d(
features,
num_classes,
kernel_size=kernel_size,
rate=rate,
activation_fn=None,
normalizer_fn=None,
scope=scope))
return tf.add_n(branch_logits)
def _split_separable_conv2d(inputs,
filters,
rate=1,
weight_decay=0.00004,
depthwise_weights_initializer_stddev=0.33,
pointwise_weights_initializer_stddev=0.06,
scope=None):
"""Splits a separable conv2d into depthwise and pointwise conv2d.
This operation differs from `tf.layers.separable_conv2d` as this operation
applies activation function between depthwise and pointwise conv2d.
Args:
inputs: Input tensor with shape [batch, height, width, channels].
filters: Number of filters in the 1x1 pointwise convolution.
rate: Atrous convolution rate for the depthwise convolution.
weight_decay: The weight decay to use for regularizing the model.
depthwise_weights_initializer_stddev: The standard deviation of the
truncated normal weight initializer for depthwise convolution.
pointwise_weights_initializer_stddev: The standard deviation of the
truncated normal weight initializer for pointwise convolution.
scope: Optional scope for the operation.
Returns:
Computed features after split separable conv2d.
"""
outputs = slim.separable_conv2d(
inputs,
None,
3,
depth_multiplier=1,
rate=rate,
weights_initializer=tf.truncated_normal_initializer(
stddev=depthwise_weights_initializer_stddev),
weights_regularizer=None,
scope=scope + '_depthwise')
return slim.conv2d(
outputs,
filters,
1,
weights_initializer=tf.truncated_normal_initializer(
stddev=pointwise_weights_initializer_stddev),
weights_regularizer=slim.l2_regularizer(weight_decay),
scope=scope + '_pointwise')
# Copyright 2018 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Tests for DeepLab model and some helper functions."""
import tensorflow as tf
from deeplab import common
from deeplab import model
class DeeplabModelTest(tf.test.TestCase):
def testScaleDimensionOutput(self):
self.assertEqual(161, model.scale_dimension(321, 0.5))
self.assertEqual(193, model.scale_dimension(321, 0.6))
self.assertEqual(241, model.scale_dimension(321, 0.75))
def testWrongDeepLabVariant(self):
model_options = common.ModelOptions([])._replace(
model_variant='no_such_variant')
with self.assertRaises(ValueError):
model._get_logits(images=[], model_options=model_options)
def testBuildDeepLabv2(self):
batch_size = 2
crop_size = [41, 41]
# Test with two image_pyramids.
image_pyramids = [[1], [0.5, 1]]
# Test two model variants.
model_variants = ['xception_65']
# Test with two output_types.
outputs_to_num_classes = {'semantic': 3,
'direction': 2}
expected_endpoints = [['merged_logits'],
['merged_logits',
'logits_0.50',
'logits_1.00']]
expected_num_logits = [1, 3]
for model_variant in model_variants:
model_options = common.ModelOptions(outputs_to_num_classes)._replace(
add_image_level_feature=False,
aspp_with_batch_norm=False,
aspp_with_separable_conv=False,
model_variant=model_variant)
for i, image_pyramid in enumerate(image_pyramids):
g = tf.Graph()
with g.as_default():
with self.test_session(graph=g):
inputs = tf.random_uniform(
(batch_size, crop_size[0], crop_size[1], 3))
outputs_to_scales_to_logits = model.multi_scale_logits(
inputs, model_options, image_pyramid=image_pyramid)
# Check computed results for each output type.
for output in outputs_to_num_classes:
scales_to_logits = outputs_to_scales_to_logits[output]
self.assertListEqual(sorted(scales_to_logits.keys()),
sorted(expected_endpoints[i]))
# Expected number of logits = len(image_pyramid) + 1, since the
# last logits is merged from all the scales.
self.assertEquals(len(scales_to_logits), expected_num_logits[i])
def testForwardpassDeepLabv3plus(self):
crop_size = [33, 33]
outputs_to_num_classes = {'semantic': 3}
model_options = common.ModelOptions(
outputs_to_num_classes,
crop_size,
atrous_rates=[6],
output_stride=16
)._replace(
add_image_level_feature=True,
aspp_with_batch_norm=True,
aspp_with_separable_conv=True,
decoder_output_stride=4,
decoder_use_separable_conv=True,
logits_kernel_size=1,
model_variant='xception_65')
g = tf.Graph()
with g.as_default():
with self.test_session(graph=g) as sess:
inputs = tf.random_uniform(
(1, crop_size[0], crop_size[1], 3))
outputs_to_scales_to_logits = model.multi_scale_logits(
inputs,
model_options,
image_pyramid=[1.0])
sess.run(tf.global_variables_initializer())
outputs_to_scales_to_logits = sess.run(outputs_to_scales_to_logits)
# Check computed results for each output type.
for output in outputs_to_num_classes:
scales_to_logits = outputs_to_scales_to_logits[output]
# Expect only one output.
self.assertEquals(len(scales_to_logits), 1)
for logits in scales_to_logits.values():
self.assertTrue(logits.any())
if __name__ == '__main__':
tf.test.main()
# Copyright 2018 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Training script for the DeepLab model.
See model.py for more details and usage.
"""
import tensorflow as tf
from deeplab import common
from deeplab import model
from deeplab.datasets import segmentation_dataset
from deeplab.utils import input_generator
from deeplab.utils import train_utils
from deployment import model_deploy
slim = tf.contrib.slim
prefetch_queue = slim.prefetch_queue
flags = tf.app.flags
FLAGS = flags.FLAGS
# Settings for multi-GPUs/multi-replicas training.
flags.DEFINE_integer('num_clones', 1, 'Number of clones to deploy.')
flags.DEFINE_boolean('clone_on_cpu', False, 'Use CPUs to deploy clones.')
flags.DEFINE_integer('num_replicas', 1, 'Number of worker replicas.')
flags.DEFINE_integer('startup_delay_steps', 15,
'Number of training steps between replicas startup.')
flags.DEFINE_integer('num_ps_tasks', 0,
'The number of parameter servers. If the value is 0, then '
'the parameters are handled locally by the worker.')
flags.DEFINE_string('master', '', 'BNS name of the tensorflow server')
flags.DEFINE_integer('task', 0, 'The task ID.')
# Settings for logging.
flags.DEFINE_string('train_logdir', None,
'Where the checkpoint and logs are stored.')
flags.DEFINE_integer('log_steps', 10,
'Display logging information at every log_steps.')
flags.DEFINE_integer('save_interval_secs', 1200,
'How often, in seconds, we save the model to disk.')
flags.DEFINE_integer('save_summaries_secs', 600,
'How often, in seconds, we compute the summaries.')
# Settings for training strategry.
flags.DEFINE_enum('learning_policy', 'poly', ['poly', 'step'],
'Learning rate policy for training.')
# Use 0.007 when training on PASCAL augmented training set, train_aug. When
# fine-tuning on PASCAL trainval set, use learning rate=0.0001.
flags.DEFINE_float('base_learning_rate', .0001,
'The base learning rate for model training.')
flags.DEFINE_float('learning_rate_decay_factor', 0.1,
'The rate to decay the base learning rate.')
flags.DEFINE_integer('learning_rate_decay_step', 2000,
'Decay the base learning rate at a fixed step.')
flags.DEFINE_float('learning_power', 0.9,
'The power value used in the poly learning policy.')
flags.DEFINE_integer('training_number_of_steps', 30000,
'The number of steps used for training')
flags.DEFINE_float('momentum', 0.9, 'The momentum value to use')
# When fine_tune_batch_norm=True, use at least batch size larger than 12
# (batch size more than 16 is better). Otherwise, one could use smaller batch
# size and set fine_tune_batch_norm=False.
flags.DEFINE_integer('train_batch_size', 8,
'The number of images in each batch during training.')
flags.DEFINE_float('weight_decay', 0.00004,
'The value of the weight decay for training.')
flags.DEFINE_multi_integer('train_crop_size', [513, 513],
'Image crop size [height, width] during training.')
flags.DEFINE_float('last_layer_gradient_multiplier', 1.0,
'The gradient multiplier for last layers, which is used to '
'boost the gradient of last layers if the value > 1.')
flags.DEFINE_boolean('upsample_logits', True,
'Upsample logits during training.')
# Settings for fine-tuning the network.
flags.DEFINE_string('tf_initial_checkpoint', None,
'The initial checkpoint in tensorflow format.')
# Set to False if one does not want to re-use the trained classifier weights.
flags.DEFINE_boolean('initialize_last_layer', True,
'Initialize the last layer.')
flags.DEFINE_integer('slow_start_step', 0,
'Training model with small learning rate for few steps.')
flags.DEFINE_float('slow_start_learning_rate', 1e-4,
'Learning rate employed during slow start.')
# Set to True if one wants to fine-tune the batch norm parameters in DeepLabv3.
# Set to False and use small batch size to save GPU memory.
flags.DEFINE_boolean('fine_tune_batch_norm', True,
'Fine tune the batch norm parameters or not.')
flags.DEFINE_float('min_scale_factor', 0.5,
'Mininum scale factor for data augmentation.')
flags.DEFINE_float('max_scale_factor', 2.,
'Maximum scale factor for data augmentation.')
flags.DEFINE_float('scale_factor_step_size', 0.25,
'Scale factor step size for data augmentation.')
# For `xception_65`, use atrous_rates = [12, 24, 36] if output_stride = 8, or
# rates = [6, 12, 18] if output_stride = 16. Note one could use different
# atrous_rates/output_stride during training/evaluation.
flags.DEFINE_multi_integer('atrous_rates', None,
'Atrous rates for atrous spatial pyramid pooling.')
flags.DEFINE_integer('output_stride', 16,
'The ratio of input to output spatial resolution.')
# Dataset settings.
flags.DEFINE_string('dataset', 'pascal_voc_seg',
'Name of the segmentation dataset.')
flags.DEFINE_string('train_split', 'train',
'Which split of the dataset to be used for training')
flags.DEFINE_string('dataset_dir', None, 'Where the dataset reside.')
def _build_deeplab(inputs_queue, outputs_to_num_classes, ignore_label):
"""Builds a clone of DeepLab.
Args:
inputs_queue: A prefetch queue for images and labels.
outputs_to_num_classes: A map from output type to the number of classes.
For example, for the task of semantic segmentation with 21 semantic
classes, we would have outputs_to_num_classes['semantic'] = 21.
ignore_label: Ignore label.
Returns:
A map of maps from output_type (e.g., semantic prediction) to a
dictionary of multi-scale logits names to logits. For each output_type,
the dictionary has keys which correspond to the scales and values which
correspond to the logits. For example, if `scales` equals [1.0, 1.5],
then the keys would include 'merged_logits', 'logits_1.00' and
'logits_1.50'.
"""
samples = inputs_queue.dequeue()
model_options = common.ModelOptions(
outputs_to_num_classes=outputs_to_num_classes,
crop_size=FLAGS.train_crop_size,
atrous_rates=FLAGS.atrous_rates,
output_stride=FLAGS.output_stride)
outputs_to_scales_to_logits = model.multi_scale_logits(
samples[common.IMAGE],
model_options=model_options,
image_pyramid=FLAGS.image_pyramid,
weight_decay=FLAGS.weight_decay,
is_training=True,
fine_tune_batch_norm=FLAGS.fine_tune_batch_norm)
for output, num_classes in outputs_to_num_classes.iteritems():
train_utils.add_softmax_cross_entropy_loss_for_each_scale(
outputs_to_scales_to_logits[output],
samples[common.LABEL],
num_classes,
ignore_label,
loss_weight=1.0,
upsample_logits=FLAGS.upsample_logits,
scope=output)
return outputs_to_scales_to_logits
def main(unused_argv):
tf.logging.set_verbosity(tf.logging.INFO)
# Set up deployment (i.e., multi-GPUs and/or multi-replicas).
config = model_deploy.DeploymentConfig(
num_clones=FLAGS.num_clones,
clone_on_cpu=FLAGS.clone_on_cpu,
replica_id=FLAGS.task,
num_replicas=FLAGS.num_replicas,
num_ps_tasks=FLAGS.num_ps_tasks)
# Split the batch across GPUs.
assert FLAGS.train_batch_size % config.num_clones == 0, (
'Training batch size not divisble by number of clones (GPUs).')
clone_batch_size = FLAGS.train_batch_size / config.num_clones
# Get dataset-dependent information.
dataset = segmentation_dataset.get_dataset(
FLAGS.dataset, FLAGS.train_split, dataset_dir=FLAGS.dataset_dir)
tf.gfile.MakeDirs(FLAGS.train_logdir)
tf.logging.info('Training on %s set', FLAGS.train_split)
with tf.Graph().as_default():
with tf.device(config.inputs_device()):
samples = input_generator.get(
dataset,
FLAGS.train_crop_size,
clone_batch_size,
min_resize_value=FLAGS.min_resize_value,
max_resize_value=FLAGS.max_resize_value,
resize_factor=FLAGS.resize_factor,
min_scale_factor=FLAGS.min_scale_factor,
max_scale_factor=FLAGS.max_scale_factor,
scale_factor_step_size=FLAGS.scale_factor_step_size,
dataset_split=FLAGS.train_split,
is_training=True,
model_variant=FLAGS.model_variant)
inputs_queue = prefetch_queue.prefetch_queue(
samples, capacity=128 * config.num_clones)
# Create the global step on the device storing the variables.
with tf.device(config.variables_device()):
global_step = tf.train.get_or_create_global_step()
# Define the model and create clones.
model_fn = _build_deeplab
model_args = (inputs_queue, {
common.OUTPUT_TYPE: dataset.num_classes
}, dataset.ignore_label)
clones = model_deploy.create_clones(config, model_fn, args=model_args)
# Gather update_ops from the first clone. These contain, for example,
# the updates for the batch_norm variables created by model_fn.
first_clone_scope = config.clone_scope(0)
update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS, first_clone_scope)
# Gather initial summaries.
summaries = set(tf.get_collection(tf.GraphKeys.SUMMARIES))
# Add summaries for model variables.
for model_var in slim.get_model_variables():
summaries.add(tf.summary.histogram(model_var.op.name, model_var))
# Add summaries for losses.
for loss in tf.get_collection(tf.GraphKeys.LOSSES, first_clone_scope):
summaries.add(tf.summary.scalar('losses/%s' % loss.op.name, loss))
# Build the optimizer based on the device specification.
with tf.device(config.optimizer_device()):
learning_rate = train_utils.get_model_learning_rate(
FLAGS.learning_policy, FLAGS.base_learning_rate,
FLAGS.learning_rate_decay_step, FLAGS.learning_rate_decay_factor,
FLAGS.training_number_of_steps, FLAGS.learning_power,
FLAGS.slow_start_step, FLAGS.slow_start_learning_rate)
optimizer = tf.train.MomentumOptimizer(learning_rate, FLAGS.momentum)
summaries.add(tf.summary.scalar('learning_rate', learning_rate))
startup_delay_steps = FLAGS.task * FLAGS.startup_delay_steps
for variable in slim.get_model_variables():
summaries.add(tf.summary.histogram(variable.op.name, variable))
with tf.device(config.variables_device()):
total_loss, grads_and_vars = model_deploy.optimize_clones(
clones, optimizer)
total_loss = tf.check_numerics(total_loss, 'Loss is inf or nan.')
summaries.add(tf.summary.scalar('total_loss', total_loss))
# Modify the gradients for biases and last layer variables.
last_layers = model.get_extra_layer_scopes()
grad_mult = train_utils.get_model_gradient_multipliers(
last_layers, FLAGS.last_layer_gradient_multiplier)
if grad_mult:
grads_and_vars = slim.learning.multiply_gradients(
grads_and_vars, grad_mult)
# Create gradient update op.
grad_updates = optimizer.apply_gradients(
grads_and_vars, global_step=global_step)
update_ops.append(grad_updates)
update_op = tf.group(*update_ops)
with tf.control_dependencies([update_op]):
train_tensor = tf.identity(total_loss, name='train_op')
# Add the summaries from the first clone. These contain the summaries
# created by model_fn and either optimize_clones() or _gather_clone_loss().
summaries |= set(
tf.get_collection(tf.GraphKeys.SUMMARIES, first_clone_scope))
# Merge all summaries together.
summary_op = tf.summary.merge(list(summaries))
# Soft placement allows placing on CPU ops without GPU implementation.
session_config = tf.ConfigProto(
allow_soft_placement=True, log_device_placement=False)
# Start the training.
slim.learning.train(
train_tensor,
logdir=FLAGS.train_logdir,
log_every_n_steps=FLAGS.log_steps,
master=FLAGS.master,
number_of_steps=FLAGS.training_number_of_steps,
is_chief=(FLAGS.task == 0),
session_config=session_config,
startup_delay_steps=startup_delay_steps,
init_fn=train_utils.get_model_init_fn(
FLAGS.train_logdir,
FLAGS.tf_initial_checkpoint,
FLAGS.initialize_last_layer,
last_layers,
ignore_missing_vars=True),
summary_op=summary_op,
save_summaries_secs=FLAGS.save_summaries_secs,
save_interval_secs=FLAGS.save_interval_secs)
if __name__ == '__main__':
flags.mark_flag_as_required('train_logdir')
flags.mark_flag_as_required('tf_initial_checkpoint')
flags.mark_flag_as_required('dataset_dir')
tf.app.run()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment