Merge pull request #3521 from YknZhu/master

Add deeplab model in tensorflow models

Merge pull request #3521 from YknZhu/master
Add deeplab model in tensorflow models
05ccaf88 · Lukasz Kaiser · GitHub · 6571d16d · 1e9b07d8 · 05ccaf88
Unverified Commit 05ccaf88 authored Mar 08, 2018 by Lukasz Kaiser Committed by GitHub Mar 08, 2018
20 changed files
--- a/research/deeplab/export_model.py
+++ b/research/deeplab/export_model.py
+# Copyright 2018 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Exports trained model to TensorFlow frozen graph."""
+
+import os
+import tensorflow as tf
+
+from tensorflow.python.tools import freeze_graph
+from deeplab import common
+from deeplab import input_preprocess
+from deeplab import model
+
+slim = tf.contrib.slim
+flags = tf.app.flags
+
+FLAGS = flags.FLAGS
+
+flags.DEFINE_string('checkpoint_path', None, 'Checkpoint path')
+
+flags.DEFINE_string('export_path', None,
+                    'Path to output Tensorflow frozen graph.')
+
+flags.DEFINE_integer('num_classes', 21, 'Number of classes.')
+
+flags.DEFINE_multi_integer('crop_size', [513, 513],
+                           'Crop size [height, width].')
+
+# For `xception_65`, use atrous_rates = [12, 24, 36] if output_stride = 8, or
+# rates = [6, 12, 18] if output_stride = 16. For `mobilenet_v2`, use None. Note
+# one could use different atrous_rates/output_stride during training/evaluation.
+flags.DEFINE_multi_integer('atrous_rates', None,
+                           'Atrous rates for atrous spatial pyramid pooling.')
+
+flags.DEFINE_integer('output_stride', 8,
+                     'The ratio of input to output spatial resolution.')
+
+# Change to [0.5, 0.75, 1.0, 1.25, 1.5, 1.75] for multi-scale inference.
+flags.DEFINE_multi_float('inference_scales', [1.0],
+                         'The scales to resize images for inference.')
+
+flags.DEFINE_bool('add_flipped_images', False,
+                  'Add flipped images during inference or not.')
+
+# Input name of the exported model.
+_INPUT_NAME = 'ImageTensor'
+
+# Output name of the exported model.
+_OUTPUT_NAME = 'SemanticPredictions'
+
+
+def _create_input_tensors():
+  """Creates and prepares input tensors for DeepLab model.
+
+  This method creates a 4-D uint8 image tensor 'ImageTensor' with shape
+  [1, None, None, 3]. The actual input tensor name to use during inference is
+  'ImageTensor:0'.
+
+  Returns:
+    image: Preprocessed 4-D float32 tensor with shape [1, crop_height,
+      crop_width, 3].
+    original_image_size: Original image shape tensor [height, width].
+    resized_image_size: Resized image shape tensor [height, width].
+  """
+  # input_preprocess takes 4-D image tensor as input.
+  input_image = tf.placeholder(tf.uint8, [1, None, None, 3], name=_INPUT_NAME)
+  original_image_size = tf.shape(input_image)[1:3]
+
+  # Squeeze the dimension in axis=0 since `preprocess_image_and_label` assumes
+  # image to be 3-D.
+  image = tf.squeeze(input_image, axis=0)
+  resized_image, image, _ = input_preprocess.preprocess_image_and_label(
+      image,
+      label=None,
+      crop_height=FLAGS.crop_size[0],
+      crop_width=FLAGS.crop_size[1],
+      min_resize_value=FLAGS.min_resize_value,
+      max_resize_value=FLAGS.max_resize_value,
+      resize_factor=FLAGS.resize_factor,
+      is_training=False,
+      model_variant=FLAGS.model_variant)
+  resized_image_size = tf.shape(resized_image)[:2]
+
+  # Expand the dimension in axis=0, since the following operations assume the
+  # image to be 4-D.
+  image = tf.expand_dims(image, 0)
+
+  return image, original_image_size, resized_image_size
+
+
+def main(unused_argv):
+  tf.logging.set_verbosity(tf.logging.INFO)
+  tf.logging.info('Prepare to export model to: %s', FLAGS.export_path)
+
+  with tf.Graph().as_default():
+    image, image_size, resized_image_size = _create_input_tensors()
+
+    model_options = common.ModelOptions(
+        outputs_to_num_classes={common.OUTPUT_TYPE: FLAGS.num_classes},
+        crop_size=FLAGS.crop_size,
+        atrous_rates=FLAGS.atrous_rates,
+        output_stride=FLAGS.output_stride)
+
+    if tuple(FLAGS.inference_scales) == (1.0,):
+      tf.logging.info('Exported model performs single-scale inference.')
+      predictions = model.predict_labels(
+          image,
+          model_options=model_options,
+          image_pyramid=FLAGS.image_pyramid)
+    else:
+      tf.logging.info('Exported model performs multi-scale inference.')
+      predictions = model.predict_labels_multi_scale(
+          image,
+          model_options=model_options,
+          eval_scales=FLAGS.inference_scales,
+          add_flipped_images=FLAGS.add_flipped_images)
+
+    # Crop the valid regions from the predictions.
+    semantic_predictions = tf.slice(
+        predictions[common.OUTPUT_TYPE],
+        [0, 0, 0],
+        [1, resized_image_size[0], resized_image_size[1]])
+    # Resize back the prediction to the original image size.
+    def _resize_label(label, label_size):
+      # Expand dimension of label to [1, height, width, 1] for resize operation.
+      label = tf.expand_dims(label, 3)
+      resized_label = tf.image.resize_images(
+          label,
+          label_size,
+          method=tf.image.ResizeMethod.NEAREST_NEIGHBOR,
+          align_corners=True)
+      return tf.squeeze(resized_label, 3)
+    semantic_predictions = _resize_label(semantic_predictions, image_size)
+    semantic_predictions = tf.identity(semantic_predictions, name=_OUTPUT_NAME)
+
+    saver = tf.train.Saver(tf.model_variables())
+
+    tf.gfile.MakeDirs(os.path.dirname(FLAGS.export_path))
+    freeze_graph.freeze_graph_with_def_protos(
+        tf.get_default_graph().as_graph_def(add_shapes=True),
+        saver.as_saver_def(),
+        FLAGS.checkpoint_path,
+        _OUTPUT_NAME,
+        restore_op_name=None,
+        filename_tensor_name=None,
+        output_graph=FLAGS.export_path,
+        clear_devices=True,
+        initializer_nodes=None)
+
+
+if __name__ == '__main__':
+  flags.mark_flag_as_required('checkpoint_path')
+  flags.mark_flag_as_required('export_path')
+  tf.app.run()
--- a/research/deeplab/g3doc/cityscapes.md
+++ b/research/deeplab/g3doc/cityscapes.md
+# Running DeepLab on Cityscapes Semantic Segmentation Dataset
+
+This page walks through the steps required to run DeepLab on Cityscapes on a
+local machine.
+
+## Download dataset and convert to TFRecord
+
+We have prepared the script (under the folder `datasets`) to convert Cityscapes
+dataset to TFRecord. The users are required to download the dataset beforehand
+by registering the [website](https://www.cityscapes-dataset.com/).
+
+```bash
+# From the tensorflow/models/research/deeplab/datasets directory.
+sh convert_cityscapes.sh
+```
+
+The converted dataset will be saved at ./deeplab/datasets/cityscapes/tfrecord.
+
+## Recommended Directory Structure for Training and Evaluation
+
+```
+ datasets
+  + cityscapes
+    + leftImg8bit
+    + gtFine
+    + tfrecord
+    + exp
+      + train_on_train_set
+        + train
+        + eval
+        + vis
+```
+
+where the folder `train_on_train_set` stores the train/eval/vis events and
+results (when training DeepLab on the Cityscapes train set).
+
+## Running the train/eval/vis jobs
+
+A local training job using `xception_65` can be run with the following command:
+
+```bash
+# From tensorflow/models/research/
+python deeplab/train.py \
+    --logtostderr \
+    --train_split="train" \
+    --model_variant="xception_65" \
+    --atrous_rates=6 \
+    --atrous_rates=12 \
+    --atrous_rates=18 \
+    --output_stride=16 \
+    --decoder_output_stride=4 \
+    --train_crop_size=769 \
+    --train_crop_size=769 \
+    --train_batch_size=1 \
+    --tf_initial_checkpoints=${PATH_TO_INITIAL_CHECKPOINT} \
+    --train_logdir=${PATH_TO_TRAIN_DIR} \
+    --dataset_dir=${PATH_TO_DATASET}
+```
+
+where ${PATH_TO_INITIAL_CHECKPOINT} is the path to the initial checkpoint
+(usually an ImageNet pretrained checkpoint), ${PATH_TO_TRAIN_DIR} is the
+directory in which training checkpoints and events will be written to, and
+${PATH_TO_DATASET} is the directory in which the Cityscapes dataset resides.
+
+Note that for {train,eval,vis}.py:
+
+1.  We use small batch size during training. The users could change it based on
+    the available GPU memory and also set `fine_tune_batch_norm` to be False or
+    True depending on the use case.
+
+2.  The users should change atrous_rates from [6, 12, 18] to [12, 24, 36] if
+    setting output_stride=8.
+
+3.  The users could skip the flag, `decoder_output_stride`, if you do not want
+    to use the decoder structure.
+
+A local evaluation job using `xception_65` can be run with the following
+command:
+
+```bash
+# From tensorflow/models/research/
+python deeplab/eval.py \
+    --logtostderr \
+    --eval_split="val" \
+    --model_variant="xception_65" \
+    --atrous_rates=6 \
+    --atrous_rates=12 \
+    --atrous_rates=18 \
+    --output_stride=16 \
+    --decoder_output_stride=4 \
+    --eval_crop_size=1025 \
+    --eval_crop_size=2049 \
+    --checkpoint_dir=${PATH_TO_CHECKPOINT} \
+    --eval_logdir=${PATH_TO_EVAL_DIR} \
+    --dataset_dir=${PATH_TO_DATASET}
+```
+
+where ${PATH_TO_CHECKPOINT} is the path to the trained checkpoint (i.e., the
+path to train_logdir), ${PATH_TO_EVAL_DIR} is the directory in which evaluation
+events will be written to, and ${PATH_TO_DATASET} is the directory in which the
+Cityscapes dataset resides.
+
+A local visualization job using `xception_65` can be run with the following
+command:
+
+```bash
+# From tensorflow/models/research/
+python deeplab/vis.py \
+    --logtostderr \
+    --vis_split="val" \
+    --model_variant="xception_65" \
+    --atrous_rates=6 \
+    --atrous_rates=12 \
+    --atrous_rates=18 \
+    --output_stride=16 \
+    --decoder_output_stride=4 \
+    --vis_crop_size=1025 \
+    --vis_crop_size=2049 \
+    --colormap_type="cityscapes" \
+    --checkpoint_dir=${PATH_TO_CHECKPOINT} \
+    --vis_logdir=${PATH_TO_VIS_DIR} \
+    --dataset_dir=${PATH_TO_DATASET}
+```
+
+where ${PATH_TO_CHECKPOINT} is the path to the trained checkpoint (i.e., the
+path to train_logdir), ${PATH_TO_VIS_DIR} is the directory in which evaluation
+events will be written to, and ${PATH_TO_DATASET} is the directory in which the
+Cityscapes dataset resides. Note that if the users would like to save the
+segmentation results for evaluation server, set also_save_raw_predictions =
+True.
+
+## Running Tensorboard
+
+Progress for training and evaluation jobs can be inspected using Tensorboard. If
+using the recommended directory structure, Tensorboard can be run using the
+following command:
+
+```bash
+tensorboard --logdir=${PATH_TO_LOG_DIRECTORY}
+```
+
+where `${PATH_TO_LOG_DIRECTORY}` points to the directory that contains the
+train, eval, and vis directories (e.g., the folder `train_on_train_set` in the
+above example). Please note it may take Tensorboard a couple minutes to populate
+with data.
--- a/research/deeplab/g3doc/export_model.md
+++ b/research/deeplab/g3doc/export_model.md
+# Export trained deeplab model to frozen inference graph
+
+After model training finishes, you could export it to a frozen TensorFlow
+inference graph proto. Your trained model checkpoint usually includes the
+following files:
+
+*   model.ckpt-${CHECKPOINT_NUMBER}.data-00000-of-00001,
+*   model.ckpt-${CHECKPOINT_NUMBER}.index
+*   model.ckpt-${CHECKPOINT_NUMBER}.meta
+
+After you have identified a candidate checkpoint to export, you can run the
+following commandline to export to a frozen graph:
+
+```bash
+# From tensorflow/models/research/
+# Assume all checkpoint files share the same path prefix `${CHECKPOINT_PATH}`.
+python deeplab/export_model.py \
+    --checkpoint_path=${CHECKPOINT_PATH} \
+    --export_path=${OUTPUT_DIR}/frozen_inference_graph.pb
+```
+
+Please also add other model specific flags as you use for training, such as
+`model_variant`, `add_image_level_feature`, etc.
--- a/research/deeplab/g3doc/faq.md
+++ b/research/deeplab/g3doc/faq.md
+# FAQ
+___
+Q1: What if I want to use other network backbones, such as ResNet [1], instead of only those provided ones (e.g., Xception)?
+
+A: The users could modify the provided core/feature_extractor.py to support more network backbones.
+___
+Q2: What if I want to train the model on other datasets?
+
+A: The users could modify the provided dataset/build_{cityscapes,voc2012}_data.py and dataset/segmentation_dataset.py to build their own dataset.
+___
+Q3: Where can I download the PASCAL VOC augmented training set?
+
+A: The PASCAL VOC augmented training set is provided by Bharath Hariharan et al. [2] Please refer to their [website](http://home.bharathh.info/pubs/codes/SBD/download.html) for details and consider citing their paper if using the dataset.
+___
+Q4: Why the implementation does not include DenseCRF [3]?
+
+A: We have not tried this. The interested users could take a look at Philipp Krähenbühl's [website](http://graphics.stanford.edu/projects/densecrf/) and [paper](https://arxiv.org/abs/1210.5644) for details.
+___
+Q5: What if I want to train the model and fine-tune the batch normalization parameters?
+
+A: Fine-tuning batch normalization requires large batch size, and thus in the train.py we suggest setting `num_clones` (number of GPUs on one machine) and `train_batch_size` to be as large as possible.
+___
+Q6: How can I train the model asynchronously?
+
+A: In the train.py, the users could set `num_replicas` (number of machines for training) and `num_ps_tasks` (we usually set `num_ps_tasks` = `num_replicas` / 2). See slim.deployment.model_deploy for more details.
+___
+## References
+
+1. **Deep Residual Learning for Image Recognition**<br />
+   Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun<br />
+   [[link]](https://arxiv.org/abs/1512.03385), In CVPR, 2016.
+
+2. **Semantic Contours from Inverse Detectors**<br />
+   Bharath Hariharan, Pablo Arbelaez, Lubomir Bourdev, Subhransu Maji, Jitendra Malik<br />
+   [[link]](http://home.bharathh.info/pubs/codes/SBD/download.html), In ICCV, 2011.
+
+3. **Efficient Inference in Fully Connected CRFs with Gaussian Edge Potentials**<br />
+   Philipp Krähenbühl, Vladlen Koltun<br />
+   [[link]](http://graphics.stanford.edu/projects/densecrf/), In NIPS, 2011.
--- a/research/deeplab/g3doc/img/image1.jpg
+++ b/research/deeplab/g3doc/img/image1.jpg
--- a/research/deeplab/g3doc/img/image2.jpg
+++ b/research/deeplab/g3doc/img/image2.jpg
--- a/research/deeplab/g3doc/img/image3.jpg
+++ b/research/deeplab/g3doc/img/image3.jpg
--- a/research/deeplab/g3doc/img/image_info.txt
+++ b/research/deeplab/g3doc/img/image_info.txt
+Image provenance:
+
+image1.jpg: Philippe Put,
+  https://www.flickr.com/photos/34547181@N00/14499172124
+
+image2.jpg: Peretz Partensky
+  https://www.flickr.com/photos/ifl/3926001309
+
+image3.jpg: Peter Harrison
+  https://www.flickr.com/photos/devcentre/392585679
+
+
+vis[1-3].png: Showing original image together with DeepLab segmentation map.
--- a/research/deeplab/g3doc/img/vis1.png
+++ b/research/deeplab/g3doc/img/vis1.png
--- a/research/deeplab/g3doc/img/vis2.png
+++ b/research/deeplab/g3doc/img/vis2.png
--- a/research/deeplab/g3doc/img/vis3.png
+++ b/research/deeplab/g3doc/img/vis3.png
--- a/research/deeplab/g3doc/installation.md
+++ b/research/deeplab/g3doc/installation.md
+# Installation
+
+## Dependencies
+
+DeepLab depends on the following libraries:
+
+*   Numpy
+*   Pillow 1.0
+*   tf Slim (which is included in the "tensorflow/models/research/" checkout)
+*   Jupyter notebook
+*   Matplotlib
+*   Tensorflow
+
+For detailed steps to install Tensorflow, follow the [Tensorflow installation
+instructions](https://www.tensorflow.org/install/). A typical user can install
+Tensorflow using one of the following commands:
+
+```bash
+# For CPU
+pip install tensorflow
+# For GPU
+pip install tensorflow-gpu
+```
+
+The remaining libraries can be installed on Ubuntu 14.04 using via apt-get:
+
+```bash
+sudo apt-get install python-pil python-numpy
+sudo pip install jupyter
+sudo pip install matplotlib
+```
+
+## Add Libraries to PYTHONPATH
+
+When running locally, the tensorflow/models/research/ and slim directories
+should be appended to PYTHONPATH. This can be done by running the following from
+tensorflow/models/research/:
+
+```bash
+# From tensorflow/models/research/
+export PYTHONPATH=$PYTHONPATH:`pwd`:`pwd`/slim
+```
+
+Note: This command needs to run from every new terminal you start. If you wish
+to avoid running this manually, you can add it as a new line to the end of your
+~/.bashrc file.
+
+# Testing the Installation
+
+You can test if you have successfully installed the Tensorflow DeepLab by
+running the following commands:
+
+Quick test by running model_test.py:
+
+```bash
+# From tensorflow/models/research/
+python deeplab/model_test.py
+```
+
+Quick running the whole code on the PASCAL VOC 2012 dataset:
+
+```bash
+# From tensorflow/models/research/deeplab
+sh local_test.sh
+```
+
--- a/research/deeplab/g3doc/model_zoo.md
+++ b/research/deeplab/g3doc/model_zoo.md
+# TensorFlow DeepLab Model Zoo
+
+We provide deeplab models pretrained on PASCAL VOC 2012 and Cityscapes datasets
+for reproducing our results, as well as some checkpoints that are only
+pretrained on ImageNet for training your own models.
+
+## DeepLab models trained on PASCAL VOC 2012
+
+Un-tar'ed directory includes:
+
+*   a frozen inference graph (`frozen_inference_graph.pb`). All frozen inference
+    graphs use output stride of 8 and a single eval scale of 1.0. No left-right
+    flips are used.
+
+*   a checkpoint (`model.ckpt.data-00000-of-00001`, `model.ckpt.index`)
+
+### Model details
+
+We provide several checkpoints that have been pretrained on VOC 2012 train_aug
+set or train_aug + trainval set. In the former case, one could train their model
+with smaller batch size and freeze batch normalization when limited GPU memory
+is available, since we have already fine-tuned the batch normalization for you.
+In the latter case, one could directly evaluate the checkpoints on VOC 2012 test
+set or use this checkpoint for demo.
+
+Checkpoint name             | Network backbone | Pretrained  dataset | ASPP  | Decoder
+--------------------------- | :--------------: | :-----------------: | :---: | :-----:
+xception_coco_voc_trainaug  | Xception_65  | MS-COCO <br> VOC 2012 train_aug set| [6,12,18] for OS=16 <br> [12,24,36] for OS=8 | OS = 4
+xception_coco_voc_trainval  | Xception_65  | MS-COCO <br> VOC 2012 train_aug + trainval sets | [6,12,18] for OS=16 <br> [12,24,36] for OS=8 | OS = 4
+
+In the table, **OS** denotes output stride.
+
+Checkpoint name                                                                                                          | Eval OS   | Eval scales                | Left-right Flip | Multiply-Adds        | Runtime (sec)  | PASCAL mIOU                    | File Size
+------------------------------------------------------------------------------------------------------------------------ | :-------: | :------------------------: | :-------------: | :------------------: | :------------: | :----------------------------: | :-------:
+[xception_coco_voc_trainaug](http://download.tensorflow.org/models/deeplabv3_pascal_train_aug_2018_01_04.tar.gz)         | 16 <br> 8 | [1.0] <br> [0.5:0.25:1.75] | No <br> Yes     | 54.17B <br> 3055.35B | 0.7 <br> 223.2 | 82.20% (val) <br> 83.58% (val) | 439MB
+[xception_coco_voc_trainval](http://download.tensorflow.org/models/deeplabv3_pascal_trainval_2018_01_04.tar.gz)          | 8         | [0.5:0.25:1.75]            | Yes             | 3055.35B             | 223.2          | 87.80% (**test**)              | 439MB
+
+In the table, we report both computation complexity (in terms of Multiply-Adds
+and CPU Runtime) and segmentation performance (in terms of mIOU) on the PASCAL
+VOC val or test set. The reported runtime is calculated by tfprof on a
+workstation with CPU E5-1650 v3 @ 3.50GHz and 32GB memory. Note that applying
+multi-scale inputs and left-right flips increases the segmentation performance
+but also significantly increases the computation and thus may not be suitable
+for real-time applications.
+
+## DeepLab models trained on Cityscapes
+
+### Model details
+
+We provide several checkpoints that have been pretrained on Cityscapes
+train_fine set.
+
+Checkpoint name                       | Network backbone | Pretrained dataset                      | ASPP                                             | Decoder
+------------------------------------- | :--------------: | :-------------------------------------: | :----------------------------------------------: | :-----:
+xception_cityscapes_trainfine         | Xception_65      | ImageNet <br> Cityscapes train_fine set | [6, 12, 18] for OS=16 <br> [12, 24, 36] for OS=8 | OS = 4
+
+In the table, **OS** denotes output stride.
+
+Checkpoint name                                                                                                                  | Eval OS   | Eval scales                 | Left-right Flip | Multiply-Adds         | Runtime (sec)  | Cityscapes mIOU                | File Size
+-------------------------------------------------------------------------------------------------------------------------------- | :-------: | :-------------------------: | :-------------: | :-------------------: | :------------: | :----------------------------: | :-------:
+[xception_cityscapes_trainfine](http://download.tensorflow.org/models/deeplabv3_cityscapes_train_2018_02_06.tar.gz)              | 16 <br> 8 | [1.0] <br> [0.75:0.25:1.25] | No <br> Yes     | 418.64B <br> 8677.92B | 5.0 <br> 422.8 | 78.79% (val) <br> 80.42% (val) | 439MB
+
+## Checkpoints pretrained on ImageNet
+
+Un-tar'ed directory includes:
+
+*   model checkpoint (`model.ckpt.data-00000-of-00001`, `model.ckpt.index`).
+
+### Model details
+
+We also provide some checkpoints that are only pretrained on ImageNet so that
+one could use this for training your own models.
+
+*   xception: We adapt the original Xception model to the task of semantic
+    segmentation with the following changes: (1) more layers, (2) all max
+    pooling operations are replaced by strided (atrous) separable convolutions,
+    and (3) extra batch-norm and ReLU after each 3x3 depthwise convolution are
+    added.
+
+Model name                                                                             | File Size
+-------------------------------------------------------------------------------------- | :-------:
+[xception](http://download.tensorflow.org/models/deeplabv3_xception_2018_01_04.tar.gz) | 447MB
+
+## References
+
+1.  **Mobilenets: Efficient convolutional neural networks for mobile vision applications**<br />
+    Andrew G. Howard, Menglong Zhu, Bo Chen, Dmitry Kalenichenko, Weijun Wang, Tobias Weyand, Marco Andreetto, Hartwig Adam<br />
+    [[link]](https://arxiv.org/abs/1704.04861). arXiv:1704.04861, 2017.
+
+2.  **Inverted Residuals and Linear Bottlenecks: Mobile Networks for Classification, Detection and Segmentation**<br />
+    Mark Sandler, Andrew Howard, Menglong Zhu, Andrey Zhmoginov, Liang-Chieh Chen<br />
+    [[link]](https://arxiv.org/abs/1801.04381). arXiv:1801.04381, 2018.
+
+3.  **Xception: Deep Learning with Depthwise Separable Convolutions**<br />
+    François Chollet<br />
+    [[link]](https://arxiv.org/abs/1610.02357). In the Proc. of CVPR, 2017.
+
+4.  **Deformable Convolutional Networks -- COCO Detection and Segmentation Challenge 2017 Entry**<br />
+    Haozhi Qi, Zheng Zhang, Bin Xiao, Han Hu, Bowen Cheng, Yichen Wei, Jifeng Dai<br />
+    [[link]](http://presentations.cocodataset.org/COCO17-Detect-MSRA.pdf). ICCV COCO Challenge
+    Workshop, 2017.
+
+5.  **The Pascal Visual Object Classes Challenge: A Retrospective**<br />
+    Mark Everingham, S. M. Ali Eslami, Luc Van Gool, Christopher K. I. Williams, John M. Winn, Andrew Zisserman<br />
+    [[link]](http://host.robots.ox.ac.uk/pascal/VOC/voc2012/). IJCV, 2014.
+
+6.  **Semantic Contours from Inverse Detectors**<br />
+    Bharath Hariharan, Pablo Arbelaez, Lubomir Bourdev, Subhransu Maji, Jitendra Malik<br />
+    [[link]](http://home.bharathh.info/pubs/codes/SBD/download.html). In the Proc. of ICCV, 2011.
+
+7.  **The Cityscapes Dataset for Semantic Urban Scene Understanding**<br />
+    Cordts, Marius, Mohamed Omran, Sebastian Ramos, Timo Rehfeld, Markus Enzweiler, Rodrigo Benenson, Uwe Franke, Stefan Roth, Bernt Schiele. <br />
+    [[link]](https://www.cityscapes-dataset.com/). In the Proc. of CVPR, 2016.
+
+8.  **Microsoft COCO: Common Objects in Context**<br />
+    Tsung-Yi Lin, Michael Maire, Serge Belongie, Lubomir Bourdev, Ross Girshick, James Hays, Pietro Perona, Deva Ramanan, C. Lawrence Zitnick, Piotr Dollar<br />
+    [[link]](http://cocodataset.org/). In the Proc. of ECCV, 2014.
+
+9.  **ImageNet Large Scale Visual Recognition Challenge**<br />
+    Olga Russakovsky, Jia Deng, Hao Su, Jonathan Krause, Sanjeev Satheesh, Sean Ma, Zhiheng Huang, Andrej Karpathy, Aditya Khosla, Michael Bernstein, Alexander C. Berg, Li Fei-Fei<br />
+    [[link]](http://www.image-net.org/). IJCV, 2015.
--- a/research/deeplab/g3doc/pascal.md
+++ b/research/deeplab/g3doc/pascal.md
+# Running DeepLab on PASCAL VOC 2012 Semantic Segmentation Dataset
+
+This page walks through the steps required to run DeepLab on PASCAL VOC 2012 on
+a local machine.
+
+## Download dataset and convert to TFRecord
+
+We have prepared the script (under the folder `datasets`) to download and
+convert PASCAL VOC 2012 semantic segmentation dataset to TFRecord.
+
+```bash
+# From the tensorflow/models/research/deeplab/datasets directory.
+sh download_and_convert_voc2012.sh
+```
+
+The converted dataset will be saved at
+./deeplab/datasets/pascal_voc_seg/tfrecord
+
+## Recommended Directory Structure for Training and Evaluation
+
+```
+ datasets
+  + pascal_voc_seg
+    + VOCdevkit
+      + VOC2012
+        + JPEGImages
+        + SegmentationClass
+    + tfrecord
+    + exp
+      + train_on_train_set
+        + train
+        + eval
+        + vis
+```
+
+where the folder `train_on_train_set` stores the train/eval/vis events and
+results (when training DeepLab on the PASCAL VOC 2012 train set).
+
+## Running the train/eval/vis jobs
+
+A local training job using `xception_65` can be run with the following command:
+
+```bash
+# From tensorflow/models/research/
+python deeplab/train.py \
+    --logtostderr \
+    --train_split="train" \
+    --model_variant="xception_65" \
+    --atrous_rates=6 \
+    --atrous_rates=12 \
+    --atrous_rates=18 \
+    --output_stride=16 \
+    --decoder_output_stride=4 \
+    --train_crop_size=513 \
+    --train_crop_size=513 \
+    --train_batch_size=1 \
+    --tf_initial_checkpoints=${PATH_TO_INITIAL_CHECKPOINT} \
+    --train_logdir=${PATH_TO_TRAIN_DIR} \
+    --dataset_dir=${PATH_TO_DATASET}
+```
+
+where ${PATH_TO_INITIAL_CHECKPOINT} is the path to the initial checkpoint
+(usually an ImageNet pretrained checkpoint), ${PATH_TO_TRAIN_DIR} is the
+directory in which training checkpoints and events will be written to, and
+${PATH_TO_DATASET} is the directory in which the PASCAL VOC 2012 dataset
+resides.
+
+Note that for {train,eval,vis}.py:
+
+1.  We use small batch size during training. The users could change it based on
+    the available GPU memory and also set `fine_tune_batch_norm` to be False or
+    True depending on the use case.
+
+2.  The users should change atrous_rates from [6, 12, 18] to [12, 24, 36] if
+    setting output_stride=8.
+
+3.  The users could skip the flag, `decoder_output_stride`, if you do not want
+    to use the decoder structure.
+
+A local evaluation job using `xception_65` can be run with the following
+command:
+
+```bash
+# From tensorflow/models/research/
+python deeplab/eval.py \
+    --logtostderr \
+    --eval_split="val" \
+    --model_variant="xception_65" \
+    --atrous_rates=6 \
+    --atrous_rates=12 \
+    --atrous_rates=18 \
+    --output_stride=16 \
+    --decoder_output_stride=4 \
+    --eval_crop_size=513 \
+    --eval_crop_size=513 \
+    --checkpoint_dir=${PATH_TO_CHECKPOINT} \
+    --eval_logdir=${PATH_TO_EVAL_DIR} \
+    --dataset_dir=${PATH_TO_DATASET}
+```
+
+where ${PATH_TO_CHECKPOINT} is the path to the trained checkpoint (i.e., the
+path to train_logdir), ${PATH_TO_EVAL_DIR} is the directory in which evaluation
+events will be written to, and ${PATH_TO_DATASET} is the directory in which the
+PASCAL VOC 2012 dataset resides.
+
+A local visualization job using `xception_65` can be run with the following
+command:
+
+```bash
+# From tensorflow/models/research/
+python deeplab/vis.py \
+    --logtostderr \
+    --vis_split="val" \
+    --model_variant="xception_65" \
+    --atrous_rates=6 \
+    --atrous_rates=12 \
+    --atrous_rates=18 \
+    --output_stride=16 \
+    --decoder_output_stride=4 \
+    --vis_crop_size=513 \
+    --vis_crop_size=513 \
+    --checkpoint_dir=${PATH_TO_CHECKPOINT} \
+    --vis_logdir=${PATH_TO_VIS_DIR} \
+    --dataset_dir=${PATH_TO_DATASET}
+```
+
+where ${PATH_TO_CHECKPOINT} is the path to the trained checkpoint (i.e., the
+path to train_logdir), ${PATH_TO_VIS_DIR} is the directory in which evaluation
+events will be written to, and ${PATH_TO_DATASET} is the directory in which the
+PASCAL VOC 2012 dataset resides. Note that if the users would like to save the
+segmentation results for evaluation server, set also_save_raw_predictions =
+True.
+
+## Running Tensorboard
+
+Progress for training and evaluation jobs can be inspected using Tensorboard. If
+using the recommended directory structure, Tensorboard can be run using the
+following command:
+
+```bash
+tensorboard --logdir=${PATH_TO_LOG_DIRECTORY}
+```
+
+where `${PATH_TO_LOG_DIRECTORY}` points to the directory that contains the
+train, eval, and vis directories (e.g., the folder `train_on_train_set` in the
+above example). Please note it may take Tensorboard a couple minutes to populate
+with data.
+
+## Example
+
+We provide a script to run the {train,eval,vis,export_model}.py on the PASCAL VOC
+2012 dataset as an example. See the code in local_test.sh for details.
+
+```bash
+# From tensorflow/models/research/deeplab
+sh local_test.sh
+```
--- a/research/deeplab/input_preprocess.py
+++ b/research/deeplab/input_preprocess.py
+# Copyright 2018 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Prepares the data used for DeepLab training/evaluation."""
+import tensorflow as tf
+from deeplab.core import feature_extractor
+from deeplab.core import preprocess_utils
+
+
+# The probability of flipping the images and labels
+# left-right during training
+_PROB_OF_FLIP = 0.5
+
+
+def preprocess_image_and_label(image,
+                               label,
+                               crop_height,
+                               crop_width,
+                               min_resize_value=None,
+                               max_resize_value=None,
+                               resize_factor=None,
+                               min_scale_factor=1.,
+                               max_scale_factor=1.,
+                               scale_factor_step_size=0,
+                               ignore_label=255,
+                               is_training=True,
+                               model_variant=None):
+  """Preprocesses the image and label.
+
+  Args:
+    image: Input image.
+    label: Ground truth annotation label.
+    crop_height: The height value used to crop the image and label.
+    crop_width: The width value used to crop the image and label.
+    min_resize_value: Desired size of the smaller image side.
+    max_resize_value: Maximum allowed size of the larger image side.
+    resize_factor: Resized dimensions are multiple of factor plus one.
+    min_scale_factor: Minimum scale factor value.
+    max_scale_factor: Maximum scale factor value.
+    scale_factor_step_size: The step size from min scale factor to max scale
+      factor. The input is randomly scaled based on the value of
+      (min_scale_factor, max_scale_factor, scale_factor_step_size).
+    ignore_label: The label value which will be ignored for training and
+      evaluation.
+    is_training: If the preprocessing is used for training or not.
+    model_variant: Model variant (string) for choosing how to mean-subtract the
+      images. See feature_extractor.network_map for supported model variants.
+
+  Returns:
+    original_image: Original image (could be resized).
+    processed_image: Preprocessed image.
+    label: Preprocessed ground truth segmentation label.
+
+  Raises:
+    ValueError: Ground truth label not provided during training.
+  """
+  if is_training and label is None:
+    raise ValueError('During training, label must be provided.')
+  if model_variant is None:
+    tf.logging.warning('Default mean-subtraction is performed. Please specify '
+                       'a model_variant. See feature_extractor.network_map for '
+                       'supported model variants.')
+
+  # Keep reference to original image.
+  original_image = image
+
+  processed_image = tf.cast(image, tf.float32)
+
+  if label is not None:
+    label = tf.cast(label, tf.int32)
+
+  # Resize image and label to the desired range.
+  if min_resize_value is not None or max_resize_value is not None:
+    [processed_image, label] = (
+        preprocess_utils.resize_to_range(
+            image=processed_image,
+            label=label,
+            min_size=min_resize_value,
+            max_size=max_resize_value,
+            factor=resize_factor,
+            align_corners=True))
+    # The `original_image` becomes the resized image.
+    original_image = tf.identity(processed_image)
+
+  # Data augmentation by randomly scaling the inputs.
+  scale = preprocess_utils.get_random_scale(
+      min_scale_factor, max_scale_factor, scale_factor_step_size)
+  processed_image, label = preprocess_utils.randomly_scale_image_and_label(
+      processed_image, label, scale)
+  processed_image.set_shape([None, None, 3])
+
+  # Pad image and label to have dimensions >= [crop_height, crop_width]
+  image_shape = tf.shape(processed_image)
+  image_height = image_shape[0]
+  image_width = image_shape[1]
+
+  target_height = image_height + tf.maximum(crop_height - image_height, 0)
+  target_width = image_width + tf.maximum(crop_width - image_width, 0)
+
+  # Pad image with mean pixel value.
+  mean_pixel = tf.reshape(
+      feature_extractor.mean_pixel(model_variant), [1, 1, 3])
+  processed_image = preprocess_utils.pad_to_bounding_box(
+      processed_image, 0, 0, target_height, target_width, mean_pixel)
+
+  if label is not None:
+    label = preprocess_utils.pad_to_bounding_box(
+        label, 0, 0, target_height, target_width, ignore_label)
+
+  # Randomly crop the image and label.
+  if is_training and label is not None:
+    processed_image, label = preprocess_utils.random_crop(
+        [processed_image, label], crop_height, crop_width)
+
+  processed_image.set_shape([crop_height, crop_width, 3])
+
+  if label is not None:
+    label.set_shape([crop_height, crop_width, 1])
+
+  if is_training:
+    # Randomly left-right flip the image and label.
+    processed_image, label, _ = preprocess_utils.flip_dim(
+        [processed_image, label], _PROB_OF_FLIP, dim=1)
+
+  return original_image, processed_image, label
--- a/research/deeplab/local_test.sh
+++ b/research/deeplab/local_test.sh
+#!/bin/bash
+# Copyright 2018 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+#
+# This script is used to run local test on PASCAL VOC 2012. Users could also
+# modify from this script for their use case.
+#
+# Usage:
+#   # From the tensorflow/models/research/deeplab directory.
+#   sh ./local_test.sh
+#
+#
+
+# Exit immediately if a command exits with a non-zero status.
+set -e
+
+# Move one-level up to tensorflow/models/research directory.
+cd ..
+
+# Update PYTHONPATH.
+export PYTHONPATH=$PYTHONPATH:`pwd`:`pwd`/slim
+
+# Set up the working environment.
+CURRENT_DIR=$(pwd)
+WORK_DIR="${CURRENT_DIR}/deeplab"
+
+# Run model_test first to make sure the PYTHONPATH is correctly set.
+python "${WORK_DIR}"/model_test.py -v
+
+# Go to datasets folder and download PASCAL VOC 2012 segmentation dataset.
+DATASET_DIR="datasets"
+cd "${WORK_DIR}/${DATASET_DIR}"
+sh download_and_convert_voc2012.sh
+
+# Go back to original directory.
+cd "${CURRENT_DIR}"
+
+# Set up the working directories.
+PASCAL_FOLDER="pascal_voc_seg"
+EXP_FOLDER="exp/train_on_trainval_set"
+INIT_FOLDER="${WORK_DIR}/${DATASET_DIR}/${PASCAL_FOLDER}/init_models"
+TRAIN_LOGDIR="${WORK_DIR}/${DATASET_DIR}/${PASCAL_FOLDER}/${EXP_FOLDER}/train"
+EVAL_LOGDIR="${WORK_DIR}/${DATASET_DIR}/${PASCAL_FOLDER}/${EXP_FOLDER}/eval"
+VIS_LOGDIR="${WORK_DIR}/${DATASET_DIR}/${PASCAL_FOLDER}/${EXP_FOLDER}/vis"
+EXPORT_DIR="${WORK_DIR}/${DATASET_DIR}/${PASCAL_FOLDER}/${EXP_FOLDER}/export"
+mkdir -p "${INIT_FOLDER}"
+mkdir -p "${TRAIN_LOGDIR}"
+mkdir -p "${EVAL_LOGDIR}"
+mkdir -p "${VIS_LOGDIR}"
+mkdir -p "${EXPORT_DIR}"
+
+# Copy locally the trained checkpoint as the initial checkpoint.
+TF_INIT_ROOT="http://download.tensorflow.org/models"
+TF_INIT_CKPT="deeplabv3_pascal_train_aug_2018_01_04.tar.gz"
+cd "${INIT_FOLDER}"
+wget -nd -c "${TF_INIT_ROOT}/${TF_INIT_CKPT}"
+tar -xf "${TF_INIT_CKPT}"
+cd "${CURRENT_DIR}"
+
+PASCAL_DATASET="${WORK_DIR}/${DATASET_DIR}/${PASCAL_FOLDER}/tfrecord"
+
+# Train 10 iterations.
+NUM_ITERATIONS=10
+python "${WORK_DIR}"/train.py \
+  --logtostderr \
+  --train_split="trainval" \
+  --model_variant="xception_65" \
+  --atrous_rates=6 \
+  --atrous_rates=12 \
+  --atrous_rates=18 \
+  --output_stride=16 \
+  --decoder_output_stride=4 \
+  --train_crop_size=513 \
+  --train_crop_size=513 \
+  --train_batch_size=4 \
+  --training_number_of_steps="${NUM_ITERATIONS}" \
+  --fine_tune_batch_norm=true \
+  --tf_initial_checkpoint="${INIT_FOLDER}/deeplabv3_pascal_train_aug/model.ckpt" \
+  --train_logdir="${TRAIN_LOGDIR}" \
+  --dataset_dir="${PASCAL_DATASET}"
+
+# Run evaluation. This performs eval over the full val split (1449 images) and
+# will take a while.
+# Using the provided checkpoint, one should expect mIOU=82.20%.
+python "${WORK_DIR}"/eval.py \
+  --logtostderr \
+  --eval_split="val" \
+  --model_variant="xception_65" \
+  --atrous_rates=6 \
+  --atrous_rates=12 \
+  --atrous_rates=18 \
+  --output_stride=16 \
+  --decoder_output_stride=4 \
+  --eval_crop_size=513 \
+  --eval_crop_size=513 \
+  --checkpoint_dir="${TRAIN_LOGDIR}" \
+  --eval_logdir="${EVAL_LOGDIR}" \
+  --dataset_dir="${PASCAL_DATASET}" \
+  --max_number_of_evaluations=1
+
+# Visualize the results.
+python "${WORK_DIR}"/vis.py \
+  --logtostderr \
+  --vis_split="val" \
+  --model_variant="xception_65" \
+  --atrous_rates=6 \
+  --atrous_rates=12 \
+  --atrous_rates=18 \
+  --output_stride=16 \
+  --decoder_output_stride=4 \
+  --vis_crop_size=513 \
+  --vis_crop_size=513 \
+  --checkpoint_dir="${TRAIN_LOGDIR}" \
+  --vis_logdir="${VIS_LOGDIR}" \
+  --dataset_dir="${PASCAL_DATASET}" \
+  --max_number_of_iterations=1
+
+# Export the trained checkpoint.
+CKPT_PATH="${TRAIN_LOGDIR}/model.ckpt-${NUM_ITERATIONS}"
+EXPORT_PATH="${EXPORT_DIR}/frozen_inference_graph.pb"
+
+python "${WORK_DIR}"/export_model.py \
+  --logtostderr \
+  --checkpoint_path="${CKPT_PATH}" \
+  --export_path="${EXPORT_PATH}" \
+  --model_variant="xception_65" \
+  --atrous_rates=6 \
+  --atrous_rates=12 \
+  --atrous_rates=18 \
+  --output_stride=16 \
+  --decoder_output_stride=4 \
+  --num_classes=21 \
+  --crop_size=513 \
+  --crop_size=513 \
+  --inference_scales=1.0
+
+# Run inference with the exported checkpoint.
+# Please refer to the provided deeplab_demo.ipynb for an example.
--- a/research/deeplab/model.py
+++ b/research/deeplab/model.py
--- a/research/deeplab/model_test.py
+++ b/research/deeplab/model_test.py
--- a/research/deeplab/train.py
+++ b/research/deeplab/train.py
--- a/research/deeplab/utils/__init__.py
+++ b/research/deeplab/utils/__init__.py