Merge pull request #3521 from YknZhu/master

Add deeplab model in tensorflow models

Merge pull request #3521 from YknZhu/master
Add deeplab model in tensorflow models
05ccaf88 · Lukasz Kaiser · GitHub · 6571d16d · 1e9b07d8 · 05ccaf88
Unverified Commit 05ccaf88 authored Mar 08, 2018 by Lukasz Kaiser Committed by GitHub Mar 08, 2018
20 changed files
--- a/research/deeplab/export_model.py
+++ b/research/deeplab/export_model.py
+# Copyright 2018 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Exports trained model to TensorFlow frozen graph."""
+
+import os
+import tensorflow as tf
+
+from tensorflow.python.tools import freeze_graph
+from deeplab import common
+from deeplab import input_preprocess
+from deeplab import model
+
+slim = tf.contrib.slim
+flags = tf.app.flags
+
+FLAGS = flags.FLAGS
+
+flags.DEFINE_string('checkpoint_path', None, 'Checkpoint path')
+
+flags.DEFINE_string('export_path', None,
+                    'Path to output Tensorflow frozen graph.')
+
+flags.DEFINE_integer('num_classes', 21, 'Number of classes.')
+
+flags.DEFINE_multi_integer('crop_size', [513, 513],
+                           'Crop size [height, width].')
+
+# For `xception_65`, use atrous_rates = [12, 24, 36] if output_stride = 8, or
+# rates = [6, 12, 18] if output_stride = 16. For `mobilenet_v2`, use None. Note
+# one could use different atrous_rates/output_stride during training/evaluation.
+flags.DEFINE_multi_integer('atrous_rates', None,
+                           'Atrous rates for atrous spatial pyramid pooling.')
+
+flags.DEFINE_integer('output_stride', 8,
+                     'The ratio of input to output spatial resolution.')
+
+# Change to [0.5, 0.75, 1.0, 1.25, 1.5, 1.75] for multi-scale inference.
+flags.DEFINE_multi_float('inference_scales', [1.0],
+                         'The scales to resize images for inference.')
+
+flags.DEFINE_bool('add_flipped_images', False,
+                  'Add flipped images during inference or not.')
+
+# Input name of the exported model.
+_INPUT_NAME = 'ImageTensor'
+
+# Output name of the exported model.
+_OUTPUT_NAME = 'SemanticPredictions'
+
+
+def _create_input_tensors():
+  """Creates and prepares input tensors for DeepLab model.
+
+  This method creates a 4-D uint8 image tensor 'ImageTensor' with shape
+  [1, None, None, 3]. The actual input tensor name to use during inference is
+  'ImageTensor:0'.
+
+  Returns:
+    image: Preprocessed 4-D float32 tensor with shape [1, crop_height,
+      crop_width, 3].
+    original_image_size: Original image shape tensor [height, width].
+    resized_image_size: Resized image shape tensor [height, width].
+  """
+  # input_preprocess takes 4-D image tensor as input.
+  input_image = tf.placeholder(tf.uint8, [1, None, None, 3], name=_INPUT_NAME)
+  original_image_size = tf.shape(input_image)[1:3]
+
+  # Squeeze the dimension in axis=0 since `preprocess_image_and_label` assumes
+  # image to be 3-D.
+  image = tf.squeeze(input_image, axis=0)
+  resized_image, image, _ = input_preprocess.preprocess_image_and_label(
+      image,
+      label=None,
+      crop_height=FLAGS.crop_size[0],
+      crop_width=FLAGS.crop_size[1],
+      min_resize_value=FLAGS.min_resize_value,
+      max_resize_value=FLAGS.max_resize_value,
+      resize_factor=FLAGS.resize_factor,
+      is_training=False,
+      model_variant=FLAGS.model_variant)
+  resized_image_size = tf.shape(resized_image)[:2]
+
+  # Expand the dimension in axis=0, since the following operations assume the
+  # image to be 4-D.
+  image = tf.expand_dims(image, 0)
+
+  return image, original_image_size, resized_image_size
+
+
+def main(unused_argv):
+  tf.logging.set_verbosity(tf.logging.INFO)
+  tf.logging.info('Prepare to export model to: %s', FLAGS.export_path)
+
+  with tf.Graph().as_default():
+    image, image_size, resized_image_size = _create_input_tensors()
+
+    model_options = common.ModelOptions(
+        outputs_to_num_classes={common.OUTPUT_TYPE: FLAGS.num_classes},
+        crop_size=FLAGS.crop_size,
+        atrous_rates=FLAGS.atrous_rates,
+        output_stride=FLAGS.output_stride)
+
+    if tuple(FLAGS.inference_scales) == (1.0,):
+      tf.logging.info('Exported model performs single-scale inference.')
+      predictions = model.predict_labels(
+          image,
+          model_options=model_options,
+          image_pyramid=FLAGS.image_pyramid)
+    else:
+      tf.logging.info('Exported model performs multi-scale inference.')
+      predictions = model.predict_labels_multi_scale(
+          image,
+          model_options=model_options,
+          eval_scales=FLAGS.inference_scales,
+          add_flipped_images=FLAGS.add_flipped_images)
+
+    # Crop the valid regions from the predictions.
+    semantic_predictions = tf.slice(
+        predictions[common.OUTPUT_TYPE],
+        [0, 0, 0],
+        [1, resized_image_size[0], resized_image_size[1]])
+    # Resize back the prediction to the original image size.
+    def _resize_label(label, label_size):
+      # Expand dimension of label to [1, height, width, 1] for resize operation.
+      label = tf.expand_dims(label, 3)
+      resized_label = tf.image.resize_images(
+          label,
+          label_size,
+          method=tf.image.ResizeMethod.NEAREST_NEIGHBOR,
+          align_corners=True)
+      return tf.squeeze(resized_label, 3)
+    semantic_predictions = _resize_label(semantic_predictions, image_size)
+    semantic_predictions = tf.identity(semantic_predictions, name=_OUTPUT_NAME)
+
+    saver = tf.train.Saver(tf.model_variables())
+
+    tf.gfile.MakeDirs(os.path.dirname(FLAGS.export_path))
+    freeze_graph.freeze_graph_with_def_protos(
+        tf.get_default_graph().as_graph_def(add_shapes=True),
+        saver.as_saver_def(),
+        FLAGS.checkpoint_path,
+        _OUTPUT_NAME,
+        restore_op_name=None,
+        filename_tensor_name=None,
+        output_graph=FLAGS.export_path,
+        clear_devices=True,
+        initializer_nodes=None)
+
+
+if __name__ == '__main__':
+  flags.mark_flag_as_required('checkpoint_path')
+  flags.mark_flag_as_required('export_path')
+  tf.app.run()
--- a/research/deeplab/g3doc/cityscapes.md
+++ b/research/deeplab/g3doc/cityscapes.md
+# Running DeepLab on Cityscapes Semantic Segmentation Dataset
+
+This page walks through the steps required to run DeepLab on Cityscapes on a
+local machine.
+
+## Download dataset and convert to TFRecord
+
+We have prepared the script (under the folder `datasets`) to convert Cityscapes
+dataset to TFRecord. The users are required to download the dataset beforehand
+by registering the [website](https://www.cityscapes-dataset.com/).
+
+```bash
+# From the tensorflow/models/research/deeplab/datasets directory.
+sh convert_cityscapes.sh
+```
+
+The converted dataset will be saved at ./deeplab/datasets/cityscapes/tfrecord.
+
+## Recommended Directory Structure for Training and Evaluation
+
+```
+ datasets
+  + cityscapes
+    + leftImg8bit
+    + gtFine
+    + tfrecord
+    + exp
+      + train_on_train_set
+        + train
+        + eval
+        + vis
+```
+
+where the folder `train_on_train_set` stores the train/eval/vis events and
+results (when training DeepLab on the Cityscapes train set).
+
+## Running the train/eval/vis jobs
+
+A local training job using `xception_65` can be run with the following command:
+
+```bash
+# From tensorflow/models/research/
+python deeplab/train.py \
+    --logtostderr \
+    --train_split="train" \
+    --model_variant="xception_65" \
+    --atrous_rates=6 \
+    --atrous_rates=12 \
+    --atrous_rates=18 \
+    --output_stride=16 \
+    --decoder_output_stride=4 \
+    --train_crop_size=769 \
+    --train_crop_size=769 \
+    --train_batch_size=1 \
+    --tf_initial_checkpoints=${PATH_TO_INITIAL_CHECKPOINT} \
+    --train_logdir=${PATH_TO_TRAIN_DIR} \
+    --dataset_dir=${PATH_TO_DATASET}
+```
+
+where ${PATH_TO_INITIAL_CHECKPOINT} is the path to the initial checkpoint
+(usually an ImageNet pretrained checkpoint), ${PATH_TO_TRAIN_DIR} is the
+directory in which training checkpoints and events will be written to, and
+${PATH_TO_DATASET} is the directory in which the Cityscapes dataset resides.
+
+Note that for {train,eval,vis}.py:
+
+1.  We use small batch size during training. The users could change it based on
+    the available GPU memory and also set `fine_tune_batch_norm` to be False or
+    True depending on the use case.
+
+2.  The users should change atrous_rates from [6, 12, 18] to [12, 24, 36] if
+    setting output_stride=8.
+
+3.  The users could skip the flag, `decoder_output_stride`, if you do not want
+    to use the decoder structure.
+
+A local evaluation job using `xception_65` can be run with the following
+command:
+
+```bash
+# From tensorflow/models/research/
+python deeplab/eval.py \
+    --logtostderr \
+    --eval_split="val" \
+    --model_variant="xception_65" \
+    --atrous_rates=6 \
+    --atrous_rates=12 \
+    --atrous_rates=18 \
+    --output_stride=16 \
+    --decoder_output_stride=4 \
+    --eval_crop_size=1025 \
+    --eval_crop_size=2049 \
+    --checkpoint_dir=${PATH_TO_CHECKPOINT} \
+    --eval_logdir=${PATH_TO_EVAL_DIR} \
+    --dataset_dir=${PATH_TO_DATASET}
+```
+
+where ${PATH_TO_CHECKPOINT} is the path to the trained checkpoint (i.e., the
+path to train_logdir), ${PATH_TO_EVAL_DIR} is the directory in which evaluation
+events will be written to, and ${PATH_TO_DATASET} is the directory in which the
+Cityscapes dataset resides.
+
+A local visualization job using `xception_65` can be run with the following
+command:
+
+```bash
+# From tensorflow/models/research/
+python deeplab/vis.py \
+    --logtostderr \
+    --vis_split="val" \
+    --model_variant="xception_65" \
+    --atrous_rates=6 \
+    --atrous_rates=12 \
+    --atrous_rates=18 \
+    --output_stride=16 \
+    --decoder_output_stride=4 \
+    --vis_crop_size=1025 \
+    --vis_crop_size=2049 \
+    --colormap_type="cityscapes" \
+    --checkpoint_dir=${PATH_TO_CHECKPOINT} \
+    --vis_logdir=${PATH_TO_VIS_DIR} \
+    --dataset_dir=${PATH_TO_DATASET}
+```
+
+where ${PATH_TO_CHECKPOINT} is the path to the trained checkpoint (i.e., the
+path to train_logdir), ${PATH_TO_VIS_DIR} is the directory in which evaluation
+events will be written to, and ${PATH_TO_DATASET} is the directory in which the
+Cityscapes dataset resides. Note that if the users would like to save the
+segmentation results for evaluation server, set also_save_raw_predictions =
+True.
+
+## Running Tensorboard
+
+Progress for training and evaluation jobs can be inspected using Tensorboard. If
+using the recommended directory structure, Tensorboard can be run using the
+following command:
+
+```bash
+tensorboard --logdir=${PATH_TO_LOG_DIRECTORY}
+```
+
+where `${PATH_TO_LOG_DIRECTORY}` points to the directory that contains the
+train, eval, and vis directories (e.g., the folder `train_on_train_set` in the
+above example). Please note it may take Tensorboard a couple minutes to populate
+with data.
--- a/research/deeplab/g3doc/export_model.md
+++ b/research/deeplab/g3doc/export_model.md
+# Export trained deeplab model to frozen inference graph
+
+After model training finishes, you could export it to a frozen TensorFlow
+inference graph proto. Your trained model checkpoint usually includes the
+following files:
+
+*   model.ckpt-${CHECKPOINT_NUMBER}.data-00000-of-00001,
+*   model.ckpt-${CHECKPOINT_NUMBER}.index
+*   model.ckpt-${CHECKPOINT_NUMBER}.meta
+
+After you have identified a candidate checkpoint to export, you can run the
+following commandline to export to a frozen graph:
+
+```bash
+# From tensorflow/models/research/
+# Assume all checkpoint files share the same path prefix `${CHECKPOINT_PATH}`.
+python deeplab/export_model.py \
+    --checkpoint_path=${CHECKPOINT_PATH} \
+    --export_path=${OUTPUT_DIR}/frozen_inference_graph.pb
+```
+
+Please also add other model specific flags as you use for training, such as
+`model_variant`, `add_image_level_feature`, etc.
--- a/research/deeplab/g3doc/faq.md
+++ b/research/deeplab/g3doc/faq.md
+# FAQ
+___
+Q1: What if I want to use other network backbones, such as ResNet [1], instead of only those provided ones (e.g., Xception)?
+
+A: The users could modify the provided core/feature_extractor.py to support more network backbones.
+___
+Q2: What if I want to train the model on other datasets?
+
+A: The users could modify the provided dataset/build_{cityscapes,voc2012}_data.py and dataset/segmentation_dataset.py to build their own dataset.
+___
+Q3: Where can I download the PASCAL VOC augmented training set?
+
+A: The PASCAL VOC augmented training set is provided by Bharath Hariharan et al. [2] Please refer to their [website](http://home.bharathh.info/pubs/codes/SBD/download.html) for details and consider citing their paper if using the dataset.
+___
+Q4: Why the implementation does not include DenseCRF [3]?
+
+A: We have not tried this. The interested users could take a look at Philipp Krähenbühl's [website](http://graphics.stanford.edu/projects/densecrf/) and [paper](https://arxiv.org/abs/1210.5644) for details.
+___
+Q5: What if I want to train the model and fine-tune the batch normalization parameters?
+
+A: Fine-tuning batch normalization requires large batch size, and thus in the train.py we suggest setting `num_clones` (number of GPUs on one machine) and `train_batch_size` to be as large as possible.
+___
+Q6: How can I train the model asynchronously?
+
+A: In the train.py, the users could set `num_replicas` (number of machines for training) and `num_ps_tasks` (we usually set `num_ps_tasks` = `num_replicas` / 2). See slim.deployment.model_deploy for more details.
+___
+## References
+
+1. **Deep Residual Learning for Image Recognition**<br />
+   Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun<br />
+   [[link]](https://arxiv.org/abs/1512.03385), In CVPR, 2016.
+
+2. **Semantic Contours from Inverse Detectors**<br />
+   Bharath Hariharan, Pablo Arbelaez, Lubomir Bourdev, Subhransu Maji, Jitendra Malik<br />
+   [[link]](http://home.bharathh.info/pubs/codes/SBD/download.html), In ICCV, 2011.
+
+3. **Efficient Inference in Fully Connected CRFs with Gaussian Edge Potentials**<br />
+   Philipp Krähenbühl, Vladlen Koltun<br />
+   [[link]](http://graphics.stanford.edu/projects/densecrf/), In NIPS, 2011.
--- a/research/deeplab/g3doc/img/image1.jpg
+++ b/research/deeplab/g3doc/img/image1.jpg
--- a/research/deeplab/g3doc/img/image2.jpg
+++ b/research/deeplab/g3doc/img/image2.jpg
--- a/research/deeplab/g3doc/img/image3.jpg
+++ b/research/deeplab/g3doc/img/image3.jpg
--- a/research/deeplab/g3doc/img/image_info.txt
+++ b/research/deeplab/g3doc/img/image_info.txt
+Image provenance:
+
+image1.jpg: Philippe Put,
+  https://www.flickr.com/photos/34547181@N00/14499172124
+
+image2.jpg: Peretz Partensky
+  https://www.flickr.com/photos/ifl/3926001309
+
+image3.jpg: Peter Harrison
+  https://www.flickr.com/photos/devcentre/392585679
+
+
+vis[1-3].png: Showing original image together with DeepLab segmentation map.
--- a/research/deeplab/g3doc/img/vis1.png
+++ b/research/deeplab/g3doc/img/vis1.png
--- a/research/deeplab/g3doc/img/vis2.png
+++ b/research/deeplab/g3doc/img/vis2.png
--- a/research/deeplab/g3doc/img/vis3.png
+++ b/research/deeplab/g3doc/img/vis3.png
--- a/research/deeplab/g3doc/installation.md
+++ b/research/deeplab/g3doc/installation.md
+# Installation
+
+## Dependencies
+
+DeepLab depends on the following libraries:
+
+*   Numpy
+*   Pillow 1.0
+*   tf Slim (which is included in the "tensorflow/models/research/" checkout)
+*   Jupyter notebook
+*   Matplotlib
+*   Tensorflow
+
+For detailed steps to install Tensorflow, follow the [Tensorflow installation
+instructions](https://www.tensorflow.org/install/). A typical user can install
+Tensorflow using one of the following commands:
+
+```bash
+# For CPU
+pip install tensorflow
+# For GPU
+pip install tensorflow-gpu
+```
+
+The remaining libraries can be installed on Ubuntu 14.04 using via apt-get:
+
+```bash
+sudo apt-get install python-pil python-numpy
+sudo pip install jupyter
+sudo pip install matplotlib
+```
+
+## Add Libraries to PYTHONPATH
+
+When running locally, the tensorflow/models/research/ and slim directories
+should be appended to PYTHONPATH. This can be done by running the following from
+tensorflow/models/research/:
+
+```bash
+# From tensorflow/models/research/
+export PYTHONPATH=$PYTHONPATH:`pwd`:`pwd`/slim
+```
+
+Note: This command needs to run from every new terminal you start. If you wish
+to avoid running this manually, you can add it as a new line to the end of your
+~/.bashrc file.
+
+# Testing the Installation
+
+You can test if you have successfully installed the Tensorflow DeepLab by
+running the following commands:
+
+Quick test by running model_test.py:
+
+```bash
+# From tensorflow/models/research/
+python deeplab/model_test.py
+```
+
+Quick running the whole code on the PASCAL VOC 2012 dataset:
+
+```bash
+# From tensorflow/models/research/deeplab
+sh local_test.sh
+```
+
--- a/research/deeplab/g3doc/model_zoo.md
+++ b/research/deeplab/g3doc/model_zoo.md
+# TensorFlow DeepLab Model Zoo
+
+We provide deeplab models pretrained on PASCAL VOC 2012 and Cityscapes datasets
+for reproducing our results, as well as some checkpoints that are only
+pretrained on ImageNet for training your own models.
+
+## DeepLab models trained on PASCAL VOC 2012
+
+Un-tar'ed directory includes:
+
+*   a frozen inference graph (`frozen_inference_graph.pb`). All frozen inference
+    graphs use output stride of 8 and a single eval scale of 1.0. No left-right
+    flips are used.
+
+*   a checkpoint (`model.ckpt.data-00000-of-00001`, `model.ckpt.index`)
+
+### Model details
+
+We provide several checkpoints that have been pretrained on VOC 2012 train_aug
+set or train_aug + trainval set. In the former case, one could train their model
+with smaller batch size and freeze batch normalization when limited GPU memory
+is available, since we have already fine-tuned the batch normalization for you.
+In the latter case, one could directly evaluate the checkpoints on VOC 2012 test
+set or use this checkpoint for demo.
+
+Checkpoint name             | Network backbone | Pretrained  dataset | ASPP  | Decoder
+--------------------------- | :--------------: | :-----------------: | :---: | :-----:
+xception_coco_voc_trainaug  | Xception_65  | MS-COCO <br> VOC 2012 train_aug set| [6,12,18] for OS=16 <br> [12,24,36] for OS=8 | OS = 4
+xception_coco_voc_trainval  | Xception_65  | MS-COCO <br> VOC 2012 train_aug + trainval sets | [6,12,18] for OS=16 <br> [12,24,36] for OS=8 | OS = 4
+
+In the table, **OS** denotes output stride.
+
+Checkpoint name                                                                                                          | Eval OS   | Eval scales                | Left-right Flip | Multiply-Adds        | Runtime (sec)  | PASCAL mIOU                    | File Size
+------------------------------------------------------------------------------------------------------------------------ | :-------: | :------------------------: | :-------------: | :------------------: | :------------: | :----------------------------: | :-------:
+[xception_coco_voc_trainaug](http://download.tensorflow.org/models/deeplabv3_pascal_train_aug_2018_01_04.tar.gz)         | 16 <br> 8 | [1.0] <br> [0.5:0.25:1.75] | No <br> Yes     | 54.17B <br> 3055.35B | 0.7 <br> 223.2 | 82.20% (val) <br> 83.58% (val) | 439MB
+[xception_coco_voc_trainval](http://download.tensorflow.org/models/deeplabv3_pascal_trainval_2018_01_04.tar.gz)          | 8         | [0.5:0.25:1.75]            | Yes             | 3055.35B             | 223.2          | 87.80% (**test**)              | 439MB
+
+In the table, we report both computation complexity (in terms of Multiply-Adds
+and CPU Runtime) and segmentation performance (in terms of mIOU) on the PASCAL
+VOC val or test set. The reported runtime is calculated by tfprof on a
+workstation with CPU E5-1650 v3 @ 3.50GHz and 32GB memory. Note that applying
+multi-scale inputs and left-right flips increases the segmentation performance
+but also significantly increases the computation and thus may not be suitable
+for real-time applications.
+
+## DeepLab models trained on Cityscapes
+
+### Model details
+
+We provide several checkpoints that have been pretrained on Cityscapes
+train_fine set.
+
+Checkpoint name                       | Network backbone | Pretrained dataset                      | ASPP                                             | Decoder
+------------------------------------- | :--------------: | :-------------------------------------: | :----------------------------------------------: | :-----:
+xception_cityscapes_trainfine         | Xception_65      | ImageNet <br> Cityscapes train_fine set | [6, 12, 18] for OS=16 <br> [12, 24, 36] for OS=8 | OS = 4
+
+In the table, **OS** denotes output stride.
+
+Checkpoint name                                                                                                                  | Eval OS   | Eval scales                 | Left-right Flip | Multiply-Adds         | Runtime (sec)  | Cityscapes mIOU                | File Size
+-------------------------------------------------------------------------------------------------------------------------------- | :-------: | :-------------------------: | :-------------: | :-------------------: | :------------: | :----------------------------: | :-------:
+[xception_cityscapes_trainfine](http://download.tensorflow.org/models/deeplabv3_cityscapes_train_2018_02_06.tar.gz)              | 16 <br> 8 | [1.0] <br> [0.75:0.25:1.25] | No <br> Yes     | 418.64B <br> 8677.92B | 5.0 <br> 422.8 | 78.79% (val) <br> 80.42% (val) | 439MB
+
+## Checkpoints pretrained on ImageNet
+
+Un-tar'ed directory includes:
+
+*   model checkpoint (`model.ckpt.data-00000-of-00001`, `model.ckpt.index`).
+
+### Model details
+
+We also provide some checkpoints that are only pretrained on ImageNet so that
+one could use this for training your own models.
+
+*   xception: We adapt the original Xception model to the task of semantic
+    segmentation with the following changes: (1) more layers, (2) all max
+    pooling operations are replaced by strided (atrous) separable convolutions,
+    and (3) extra batch-norm and ReLU after each 3x3 depthwise convolution are
+    added.
+
+Model name                                                                             | File Size
+-------------------------------------------------------------------------------------- | :-------:
+[xception](http://download.tensorflow.org/models/deeplabv3_xception_2018_01_04.tar.gz) | 447MB
+
+## References
+
+1.  **Mobilenets: Efficient convolutional neural networks for mobile vision applications**<br />
+    Andrew G. Howard, Menglong Zhu, Bo Chen, Dmitry Kalenichenko, Weijun Wang, Tobias Weyand, Marco Andreetto, Hartwig Adam<br />
+    [[link]](https://arxiv.org/abs/1704.04861). arXiv:1704.04861, 2017.
+
+2.  **Inverted Residuals and Linear Bottlenecks: Mobile Networks for Classification, Detection and Segmentation**<br />
+    Mark Sandler, Andrew Howard, Menglong Zhu, Andrey Zhmoginov, Liang-Chieh Chen<br />
+    [[link]](https://arxiv.org/abs/1801.04381). arXiv:1801.04381, 2018.
+
+3.  **Xception: Deep Learning with Depthwise Separable Convolutions**<br />
+    François Chollet<br />
+    [[link]](https://arxiv.org/abs/1610.02357). In the Proc. of CVPR, 2017.
+
+4.  **Deformable Convolutional Networks -- COCO Detection and Segmentation Challenge 2017 Entry**<br />
+    Haozhi Qi, Zheng Zhang, Bin Xiao, Han Hu, Bowen Cheng, Yichen Wei, Jifeng Dai<br />
+    [[link]](http://presentations.cocodataset.org/COCO17-Detect-MSRA.pdf). ICCV COCO Challenge
+    Workshop, 2017.
+
+5.  **The Pascal Visual Object Classes Challenge: A Retrospective**<br />
+    Mark Everingham, S. M. Ali Eslami, Luc Van Gool, Christopher K. I. Williams, John M. Winn, Andrew Zisserman<br />
+    [[link]](http://host.robots.ox.ac.uk/pascal/VOC/voc2012/). IJCV, 2014.
+
+6.  **Semantic Contours from Inverse Detectors**<br />
+    Bharath Hariharan, Pablo Arbelaez, Lubomir Bourdev, Subhransu Maji, Jitendra Malik<br />
+    [[link]](http://home.bharathh.info/pubs/codes/SBD/download.html). In the Proc. of ICCV, 2011.
+
+7.  **The Cityscapes Dataset for Semantic Urban Scene Understanding**<br />
+    Cordts, Marius, Mohamed Omran, Sebastian Ramos, Timo Rehfeld, Markus Enzweiler, Rodrigo Benenson, Uwe Franke, Stefan Roth, Bernt Schiele. <br />
+    [[link]](https://www.cityscapes-dataset.com/). In the Proc. of CVPR, 2016.
+
+8.  **Microsoft COCO: Common Objects in Context**<br />
+    Tsung-Yi Lin, Michael Maire, Serge Belongie, Lubomir Bourdev, Ross Girshick, James Hays, Pietro Perona, Deva Ramanan, C. Lawrence Zitnick, Piotr Dollar<br />
+    [[link]](http://cocodataset.org/). In the Proc. of ECCV, 2014.
+
+9.  **ImageNet Large Scale Visual Recognition Challenge**<br />
+    Olga Russakovsky, Jia Deng, Hao Su, Jonathan Krause, Sanjeev Satheesh, Sean Ma, Zhiheng Huang, Andrej Karpathy, Aditya Khosla, Michael Bernstein, Alexander C. Berg, Li Fei-Fei<br />
+    [[link]](http://www.image-net.org/). IJCV, 2015.
--- a/research/deeplab/g3doc/pascal.md
+++ b/research/deeplab/g3doc/pascal.md
+# Running DeepLab on PASCAL VOC 2012 Semantic Segmentation Dataset
+
+This page walks through the steps required to run DeepLab on PASCAL VOC 2012 on
+a local machine.
+
+## Download dataset and convert to TFRecord
+
+We have prepared the script (under the folder `datasets`) to download and
+convert PASCAL VOC 2012 semantic segmentation dataset to TFRecord.
+
+```bash
+# From the tensorflow/models/research/deeplab/datasets directory.
+sh download_and_convert_voc2012.sh
+```
+
+The converted dataset will be saved at
+./deeplab/datasets/pascal_voc_seg/tfrecord
+
+## Recommended Directory Structure for Training and Evaluation
+
+```
+ datasets
+  + pascal_voc_seg
+    + VOCdevkit
+      + VOC2012
+        + JPEGImages
+        + SegmentationClass
+    + tfrecord
+    + exp
+      + train_on_train_set
+        + train
+        + eval
+        + vis
+```
+
+where the folder `train_on_train_set` stores the train/eval/vis events and
+results (when training DeepLab on the PASCAL VOC 2012 train set).
+
+## Running the train/eval/vis jobs
+
+A local training job using `xception_65` can be run with the following command:
+
+```bash
+# From tensorflow/models/research/
+python deeplab/train.py \
+    --logtostderr \
+    --train_split="train" \
+    --model_variant="xception_65" \
+    --atrous_rates=6 \
+    --atrous_rates=12 \
+    --atrous_rates=18 \
+    --output_stride=16 \
+    --decoder_output_stride=4 \
+    --train_crop_size=513 \
+    --train_crop_size=513 \
+    --train_batch_size=1 \
+    --tf_initial_checkpoints=${PATH_TO_INITIAL_CHECKPOINT} \
+    --train_logdir=${PATH_TO_TRAIN_DIR} \
+    --dataset_dir=${PATH_TO_DATASET}
+```
+
+where ${PATH_TO_INITIAL_CHECKPOINT} is the path to the initial checkpoint
+(usually an ImageNet pretrained checkpoint), ${PATH_TO_TRAIN_DIR} is the
+directory in which training checkpoints and events will be written to, and
+${PATH_TO_DATASET} is the directory in which the PASCAL VOC 2012 dataset
+resides.
+
+Note that for {train,eval,vis}.py:
+
+1.  We use small batch size during training. The users could change it based on
+    the available GPU memory and also set `fine_tune_batch_norm` to be False or
+    True depending on the use case.
+
+2.  The users should change atrous_rates from [6, 12, 18] to [12, 24, 36] if
+    setting output_stride=8.
+
+3.  The users could skip the flag, `decoder_output_stride`, if you do not want
+    to use the decoder structure.
+
+A local evaluation job using `xception_65` can be run with the following
+command:
+
+```bash
+# From tensorflow/models/research/
+python deeplab/eval.py \
+    --logtostderr \
+    --eval_split="val" \
+    --model_variant="xception_65" \
+    --atrous_rates=6 \
+    --atrous_rates=12 \
+    --atrous_rates=18 \
+    --output_stride=16 \
+    --decoder_output_stride=4 \
+    --eval_crop_size=513 \
+    --eval_crop_size=513 \
+    --checkpoint_dir=${PATH_TO_CHECKPOINT} \
+    --eval_logdir=${PATH_TO_EVAL_DIR} \
+    --dataset_dir=${PATH_TO_DATASET}
+```
+
+where ${PATH_TO_CHECKPOINT} is the path to the trained checkpoint (i.e., the
+path to train_logdir), ${PATH_TO_EVAL_DIR} is the directory in which evaluation
+events will be written to, and ${PATH_TO_DATASET} is the directory in which the
+PASCAL VOC 2012 dataset resides.
+
+A local visualization job using `xception_65` can be run with the following
+command:
+
+```bash
+# From tensorflow/models/research/
+python deeplab/vis.py \
+    --logtostderr \
+    --vis_split="val" \
+    --model_variant="xception_65" \
+    --atrous_rates=6 \
+    --atrous_rates=12 \
+    --atrous_rates=18 \
+    --output_stride=16 \
+    --decoder_output_stride=4 \
+    --vis_crop_size=513 \
+    --vis_crop_size=513 \
+    --checkpoint_dir=${PATH_TO_CHECKPOINT} \
+    --vis_logdir=${PATH_TO_VIS_DIR} \
+    --dataset_dir=${PATH_TO_DATASET}
+```
+
+where ${PATH_TO_CHECKPOINT} is the path to the trained checkpoint (i.e., the
+path to train_logdir), ${PATH_TO_VIS_DIR} is the directory in which evaluation
+events will be written to, and ${PATH_TO_DATASET} is the directory in which the
+PASCAL VOC 2012 dataset resides. Note that if the users would like to save the
+segmentation results for evaluation server, set also_save_raw_predictions =
+True.
+
+## Running Tensorboard
+
+Progress for training and evaluation jobs can be inspected using Tensorboard. If
+using the recommended directory structure, Tensorboard can be run using the
+following command:
+
+```bash
+tensorboard --logdir=${PATH_TO_LOG_DIRECTORY}
+```
+
+where `${PATH_TO_LOG_DIRECTORY}` points to the directory that contains the
+train, eval, and vis directories (e.g., the folder `train_on_train_set` in the
+above example). Please note it may take Tensorboard a couple minutes to populate
+with data.
+
+## Example
+
+We provide a script to run the {train,eval,vis,export_model}.py on the PASCAL VOC
+2012 dataset as an example. See the code in local_test.sh for details.
+
+```bash
+# From tensorflow/models/research/deeplab
+sh local_test.sh
+```
--- a/research/deeplab/input_preprocess.py
+++ b/research/deeplab/input_preprocess.py
+# Copyright 2018 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Prepares the data used for DeepLab training/evaluation."""
+import tensorflow as tf
+from deeplab.core import feature_extractor
+from deeplab.core import preprocess_utils
+
+
+# The probability of flipping the images and labels
+# left-right during training
+_PROB_OF_FLIP = 0.5
+
+
+def preprocess_image_and_label(image,
+                               label,
+                               crop_height,
+                               crop_width,
+                               min_resize_value=None,
+                               max_resize_value=None,
+                               resize_factor=None,
+                               min_scale_factor=1.,
+                               max_scale_factor=1.,
+                               scale_factor_step_size=0,
+                               ignore_label=255,
+                               is_training=True,
+                               model_variant=None):
+  """Preprocesses the image and label.
+
+  Args:
+    image: Input image.
+    label: Ground truth annotation label.
+    crop_height: The height value used to crop the image and label.
+    crop_width: The width value used to crop the image and label.
+    min_resize_value: Desired size of the smaller image side.
+    max_resize_value: Maximum allowed size of the larger image side.
+    resize_factor: Resized dimensions are multiple of factor plus one.
+    min_scale_factor: Minimum scale factor value.
+    max_scale_factor: Maximum scale factor value.
+    scale_factor_step_size: The step size from min scale factor to max scale
+      factor. The input is randomly scaled based on the value of
+      (min_scale_factor, max_scale_factor, scale_factor_step_size).
+    ignore_label: The label value which will be ignored for training and
+      evaluation.
+    is_training: If the preprocessing is used for training or not.
+    model_variant: Model variant (string) for choosing how to mean-subtract the
+      images. See feature_extractor.network_map for supported model variants.
+
+  Returns:
+    original_image: Original image (could be resized).
+    processed_image: Preprocessed image.
+    label: Preprocessed ground truth segmentation label.
+
+  Raises:
+    ValueError: Ground truth label not provided during training.
+  """
+  if is_training and label is None:
+    raise ValueError('During training, label must be provided.')
+  if model_variant is None:
+    tf.logging.warning('Default mean-subtraction is performed. Please specify '
+                       'a model_variant. See feature_extractor.network_map for '
+                       'supported model variants.')
+
+  # Keep reference to original image.
+  original_image = image
+
+  processed_image = tf.cast(image, tf.float32)
+
+  if label is not None:
+    label = tf.cast(label, tf.int32)
+
+  # Resize image and label to the desired range.
+  if min_resize_value is not None or max_resize_value is not None:
+    [processed_image, label] = (
+        preprocess_utils.resize_to_range(
+            image=processed_image,
+            label=label,
+            min_size=min_resize_value,
+            max_size=max_resize_value,
+            factor=resize_factor,
+            align_corners=True))
+    # The `original_image` becomes the resized image.
+    original_image = tf.identity(processed_image)
+
+  # Data augmentation by randomly scaling the inputs.
+  scale = preprocess_utils.get_random_scale(
+      min_scale_factor, max_scale_factor, scale_factor_step_size)
+  processed_image, label = preprocess_utils.randomly_scale_image_and_label(
+      processed_image, label, scale)
+  processed_image.set_shape([None, None, 3])
+
+  # Pad image and label to have dimensions >= [crop_height, crop_width]
+  image_shape = tf.shape(processed_image)
+  image_height = image_shape[0]
+  image_width = image_shape[1]
+
+  target_height = image_height + tf.maximum(crop_height - image_height, 0)
+  target_width = image_width + tf.maximum(crop_width - image_width, 0)
+
+  # Pad image with mean pixel value.
+  mean_pixel = tf.reshape(
+      feature_extractor.mean_pixel(model_variant), [1, 1, 3])
+  processed_image = preprocess_utils.pad_to_bounding_box(
+      processed_image, 0, 0, target_height, target_width, mean_pixel)
+
+  if label is not None:
+    label = preprocess_utils.pad_to_bounding_box(
+        label, 0, 0, target_height, target_width, ignore_label)
+
+  # Randomly crop the image and label.
+  if is_training and label is not None:
+    processed_image, label = preprocess_utils.random_crop(
+        [processed_image, label], crop_height, crop_width)
+
+  processed_image.set_shape([crop_height, crop_width, 3])
+
+  if label is not None:
+    label.set_shape([crop_height, crop_width, 1])
+
+  if is_training:
+    # Randomly left-right flip the image and label.
+    processed_image, label, _ = preprocess_utils.flip_dim(
+        [processed_image, label], _PROB_OF_FLIP, dim=1)
+
+  return original_image, processed_image, label
--- a/research/deeplab/local_test.sh
+++ b/research/deeplab/local_test.sh
+#!/bin/bash
+# Copyright 2018 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+#
+# This script is used to run local test on PASCAL VOC 2012. Users could also
+# modify from this script for their use case.
+#
+# Usage:
+#   # From the tensorflow/models/research/deeplab directory.
+#   sh ./local_test.sh
+#
+#
+
+# Exit immediately if a command exits with a non-zero status.
+set -e
+
+# Move one-level up to tensorflow/models/research directory.
+cd ..
+
+# Update PYTHONPATH.
+export PYTHONPATH=$PYTHONPATH:`pwd`:`pwd`/slim
+
+# Set up the working environment.
+CURRENT_DIR=$(pwd)
+WORK_DIR="${CURRENT_DIR}/deeplab"
+
+# Run model_test first to make sure the PYTHONPATH is correctly set.
+python "${WORK_DIR}"/model_test.py -v
+
+# Go to datasets folder and download PASCAL VOC 2012 segmentation dataset.
+DATASET_DIR="datasets"
+cd "${WORK_DIR}/${DATASET_DIR}"
+sh download_and_convert_voc2012.sh
+
+# Go back to original directory.
+cd "${CURRENT_DIR}"
+
+# Set up the working directories.
+PASCAL_FOLDER="pascal_voc_seg"
+EXP_FOLDER="exp/train_on_trainval_set"
+INIT_FOLDER="${WORK_DIR}/${DATASET_DIR}/${PASCAL_FOLDER}/init_models"
+TRAIN_LOGDIR="${WORK_DIR}/${DATASET_DIR}/${PASCAL_FOLDER}/${EXP_FOLDER}/train"
+EVAL_LOGDIR="${WORK_DIR}/${DATASET_DIR}/${PASCAL_FOLDER}/${EXP_FOLDER}/eval"
+VIS_LOGDIR="${WORK_DIR}/${DATASET_DIR}/${PASCAL_FOLDER}/${EXP_FOLDER}/vis"
+EXPORT_DIR="${WORK_DIR}/${DATASET_DIR}/${PASCAL_FOLDER}/${EXP_FOLDER}/export"
+mkdir -p "${INIT_FOLDER}"
+mkdir -p "${TRAIN_LOGDIR}"
+mkdir -p "${EVAL_LOGDIR}"
+mkdir -p "${VIS_LOGDIR}"
+mkdir -p "${EXPORT_DIR}"
+
+# Copy locally the trained checkpoint as the initial checkpoint.
+TF_INIT_ROOT="http://download.tensorflow.org/models"
+TF_INIT_CKPT="deeplabv3_pascal_train_aug_2018_01_04.tar.gz"
+cd "${INIT_FOLDER}"
+wget -nd -c "${TF_INIT_ROOT}/${TF_INIT_CKPT}"
+tar -xf "${TF_INIT_CKPT}"
+cd "${CURRENT_DIR}"
+
+PASCAL_DATASET="${WORK_DIR}/${DATASET_DIR}/${PASCAL_FOLDER}/tfrecord"
+
+# Train 10 iterations.
+NUM_ITERATIONS=10
+python "${WORK_DIR}"/train.py \
+  --logtostderr \
+  --train_split="trainval" \
+  --model_variant="xception_65" \
+  --atrous_rates=6 \
+  --atrous_rates=12 \
+  --atrous_rates=18 \
+  --output_stride=16 \
+  --decoder_output_stride=4 \
+  --train_crop_size=513 \
+  --train_crop_size=513 \
+  --train_batch_size=4 \
+  --training_number_of_steps="${NUM_ITERATIONS}" \
+  --fine_tune_batch_norm=true \
+  --tf_initial_checkpoint="${INIT_FOLDER}/deeplabv3_pascal_train_aug/model.ckpt" \
+  --train_logdir="${TRAIN_LOGDIR}" \
+  --dataset_dir="${PASCAL_DATASET}"
+
+# Run evaluation. This performs eval over the full val split (1449 images) and
+# will take a while.
+# Using the provided checkpoint, one should expect mIOU=82.20%.
+python "${WORK_DIR}"/eval.py \
+  --logtostderr \
+  --eval_split="val" \
+  --model_variant="xception_65" \
+  --atrous_rates=6 \
+  --atrous_rates=12 \
+  --atrous_rates=18 \
+  --output_stride=16 \
+  --decoder_output_stride=4 \
+  --eval_crop_size=513 \
+  --eval_crop_size=513 \
+  --checkpoint_dir="${TRAIN_LOGDIR}" \
+  --eval_logdir="${EVAL_LOGDIR}" \
+  --dataset_dir="${PASCAL_DATASET}" \
+  --max_number_of_evaluations=1
+
+# Visualize the results.
+python "${WORK_DIR}"/vis.py \
+  --logtostderr \
+  --vis_split="val" \
+  --model_variant="xception_65" \
+  --atrous_rates=6 \
+  --atrous_rates=12 \
+  --atrous_rates=18 \
+  --output_stride=16 \
+  --decoder_output_stride=4 \
+  --vis_crop_size=513 \
+  --vis_crop_size=513 \
+  --checkpoint_dir="${TRAIN_LOGDIR}" \
+  --vis_logdir="${VIS_LOGDIR}" \
+  --dataset_dir="${PASCAL_DATASET}" \
+  --max_number_of_iterations=1
+
+# Export the trained checkpoint.
+CKPT_PATH="${TRAIN_LOGDIR}/model.ckpt-${NUM_ITERATIONS}"
+EXPORT_PATH="${EXPORT_DIR}/frozen_inference_graph.pb"
+
+python "${WORK_DIR}"/export_model.py \
+  --logtostderr \
+  --checkpoint_path="${CKPT_PATH}" \
+  --export_path="${EXPORT_PATH}" \
+  --model_variant="xception_65" \
+  --atrous_rates=6 \
+  --atrous_rates=12 \
+  --atrous_rates=18 \
+  --output_stride=16 \
+  --decoder_output_stride=4 \
+  --num_classes=21 \
+  --crop_size=513 \
+  --crop_size=513 \
+  --inference_scales=1.0
+
+# Run inference with the exported checkpoint.
+# Please refer to the provided deeplab_demo.ipynb for an example.
--- a/research/deeplab/model.py
+++ b/research/deeplab/model.py
+# Copyright 2018 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+r"""Provides DeepLab model definition and helper functions.
+
+DeepLab is a deep learning system for semantic image segmentation with
+the following features:
+
+(1) Atrous convolution to explicitly control the resolution at which
+feature responses are computed within Deep Convolutional Neural Networks.
+
+(2) Atrous spatial pyramid pooling (ASPP) to robustly segment objects at
+multiple scales with filters at multiple sampling rates and effective
+fields-of-views.
+
+(3) ASPP module augmented with image-level feature and batch normalization.
+
+(4) A simple yet effective decoder module to recover the object boundaries.
+
+See the following papers for more details:
+
+"Encoder-Decoder with Atrous Separable Convolution for Semantic Image
+Segmentation"
+Liang-Chieh Chen, Yukun Zhu, George Papandreou, Florian Schroff, Hartwig Adam.
+(https://arxiv.org/abs1802.02611)
+
+"Rethinking Atrous Convolution for Semantic Image Segmentation,"
+Liang-Chieh Chen, George Papandreou, Florian Schroff, Hartwig Adam
+(https://arxiv.org/abs/1706.05587)
+
+"DeepLab: Semantic Image Segmentation with Deep Convolutional Nets,
+Atrous Convolution, and Fully Connected CRFs",
+Liang-Chieh Chen*, George Papandreou*, Iasonas Kokkinos, Kevin Murphy,
+Alan L Yuille (* equal contribution)
+(https://arxiv.org/abs/1606.00915)
+
+"Semantic Image Segmentation with Deep Convolutional Nets and Fully Connected
+CRFs"
+Liang-Chieh Chen*, George Papandreou*, Iasonas Kokkinos, Kevin Murphy,
+Alan L. Yuille (* equal contribution)
+(https://arxiv.org/abs/1412.7062)
+"""
+import tensorflow as tf
+from deeplab.core import feature_extractor
+
+slim = tf.contrib.slim
+
+_LOGITS_SCOPE_NAME = 'logits'
+_MERGED_LOGITS_SCOPE = 'merged_logits'
+_IMAGE_POOLING_SCOPE = 'image_pooling'
+_ASPP_SCOPE = 'aspp'
+_CONCAT_PROJECTION_SCOPE = 'concat_projection'
+_DECODER_SCOPE = 'decoder'
+
+
+def get_extra_layer_scopes():
+  """Gets the scopes for extra layers.
+
+  Returns:
+    A list of scopes for extra layers.
+  """
+  return [
+      _LOGITS_SCOPE_NAME,
+      _IMAGE_POOLING_SCOPE,
+      _ASPP_SCOPE,
+      _CONCAT_PROJECTION_SCOPE,
+      _DECODER_SCOPE,
+  ]
+
+
+def predict_labels_multi_scale(images,
+                               model_options,
+                               eval_scales=(1.0,),
+                               add_flipped_images=False):
+  """Predicts segmentation labels.
+
+  Args:
+    images: A tensor of size [batch, height, width, channels].
+    model_options: A ModelOptions instance to configure models.
+    eval_scales: The scales to resize images for evaluation.
+    add_flipped_images: Add flipped images for evaluation or not.
+
+  Returns:
+    A dictionary with keys specifying the output_type (e.g., semantic
+      prediction) and values storing Tensors representing predictions (argmax
+      over channels). Each prediction has size [batch, height, width].
+  """
+  outputs_to_predictions = {
+      output: []
+      for output in model_options.outputs_to_num_classes
+  }
+
+  for i, image_scale in enumerate(eval_scales):
+    with tf.variable_scope(tf.get_variable_scope(), reuse=True if i else None):
+      outputs_to_scales_to_logits = multi_scale_logits(
+          images,
+          model_options=model_options,
+          image_pyramid=[image_scale],
+          is_training=False,
+          fine_tune_batch_norm=False)
+
+    if add_flipped_images:
+      with tf.variable_scope(tf.get_variable_scope(), reuse=True):
+        outputs_to_scales_to_logits_reversed = multi_scale_logits(
+            tf.reverse_v2(images, [2]),
+            model_options=model_options,
+            image_pyramid=[image_scale],
+            is_training=False,
+            fine_tune_batch_norm=False)
+
+    for output in sorted(outputs_to_scales_to_logits):
+      scales_to_logits = outputs_to_scales_to_logits[output]
+      logits = tf.image.resize_bilinear(
+          scales_to_logits[_MERGED_LOGITS_SCOPE],
+          tf.shape(images)[1:3],
+          align_corners=True)
+      outputs_to_predictions[output].append(
+          tf.expand_dims(tf.nn.softmax(logits), 4))
+
+      if add_flipped_images:
+        scales_to_logits_reversed = (
+            outputs_to_scales_to_logits_reversed[output])
+        logits_reversed = tf.image.resize_bilinear(
+            tf.reverse_v2(scales_to_logits_reversed[_MERGED_LOGITS_SCOPE], [2]),
+            tf.shape(images)[1:3],
+            align_corners=True)
+        outputs_to_predictions[output].append(
+            tf.expand_dims(tf.nn.softmax(logits_reversed), 4))
+
+  for output in sorted(outputs_to_predictions):
+    predictions = outputs_to_predictions[output]
+    # Compute average prediction across different scales and flipped images.
+    predictions = tf.reduce_mean(tf.concat(predictions, 4), axis=4)
+    outputs_to_predictions[output] = tf.argmax(predictions, 3)
+
+  return outputs_to_predictions
+
+
+def predict_labels(images, model_options, image_pyramid=None):
+  """Predicts segmentation labels.
+
+  Args:
+    images: A tensor of size [batch, height, width, channels].
+    model_options: A ModelOptions instance to configure models.
+    image_pyramid: Input image scales for multi-scale feature extraction.
+
+  Returns:
+    A dictionary with keys specifying the output_type (e.g., semantic
+      prediction) and values storing Tensors representing predictions (argmax
+      over channels). Each prediction has size [batch, height, width].
+  """
+  outputs_to_scales_to_logits = multi_scale_logits(
+      images,
+      model_options=model_options,
+      image_pyramid=image_pyramid,
+      is_training=False,
+      fine_tune_batch_norm=False)
+
+  predictions = {}
+  for output in sorted(outputs_to_scales_to_logits):
+    scales_to_logits = outputs_to_scales_to_logits[output]
+    logits = tf.image.resize_bilinear(
+        scales_to_logits[_MERGED_LOGITS_SCOPE],
+        tf.shape(images)[1:3],
+        align_corners=True)
+    predictions[output] = tf.argmax(logits, 3)
+
+  return predictions
+
+
+def scale_dimension(dim, scale):
+  """Scales the input dimension.
+
+  Args:
+    dim: Input dimension (a scalar or a scalar Tensor).
+    scale: The amount of scaling applied to the input.
+
+  Returns:
+    Scaled dimension.
+  """
+  if isinstance(dim, tf.Tensor):
+    return tf.cast((tf.to_float(dim) - 1.0) * scale + 1.0, dtype=tf.int32)
+  else:
+    return int((float(dim) - 1.0) * scale + 1.0)
+
+
+def multi_scale_logits(images,
+                       model_options,
+                       image_pyramid,
+                       weight_decay=0.0001,
+                       is_training=False,
+                       fine_tune_batch_norm=False):
+  """Gets the logits for multi-scale inputs.
+
+  The returned logits are all downsampled (due to max-pooling layers)
+  for both training and evaluation.
+
+  Args:
+    images: A tensor of size [batch, height, width, channels].
+    model_options: A ModelOptions instance to configure models.
+    image_pyramid: Input image scales for multi-scale feature extraction.
+
+    weight_decay: The weight decay for model variables.
+    is_training: Is training or not.
+    fine_tune_batch_norm: Fine-tune the batch norm parameters or not.
+
+  Returns:
+    outputs_to_scales_to_logits: A map of maps from output_type (e.g.,
+      semantic prediction) to a dictionary of multi-scale logits names to
+      logits. For each output_type, the dictionary has keys which
+      correspond to the scales and values which correspond to the logits.
+      For example, if `scales` equals [1.0, 1.5], then the keys would
+      include 'merged_logits', 'logits_1.00' and 'logits_1.50'.
+
+  Raises:
+    ValueError: If model_options doesn't specify crop_size and its
+      add_image_level_feature = True, since add_image_level_feature requires
+      crop_size information. Or, if model_options has model_variant =
+      'mobilenet_v2' but atrous_rates or decoder_output_stride are not None.
+  """
+  # Setup default values.
+  if not image_pyramid:
+    image_pyramid = [1.0]
+
+  if model_options.crop_size is None and model_options.add_image_level_feature:
+    raise ValueError(
+        'Crop size must be specified for using image-level feature.')
+
+  crop_height = (
+      model_options.crop_size[0]
+      if model_options.crop_size else tf.shape(images)[1])
+  crop_width = (
+      model_options.crop_size[1]
+      if model_options.crop_size else tf.shape(images)[2])
+
+  # Compute the height, width for the output logits.
+  logits_output_stride = (
+      model_options.decoder_output_stride or model_options.output_stride)
+
+  logits_height = scale_dimension(
+      crop_height,
+      max(1.0, max(image_pyramid)) / logits_output_stride)
+  logits_width = scale_dimension(
+      crop_width,
+      max(1.0, max(image_pyramid)) / logits_output_stride)
+
+  # Compute the logits for each scale in the image pyramid.
+  outputs_to_scales_to_logits = {
+      k: {}
+      for k in model_options.outputs_to_num_classes
+  }
+
+  for count, image_scale in enumerate(image_pyramid):
+    if image_scale != 1.0:
+      scaled_height = scale_dimension(crop_height, image_scale)
+      scaled_width = scale_dimension(crop_width, image_scale)
+      scaled_crop_size = [scaled_height, scaled_width]
+      scaled_images = tf.image.resize_bilinear(
+          images, scaled_crop_size, align_corners=True)
+      if model_options.crop_size:
+        scaled_images.set_shape([None, scaled_height, scaled_width, 3])
+    else:
+      scaled_crop_size = model_options.crop_size
+      scaled_images = images
+
+    updated_options = model_options._replace(crop_size=scaled_crop_size)
+    outputs_to_logits = _get_logits(
+        scaled_images,
+        updated_options,
+        weight_decay=weight_decay,
+        reuse=True if count else None,
+        is_training=is_training,
+        fine_tune_batch_norm=fine_tune_batch_norm)
+
+    # Resize the logits to have the same dimension before merging.
+    for output in sorted(outputs_to_logits):
+      outputs_to_logits[output] = tf.image.resize_bilinear(
+          outputs_to_logits[output], [logits_height, logits_width],
+          align_corners=True)
+
+    # Return when only one input scale.
+    if len(image_pyramid) == 1:
+      for output in sorted(model_options.outputs_to_num_classes):
+        outputs_to_scales_to_logits[output][
+            _MERGED_LOGITS_SCOPE] = outputs_to_logits[output]
+      return outputs_to_scales_to_logits
+
+    # Save logits to the output map.
+    for output in sorted(model_options.outputs_to_num_classes):
+      outputs_to_scales_to_logits[output][
+          'logits_%.2f' % image_scale] = outputs_to_logits[output]
+
+  # Merge the logits from all the multi-scale inputs.
+  for output in sorted(model_options.outputs_to_num_classes):
+    # Concatenate the multi-scale logits for each output type.
+    all_logits = [
+        tf.expand_dims(logits, axis=4)
+        for logits in outputs_to_scales_to_logits[output].values()
+    ]
+    all_logits = tf.concat(all_logits, 4)
+    merge_fn = (
+        tf.reduce_max
+        if model_options.merge_method == 'max' else tf.reduce_mean)
+    outputs_to_scales_to_logits[output][_MERGED_LOGITS_SCOPE] = merge_fn(
+        all_logits, axis=4)
+
+  return outputs_to_scales_to_logits
+
+
+def _extract_features(images,
+                      model_options,
+                      weight_decay=0.0001,
+                      reuse=None,
+                      is_training=False,
+                      fine_tune_batch_norm=False):
+  """Extracts features by the particular model_variant.
+
+  Args:
+    images: A tensor of size [batch, height, width, channels].
+    model_options: A ModelOptions instance to configure models.
+    weight_decay: The weight decay for model variables.
+    reuse: Reuse the model variables or not.
+    is_training: Is training or not.
+    fine_tune_batch_norm: Fine-tune the batch norm parameters or not.
+
+  Returns:
+    concat_logits: A tensor of size [batch, feature_height, feature_width,
+      feature_channels], where feature_height/feature_width are determined by
+      the images height/width and output_stride.
+    end_points: A dictionary from components of the network to the corresponding
+      activation.
+  """
+  features, end_points = feature_extractor.extract_features(
+      images,
+      output_stride=model_options.output_stride,
+      multi_grid=model_options.multi_grid,
+      model_variant=model_options.model_variant,
+      weight_decay=weight_decay,
+      reuse=reuse,
+      is_training=is_training,
+      fine_tune_batch_norm=fine_tune_batch_norm)
+
+  if not model_options.aspp_with_batch_norm:
+    return features, end_points
+  else:
+    batch_norm_params = {
+        'is_training': is_training and fine_tune_batch_norm,
+        'decay': 0.9997,
+        'epsilon': 1e-5,
+        'scale': True,
+    }
+
+    with slim.arg_scope(
+        [slim.conv2d, slim.separable_conv2d],
+        weights_regularizer=slim.l2_regularizer(weight_decay),
+        activation_fn=tf.nn.relu,
+        normalizer_fn=slim.batch_norm,
+        padding='SAME',
+        stride=1,
+        reuse=reuse):
+      with slim.arg_scope([slim.batch_norm], **batch_norm_params):
+        depth = 256
+        branch_logits = []
+
+        if model_options.add_image_level_feature:
+          pool_height = scale_dimension(model_options.crop_size[0],
+                                        1. / model_options.output_stride)
+          pool_width = scale_dimension(model_options.crop_size[1],
+                                       1. / model_options.output_stride)
+          image_feature = slim.avg_pool2d(
+              features, [pool_height, pool_width], [pool_height, pool_width],
+              padding='VALID')
+          image_feature = slim.conv2d(
+              image_feature, depth, 1, scope=_IMAGE_POOLING_SCOPE)
+          image_feature = tf.image.resize_bilinear(
+              image_feature, [pool_height, pool_width], align_corners=True)
+          image_feature.set_shape([None, pool_height, pool_width, depth])
+          branch_logits.append(image_feature)
+
+        # Employ a 1x1 convolution.
+        branch_logits.append(slim.conv2d(features, depth, 1,
+                                         scope=_ASPP_SCOPE + str(0)))
+
+        if model_options.atrous_rates:
+          # Employ 3x3 convolutions with different atrous rates.
+          for i, rate in enumerate(model_options.atrous_rates, 1):
+            scope = _ASPP_SCOPE + str(i)
+            if model_options.aspp_with_separable_conv:
+              aspp_features = _split_separable_conv2d(
+                  features,
+                  filters=depth,
+                  rate=rate,
+                  weight_decay=weight_decay,
+                  scope=scope)
+            else:
+              aspp_features = slim.conv2d(
+                  features, depth, 3, rate=rate, scope=scope)
+            branch_logits.append(aspp_features)
+
+        # Merge branch logits.
+        concat_logits = tf.concat(branch_logits, 3)
+        concat_logits = slim.conv2d(
+            concat_logits, depth, 1, scope=_CONCAT_PROJECTION_SCOPE)
+        concat_logits = slim.dropout(
+            concat_logits,
+            keep_prob=0.9,
+            is_training=is_training,
+            scope=_CONCAT_PROJECTION_SCOPE + '_dropout')
+
+        return concat_logits, end_points
+
+
+def _get_logits(images,
+                model_options,
+                weight_decay=0.0001,
+                reuse=None,
+                is_training=False,
+                fine_tune_batch_norm=False):
+  """Gets the logits by atrous/image spatial pyramid pooling.
+
+  Args:
+    images: A tensor of size [batch, height, width, channels].
+    model_options: A ModelOptions instance to configure models.
+    weight_decay: The weight decay for model variables.
+    reuse: Reuse the model variables or not.
+    is_training: Is training or not.
+    fine_tune_batch_norm: Fine-tune the batch norm parameters or not.
+
+  Returns:
+    outputs_to_logits: A map from output_type to logits.
+  """
+  features, end_points = _extract_features(
+      images,
+      model_options,
+      weight_decay=weight_decay,
+      reuse=reuse,
+      is_training=is_training,
+      fine_tune_batch_norm=fine_tune_batch_norm)
+
+  if model_options.decoder_output_stride is not None:
+    decoder_height = scale_dimension(model_options.crop_size[0],
+                                     1.0 / model_options.decoder_output_stride)
+    decoder_width = scale_dimension(model_options.crop_size[1],
+                                    1.0 / model_options.decoder_output_stride)
+    features = refine_by_decoder(
+        features,
+        end_points,
+        decoder_height=decoder_height,
+        decoder_width=decoder_width,
+        decoder_use_separable_conv=model_options.decoder_use_separable_conv,
+        model_variant=model_options.model_variant,
+        weight_decay=weight_decay,
+        reuse=reuse,
+        is_training=is_training,
+        fine_tune_batch_norm=fine_tune_batch_norm)
+
+  outputs_to_logits = {}
+  for output in sorted(model_options.outputs_to_num_classes):
+    outputs_to_logits[output] = _get_branch_logits(
+        features,
+        model_options.outputs_to_num_classes[output],
+        model_options.atrous_rates,
+        aspp_with_batch_norm=model_options.aspp_with_batch_norm,
+        kernel_size=model_options.logits_kernel_size,
+        weight_decay=weight_decay,
+        reuse=reuse,
+        scope_suffix=output)
+
+  return outputs_to_logits
+
+
+def refine_by_decoder(features,
+                      end_points,
+                      decoder_height,
+                      decoder_width,
+                      decoder_use_separable_conv=False,
+                      model_variant=None,
+                      weight_decay=0.0001,
+                      reuse=None,
+                      is_training=False,
+                      fine_tune_batch_norm=False):
+  """Adds the decoder to obtain sharper segmentation results.
+
+  Args:
+    features: A tensor of size [batch, features_height, features_width,
+      features_channels].
+    end_points: A dictionary from components of the network to the corresponding
+      activation.
+    decoder_height: The height of decoder feature maps.
+    decoder_width: The width of decoder feature maps.
+    decoder_use_separable_conv: Employ separable convolution for decoder or not.
+    model_variant: Model variant for feature extraction.
+    weight_decay: The weight decay for model variables.
+    reuse: Reuse the model variables or not.
+    is_training: Is training or not.
+    fine_tune_batch_norm: Fine-tune the batch norm parameters or not.
+
+  Returns:
+    Decoder output with size [batch, decoder_height, decoder_width,
+      decoder_channels].
+  """
+  batch_norm_params = {
+      'is_training': is_training and fine_tune_batch_norm,
+      'decay': 0.9997,
+      'epsilon': 1e-5,
+      'scale': True,
+  }
+
+  with slim.arg_scope(
+      [slim.conv2d, slim.separable_conv2d],
+      weights_regularizer=slim.l2_regularizer(weight_decay),
+      activation_fn=tf.nn.relu,
+      normalizer_fn=slim.batch_norm,
+      padding='SAME',
+      stride=1,
+      reuse=reuse):
+    with slim.arg_scope([slim.batch_norm], **batch_norm_params):
+      with tf.variable_scope(_DECODER_SCOPE, _DECODER_SCOPE, [features]):
+        feature_list = feature_extractor.networks_to_feature_maps[
+            model_variant][feature_extractor.DECODER_END_POINTS]
+        if feature_list is None:
+          tf.logging.info('Not found any decoder end points.')
+          return features
+        else:
+          decoder_features = features
+          for i, name in enumerate(feature_list):
+            decoder_features_list = [decoder_features]
+            feature_name = '{}/{}'.format(
+                feature_extractor.name_scope[model_variant], name)
+            decoder_features_list.append(
+                slim.conv2d(
+                    end_points[feature_name],
+                    48,
+                    1,
+                    scope='feature_projection' + str(i)))
+            # Resize to decoder_height/decoder_width.
+            for j, feature in enumerate(decoder_features_list):
+              decoder_features_list[j] = tf.image.resize_bilinear(
+                  feature, [decoder_height, decoder_width], align_corners=True)
+              decoder_features_list[j].set_shape(
+                  [None, decoder_height, decoder_width, None])
+            decoder_depth = 256
+            if decoder_use_separable_conv:
+              decoder_features = _split_separable_conv2d(
+                  tf.concat(decoder_features_list, 3),
+                  filters=decoder_depth,
+                  rate=1,
+                  weight_decay=weight_decay,
+                  scope='decoder_conv0')
+              decoder_features = _split_separable_conv2d(
+                  decoder_features,
+                  filters=decoder_depth,
+                  rate=1,
+                  weight_decay=weight_decay,
+                  scope='decoder_conv1')
+            else:
+              num_convs = 2
+              decoder_features = slim.repeat(
+                  tf.concat(decoder_features_list, 3),
+                  num_convs,
+                  slim.conv2d,
+                  decoder_depth,
+                  3,
+                  scope='decoder_conv' + str(i))
+          return decoder_features
+
+
+def _get_branch_logits(features,
+                       num_classes,
+                       atrous_rates=None,
+                       aspp_with_batch_norm=False,
+                       kernel_size=1,
+                       weight_decay=0.0001,
+                       reuse=None,
+                       scope_suffix=''):
+  """Gets the logits from each model's branch.
+
+  The underlying model is branched out in the last layer when atrous
+  spatial pyramid pooling is employed, and all branches are sum-merged
+  to form the final logits.
+
+  Args:
+    features: A float tensor of shape [batch, height, width, channels].
+    num_classes: Number of classes to predict.
+    atrous_rates: A list of atrous convolution rates for last layer.
+    aspp_with_batch_norm: Use batch normalization layers for ASPP.
+    kernel_size: Kernel size for convolution.
+    weight_decay: Weight decay for the model variables.
+    reuse: Reuse model variables or not.
+    scope_suffix: Scope suffix for the model variables.
+
+  Returns:
+    Merged logits with shape [batch, height, width, num_classes].
+
+  Raises:
+    ValueError: Upon invalid input kernel_size value.
+  """
+  # When using batch normalization with ASPP, ASPP has been applied before
+  # in _extract_features, and thus we simply apply 1x1 convolution here.
+  if aspp_with_batch_norm or atrous_rates is None:
+    if kernel_size != 1:
+      raise ValueError('Kernel size must be 1 when atrous_rates is None or '
+                       'using aspp_with_batch_norm. Gets %d.' % kernel_size)
+    atrous_rates = [1]
+
+  with slim.arg_scope(
+      [slim.conv2d],
+      weights_regularizer=slim.l2_regularizer(weight_decay),
+      weights_initializer=tf.truncated_normal_initializer(stddev=0.01),
+      reuse=reuse):
+    with tf.variable_scope(_LOGITS_SCOPE_NAME, _LOGITS_SCOPE_NAME, [features]):
+      branch_logits = []
+      for i, rate in enumerate(atrous_rates):
+        scope = scope_suffix
+        if i:
+          scope += '_%d' % i
+
+        branch_logits.append(
+            slim.conv2d(
+                features,
+                num_classes,
+                kernel_size=kernel_size,
+                rate=rate,
+                activation_fn=None,
+                normalizer_fn=None,
+                scope=scope))
+
+      return tf.add_n(branch_logits)
+
+
+def _split_separable_conv2d(inputs,
+                            filters,
+                            rate=1,
+                            weight_decay=0.00004,
+                            depthwise_weights_initializer_stddev=0.33,
+                            pointwise_weights_initializer_stddev=0.06,
+                            scope=None):
+  """Splits a separable conv2d into depthwise and pointwise conv2d.
+
+  This operation differs from `tf.layers.separable_conv2d` as this operation
+  applies activation function between depthwise and pointwise conv2d.
+
+  Args:
+    inputs: Input tensor with shape [batch, height, width, channels].
+    filters: Number of filters in the 1x1 pointwise convolution.
+    rate: Atrous convolution rate for the depthwise convolution.
+    weight_decay: The weight decay to use for regularizing the model.
+    depthwise_weights_initializer_stddev: The standard deviation of the
+      truncated normal weight initializer for depthwise convolution.
+    pointwise_weights_initializer_stddev: The standard deviation of the
+      truncated normal weight initializer for pointwise convolution.
+    scope: Optional scope for the operation.
+
+  Returns:
+    Computed features after split separable conv2d.
+  """
+  outputs = slim.separable_conv2d(
+      inputs,
+      None,
+      3,
+      depth_multiplier=1,
+      rate=rate,
+      weights_initializer=tf.truncated_normal_initializer(
+          stddev=depthwise_weights_initializer_stddev),
+      weights_regularizer=None,
+      scope=scope + '_depthwise')
+  return slim.conv2d(
+      outputs,
+      filters,
+      1,
+      weights_initializer=tf.truncated_normal_initializer(
+          stddev=pointwise_weights_initializer_stddev),
+      weights_regularizer=slim.l2_regularizer(weight_decay),
+      scope=scope + '_pointwise')
--- a/research/deeplab/model_test.py
+++ b/research/deeplab/model_test.py
+# Copyright 2018 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Tests for DeepLab model and some helper functions."""
+
+import tensorflow as tf
+
+from deeplab import common
+from deeplab import model
+
+
+class DeeplabModelTest(tf.test.TestCase):
+
+  def testScaleDimensionOutput(self):
+    self.assertEqual(161, model.scale_dimension(321, 0.5))
+    self.assertEqual(193, model.scale_dimension(321, 0.6))
+    self.assertEqual(241, model.scale_dimension(321, 0.75))
+
+  def testWrongDeepLabVariant(self):
+    model_options = common.ModelOptions([])._replace(
+        model_variant='no_such_variant')
+    with self.assertRaises(ValueError):
+      model._get_logits(images=[], model_options=model_options)
+
+  def testBuildDeepLabv2(self):
+    batch_size = 2
+    crop_size = [41, 41]
+
+    # Test with two image_pyramids.
+    image_pyramids = [[1], [0.5, 1]]
+
+    # Test two model variants.
+    model_variants = ['xception_65']
+
+    # Test with two output_types.
+    outputs_to_num_classes = {'semantic': 3,
+                              'direction': 2}
+
+    expected_endpoints = [['merged_logits'],
+                          ['merged_logits',
+                           'logits_0.50',
+                           'logits_1.00']]
+    expected_num_logits = [1, 3]
+
+    for model_variant in model_variants:
+      model_options = common.ModelOptions(outputs_to_num_classes)._replace(
+          add_image_level_feature=False,
+          aspp_with_batch_norm=False,
+          aspp_with_separable_conv=False,
+          model_variant=model_variant)
+
+      for i, image_pyramid in enumerate(image_pyramids):
+        g = tf.Graph()
+        with g.as_default():
+          with self.test_session(graph=g):
+            inputs = tf.random_uniform(
+                (batch_size, crop_size[0], crop_size[1], 3))
+            outputs_to_scales_to_logits = model.multi_scale_logits(
+                inputs, model_options, image_pyramid=image_pyramid)
+
+            # Check computed results for each output type.
+            for output in outputs_to_num_classes:
+              scales_to_logits = outputs_to_scales_to_logits[output]
+              self.assertListEqual(sorted(scales_to_logits.keys()),
+                                   sorted(expected_endpoints[i]))
+
+              # Expected number of logits = len(image_pyramid) + 1, since the
+              # last logits is merged from all the scales.
+              self.assertEquals(len(scales_to_logits), expected_num_logits[i])
+
+  def testForwardpassDeepLabv3plus(self):
+    crop_size = [33, 33]
+    outputs_to_num_classes = {'semantic': 3}
+
+    model_options = common.ModelOptions(
+        outputs_to_num_classes,
+        crop_size,
+        atrous_rates=[6],
+        output_stride=16
+    )._replace(
+        add_image_level_feature=True,
+        aspp_with_batch_norm=True,
+        aspp_with_separable_conv=True,
+        decoder_output_stride=4,
+        decoder_use_separable_conv=True,
+        logits_kernel_size=1,
+        model_variant='xception_65')
+
+    g = tf.Graph()
+    with g.as_default():
+      with self.test_session(graph=g) as sess:
+        inputs = tf.random_uniform(
+            (1, crop_size[0], crop_size[1], 3))
+        outputs_to_scales_to_logits = model.multi_scale_logits(
+            inputs,
+            model_options,
+            image_pyramid=[1.0])
+
+        sess.run(tf.global_variables_initializer())
+        outputs_to_scales_to_logits = sess.run(outputs_to_scales_to_logits)
+
+        # Check computed results for each output type.
+        for output in outputs_to_num_classes:
+          scales_to_logits = outputs_to_scales_to_logits[output]
+          # Expect only one output.
+          self.assertEquals(len(scales_to_logits), 1)
+          for logits in scales_to_logits.values():
+            self.assertTrue(logits.any())
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/research/deeplab/train.py
+++ b/research/deeplab/train.py
+# Copyright 2018 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Training script for the DeepLab model.
+
+See model.py for more details and usage.
+"""
+
+import tensorflow as tf
+from deeplab import common
+from deeplab import model
+from deeplab.datasets import segmentation_dataset
+from deeplab.utils import input_generator
+from deeplab.utils import train_utils
+from deployment import model_deploy
+
+slim = tf.contrib.slim
+
+prefetch_queue = slim.prefetch_queue
+
+flags = tf.app.flags
+
+FLAGS = flags.FLAGS
+
+# Settings for multi-GPUs/multi-replicas training.
+
+flags.DEFINE_integer('num_clones', 1, 'Number of clones to deploy.')
+
+flags.DEFINE_boolean('clone_on_cpu', False, 'Use CPUs to deploy clones.')
+
+flags.DEFINE_integer('num_replicas', 1, 'Number of worker replicas.')
+
+flags.DEFINE_integer('startup_delay_steps', 15,
+                     'Number of training steps between replicas startup.')
+
+flags.DEFINE_integer('num_ps_tasks', 0,
+                     'The number of parameter servers. If the value is 0, then '
+                     'the parameters are handled locally by the worker.')
+
+flags.DEFINE_string('master', '', 'BNS name of the tensorflow server')
+
+flags.DEFINE_integer('task', 0, 'The task ID.')
+
+# Settings for logging.
+
+flags.DEFINE_string('train_logdir', None,
+                    'Where the checkpoint and logs are stored.')
+
+flags.DEFINE_integer('log_steps', 10,
+                     'Display logging information at every log_steps.')
+
+flags.DEFINE_integer('save_interval_secs', 1200,
+                     'How often, in seconds, we save the model to disk.')
+
+flags.DEFINE_integer('save_summaries_secs', 600,
+                     'How often, in seconds, we compute the summaries.')
+
+# Settings for training strategry.
+
+flags.DEFINE_enum('learning_policy', 'poly', ['poly', 'step'],
+                  'Learning rate policy for training.')
+
+# Use 0.007 when training on PASCAL augmented training set, train_aug. When
+# fine-tuning on PASCAL trainval set, use learning rate=0.0001.
+flags.DEFINE_float('base_learning_rate', .0001,
+                   'The base learning rate for model training.')
+
+flags.DEFINE_float('learning_rate_decay_factor', 0.1,
+                   'The rate to decay the base learning rate.')
+
+flags.DEFINE_integer('learning_rate_decay_step', 2000,
+                     'Decay the base learning rate at a fixed step.')
+
+flags.DEFINE_float('learning_power', 0.9,
+                   'The power value used in the poly learning policy.')
+
+flags.DEFINE_integer('training_number_of_steps', 30000,
+                     'The number of steps used for training')
+
+flags.DEFINE_float('momentum', 0.9, 'The momentum value to use')
+
+# When fine_tune_batch_norm=True, use at least batch size larger than 12
+# (batch size more than 16 is better). Otherwise, one could use smaller batch
+# size and set fine_tune_batch_norm=False.
+flags.DEFINE_integer('train_batch_size', 8,
+                     'The number of images in each batch during training.')
+
+flags.DEFINE_float('weight_decay', 0.00004,
+                   'The value of the weight decay for training.')
+
+flags.DEFINE_multi_integer('train_crop_size', [513, 513],
+                           'Image crop size [height, width] during training.')
+
+flags.DEFINE_float('last_layer_gradient_multiplier', 1.0,
+                   'The gradient multiplier for last layers, which is used to '
+                   'boost the gradient of last layers if the value > 1.')
+
+flags.DEFINE_boolean('upsample_logits', True,
+                     'Upsample logits during training.')
+
+# Settings for fine-tuning the network.
+
+flags.DEFINE_string('tf_initial_checkpoint', None,
+                    'The initial checkpoint in tensorflow format.')
+
+# Set to False if one does not want to re-use the trained classifier weights.
+flags.DEFINE_boolean('initialize_last_layer', True,
+                     'Initialize the last layer.')
+
+flags.DEFINE_integer('slow_start_step', 0,
+                     'Training model with small learning rate for few steps.')
+
+flags.DEFINE_float('slow_start_learning_rate', 1e-4,
+                   'Learning rate employed during slow start.')
+
+# Set to True if one wants to fine-tune the batch norm parameters in DeepLabv3.
+# Set to False and use small batch size to save GPU memory.
+flags.DEFINE_boolean('fine_tune_batch_norm', True,
+                     'Fine tune the batch norm parameters or not.')
+
+flags.DEFINE_float('min_scale_factor', 0.5,
+                   'Mininum scale factor for data augmentation.')
+
+flags.DEFINE_float('max_scale_factor', 2.,
+                   'Maximum scale factor for data augmentation.')
+
+flags.DEFINE_float('scale_factor_step_size', 0.25,
+                   'Scale factor step size for data augmentation.')
+
+# For `xception_65`, use atrous_rates = [12, 24, 36] if output_stride = 8, or
+# rates = [6, 12, 18] if output_stride = 16. Note one could use different
+# atrous_rates/output_stride during training/evaluation.
+flags.DEFINE_multi_integer('atrous_rates', None,
+                           'Atrous rates for atrous spatial pyramid pooling.')
+
+flags.DEFINE_integer('output_stride', 16,
+                     'The ratio of input to output spatial resolution.')
+
+# Dataset settings.
+flags.DEFINE_string('dataset', 'pascal_voc_seg',
+                    'Name of the segmentation dataset.')
+
+flags.DEFINE_string('train_split', 'train',
+                    'Which split of the dataset to be used for training')
+
+flags.DEFINE_string('dataset_dir', None, 'Where the dataset reside.')
+
+
+def _build_deeplab(inputs_queue, outputs_to_num_classes, ignore_label):
+  """Builds a clone of DeepLab.
+
+  Args:
+    inputs_queue: A prefetch queue for images and labels.
+    outputs_to_num_classes: A map from output type to the number of classes.
+      For example, for the task of semantic segmentation with 21 semantic
+      classes, we would have outputs_to_num_classes['semantic'] = 21.
+    ignore_label: Ignore label.
+
+  Returns:
+    A map of maps from output_type (e.g., semantic prediction) to a
+      dictionary of multi-scale logits names to logits. For each output_type,
+      the dictionary has keys which correspond to the scales and values which
+      correspond to the logits. For example, if `scales` equals [1.0, 1.5],
+      then the keys would include 'merged_logits', 'logits_1.00' and
+      'logits_1.50'.
+  """
+  samples = inputs_queue.dequeue()
+
+  model_options = common.ModelOptions(
+      outputs_to_num_classes=outputs_to_num_classes,
+      crop_size=FLAGS.train_crop_size,
+      atrous_rates=FLAGS.atrous_rates,
+      output_stride=FLAGS.output_stride)
+  outputs_to_scales_to_logits = model.multi_scale_logits(
+      samples[common.IMAGE],
+      model_options=model_options,
+      image_pyramid=FLAGS.image_pyramid,
+      weight_decay=FLAGS.weight_decay,
+      is_training=True,
+      fine_tune_batch_norm=FLAGS.fine_tune_batch_norm)
+
+  for output, num_classes in outputs_to_num_classes.iteritems():
+    train_utils.add_softmax_cross_entropy_loss_for_each_scale(
+        outputs_to_scales_to_logits[output],
+        samples[common.LABEL],
+        num_classes,
+        ignore_label,
+        loss_weight=1.0,
+        upsample_logits=FLAGS.upsample_logits,
+        scope=output)
+
+  return outputs_to_scales_to_logits
+
+
+def main(unused_argv):
+  tf.logging.set_verbosity(tf.logging.INFO)
+  # Set up deployment (i.e., multi-GPUs and/or multi-replicas).
+  config = model_deploy.DeploymentConfig(
+      num_clones=FLAGS.num_clones,
+      clone_on_cpu=FLAGS.clone_on_cpu,
+      replica_id=FLAGS.task,
+      num_replicas=FLAGS.num_replicas,
+      num_ps_tasks=FLAGS.num_ps_tasks)
+
+  # Split the batch across GPUs.
+  assert FLAGS.train_batch_size % config.num_clones == 0, (
+      'Training batch size not divisble by number of clones (GPUs).')
+
+  clone_batch_size = FLAGS.train_batch_size / config.num_clones
+
+  # Get dataset-dependent information.
+  dataset = segmentation_dataset.get_dataset(
+      FLAGS.dataset, FLAGS.train_split, dataset_dir=FLAGS.dataset_dir)
+
+  tf.gfile.MakeDirs(FLAGS.train_logdir)
+  tf.logging.info('Training on %s set', FLAGS.train_split)
+
+  with tf.Graph().as_default():
+    with tf.device(config.inputs_device()):
+      samples = input_generator.get(
+          dataset,
+          FLAGS.train_crop_size,
+          clone_batch_size,
+          min_resize_value=FLAGS.min_resize_value,
+          max_resize_value=FLAGS.max_resize_value,
+          resize_factor=FLAGS.resize_factor,
+          min_scale_factor=FLAGS.min_scale_factor,
+          max_scale_factor=FLAGS.max_scale_factor,
+          scale_factor_step_size=FLAGS.scale_factor_step_size,
+          dataset_split=FLAGS.train_split,
+          is_training=True,
+          model_variant=FLAGS.model_variant)
+      inputs_queue = prefetch_queue.prefetch_queue(
+          samples, capacity=128 * config.num_clones)
+
+    # Create the global step on the device storing the variables.
+    with tf.device(config.variables_device()):
+      global_step = tf.train.get_or_create_global_step()
+
+      # Define the model and create clones.
+      model_fn = _build_deeplab
+      model_args = (inputs_queue, {
+          common.OUTPUT_TYPE: dataset.num_classes
+      }, dataset.ignore_label)
+      clones = model_deploy.create_clones(config, model_fn, args=model_args)
+
+      # Gather update_ops from the first clone. These contain, for example,
+      # the updates for the batch_norm variables created by model_fn.
+      first_clone_scope = config.clone_scope(0)
+      update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS, first_clone_scope)
+
+    # Gather initial summaries.
+    summaries = set(tf.get_collection(tf.GraphKeys.SUMMARIES))
+
+    # Add summaries for model variables.
+    for model_var in slim.get_model_variables():
+      summaries.add(tf.summary.histogram(model_var.op.name, model_var))
+
+    # Add summaries for losses.
+    for loss in tf.get_collection(tf.GraphKeys.LOSSES, first_clone_scope):
+      summaries.add(tf.summary.scalar('losses/%s' % loss.op.name, loss))
+
+    # Build the optimizer based on the device specification.
+    with tf.device(config.optimizer_device()):
+      learning_rate = train_utils.get_model_learning_rate(
+          FLAGS.learning_policy, FLAGS.base_learning_rate,
+          FLAGS.learning_rate_decay_step, FLAGS.learning_rate_decay_factor,
+          FLAGS.training_number_of_steps, FLAGS.learning_power,
+          FLAGS.slow_start_step, FLAGS.slow_start_learning_rate)
+      optimizer = tf.train.MomentumOptimizer(learning_rate, FLAGS.momentum)
+      summaries.add(tf.summary.scalar('learning_rate', learning_rate))
+
+    startup_delay_steps = FLAGS.task * FLAGS.startup_delay_steps
+    for variable in slim.get_model_variables():
+      summaries.add(tf.summary.histogram(variable.op.name, variable))
+
+    with tf.device(config.variables_device()):
+      total_loss, grads_and_vars = model_deploy.optimize_clones(
+          clones, optimizer)
+      total_loss = tf.check_numerics(total_loss, 'Loss is inf or nan.')
+      summaries.add(tf.summary.scalar('total_loss', total_loss))
+
+      # Modify the gradients for biases and last layer variables.
+      last_layers = model.get_extra_layer_scopes()
+      grad_mult = train_utils.get_model_gradient_multipliers(
+          last_layers, FLAGS.last_layer_gradient_multiplier)
+      if grad_mult:
+        grads_and_vars = slim.learning.multiply_gradients(
+            grads_and_vars, grad_mult)
+
+      # Create gradient update op.
+      grad_updates = optimizer.apply_gradients(
+          grads_and_vars, global_step=global_step)
+      update_ops.append(grad_updates)
+      update_op = tf.group(*update_ops)
+      with tf.control_dependencies([update_op]):
+        train_tensor = tf.identity(total_loss, name='train_op')
+
+    # Add the summaries from the first clone. These contain the summaries
+    # created by model_fn and either optimize_clones() or _gather_clone_loss().
+    summaries |= set(
+        tf.get_collection(tf.GraphKeys.SUMMARIES, first_clone_scope))
+
+    # Merge all summaries together.
+    summary_op = tf.summary.merge(list(summaries))
+
+    # Soft placement allows placing on CPU ops without GPU implementation.
+    session_config = tf.ConfigProto(
+        allow_soft_placement=True, log_device_placement=False)
+
+    # Start the training.
+    slim.learning.train(
+        train_tensor,
+        logdir=FLAGS.train_logdir,
+        log_every_n_steps=FLAGS.log_steps,
+        master=FLAGS.master,
+        number_of_steps=FLAGS.training_number_of_steps,
+        is_chief=(FLAGS.task == 0),
+        session_config=session_config,
+        startup_delay_steps=startup_delay_steps,
+        init_fn=train_utils.get_model_init_fn(
+            FLAGS.train_logdir,
+            FLAGS.tf_initial_checkpoint,
+            FLAGS.initialize_last_layer,
+            last_layers,
+            ignore_missing_vars=True),
+        summary_op=summary_op,
+        save_summaries_secs=FLAGS.save_summaries_secs,
+        save_interval_secs=FLAGS.save_interval_secs)
+
+
+if __name__ == '__main__':
+  flags.mark_flag_as_required('train_logdir')
+  flags.mark_flag_as_required('tf_initial_checkpoint')
+  flags.mark_flag_as_required('dataset_dir')
+  tf.app.run()
--- a/research/deeplab/utils/__init__.py
+++ b/research/deeplab/utils/__init__.py