PiperOrigin-RevId: 190154671

7a2f1a3e · Liang-Chieh Chen · yukun · 67d65c69 · 7a2f1a3e · 7a2f1a3e
Commit 7a2f1a3e authored Mar 22, 2018 by Liang-Chieh Chen Committed by yukun Mar 22, 2018
16 changed files
--- a/research/deeplab/README.md
+++ b/research/deeplab/README.md
@@ -28,7 +28,9 @@ features:
    convolution to trade-off precision and runtime.

 If you find the code useful for your research, please consider citing our latest
-work:
+works:
+
+*   DeepLabv3+:

 ```
 @article{deeplabv3plus2018,
@@ -39,11 +41,21 @@ work:
 }
 ```

+*   MobileNetv2:
+
+```
+@inproceedings{mobilenetv22018,
+  title={Inverted Residuals and Linear Bottlenecks: Mobile Networks for Classification, Detection and Segmentation},
+  author={Mark Sandler and Andrew Howard and Menglong Zhu and Andrey Zhmoginov and Liang-Chieh Chen},
+  booktitle={CVPR},
+  year={2018}
+}
+```
+
 In the current implementation, we support adopting the following network
 backbones:

-1.  MobileNetv2 [8]: A fast network structure designed for mobile devices. **We
-    will provide MobileNetv2 support in the next update. Please stay tuned.**
+1.  MobileNetv2 [8]: A fast network structure designed for mobile devices.

 2.  Xception [9, 10]: A powerful network structure intended for server-side
    deployment.
@@ -71,7 +83,7 @@ Some segmentation results on Flickr images:

 Demo:

-*   <a href='deeplab_demo.ipynb'>Jupyter notebook for off-the-shelf inference.</a><br>
+*   <a href='https://colab.sandbox.google.com/github/tensorflow/models/blob/master/research/deeplab/deeplab_demo.ipynb'>Colab notebook for off-the-shelf inference.</a><br>

 Running:


--- a/research/deeplab/common.py
+++ b/research/deeplab/common.py
@@ -39,11 +39,11 @@ flags.DEFINE_integer('logits_kernel_size', 1,
                     'The kernel size for the convolutional kernel that '
                     'generates logits.')

-# We will support `mobilenet_v2' in the coming update. When using
-# 'xception_65', we set atrous_rates = [6, 12, 18] (output stride 16) and
-# decoder_output_stride = 4.
-flags.DEFINE_enum('model_variant', 'xception_65', ['xception_65'],
-                  'DeepLab model variants.')
+# When using 'mobilent_v2', we set atrous_rates = decoder_output_stride = None.
+# When using 'xception_65', we set atrous_rates = [6, 12, 18] (output stride 16)
+# and decoder_output_stride = 4.
+flags.DEFINE_enum('model_variant', 'mobilenet_v2',
+                  ['xception_65', 'mobilenet_v2'], 'DeepLab model variant.')

 flags.DEFINE_multi_float('image_pyramid', None,
                         'Input scales for multi-scale feature extraction.')
@@ -60,7 +60,12 @@ flags.DEFINE_boolean('aspp_with_separable_conv', True,
 flags.DEFINE_multi_integer('multi_grid', None,
                           'Employ a hierarchy of atrous rates for ResNet.')

-# For `xception_65`, use decoder_output_stride = 4.
+flags.DEFINE_float('depth_multiplier', 1.0,
+                   'Multiplier for the depth (number of channels) for all '
+                   'convolution ops used in MobileNet.')
+
+# For `xception_65`, use decoder_output_stride = 4. For `mobilenet_v2`, use
+# decoder_output_stride = None.
 flags.DEFINE_integer('decoder_output_stride', None,
                     'The ratio of input to output spatial resolution when '
                     'employing decoder to refine segmentation results.')

--- a/research/deeplab/core/feature_extractor.py
+++ b/research/deeplab/core/feature_extractor.py
@@ -18,18 +18,62 @@ import functools
 import tensorflow as tf

 from deeplab.core import xception
+from nets.mobilenet import mobilenet as mobilenet_lib
+from nets.mobilenet import mobilenet_v2


 slim = tf.contrib.slim

+# Default end point for MobileNetv2.
+_MOBILENET_V2_FINAL_ENDPOINT = 'layer_18'
+
+
+def _mobilenet_v2(net,
+                  depth_multiplier,
+                  output_stride,
+                  reuse=None,
+                  scope=None,
+                  final_endpoint=None):
+  """Auxiliary function to add support for 'reuse' to mobilenet_v2.
+
+  Args:
+    net: Input tensor of shape [batch_size, height, width, channels].
+    depth_multiplier: Float multiplier for the depth (number of channels)
+      for all convolution ops. The value must be greater than zero. Typical
+      usage will be to set this value in (0, 1) to reduce the number of
+      parameters or computation cost of the model.
+    output_stride: An integer that specifies the requested ratio of input to
+      output spatial resolution. If not None, then we invoke atrous convolution
+      if necessary to prevent the network from reducing the spatial resolution
+      of the activation maps. Allowed values are 8 (accurate fully convolutional
+      mode), 16 (fast fully convolutional mode), 32 (classification mode).
+    reuse: Reuse model variables.
+    scope: Optional variable scope.
+    final_endpoint: The endpoint to construct the network up to.
+
+  Returns:
+    Features extracted by MobileNetv2.
+  """
+  with tf.variable_scope(
+      scope, 'MobilenetV2', [net], reuse=reuse) as scope:
+    return mobilenet_lib.mobilenet_base(
+        net,
+        conv_defs=mobilenet_v2.V2_DEF,
+        multiplier=depth_multiplier,
+        final_endpoint=final_endpoint or _MOBILENET_V2_FINAL_ENDPOINT,
+        output_stride=output_stride,
+        scope=scope)
+

 # A map from network name to network function.
 networks_map = {
+    'mobilenet_v2': _mobilenet_v2,
    'xception_65': xception.xception_65,
 }

 # A map from network name to network arg scope.
 arg_scopes_map = {
+    'mobilenet_v2': mobilenet_v2.training_scope,
    'xception_65': xception.xception_arg_scope,
 }

@@ -38,6 +82,10 @@ DECODER_END_POINTS = 'decoder_end_points'

 # A dictionary from network name to a map of end point features.
 networks_to_feature_maps = {
+    'mobilenet_v2': {
+        # The provided checkpoint does not include decoder module.
+        DECODER_END_POINTS: None,
+    },
    'xception_65': {
        DECODER_END_POINTS: [
            'entry_flow/block2/unit_1/xception_module/'
@@ -49,6 +97,7 @@ networks_to_feature_maps = {
 # A map from feature extractor name to the network name scope used in the
 # ImageNet pretrained versions of these models.
 name_scope = {
+    'mobilenet_v2': 'MobilenetV2',
    'xception_65': 'xception_65',
 }

@@ -68,6 +117,7 @@ def _preprocess_zero_mean_unit_range(inputs):


 _PREPROCESS_FN = {
+    'mobilenet_v2': _preprocess_zero_mean_unit_range,
    'xception_65': _preprocess_zero_mean_unit_range,
 }

@@ -99,6 +149,8 @@ def mean_pixel(model_variant=None):
 def extract_features(images,
                     output_stride=8,
                     multi_grid=None,
+                     depth_multiplier=1.0,
+                     final_endpoint=None,
                     model_variant=None,
                     weight_decay=0.0001,
                     reuse=None,
@@ -114,6 +166,9 @@ def extract_features(images,
    images: A tensor of size [batch, height, width, channels].
    output_stride: The ratio of input to output spatial resolution.
    multi_grid: Employ a hierarchy of different atrous rates within network.
+    depth_multiplier: Float multiplier for the depth (number of channels)
+      for all convolution ops used in MobileNet.
+    final_endpoint: The MobileNet endpoint to construct the network up to.
    model_variant: Model variant for feature extraction.
    weight_decay: The weight decay for model variables.
    reuse: Reuse the model variables or not.
@@ -159,7 +214,17 @@ def extract_features(images,
            reuse=reuse,
            scope=name_scope[model_variant])
  elif 'mobilenet' in model_variant:
-    raise ValueError('MobileNetv2 support is coming soon.')
+    arg_scope = arg_scopes_map[model_variant](
+        is_training=(is_training and fine_tune_batch_norm),
+        weight_decay=weight_decay)
+    features, end_points = get_network(
+        model_variant, preprocess_images, arg_scope)(
+            inputs=images,
+            depth_multiplier=depth_multiplier,
+            output_stride=output_stride,
+            reuse=reuse,
+            scope=name_scope[model_variant],
+            final_endpoint=final_endpoint)
  else:
    raise ValueError('Unknown model variant %s.' % model_variant)


--- a/research/deeplab/datasets/convert_cityscapes.sh
+++ b/research/deeplab/datasets/convert_cityscapes.sh
@@ -14,19 +14,21 @@
 # limitations under the License.
 # ==============================================================================
 #
-# Script to preprocess the Cityscapes dataset. Note (1) the users should register
-# the Cityscapes dataset website: https://www.cityscapes-dataset.com/downloads/ to
-# download the dataset, and (2) the users should run the script provided by Cityscapes
-# `preparation/createTrainIdLabelImgs.py` to generate the training groundtruth.
+# Script to preprocess the Cityscapes dataset. Note (1) the users should
+# register the Cityscapes dataset website at
+# https://www.cityscapes-dataset.com/downloads/ to download the dataset,
+# and (2) the users should download the utility scripts provided by
+# Cityscapes at https://github.com/mcordts/cityscapesScripts.
 #
 # Usage:
 #   bash ./preprocess_cityscapes.sh
 #
 # The folder structure is assumed to be:
-#  + data
+#  + datasets
 #    - build_cityscapes_data.py
+#    - convert_cityscapes.sh
 #    + cityscapes
-#      + cityscapesscripts
+#      + cityscapesscripts (downloaded scripts)
 #      + gtFine
 #      + leftImg8bit
 #
@@ -37,17 +39,18 @@ set -e
 CURRENT_DIR=$(pwd)
 WORK_DIR="."

-cd "${CURRENT_DIR}"
-
-# Root path for PASCAL VOC 2012 dataset.
+# Root path for Cityscapes dataset.
 CITYSCAPES_ROOT="${WORK_DIR}/cityscapes"

+# Create training labels.
+python "${CITYSCAPES_ROOT}/cityscapesscripts/preparation/createTrainIdLabelImgs.py"
+
 # Build TFRecords of the dataset.
 # First, create output directory for storing TFRecords.
 OUTPUT_DIR="${CITYSCAPES_ROOT}/tfrecord"
 mkdir -p "${OUTPUT_DIR}"

-BUILD_SCRIPT="${WORK_DIR}/build_cityscapes_data.py"
+BUILD_SCRIPT="${CURRENT_DIR}/build_cityscapes_data.py"

 echo "Converting Cityscapes dataset..."
 python "${BUILD_SCRIPT}" \

--- a/research/deeplab/datasets/download_and_convert_voc2012.sh
+++ b/research/deeplab/datasets/download_and_convert_voc2012.sh
@@ -20,15 +20,16 @@
 #   bash ./download_and_preprocess_voc2012.sh
 #
 # The folder structure is assumed to be:
-#  + data
+#  + datasets
 #     - build_data.py
 #     - build_voc2012_data.py
 #     - download_and_preprocess_voc2012.sh
 #     - remove_gt_colormap.py
-#     + VOCdevkit
-#       + VOC2012
-#         + JPEGImages
-#         + SegmentationClass
+#     + pascal_voc_seg
+#       + VOCdevkit
+#         + VOC2012
+#           + JPEGImages
+#           + SegmentationClass
 #

 # Exit immediately if a command exits with a non-zero status.

--- a/research/deeplab/deeplab_demo.ipynb
+++ b/research/deeplab/deeplab_demo.ipynb
--- a/research/deeplab/eval.py
+++ b/research/deeplab/eval.py
@@ -50,8 +50,8 @@ flags.DEFINE_integer('eval_interval_secs', 60 * 5,
                     'How often (in seconds) to run evaluation.')

 # For `xception_65`, use atrous_rates = [12, 24, 36] if output_stride = 8, or
-# rates = [6, 12, 18] if output_stride = 16. Note one could use different
-# atrous_rates/output_stride during training/evaluation.
+# rates = [6, 12, 18] if output_stride = 16. For `mobilenet_v2`, use None. Note
+# one could use different atrous_rates/output_stride during training/evaluation.
 flags.DEFINE_multi_integer('atrous_rates', None,
                           'Atrous rates for atrous spatial pyramid pooling.')


--- a/research/deeplab/g3doc/cityscapes.md
+++ b/research/deeplab/g3doc/cityscapes.md
@@ -54,7 +54,7 @@ python deeplab/train.py \
    --train_crop_size=769 \
    --train_batch_size=1 \
    --dataset="cityscapes" \
-    --tf_initial_checkpoints=${PATH_TO_INITIAL_CHECKPOINT} \
+    --tf_initial_checkpoint=${PATH_TO_INITIAL_CHECKPOINT} \
    --train_logdir=${PATH_TO_TRAIN_DIR} \
    --dataset_dir=${PATH_TO_DATASET}
 ```

--- a/research/deeplab/g3doc/faq.md
+++ b/research/deeplab/g3doc/faq.md
@@ -18,12 +18,49 @@ A: We have not tried this. The interested users could take a look at Philipp Kr
 ___
 Q5: What if I want to train the model and fine-tune the batch normalization parameters?

-A: Fine-tuning batch normalization requires large batch size, and thus in the train.py we suggest setting `num_clones` (number of GPUs on one machine) and `train_batch_size` to be as large as possible.
+A: If given the limited resource at hand, we would suggest you simply fine-tune
+from our provided checkpoint whose batch-norm parameters have been trained (i.e.,
+train with a smaller learning rate, set `fine_tune_batch_norm = false`, and
+employ longer training iterations since the learning rate is small). If
+you really would like to train by yourself, we would suggest
+
+1. Set `output_stride = 16` or maybe even `32` (remember to change the flag
+`atrous_rates` accordingly, e.g., `atrous_rates = [3, 6, 9]` for
+`output_stride = 32`).
+
+2. Use as many GPUs as possible (change the flag `num_clones` in train.py) and
+set `train_batch_size` as large as possible.
+
+3. Adjust the `train_crop_size` in train.py. Maybe set it to be smaller, e.g.,
+513x513 (or even 321x321), so that you could use a larger batch size.
+
+4. Use a smaller network backbone, such as MobileNet-v2.
+
 ___
 Q6: How can I train the model asynchronously?

 A: In the train.py, the users could set `num_replicas` (number of machines for training) and `num_ps_tasks` (we usually set `num_ps_tasks` = `num_replicas` / 2). See slim.deployment.model_deploy for more details.
 ___
+Q7: I could not reproduce the performance even with the provided checkpoints.
+
+A: Please try running
+
+```bash
+# Run the simple test with Xception_65 as network backbone.
+sh local_test.sh
+```
+
+or
+
+```bash
+# Run the simple test with MobileNet-v2 as network backbone.
+sh local_test_mobilenetv2.sh
+```
+
+First, make sure you could reproduce the results with our provided setting.
+After that, you could start to make a new change one at a time to help debug.
+___
+
 ## References

 1. **Deep Residual Learning for Image Recognition**<br />

--- a/research/deeplab/g3doc/model_zoo.md
+++ b/research/deeplab/g3doc/model_zoo.md
@@ -10,7 +10,8 @@ Un-tar'ed directory includes:

 *   a frozen inference graph (`frozen_inference_graph.pb`). All frozen inference
    graphs use output stride of 8 and a single eval scale of 1.0. No left-right
-    flips are used.
+    flips are used, and MobileNet-v2 based models do not include the decoder
+    module.

 *   a checkpoint (`model.ckpt.data-00000-of-00001`, `model.ckpt.index`)

@@ -21,10 +22,13 @@ set or train_aug + trainval set. In the former case, one could train their model
 with smaller batch size and freeze batch normalization when limited GPU memory
 is available, since we have already fine-tuned the batch normalization for you.
 In the latter case, one could directly evaluate the checkpoints on VOC 2012 test
-set or use this checkpoint for demo.
+set or use this checkpoint for demo. Note *MobileNet-v2* based models do not
+employ ASPP and decoder modules for fast computation.

 Checkpoint name             | Network backbone | Pretrained  dataset | ASPP  | Decoder
 --------------------------- | :--------------: | :-----------------: | :---: | :-----:
+mobilenetv2_coco_voc_trainaug | MobileNet-v2  | MS-COCO <br> VOC 2012 train_aug set| N/A | N/A
+mobilenetv2_coco_voc_trainval | MobileNet-v2  | MS-COCO <br> VOC 2012 train_aug + trainval sets | N/A | N/A
 xception_coco_voc_trainaug  | Xception_65  | MS-COCO <br> VOC 2012 train_aug set| [6,12,18] for OS=16 <br> [12,24,36] for OS=8 | OS = 4
 xception_coco_voc_trainval  | Xception_65  | MS-COCO <br> VOC 2012 train_aug + trainval sets | [6,12,18] for OS=16 <br> [12,24,36] for OS=8 | OS = 4

@@ -32,6 +36,8 @@ In the table, **OS** denotes output stride.

 Checkpoint name                                                                                                          | Eval OS   | Eval scales                | Left-right Flip | Multiply-Adds        | Runtime (sec)  | PASCAL mIOU                    | File Size
 ------------------------------------------------------------------------------------------------------------------------ | :-------: | :------------------------: | :-------------: | :------------------: | :------------: | :----------------------------: | :-------:
+[mobilenetv2_coco_voc_trainaug](http://download.tensorflow.org/models/deeplabv3_mnv2_pascal_train_aug_2018_01_29.tar.gz) | 16 <br> 8 | [1.0] <br> [0.5:0.25:1.75] | No <br> Yes     | 2.75B <br> 152.59B   | 0.1 <br> 26.9  | 75.32% (val) <br> 77.33 (val)  | 23MB
+[mobilenetv2_coco_voc_trainval](http://download.tensorflow.org/models/deeplabv3_mnv2_pascal_trainval_2018_01_29.tar.gz)  | 8         | [0.5:0.25:1.75]            | Yes             | 152.59B              | 26.9           | 80.25% (**test**)              | 23MB
 [xception_coco_voc_trainaug](http://download.tensorflow.org/models/deeplabv3_pascal_train_aug_2018_01_04.tar.gz)         | 16 <br> 8 | [1.0] <br> [0.5:0.25:1.75] | No <br> Yes     | 54.17B <br> 3055.35B | 0.7 <br> 223.2 | 82.20% (val) <br> 83.58% (val) | 439MB
 [xception_coco_voc_trainval](http://download.tensorflow.org/models/deeplabv3_pascal_trainval_2018_01_04.tar.gz)          | 8         | [0.5:0.25:1.75]            | Yes             | 3055.35B             | 223.2          | 87.80% (**test**)              | 439MB

@@ -48,16 +54,19 @@ for real-time applications.
 ### Model details

 We provide several checkpoints that have been pretrained on Cityscapes
-train_fine set.
+train_fine set. Note *MobileNet-v2* based model has been pretrained on MS-COCO
+dataset and does not employ ASPP and decoder modules for fast computation.

 Checkpoint name                       | Network backbone | Pretrained dataset                      | ASPP                                             | Decoder
 ------------------------------------- | :--------------: | :-------------------------------------: | :----------------------------------------------: | :-----:
+mobilenetv2_coco_cityscapes_trainfine | MobileNet-v2     | MS-COCO <br> Cityscapes train_fine set  | N/A                                              | N/A
 xception_cityscapes_trainfine         | Xception_65      | ImageNet <br> Cityscapes train_fine set | [6, 12, 18] for OS=16 <br> [12, 24, 36] for OS=8 | OS = 4

 In the table, **OS** denotes output stride.

 Checkpoint name                                                                                                                  | Eval OS   | Eval scales                 | Left-right Flip | Multiply-Adds         | Runtime (sec)  | Cityscapes mIOU                | File Size
 -------------------------------------------------------------------------------------------------------------------------------- | :-------: | :-------------------------: | :-------------: | :-------------------: | :------------: | :----------------------------: | :-------:
+[mobilenetv2_coco_cityscapes_trainfine](http://download.tensorflow.org/models/deeplabv3_mnv2_cityscapes_train_2018_02_05.tar.gz) | 16 <br> 8 | [1.0] <br> [0.75:0.25:1.25] | No <br> Yes     | 21.27B <br> 433.24B   | 0.8 <br> 51.12 | 70.71% (val) <br> 73.57% (val) | 23MB
 [xception_cityscapes_trainfine](http://download.tensorflow.org/models/deeplabv3_cityscapes_train_2018_02_06.tar.gz)              | 16 <br> 8 | [1.0] <br> [0.75:0.25:1.25] | No <br> Yes     | 418.64B <br> 8677.92B | 5.0 <br> 422.8 | 78.79% (val) <br> 80.42% (val) | 439MB

 ## Checkpoints pretrained on ImageNet
@@ -71,6 +80,10 @@ Un-tar'ed directory includes:
 We also provide some checkpoints that are only pretrained on ImageNet so that
 one could use this for training your own models.

+*   mobilenet_v2: We refer the interested users to the TensorFlow open source
+    [MobileNet-V2](https://github.com/tensorflow/models/tree/master/research/slim/nets/mobilenet)
+    for details.
+
 *   xception: We adapt the original Xception model to the task of semantic
    segmentation with the following changes: (1) more layers, (2) all max
    pooling operations are replaced by strided (atrous) separable convolutions,

--- a/research/deeplab/g3doc/pascal.md
+++ b/research/deeplab/g3doc/pascal.md
@@ -56,7 +56,7 @@ python deeplab/train.py \
    --train_crop_size=513 \
    --train_batch_size=1 \
    --dataset="pascal_voc_seg" \
-    --tf_initial_checkpoints=${PATH_TO_INITIAL_CHECKPOINT} \
+    --tf_initial_checkpoint=${PATH_TO_INITIAL_CHECKPOINT} \
    --train_logdir=${PATH_TO_TRAIN_DIR} \
    --dataset_dir=${PATH_TO_DATASET}
 ```

--- a/research/deeplab/local_test_mobilenetv2.sh
+++ b/research/deeplab/local_test_mobilenetv2.sh
+#!/bin/bash
+# Copyright 2018 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+#
+# This script is used to run local test on PASCAL VOC 2012 using MobileNet-v2.
+# Users could also modify from this script for their use case.
+#
+# Usage:
+#   # From the tensorflow/models/research/deeplab directory.
+#   sh ./local_test_mobilenetv2.sh
+#
+#
+
+# Exit immediately if a command exits with a non-zero status.
+set -e
+
+# Move one-level up to tensorflow/models/research directory.
+cd ..
+
+# Update PYTHONPATH.
+export PYTHONPATH=$PYTHONPATH:`pwd`:`pwd`/slim
+
+# Set up the working environment.
+CURRENT_DIR=$(pwd)
+WORK_DIR="${CURRENT_DIR}/deeplab"
+
+# Run model_test first to make sure the PYTHONPATH is correctly set.
+python "${WORK_DIR}"/model_test.py -v
+
+# Go to datasets folder and download PASCAL VOC 2012 segmentation dataset.
+DATASET_DIR="datasets"
+cd "${WORK_DIR}/${DATASET_DIR}"
+sh download_and_convert_voc2012.sh
+
+# Go back to original directory.
+cd "${CURRENT_DIR}"
+
+# Set up the working directories.
+PASCAL_FOLDER="pascal_voc_seg"
+EXP_FOLDER="exp/train_on_trainval_set_mobilenetv2"
+INIT_FOLDER="${WORK_DIR}/${DATASET_DIR}/${PASCAL_FOLDER}/init_models"
+TRAIN_LOGDIR="${WORK_DIR}/${DATASET_DIR}/${PASCAL_FOLDER}/${EXP_FOLDER}/train"
+EVAL_LOGDIR="${WORK_DIR}/${DATASET_DIR}/${PASCAL_FOLDER}/${EXP_FOLDER}/eval"
+VIS_LOGDIR="${WORK_DIR}/${DATASET_DIR}/${PASCAL_FOLDER}/${EXP_FOLDER}/vis"
+EXPORT_DIR="${WORK_DIR}/${DATASET_DIR}/${PASCAL_FOLDER}/${EXP_FOLDER}/export"
+mkdir -p "${INIT_FOLDER}"
+mkdir -p "${TRAIN_LOGDIR}"
+mkdir -p "${EVAL_LOGDIR}"
+mkdir -p "${VIS_LOGDIR}"
+mkdir -p "${EXPORT_DIR}"
+
+# Copy locally the trained checkpoint as the initial checkpoint.
+TF_INIT_ROOT="http://download.tensorflow.org/models"
+CKPT_NAME="deeplabv3_mnv2_pascal_train_aug"
+TF_INIT_CKPT="${CKPT_NAME}_2018_01_29.tar.gz"
+cd "${INIT_FOLDER}"
+wget -nd -c "${TF_INIT_ROOT}/${TF_INIT_CKPT}"
+tar -xf "${TF_INIT_CKPT}"
+cd "${CURRENT_DIR}"
+
+PASCAL_DATASET="${WORK_DIR}/${DATASET_DIR}/${PASCAL_FOLDER}/tfrecord"
+
+# Train 10 iterations.
+NUM_ITERATIONS=10
+python "${WORK_DIR}"/train.py \
+  --logtostderr \
+  --train_split="trainval" \
+  --model_variant="mobilenet_v2" \
+  --output_stride=16 \
+  --train_crop_size=513 \
+  --train_crop_size=513 \
+  --train_batch_size=4 \
+  --training_number_of_steps="${NUM_ITERATIONS}" \
+  --fine_tune_batch_norm=true \
+  --tf_initial_checkpoint="${INIT_FOLDER}/${CKPT_NAME}/model.ckpt-30000" \
+  --train_logdir="${TRAIN_LOGDIR}" \
+  --dataset_dir="${PASCAL_DATASET}"
+
+# Run evaluation. This performs eval over the full val split (1449 images) and
+# will take a while.
+# Using the provided checkpoint, one should expect mIOU=75.34%.
+python "${WORK_DIR}"/eval.py \
+  --logtostderr \
+  --eval_split="val" \
+  --model_variant="mobilenet_v2" \
+  --eval_crop_size=513 \
+  --eval_crop_size=513 \
+  --checkpoint_dir="${TRAIN_LOGDIR}" \
+  --eval_logdir="${EVAL_LOGDIR}" \
+  --dataset_dir="${PASCAL_DATASET}" \
+  --max_number_of_evaluations=1
+
+# Visualize the results.
+python "${WORK_DIR}"/vis.py \
+  --logtostderr \
+  --vis_split="val" \
+  --model_variant="mobilenet_v2" \
+  --vis_crop_size=513 \
+  --vis_crop_size=513 \
+  --checkpoint_dir="${TRAIN_LOGDIR}" \
+  --vis_logdir="${VIS_LOGDIR}" \
+  --dataset_dir="${PASCAL_DATASET}" \
+  --max_number_of_iterations=1
+
+# Export the trained checkpoint.
+CKPT_PATH="${TRAIN_LOGDIR}/model.ckpt-${NUM_ITERATIONS}"
+EXPORT_PATH="${EXPORT_DIR}/frozen_inference_graph.pb"
+
+python "${WORK_DIR}"/export_model.py \
+  --logtostderr \
+  --checkpoint_path="${CKPT_PATH}" \
+  --export_path="${EXPORT_PATH}" \
+  --model_variant="mobilenet_v2" \
+  --num_classes=21 \
+  --crop_size=513 \
+  --crop_size=513 \
+  --inference_scales=1.0
+
+# Run inference with the exported checkpoint.
+# Please refer to the provided deeplab_demo.ipynb for an example.
--- a/research/deeplab/model.py
+++ b/research/deeplab/model.py
@@ -226,8 +226,7 @@ def multi_scale_logits(images,
  Raises:
    ValueError: If model_options doesn't specify crop_size and its
      add_image_level_feature = True, since add_image_level_feature requires
-      crop_size information. Or, if model_options has model_variant =
-      'mobilenet_v2' but atrous_rates or decoder_output_stride are not None.
+      crop_size information.
  """
  # Setup default values.
  if not image_pyramid:
@@ -236,6 +235,12 @@ def multi_scale_logits(images,
  if model_options.crop_size is None and model_options.add_image_level_feature:
    raise ValueError(
        'Crop size must be specified for using image-level feature.')
+  if model_options.model_variant == 'mobilenet_v2':
+    if (model_options.atrous_rates is not None or
+        model_options.decoder_output_stride is not None):
+      # Output a warning and users should make sure if the setting is desired.
+      tf.logging.warning('Our provided mobilenet_v2 checkpoint does not '
+                         'include ASPP and decoder modules.')

  crop_height = (
      model_options.crop_size[0]

--- a/research/deeplab/model_test.py
+++ b/research/deeplab/model_test.py
@@ -42,7 +42,7 @@ class DeeplabModelTest(tf.test.TestCase):
    image_pyramids = [[1], [0.5, 1]]

    # Test two model variants.
-    model_variants = ['xception_65']
+    model_variants = ['xception_65', 'mobilenet_v2']

    # Test with two output_types.
    outputs_to_num_classes = {'semantic': 3,
@@ -87,16 +87,12 @@ class DeeplabModelTest(tf.test.TestCase):
    model_options = common.ModelOptions(
        outputs_to_num_classes,
        crop_size,
-        atrous_rates=[6],
        output_stride=16
    )._replace(
        add_image_level_feature=True,
        aspp_with_batch_norm=True,
-        aspp_with_separable_conv=True,
-        decoder_output_stride=4,
-        decoder_use_separable_conv=True,
        logits_kernel_size=1,
-        model_variant='xception_65')
+        model_variant='mobilenet_v2')  # Employ MobileNetv2 for fast test.

    g = tf.Graph()
    with g.as_default():

--- a/research/deeplab/train.py
+++ b/research/deeplab/train.py
@@ -139,8 +139,8 @@ flags.DEFINE_float('scale_factor_step_size', 0.25,
                   'Scale factor step size for data augmentation.')

 # For `xception_65`, use atrous_rates = [12, 24, 36] if output_stride = 8, or
-# rates = [6, 12, 18] if output_stride = 16. Note one could use different
-# atrous_rates/output_stride during training/evaluation.
+# rates = [6, 12, 18] if output_stride = 16. For `mobilenet_v2`, use None. Note
+# one could use different atrous_rates/output_stride during training/evaluation.
 flags.DEFINE_multi_integer('atrous_rates', None,
                           'Atrous rates for atrous spatial pyramid pooling.')


--- a/research/deeplab/vis.py
+++ b/research/deeplab/vis.py
@@ -54,8 +54,8 @@ flags.DEFINE_integer('eval_interval_secs', 60 * 5,
                     'How often (in seconds) to run evaluation.')

 # For `xception_65`, use atrous_rates = [12, 24, 36] if output_stride = 8, or
-# rates = [6, 12, 18] if output_stride = 16. Note one could use different
-# atrous_rates/output_stride during training/evaluation.
+# rates = [6, 12, 18] if output_stride = 16. For `mobilenet_v2`, use None. Note
+# one could use different atrous_rates/output_stride during training/evaluation.
 flags.DEFINE_multi_integer('atrous_rates', None,
                           'Atrous rates for atrous spatial pyramid pooling.')