PiperOrigin-RevId: 200493322

8caa269d · Liang-Chieh Chen · huihui · 1f82c227 · 8caa269d · 8caa269d
Commit 8caa269d authored Jun 14, 2018 by Liang-Chieh Chen Committed by huihui Jun 14, 2018
15 changed files
--- a/research/deeplab/README.md
+++ b/research/deeplab/README.md
@@ -113,6 +113,11 @@ with "deeplab".
 ## Change Logs
+### May 26, 2018
+Updated ADE20K pretrained checkpoint.
 ### May 18, 2018
 1.  Added builders for ResNet-v1 and Xception model variants.
 1.  Added ADE20K support, including colormap and pretrained Xception_65 checkpoint.

--- a/research/deeplab/common.py
+++ b/research/deeplab/common.py
@@ -40,10 +40,10 @@ flags.DEFINE_integer('logits_kernel_size', 1,
                     'generates logits.')
 # When using 'mobilent_v2', we set atrous_rates = decoder_output_stride = None.
-# When using 'xception_65', we set atrous_rates = [6, 12, 18] (output stride 16)
+# When using 'xception_65' or 'resnet_v1' model variants, we set
-# and decoder_output_stride = 4.
+# atrous_rates = [6, 12, 18] (output stride 16) and decoder_output_stride = 4.
-flags.DEFINE_enum('model_variant', 'mobilenet_v2',
+# See core/feature_extractor.py for supported model variants.
-                  ['xception_65', 'mobilenet_v2'], 'DeepLab model variant.')
+flags.DEFINE_string('model_variant', 'mobilenet_v2', 'DeepLab model variant.')
 flags.DEFINE_multi_float('image_pyramid', None,
                         'Input scales for multi-scale feature extraction.')
@@ -57,6 +57,8 @@ flags.DEFINE_boolean('aspp_with_batch_norm', True,
 flags.DEFINE_boolean('aspp_with_separable_conv', True,
                     'Use separable convolution for ASPP or not.')
+# Defaults to None. Set multi_grid = [1, 2, 4] when using provided
+# 'resnet_v1_{50,101}_beta' checkpoints.
 flags.DEFINE_multi_integer('multi_grid', None,
                           'Employ a hierarchy of atrous rates for ResNet.')

--- a/research/deeplab/common_test.py
+++ b/research/deeplab/common_test.py
+# Copyright 2018 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for common.py."""
+import tensorflow as tf
+from deeplab import common
+class CommonTest(tf.test.TestCase):
+  def testOutputsToNumClasses(self):
+    num_classes = 21
+    model_options = common.ModelOptions(
+        outputs_to_num_classes={common.OUTPUT_TYPE: num_classes})
+    self.assertEqual(model_options.outputs_to_num_classes[common.OUTPUT_TYPE],
+                     num_classes)
+if __name__ == '__main__':
+  tf.test.main()
--- a/research/deeplab/core/feature_extractor.py
+++ b/research/deeplab/core/feature_extractor.py
@@ -98,8 +98,7 @@ DECODER_END_POINTS = 'decoder_end_points'
 # A dictionary from network name to a map of end point features.
 networks_to_feature_maps = {
    'mobilenet_v2': {
-        # The provided checkpoint does not include decoder module.
+        DECODER_END_POINTS: ['layer_4/depthwise_output'],
-        DECODER_END_POINTS: None,
    },
    'resnet_v1_50': {
        DECODER_END_POINTS: ['block1/unit_2/bottleneck_v1/conv3'],
@@ -211,8 +210,7 @@ def extract_features(images,
                     regularize_depthwise=False,
                     preprocess_images=True,
                     num_classes=None,
-                     global_pool=False,
+                     global_pool=False):
-                     use_bounded_activations=False):
  """Extracts features by the particular model_variant.
  Args:
@@ -237,8 +235,6 @@ def extract_features(images,
      to None for dense prediction tasks.
    global_pool: Global pooling for image classification task. Defaults to
      False, since dense prediction tasks do not use this.
-    use_bounded_activations: Whether or not to use bounded activations. Bounded
-      activations better lend themselves to quantized inference.
  Returns:
    features: A tensor of size [batch, feature_height, feature_width,
@@ -255,8 +251,7 @@ def extract_features(images,
        weight_decay=weight_decay,
        batch_norm_decay=0.95,
        batch_norm_epsilon=1e-5,
-        batch_norm_scale=True,
+        batch_norm_scale=True)
-        activation_fn=tf.nn.relu6 if use_bounded_activations else tf.nn.relu)
    features, end_points = get_network(
        model_variant, preprocess_images, arg_scope)(
            inputs=images,
@@ -266,8 +261,7 @@ def extract_features(images,
            output_stride=output_stride,
            multi_grid=multi_grid,
            reuse=reuse,
-            scope=name_scope[model_variant],
+            scope=name_scope[model_variant])
-            use_bounded_activations=use_bounded_activations)
  elif 'xception' in model_variant:
    arg_scope = arg_scopes_map[model_variant](
        weight_decay=weight_decay,

--- a/research/deeplab/core/resnet_v1_beta.py
+++ b/research/deeplab/core/resnet_v1_beta.py
@@ -44,8 +44,7 @@ def bottleneck(inputs,
               unit_rate=1,
               rate=1,
               outputs_collections=None,
-               scope=None,
+               scope=None):
-               use_bounded_activations=True):
  """Bottleneck residual unit variant with BN after convolutions.
  This is the original residual unit proposed in [1]. See Fig. 1(a) of [2] for
@@ -65,8 +64,6 @@ def bottleneck(inputs,
    rate: An integer, rate for atrous convolution.
    outputs_collections: Collection to add the ResNet unit output.
    scope: Optional variable_scope.
-    use_bounded_activations: Whether or not to use bounded activations. Bounded
-      activations better lend themselves to quantized inference.
  Returns:
    The ResNet unit's output.
@@ -81,7 +78,7 @@ def bottleneck(inputs,
          depth,
          [1, 1],
          stride=stride,
-          activation_fn=tf.nn.relu6 if use_bounded_activations else None,
+          activation_fn=None,
          scope='shortcut')
    residual = slim.conv2d(inputs, depth_bottleneck, [1, 1], stride=1,
@@ -90,13 +87,7 @@ def bottleneck(inputs,
                                        rate=rate*unit_rate, scope='conv2')
    residual = slim.conv2d(residual, depth, [1, 1], stride=1,
                           activation_fn=None, scope='conv3')
+    output = tf.nn.relu(shortcut + residual)
-    if use_bounded_activations:
-      # Use clip_by_value to simulate bandpass activation.
-      residual = tf.clip_by_value(residual, -6.0, 6.0)
-      output = tf.nn.relu6(shortcut + residual)
-    else:
-      output = tf.nn.relu(shortcut + residual)
    return slim.utils.collect_named_outputs(outputs_collections,
                                            sc.name,
@@ -129,8 +120,6 @@ def resnet_v1_beta(inputs,
                   global_pool=True,
                   output_stride=None,
                   root_block_fn=None,
-                   store_non_strided_activations=False,
-                   use_bounded_activations=False,
                   reuse=None,
                   scope=None):
  """Generator for v1 ResNet models (beta variant).
@@ -159,14 +148,6 @@ def resnet_v1_beta(inputs,
    root_block_fn: The function consisting of convolution operations applied to
      the root input. If root_block_fn is None, use the original setting of
      RseNet-v1, which is simply one convolution with 7x7 kernel and stride=2.
-    store_non_strided_activations: If True, we compute non-strided (undecimated)
-      activations at the last unit of each block and store them in the
-      `outputs_collections` before subsampling them. This gives us access to
-      higher resolution intermediate activations which are useful in some
-      dense prediction problems but increases 4x the computation and memory cost
-      at the last unit of each block.
-    use_bounded_activations: Whether or not to use bounded activations. Bounded
-      activations better lend themselves to quantized inference.
    reuse: whether or not the network and its variables should be reused. To be
      able to reuse 'scope' must be given.
    scope: Optional variable_scope.
@@ -196,35 +177,32 @@ def resnet_v1_beta(inputs,
    with slim.arg_scope([slim.conv2d, bottleneck,
                         resnet_utils.stack_blocks_dense],
                        outputs_collections=end_points_collection):
-      with slim.arg_scope(
+      if is_training is not None:
-          [bottleneck], use_bounded_activations=use_bounded_activations):
+        arg_scope = slim.arg_scope([slim.batch_norm], is_training=is_training)
-        if is_training is not None:
+      else:
-          arg_scope = slim.arg_scope([slim.batch_norm], is_training=is_training)
+        arg_scope = slim.arg_scope([])
-        else:
+      with arg_scope:
-          arg_scope = slim.arg_scope([])
+        net = inputs
-        with arg_scope:
+        if output_stride is not None:
-          net = inputs
+          if output_stride % 4 != 0:
-          if output_stride is not None:
+            raise ValueError('The output_stride needs to be a multiple of 4.')
-            if output_stride % 4 != 0:
+          output_stride /= 4
-              raise ValueError('The output_stride needs to be a multiple of 4.')
+        net = root_block_fn(net)
-            output_stride /= 4
+        net = slim.max_pool2d(net, 3, stride=2, padding='SAME', scope='pool1')
-          net = root_block_fn(net)
+        net = resnet_utils.stack_blocks_dense(net, blocks, output_stride)
-          net = slim.max_pool2d(net, 3, stride=2, padding='SAME', scope='pool1')
-          net = resnet_utils.stack_blocks_dense(net, blocks, output_stride,
+        if global_pool:
-                                                store_non_strided_activations)
+          # Global average pooling.
+          net = tf.reduce_mean(net, [1, 2], name='pool5', keepdims=True)
-          if global_pool:
+        if num_classes is not None:
-            # Global average pooling.
+          net = slim.conv2d(net, num_classes, [1, 1], activation_fn=None,
-            net = tf.reduce_mean(net, [1, 2], name='pool5', keepdims=True)
+                            normalizer_fn=None, scope='logits')
-          if num_classes is not None:
+        # Convert end_points_collection into a dictionary of end_points.
-            net = slim.conv2d(net, num_classes, [1, 1], activation_fn=None,
+        end_points = slim.utils.convert_collection_to_dict(
-                              normalizer_fn=None, scope='logits')
+            end_points_collection)
-          # Convert end_points_collection into a dictionary of end_points.
+        if num_classes is not None:
-          end_points = slim.utils.convert_collection_to_dict(
+          end_points['predictions'] = slim.softmax(net, scope='predictions')
-              end_points_collection)
+        return net, end_points
-          if num_classes is not None:
-            end_points['predictions'] = slim.softmax(net, scope='predictions')
-          return net, end_points
 def resnet_v1_beta_block(scope, base_depth, num_units, stride):
@@ -258,9 +236,7 @@ def resnet_v1_50(inputs,
                 is_training=None,
                 global_pool=False,
                 output_stride=None,
-                 store_non_strided_activations=False,
                 multi_grid=None,
-                 use_bounded_activations=False,
                 reuse=None,
                 scope='resnet_v1_50'):
  """Resnet v1 50.
@@ -275,15 +251,7 @@ def resnet_v1_50(inputs,
    output_stride: If None, then the output will be computed at the nominal
      network stride. If output_stride is not None, it specifies the requested
      ratio of input to output spatial resolution.
-    store_non_strided_activations: If True, we compute non-strided (undecimated)
-      activations at the last unit of each block and store them in the
-      `outputs_collections` before subsampling them. This gives us access to
-      higher resolution intermediate activations which are useful in some
-      dense prediction problems but increases 4x the computation and memory cost
-      at the last unit of each block.
    multi_grid: Employ a hierarchy of different atrous rates within network.
-    use_bounded_activations: Whether or not to use bounded activations. Bounded
-      activations better lend themselves to quantized inference.
    reuse: whether or not the network and its variables should be reused. To be
      able to reuse 'scope' must be given.
    scope: Optional variable_scope.
@@ -328,10 +296,8 @@ def resnet_v1_50(inputs,
      is_training=is_training,
      global_pool=global_pool,
      output_stride=output_stride,
-      store_non_strided_activations=store_non_strided_activations,
      reuse=reuse,
-      scope=scope,
+      scope=scope)
-      use_bounded_activations=use_bounded_activations)
 def resnet_v1_50_beta(inputs,
@@ -339,9 +305,7 @@ def resnet_v1_50_beta(inputs,
                      is_training=None,
                      global_pool=False,
                      output_stride=None,
-                      store_non_strided_activations=False,
                      multi_grid=None,
-                      use_bounded_activations=False,
                      reuse=None,
                      scope='resnet_v1_50'):
  """Resnet v1 50 beta variant.
@@ -360,15 +324,7 @@ def resnet_v1_50_beta(inputs,
    output_stride: If None, then the output will be computed at the nominal
      network stride. If output_stride is not None, it specifies the requested
      ratio of input to output spatial resolution.
-    store_non_strided_activations: If True, we compute non-strided (undecimated)
-      activations at the last unit of each block and store them in the
-      `outputs_collections` before subsampling them. This gives us access to
-      higher resolution intermediate activations which are useful in some
-      dense prediction problems but increases 4x the computation and memory cost
-      at the last unit of each block.
    multi_grid: Employ a hierarchy of different atrous rates within network.
-    use_bounded_activations: Whether or not to use bounded activations. Bounded
-      activations better lend themselves to quantized inference.
    reuse: whether or not the network and its variables should be reused. To be
      able to reuse 'scope' must be given.
    scope: Optional variable_scope.
@@ -414,10 +370,8 @@ def resnet_v1_50_beta(inputs,
      global_pool=global_pool,
      output_stride=output_stride,
      root_block_fn=functools.partial(root_block_fn_for_beta_variant),
-      store_non_strided_activations=store_non_strided_activations,
      reuse=reuse,
-      scope=scope,
+      scope=scope)
-      use_bounded_activations=use_bounded_activations)
 def resnet_v1_101(inputs,
@@ -425,9 +379,7 @@ def resnet_v1_101(inputs,
                  is_training=None,
                  global_pool=False,
                  output_stride=None,
-                  store_non_strided_activations=False,
                  multi_grid=None,
-                  use_bounded_activations=False,
                  reuse=None,
                  scope='resnet_v1_101'):
  """Resnet v1 101.
@@ -442,15 +394,7 @@ def resnet_v1_101(inputs,
    output_stride: If None, then the output will be computed at the nominal
      network stride. If output_stride is not None, it specifies the requested
      ratio of input to output spatial resolution.
-    store_non_strided_activations: If True, we compute non-strided (undecimated)
-      activations at the last unit of each block and store them in the
-      `outputs_collections` before subsampling them. This gives us access to
-      higher resolution intermediate activations which are useful in some
-      dense prediction problems but increases 4x the computation and memory cost
-      at the last unit of each block.
    multi_grid: Employ a hierarchy of different atrous rates within network.
-    use_bounded_activations: Whether or not to use bounded activations. Bounded
-      activations better lend themselves to quantized inference.
    reuse: whether or not the network and its variables should be reused. To be
      able to reuse 'scope' must be given.
    scope: Optional variable_scope.
@@ -495,10 +439,8 @@ def resnet_v1_101(inputs,
      is_training=is_training,
      global_pool=global_pool,
      output_stride=output_stride,
-      store_non_strided_activations=store_non_strided_activations,
      reuse=reuse,
-      scope=scope,
+      scope=scope)
-      use_bounded_activations=use_bounded_activations)
 def resnet_v1_101_beta(inputs,
@@ -506,9 +448,7 @@ def resnet_v1_101_beta(inputs,
                       is_training=None,
                       global_pool=False,
                       output_stride=None,
-                       store_non_strided_activations=False,
                       multi_grid=None,
-                       use_bounded_activations=False,
                       reuse=None,
                       scope='resnet_v1_101'):
  """Resnet v1 101 beta variant.
@@ -527,15 +467,7 @@ def resnet_v1_101_beta(inputs,
    output_stride: If None, then the output will be computed at the nominal
      network stride. If output_stride is not None, it specifies the requested
      ratio of input to output spatial resolution.
-    store_non_strided_activations: If True, we compute non-strided (undecimated)
-      activations at the last unit of each block and store them in the
-      `outputs_collections` before subsampling them. This gives us access to
-      higher resolution intermediate activations which are useful in some
-      dense prediction problems but increases 4x the computation and memory cost
-      at the last unit of each block.
    multi_grid: Employ a hierarchy of different atrous rates within network.
-    use_bounded_activations: Whether or not to use bounded activations. Bounded
-      activations better lend themselves to quantized inference.
    reuse: whether or not the network and its variables should be reused. To be
      able to reuse 'scope' must be given.
    scope: Optional variable_scope.
@@ -581,7 +513,5 @@ def resnet_v1_101_beta(inputs,
      global_pool=global_pool,
      output_stride=output_stride,
      root_block_fn=functools.partial(root_block_fn_for_beta_variant),
-      store_non_strided_activations=store_non_strided_activations,
-      use_bounded_activations=use_bounded_activations,
      reuse=reuse,
      scope=scope)
--- a/research/deeplab/core/resnet_v1_beta_test.py
+++ b/research/deeplab/core/resnet_v1_beta_test.py
@@ -53,7 +53,6 @@ class ResnetCompleteNetworkTest(tf.test.TestCase):
                    is_training=True,
                    global_pool=True,
                    output_stride=None,
-                    store_non_strided_activations=False,
                    multi_grid=None,
                    reuse=None,
                    scope='resnet_v1_small'):
@@ -84,7 +83,6 @@ class ResnetCompleteNetworkTest(tf.test.TestCase):
        output_stride=output_stride,
        root_block_fn=functools.partial(
            resnet_v1_beta.root_block_fn_for_beta_variant),
-        store_non_strided_activations=store_non_strided_activations,
        reuse=reuse,
        scope=scope)

--- a/research/deeplab/datasets/segmentation_dataset.py
+++ b/research/deeplab/datasets/segmentation_dataset.py
@@ -89,6 +89,7 @@ _CITYSCAPES_INFORMATION = DatasetDescriptor(
 _PASCAL_VOC_SEG_INFORMATION = DatasetDescriptor(
    splits_to_sizes={
        'train': 1464,
+        'train_aug': 10582,
        'trainval': 2913,
        'val': 1449,
    },

--- a/research/deeplab/deeplab_demo.ipynb
+++ b/research/deeplab/deeplab_demo.ipynb
@@ -294,13 +294,13 @@
        "  try:\n",
        "    f = urllib.request.urlopen(url)\n",
        "    jpeg_str = f.read()\n",
-        "    original_im = Image.open(BytesIO(jpeg_str))\n",
+        "    orignal_im = Image.open(BytesIO(jpeg_str))\n",
        "  except IOError:\n",
        "    print('Cannot retrieve image. Please check url: ' + url)\n",
        "    return\n",
        "\n",
        "  print('running deeplab on image %s...' % url)\n",
-        "  resized_im, seg_map = MODEL.run(original_im)\n",
+        "  resized_im, seg_map = MODEL.run(orignal_im)\n",
        "\n",
        "  vis_segmentation(resized_im, seg_map)\n",
        "\n",

--- a/research/deeplab/g3doc/ade20k.md
+++ b/research/deeplab/g3doc/ade20k.md
@@ -49,7 +49,7 @@ A local training job using `xception_65` can be run with the following command:
 # From tensorflow/models/research/
 python deeplab/train.py \
    --logtostderr \
-    --training_number_of_steps=90000 \
+    --training_number_of_steps=150000 \
    --train_split="train" \
    --model_variant="xception_65" \
    --atrous_rates=6 \

--- a/research/deeplab/g3doc/model_zoo.md
+++ b/research/deeplab/g3doc/model_zoo.md
@@ -84,7 +84,7 @@ xception_ade20k_train                 | Xception_65      | ImageNet <br> ADE20K
 Checkpoint name                       | Eval OS   | Eval scales                 | Left-right Flip |  mIOU                 | Pixel-wise Accuracy | File Size
 ------------------------------------- | :-------: | :-------------------------: | :-------------: | :-------------------: | :-------------------: | :-------:
-[xception_ade20k_train](http://download.tensorflow.org/models/deeplabv3_xception_ade20k_train_2018_05_14.tar.gz)              | 16 | [0.5:0.25:1.75] | Yes     | 43.54% (val) | 81.74% (val) | 439MB
+[xception_ade20k_train](http://download.tensorflow.org/models/deeplabv3_xception_ade20k_train_2018_05_29.tar.gz)              | 8 | [0.5:0.25:1.75] | Yes     | 45.65% (val) | 82.52% (val) | 439MB
 ## Checkpoints pretrained on ImageNet

--- a/research/deeplab/model.py
+++ b/research/deeplab/model.py
@@ -56,16 +56,12 @@ from deeplab.core import feature_extractor
 slim = tf.contrib.slim
-_LOGITS_SCOPE_NAME = 'logits'
+LOGITS_SCOPE_NAME = 'logits'
-_MERGED_LOGITS_SCOPE = 'merged_logits'
+MERGED_LOGITS_SCOPE = 'merged_logits'
-_IMAGE_POOLING_SCOPE = 'image_pooling'
+IMAGE_POOLING_SCOPE = 'image_pooling'
-_ASPP_SCOPE = 'aspp'
+ASPP_SCOPE = 'aspp'
-_CONCAT_PROJECTION_SCOPE = 'concat_projection'
+CONCAT_PROJECTION_SCOPE = 'concat_projection'
-_DECODER_SCOPE = 'decoder'
+DECODER_SCOPE = 'decoder'
-def get_merged_logits_scope():
-  return _MERGED_LOGITS_SCOPE
 def get_extra_layer_scopes(last_layers_contain_logits_only=False):
@@ -79,14 +75,14 @@ def get_extra_layer_scopes(last_layers_contain_logits_only=False):
    A list of scopes for extra layers.
  """
  if last_layers_contain_logits_only:
-    return [_LOGITS_SCOPE_NAME]
+    return [LOGITS_SCOPE_NAME]
  else:
    return [
-        _LOGITS_SCOPE_NAME,
+        LOGITS_SCOPE_NAME,
-        _IMAGE_POOLING_SCOPE,
+        IMAGE_POOLING_SCOPE,
-        _ASPP_SCOPE,
+        ASPP_SCOPE,
-        _CONCAT_PROJECTION_SCOPE,
+        CONCAT_PROJECTION_SCOPE,
-        _DECODER_SCOPE,
+        DECODER_SCOPE,
    ]
@@ -133,7 +129,7 @@ def predict_labels_multi_scale(images,
    for output in sorted(outputs_to_scales_to_logits):
      scales_to_logits = outputs_to_scales_to_logits[output]
      logits = tf.image.resize_bilinear(
-          scales_to_logits[_MERGED_LOGITS_SCOPE],
+          scales_to_logits[MERGED_LOGITS_SCOPE],
          tf.shape(images)[1:3],
          align_corners=True)
      outputs_to_predictions[output].append(
@@ -143,7 +139,7 @@ def predict_labels_multi_scale(images,
        scales_to_logits_reversed = (
            outputs_to_scales_to_logits_reversed[output])
        logits_reversed = tf.image.resize_bilinear(
-            tf.reverse_v2(scales_to_logits_reversed[_MERGED_LOGITS_SCOPE], [2]),
+            tf.reverse_v2(scales_to_logits_reversed[MERGED_LOGITS_SCOPE], [2]),
            tf.shape(images)[1:3],
            align_corners=True)
        outputs_to_predictions[output].append(
@@ -182,7 +178,7 @@ def predict_labels(images, model_options, image_pyramid=None):
  for output in sorted(outputs_to_scales_to_logits):
    scales_to_logits = outputs_to_scales_to_logits[output]
    logits = tf.image.resize_bilinear(
-        scales_to_logits[_MERGED_LOGITS_SCOPE],
+        scales_to_logits[MERGED_LOGITS_SCOPE],
        tf.shape(images)[1:3],
        align_corners=True)
    predictions[output] = tf.argmax(logits, 3)
@@ -221,7 +217,6 @@ def multi_scale_logits(images,
    images: A tensor of size [batch, height, width, channels].
    model_options: A ModelOptions instance to configure models.
    image_pyramid: Input image scales for multi-scale feature extraction.
    weight_decay: The weight decay for model variables.
    is_training: Is training or not.
    fine_tune_batch_norm: Fine-tune the batch norm parameters or not.
@@ -242,17 +237,9 @@ def multi_scale_logits(images,
  # Setup default values.
  if not image_pyramid:
    image_pyramid = [1.0]
  if model_options.crop_size is None and model_options.add_image_level_feature:
    raise ValueError(
        'Crop size must be specified for using image-level feature.')
-  if model_options.model_variant == 'mobilenet_v2':
-    if (model_options.atrous_rates is not None or
-        model_options.decoder_output_stride is not None):
-      # Output a warning and users should make sure if the setting is desired.
-      tf.logging.warning('Our provided mobilenet_v2 checkpoint does not '
-                         'include ASPP and decoder modules.')
  crop_height = (
      model_options.crop_size[0]
      if model_options.crop_size else tf.shape(images)[1])
@@ -277,7 +264,7 @@ def multi_scale_logits(images,
      for k in model_options.outputs_to_num_classes
  }
-  for count, image_scale in enumerate(image_pyramid):
+  for image_scale in image_pyramid:
    if image_scale != 1.0:
      scaled_height = scale_dimension(crop_height, image_scale)
      scaled_width = scale_dimension(crop_width, image_scale)
@@ -295,7 +282,7 @@ def multi_scale_logits(images,
        scaled_images,
        updated_options,
        weight_decay=weight_decay,
-        reuse=True if count else None,
+        reuse=tf.AUTO_REUSE,
        is_training=is_training,
        fine_tune_batch_norm=fine_tune_batch_norm)
@@ -309,7 +296,7 @@ def multi_scale_logits(images,
    if len(image_pyramid) == 1:
      for output in sorted(model_options.outputs_to_num_classes):
        outputs_to_scales_to_logits[output][
-            _MERGED_LOGITS_SCOPE] = outputs_to_logits[output]
+            MERGED_LOGITS_SCOPE] = outputs_to_logits[output]
      return outputs_to_scales_to_logits
    # Save logits to the output map.
@@ -328,18 +315,18 @@ def multi_scale_logits(images,
    merge_fn = (
        tf.reduce_max
        if model_options.merge_method == 'max' else tf.reduce_mean)
-    outputs_to_scales_to_logits[output][_MERGED_LOGITS_SCOPE] = merge_fn(
+    outputs_to_scales_to_logits[output][MERGED_LOGITS_SCOPE] = merge_fn(
        all_logits, axis=4)
  return outputs_to_scales_to_logits
-def _extract_features(images,
+def extract_features(images,
-                      model_options,
+                     model_options,
-                      weight_decay=0.0001,
+                     weight_decay=0.0001,
-                      reuse=None,
+                     reuse=None,
-                      is_training=False,
+                     is_training=False,
-                      fine_tune_batch_norm=False):
+                     fine_tune_batch_norm=False):
  """Extracts features by the particular model_variant.
  Args:
@@ -399,7 +386,7 @@ def _extract_features(images,
              features, [pool_height, pool_width], [pool_height, pool_width],
              padding='VALID')
          image_feature = slim.conv2d(
-              image_feature, depth, 1, scope=_IMAGE_POOLING_SCOPE)
+              image_feature, depth, 1, scope=IMAGE_POOLING_SCOPE)
          image_feature = tf.image.resize_bilinear(
              image_feature, [pool_height, pool_width], align_corners=True)
          image_feature.set_shape([None, pool_height, pool_width, depth])
@@ -407,14 +394,14 @@ def _extract_features(images,
        # Employ a 1x1 convolution.
        branch_logits.append(slim.conv2d(features, depth, 1,
-                                         scope=_ASPP_SCOPE + str(0)))
+                                         scope=ASPP_SCOPE + str(0)))
        if model_options.atrous_rates:
          # Employ 3x3 convolutions with different atrous rates.
          for i, rate in enumerate(model_options.atrous_rates, 1):
-            scope = _ASPP_SCOPE + str(i)
+            scope = ASPP_SCOPE + str(i)
            if model_options.aspp_with_separable_conv:
-              aspp_features = _split_separable_conv2d(
+              aspp_features = split_separable_conv2d(
                  features,
                  filters=depth,
                  rate=rate,
@@ -428,12 +415,12 @@ def _extract_features(images,
        # Merge branch logits.
        concat_logits = tf.concat(branch_logits, 3)
        concat_logits = slim.conv2d(
-            concat_logits, depth, 1, scope=_CONCAT_PROJECTION_SCOPE)
+            concat_logits, depth, 1, scope=CONCAT_PROJECTION_SCOPE)
        concat_logits = slim.dropout(
            concat_logits,
            keep_prob=0.9,
            is_training=is_training,
-            scope=_CONCAT_PROJECTION_SCOPE + '_dropout')
+            scope=CONCAT_PROJECTION_SCOPE + '_dropout')
        return concat_logits, end_points
@@ -457,7 +444,7 @@ def _get_logits(images,
  Returns:
    outputs_to_logits: A map from output_type to logits.
  """
-  features, end_points = _extract_features(
+  features, end_points = extract_features(
      images,
      model_options,
      weight_decay=weight_decay,
@@ -484,7 +471,7 @@ def _get_logits(images,
  outputs_to_logits = {}
  for output in sorted(model_options.outputs_to_num_classes):
-    outputs_to_logits[output] = _get_branch_logits(
+    outputs_to_logits[output] = get_branch_logits(
        features,
        model_options.outputs_to_num_classes[output],
        model_options.atrous_rates,
@@ -543,7 +530,7 @@ def refine_by_decoder(features,
      stride=1,
      reuse=reuse):
    with slim.arg_scope([slim.batch_norm], **batch_norm_params):
-      with tf.variable_scope(_DECODER_SCOPE, _DECODER_SCOPE, [features]):
+      with tf.variable_scope(DECODER_SCOPE, DECODER_SCOPE, [features]):
        feature_list = feature_extractor.networks_to_feature_maps[
            model_variant][feature_extractor.DECODER_END_POINTS]
        if feature_list is None:
@@ -553,8 +540,13 @@ def refine_by_decoder(features,
          decoder_features = features
          for i, name in enumerate(feature_list):
            decoder_features_list = [decoder_features]
-            feature_name = '{}/{}'.format(
-                feature_extractor.name_scope[model_variant], name)
+            # MobileNet variants use different naming convention.
+            if 'mobilenet' in model_variant:
+              feature_name = name
+            else:
+              feature_name = '{}/{}'.format(
+                  feature_extractor.name_scope[model_variant], name)
            decoder_features_list.append(
                slim.conv2d(
                    end_points[feature_name],
@@ -569,13 +561,13 @@ def refine_by_decoder(features,
                  [None, decoder_height, decoder_width, None])
            decoder_depth = 256
            if decoder_use_separable_conv:
-              decoder_features = _split_separable_conv2d(
+              decoder_features = split_separable_conv2d(
                  tf.concat(decoder_features_list, 3),
                  filters=decoder_depth,
                  rate=1,
                  weight_decay=weight_decay,
                  scope='decoder_conv0')
-              decoder_features = _split_separable_conv2d(
+              decoder_features = split_separable_conv2d(
                  decoder_features,
                  filters=decoder_depth,
                  rate=1,
@@ -593,14 +585,14 @@ def refine_by_decoder(features,
          return decoder_features
-def _get_branch_logits(features,
+def get_branch_logits(features,
-                       num_classes,
+                      num_classes,
-                       atrous_rates=None,
+                      atrous_rates=None,
-                       aspp_with_batch_norm=False,
+                      aspp_with_batch_norm=False,
-                       kernel_size=1,
+                      kernel_size=1,
-                       weight_decay=0.0001,
+                      weight_decay=0.0001,
-                       reuse=None,
+                      reuse=None,
-                       scope_suffix=''):
+                      scope_suffix=''):
  """Gets the logits from each model's branch.
  The underlying model is branched out in the last layer when atrous
@@ -624,7 +616,7 @@ def _get_branch_logits(features,
    ValueError: Upon invalid input kernel_size value.
  """
  # When using batch normalization with ASPP, ASPP has been applied before
-  # in _extract_features, and thus we simply apply 1x1 convolution here.
+  # in extract_features, and thus we simply apply 1x1 convolution here.
  if aspp_with_batch_norm or atrous_rates is None:
    if kernel_size != 1:
      raise ValueError('Kernel size must be 1 when atrous_rates is None or '
@@ -636,7 +628,7 @@ def _get_branch_logits(features,
      weights_regularizer=slim.l2_regularizer(weight_decay),
      weights_initializer=tf.truncated_normal_initializer(stddev=0.01),
      reuse=reuse):
-    with tf.variable_scope(_LOGITS_SCOPE_NAME, _LOGITS_SCOPE_NAME, [features]):
+    with tf.variable_scope(LOGITS_SCOPE_NAME, LOGITS_SCOPE_NAME, [features]):
      branch_logits = []
      for i, rate in enumerate(atrous_rates):
        scope = scope_suffix
@@ -656,13 +648,14 @@ def _get_branch_logits(features,
      return tf.add_n(branch_logits)
-def _split_separable_conv2d(inputs,
+def split_separable_conv2d(inputs,
-                            filters,
+                           filters,
-                            rate=1,
+                           kernel_size=3,
-                            weight_decay=0.00004,
+                           rate=1,
-                            depthwise_weights_initializer_stddev=0.33,
+                           weight_decay=0.00004,
-                            pointwise_weights_initializer_stddev=0.06,
+                           depthwise_weights_initializer_stddev=0.33,
-                            scope=None):
+                           pointwise_weights_initializer_stddev=0.06,
+                           scope=None):
  """Splits a separable conv2d into depthwise and pointwise conv2d.
  This operation differs from `tf.layers.separable_conv2d` as this operation
@@ -671,6 +664,8 @@ def _split_separable_conv2d(inputs,
  Args:
    inputs: Input tensor with shape [batch, height, width, channels].
    filters: Number of filters in the 1x1 pointwise convolution.
+    kernel_size: A list of length 2: [kernel_height, kernel_width] of
+      of the filters. Can be an int if both values are the same.
    rate: Atrous convolution rate for the depthwise convolution.
    weight_decay: The weight decay to use for regularizing the model.
    depthwise_weights_initializer_stddev: The standard deviation of the
@@ -685,7 +680,7 @@ def _split_separable_conv2d(inputs,
  outputs = slim.separable_conv2d(
      inputs,
      None,
-      3,
+      kernel_size=kernel_size,
      depth_multiplier=1,
      rate=rate,
      weights_initializer=tf.truncated_normal_initializer(

--- a/research/deeplab/train.py
+++ b/research/deeplab/train.py
@@ -101,6 +101,8 @@ flags.DEFINE_float('momentum', 0.9, 'The momentum value to use')
 flags.DEFINE_integer('train_batch_size', 8,
                     'The number of images in each batch during training.')
+# For weight_decay, use 0.00004 for MobileNet-V2 or Xcpetion model variants.
+# Use 0.0001 for ResNet model variants.
 flags.DEFINE_float('weight_decay', 0.00004,
                   'The value of the weight decay for training.')
@@ -206,8 +208,8 @@ def _build_deeplab(inputs_queue, outputs_to_num_classes, ignore_label):
  # Add name to graph node so we can add to summary.
  output_type_dict = outputs_to_scales_to_logits[common.OUTPUT_TYPE]
-  output_type_dict[model.get_merged_logits_scope()] = tf.identity(
+  output_type_dict[model.MERGED_LOGITS_SCOPE] = tf.identity(
-      output_type_dict[model.get_merged_logits_scope()],
+      output_type_dict[model.MERGED_LOGITS_SCOPE],
      name=common.OUTPUT_TYPE)
  for output, num_classes in six.iteritems(outputs_to_num_classes):

--- a/research/deeplab/utils/get_dataset_colormap.py
+++ b/research/deeplab/utils/get_dataset_colormap.py
@@ -29,12 +29,14 @@ import numpy as np
 # Dataset names.
 _ADE20K = 'ade20k'
 _CITYSCAPES = 'cityscapes'
+_MAPILLARY_VISTAS = 'mapillary_vistas'
 _PASCAL = 'pascal'
 # Max number of entries in the colormap for each dataset.
 _DATASET_MAX_ENTRIES = {
    _ADE20K: 151,
    _CITYSCAPES: 19,
+    _MAPILLARY_VISTAS: 66,
    _PASCAL: 256,
 }
@@ -229,6 +231,82 @@ def create_cityscapes_label_colormap():
  ])
+def create_mapillary_vistas_label_colormap():
+  """Creates a label colormap used in Mapillary Vistas segmentation benchmark.
+  Returns:
+    A colormap for visualizing segmentation results.
+  """
+  return np.asarray([
+      [165, 42, 42],
+      [0, 192, 0],
+      [196, 196, 196],
+      [190, 153, 153],
+      [180, 165, 180],
+      [102, 102, 156],
+      [102, 102, 156],
+      [128, 64, 255],
+      [140, 140, 200],
+      [170, 170, 170],
+      [250, 170, 160],
+      [96, 96, 96],
+      [230, 150, 140],
+      [128, 64, 128],
+      [110, 110, 110],
+      [244, 35, 232],
+      [150, 100, 100],
+      [70, 70, 70],
+      [150, 120, 90],
+      [220, 20, 60],
+      [255, 0, 0],
+      [255, 0, 0],
+      [255, 0, 0],
+      [200, 128, 128],
+      [255, 255, 255],
+      [64, 170, 64],
+      [128, 64, 64],
+      [70, 130, 180],
+      [255, 255, 255],
+      [152, 251, 152],
+      [107, 142, 35],
+      [0, 170, 30],
+      [255, 255, 128],
+      [250, 0, 30],
+      [0, 0, 0],
+      [220, 220, 220],
+      [170, 170, 170],
+      [222, 40, 40],
+      [100, 170, 30],
+      [40, 40, 40],
+      [33, 33, 33],
+      [170, 170, 170],
+      [0, 0, 142],
+      [170, 170, 170],
+      [210, 170, 100],
+      [153, 153, 153],
+      [128, 128, 128],
+      [0, 0, 142],
+      [250, 170, 30],
+      [192, 192, 192],
+      [220, 220, 0],
+      [180, 165, 180],
+      [119, 11, 32],
+      [0, 0, 142],
+      [0, 60, 100],
+      [0, 0, 142],
+      [0, 0, 90],
+      [0, 0, 230],
+      [0, 80, 100],
+      [128, 64, 64],
+      [0, 0, 110],
+      [0, 0, 70],
+      [0, 0, 192],
+      [32, 32, 32],
+      [0, 0, 0],
+      [0, 0, 0],
+      ])
 def create_pascal_label_colormap():
  """Creates a label colormap used in PASCAL VOC segmentation benchmark.
@@ -254,6 +332,10 @@ def get_cityscapes_name():
  return _CITYSCAPES
+def get_mapillary_vistas_name():
+  return _MAPILLARY_VISTAS
 def get_pascal_name():
  return _PASCAL
@@ -287,6 +369,8 @@ def create_label_colormap(dataset=_PASCAL):
    return create_ade20k_label_colormap()
  elif dataset == _CITYSCAPES:
    return create_cityscapes_label_colormap()
+  elif dataset == _MAPILLARY_VISTAS:
+    return create_mapillary_vistas_label_colormap()
  elif dataset == _PASCAL:
    return create_pascal_label_colormap()
  else:

--- a/research/deeplab/utils/get_dataset_colormap_test.py
+++ b/research/deeplab/utils/get_dataset_colormap_test.py
@@ -86,6 +86,11 @@ class VisualizationUtilTest(tf.test.TestCase):
        label, get_dataset_colormap.get_ade20k_name())
    self.assertTrue(np.array_equal(colored_label, expected_result))
+  def testMapillaryVistasColorMapValue(self):
+    colormap = get_dataset_colormap.create_mapillary_vistas_label_colormap()
+    self.assertTrue(np.array_equal([190, 153, 153], colormap[3, :]))
+    self.assertTrue(np.array_equal([102, 102, 156], colormap[6, :]))
 if __name__ == '__main__':
  tf.test.main()
--- a/research/deeplab/utils/train_utils.py
+++ b/research/deeplab/utils/train_utils.py
@@ -17,6 +17,7 @@
 import six
 import tensorflow as tf
+from deeplab.core import preprocess_utils
 slim = tf.contrib.slim
@@ -54,12 +55,16 @@ def add_softmax_cross_entropy_loss_for_each_scale(scales_to_logits,
    if upsample_logits:
      # Label is not downsampled, and instead we upsample logits.
      logits = tf.image.resize_bilinear(
-          logits, tf.shape(labels)[1:3], align_corners=True)
+          logits,
+          preprocess_utils.resolve_shape(labels, 4)[1:3],
+          align_corners=True)
      scaled_labels = labels
    else:
      # Label is downsampled to the same size as logits.
      scaled_labels = tf.image.resize_nearest_neighbor(
-          labels, tf.shape(logits)[1:3], align_corners=True)
+          labels,
+          preprocess_utils.resolve_shape(logits, 4)[1:3],
+          align_corners=True)
    scaled_labels = tf.reshape(scaled_labels, shape=[-1])
    not_ignore_mask = tf.to_float(tf.not_equal(scaled_labels,