Source code for `Searching for Efficient Multi-Scale Architectures for Dense Image Prediction`

83490227 · Hui Hui · c961e92d · 83490227 · 83490227 · 83490227
Commit 83490227 authored Sep 28, 2018 by Hui Hui
12 changed files
--- a/research/deeplab/README.md
+++ b/research/deeplab/README.md
@@ -52,6 +52,18 @@ works:
 }
 ```
+*  Architecture search for dense prediction cell:
+```
+@inproceedings{dpc2018,
+  title={Searching for Efficient Multi-Scale Architectures for Dense Image Prediction},
+  author={Liang-Chieh Chen and Maxwell D. Collins and Yukun Zhu and George Papandreou and Barret Zoph and Florian Schroff and Hartwig Adam and Jonathon Shlens},
+  booktitle={NIPS},
+  year={2018}
+}
+```
 In the current implementation, we support adopting the following network
 backbones:
@@ -114,6 +126,10 @@ with "deeplab".
 ## Change Logs
+### September 5, 2018
+Released Cityscapes pretrained checkpoints with found best dense prediction cell.
 ### May 26, 2018
 Updated ADE20K pretrained checkpoint.

--- a/research/deeplab/common.py
+++ b/research/deeplab/common.py
@@ -18,6 +18,7 @@ Common flags from train/eval/vis/export_model.py are collected in this script.
 """
 import collections
 import copy
+import json
 import tensorflow as tf
@@ -85,6 +86,11 @@ flags.DEFINE_boolean('decoder_use_separable_conv', True,
 flags.DEFINE_enum('merge_method', 'max', ['max', 'avg'],
                  'Scheme to merge multi scale features.')
+flags.DEFINE_string(
+    'dense_prediction_cell_json',
+    '',
+    'A JSON file that specifies the dense prediction cell.')
 FLAGS = flags.FLAGS
 # Constants
@@ -122,6 +128,7 @@ class ModelOptions(
        'logits_kernel_size',
        'model_variant',
        'depth_multiplier',
+        'dense_prediction_cell_config',
    ])):
  """Immutable class to hold model options."""
@@ -145,13 +152,19 @@ class ModelOptions(
    Returns:
      A new ModelOptions instance.
    """
+    dense_prediction_cell_config = None
+    if FLAGS.dense_prediction_cell_json:
+      with tf.gfile.Open(FLAGS.dense_prediction_cell_json, 'r') as f:
+        dense_prediction_cell_config = json.load(f)
    return super(ModelOptions, cls).__new__(
        cls, outputs_to_num_classes, crop_size, atrous_rates, output_stride,
        FLAGS.merge_method, FLAGS.add_image_level_feature,
        FLAGS.image_pooling_crop_size, FLAGS.aspp_with_batch_norm,
        FLAGS.aspp_with_separable_conv, FLAGS.multi_grid,
        FLAGS.decoder_output_stride, FLAGS.decoder_use_separable_conv,
-        FLAGS.logits_kernel_size, FLAGS.model_variant, FLAGS.depth_multiplier)
+        FLAGS.logits_kernel_size, FLAGS.model_variant, FLAGS.depth_multiplier,
+        dense_prediction_cell_config)
  def __deepcopy__(self, memo):
    return ModelOptions(copy.deepcopy(self.outputs_to_num_classes),

--- a/research/deeplab/core/dense_prediction_cell.py
+++ b/research/deeplab/core/dense_prediction_cell.py
+# Copyright 2018 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Dense Prediction Cell class that can be evolved in semantic segmentation.
+DensePredictionCell is used as a `layer` in semantic segmentation whose
+architecture is determined by the `config`, a dictionary specifying
+the architecture.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import tensorflow as tf
+from deeplab.core import utils
+slim = tf.contrib.slim
+# Local constants.
+_META_ARCHITECTURE_SCOPE = 'meta_architecture'
+_CONCAT_PROJECTION_SCOPE = 'concat_projection'
+_OP = 'op'
+_CONV = 'conv'
+_PYRAMID_POOLING = 'pyramid_pooling'
+_KERNEL = 'kernel'
+_RATE = 'rate'
+_GRID_SIZE = 'grid_size'
+_TARGET_SIZE = 'target_size'
+_INPUT = 'input'
+def dense_prediction_cell_hparams():
+  """DensePredictionCell HParams.
+  Returns:
+    A dictionary of hyper-parameters used for dense prediction cell with keys:
+      - reduction_size: Integer, the number of output filters for each operation
+          inside the cell.
+      - dropout_on_concat_features: Boolean, apply dropout on the concatenated
+          features or not.
+      - dropout_on_projection_features: Boolean, apply dropout on the projection
+          features or not.
+      - dropout_keep_prob: Float, when `dropout_on_concat_features' or
+          `dropout_on_projection_features' is True, the `keep_prob` value used
+          in the dropout operation.
+      - concat_channels: Integer, the concatenated features will be
+          channel-reduced to `concat_channels` channels.
+      - conv_rate_multiplier: Integer, used to multiply the convolution rates.
+          This is useful in the case when the output_stride is changed from 16
+          to 8, we need to double the convolution rates correspondingly.
+  """
+  return {
+    'reduction_size': 256,
+    'dropout_on_concat_features': True,
+    'dropout_on_projection_features': False,
+    'dropout_keep_prob': 0.9,
+    'concat_channels': 256,
+    'conv_rate_multiplier': 1,
+  }
+class DensePredictionCell(object):
+  """DensePredictionCell class used as a 'layer' in semantic segmentation."""
+  def __init__(self, config, hparams=None):
+    """Initializes the dense prediction cell.
+    Args:
+      config: A dictionary storing the architecture of a dense prediction cell.
+      hparams: A dictionary of hyper-parameters, provided by users. This
+        dictionary will be used to update the default dictionary returned by
+        dense_prediction_cell_hparams().
+    Raises:
+       ValueError: If `conv_rate_multiplier` has value < 1.
+    """
+    self.hparams = dense_prediction_cell_hparams()
+    if hparams is not None:
+      self.hparams.update(hparams)
+    self.config = config
+    # Check values in hparams are valid or not.
+    if self.hparams['conv_rate_multiplier'] < 1:
+      raise ValueError('conv_rate_multiplier cannot have value < 1.')
+  def _get_pyramid_pooling_arguments(
+      self, crop_size, output_stride, image_grid, image_pooling_crop_size=None):
+    """Gets arguments for pyramid pooling.
+    Args:
+      crop_size: A list of two integers, [crop_height, crop_width] specifying
+        whole patch crop size.
+      output_stride: Integer, output stride value for extracted features.
+      image_grid: A list of two integers, [image_grid_height, image_grid_width],
+        specifying the grid size of how the pyramid pooling will be performed.
+      image_pooling_crop_size: A list of two integers, [crop_height, crop_width]
+        specifying the crop size for image pooling operations. Note that we
+        decouple whole patch crop_size and image_pooling_crop_size as one could
+        perform the image_pooling with different crop sizes.
+    Returns:
+      A list of (resize_value, pooled_kernel)
+    """
+    resize_height = utils.scale_dimension(crop_size[0], 1. / output_stride)
+    resize_width = utils.scale_dimension(crop_size[1], 1. / output_stride)
+    # If image_pooling_crop_size is not specified, use crop_size.
+    if image_pooling_crop_size is None:
+      image_pooling_crop_size = crop_size
+    pooled_height = utils.scale_dimension(
+        image_pooling_crop_size[0], 1. / (output_stride * image_grid[0]))
+    pooled_width = utils.scale_dimension(
+        image_pooling_crop_size[1], 1. / (output_stride * image_grid[1]))
+    return ([resize_height, resize_width], [pooled_height, pooled_width])
+  def _parse_operation(self, config, crop_size, output_stride,
+      image_pooling_crop_size=None):
+    """Parses one operation.
+    When 'operation' is 'pyramid_pooling', we compute the required
+    hyper-parameters and save in config.
+    Args:
+      config: A dictionary storing required hyper-parameters for one
+        operation.
+      crop_size: A list of two integers, [crop_height, crop_width] specifying
+        whole patch crop size.
+      output_stride: Integer, output stride value for extracted features.
+      image_pooling_crop_size: A list of two integers, [crop_height, crop_width]
+        specifying the crop size for image pooling operations. Note that we
+        decouple whole patch crop_size and image_pooling_crop_size as one could
+        perform the image_pooling with different crop sizes.
+    Returns:
+      A dictionary stores the related information for the operation.
+    """
+    if config[_OP] == _PYRAMID_POOLING:
+      (config[_TARGET_SIZE],
+       config[_KERNEL]) = self._get_pyramid_pooling_arguments(
+          crop_size=crop_size,
+          output_stride=output_stride,
+          image_grid=config[_GRID_SIZE],
+          image_pooling_crop_size=image_pooling_crop_size)
+    return config
+  def build_cell(self,
+      features,
+      output_stride=16,
+      crop_size=None,
+      image_pooling_crop_size=None,
+      weight_decay=0.00004,
+      reuse=None,
+      is_training=False,
+      fine_tune_batch_norm=False,
+      scope=None):
+    """Builds the dense prediction cell based on the config.
+    Args:
+      features: Input feature map of size [batch, height, width, channels].
+      output_stride: Int, output stride at which the features were extracted.
+      crop_size: A list [crop_height, crop_width], determining the input
+        features resolution.
+      image_pooling_crop_size: A list of two integers, [crop_height, crop_width]
+        specifying the crop size for image pooling operations. Note that we
+        decouple whole patch crop_size and image_pooling_crop_size as one could
+        perform the image_pooling with different crop sizes.
+      weight_decay: Float, the weight decay for model variables.
+      reuse: Reuse the model variables or not.
+      is_training: Boolean, is training or not.
+      fine_tune_batch_norm: Boolean, fine-tuning batch norm parameters or not.
+      scope: Optional string, specifying the variable scope.
+    Returns:
+      Features after passing through the constructed dense prediction cell with
+        shape = [batch, height, width, channels] where channels are determined
+        by `reduction_size` returned by dense_prediction_cell_hparams().
+    Raises:
+      ValueError: Use Convolution with kernel size not equal to 1x1 or 3x3 or
+        the operation is not recognized.
+    """
+    batch_norm_params = {
+      'is_training': is_training and fine_tune_batch_norm,
+      'decay': 0.9997,
+      'epsilon': 1e-5,
+      'scale': True,
+    }
+    hparams = self.hparams
+    with slim.arg_scope(
+        [slim.conv2d, slim.separable_conv2d],
+        weights_regularizer=slim.l2_regularizer(weight_decay),
+        activation_fn=tf.nn.relu,
+        normalizer_fn=slim.batch_norm,
+        padding='SAME',
+        stride=1,
+        reuse=reuse):
+      with slim.arg_scope([slim.batch_norm], **batch_norm_params):
+        with tf.variable_scope(scope, _META_ARCHITECTURE_SCOPE, [features]):
+          depth = hparams['reduction_size']
+          branch_logits = []
+          for i, current_config in enumerate(self.config):
+            scope = 'branch%d' % i
+            current_config = self._parse_operation(
+                config=current_config,
+                crop_size=crop_size,
+                output_stride=output_stride,
+                image_pooling_crop_size=image_pooling_crop_size)
+            tf.logging.info(current_config)
+            if current_config[_INPUT] < 0:
+              operation_input = features
+            else:
+              operation_input = branch_logits[current_config[_INPUT]]
+            if current_config[_OP] == _CONV:
+              if current_config[_KERNEL] == [1, 1] or current_config[
+                _KERNEL] == 1:
+                branch_logits.append(
+                    slim.conv2d(operation_input, depth, 1, scope=scope))
+              else:
+                conv_rate = [r * hparams['conv_rate_multiplier']
+                             for r in current_config[_RATE]]
+                branch_logits.append(
+                    utils.split_separable_conv2d(
+                        operation_input,
+                        filters=depth,
+                        kernel_size=current_config[_KERNEL],
+                        rate=conv_rate,
+                        weight_decay=weight_decay,
+                        scope=scope))
+            elif current_config[_OP] == _PYRAMID_POOLING:
+              pooled_features = slim.avg_pool2d(
+                  operation_input,
+                  kernel_size=current_config[_KERNEL],
+                  stride=[1, 1],
+                  padding='VALID')
+              pooled_features = slim.conv2d(
+                  pooled_features,
+                  depth,
+                  1,
+                  scope=scope)
+              pooled_features = tf.image.resize_bilinear(
+                  pooled_features,
+                  current_config[_TARGET_SIZE],
+                  align_corners=True)
+              # Set shape for resize_height/resize_width if they are not Tensor.
+              resize_height = current_config[_TARGET_SIZE][0]
+              resize_width = current_config[_TARGET_SIZE][1]
+              if isinstance(resize_height, tf.Tensor):
+                resize_height = None
+              if isinstance(resize_width, tf.Tensor):
+                resize_width = None
+              pooled_features.set_shape(
+                  [None, resize_height, resize_width, depth])
+              branch_logits.append(pooled_features)
+            else:
+              raise ValueError('Unrecognized operation.')
+          # Merge branch logits.
+          concat_logits = tf.concat(branch_logits, 3)
+          if self.hparams['dropout_on_concat_features']:
+            concat_logits = slim.dropout(
+                concat_logits,
+                keep_prob=self.hparams['dropout_keep_prob'],
+                is_training=is_training,
+                scope=_CONCAT_PROJECTION_SCOPE + '_dropout')
+          concat_logits = slim.conv2d(concat_logits,
+                                      self.hparams['concat_channels'],
+                                      1,
+                                      scope=_CONCAT_PROJECTION_SCOPE)
+          if self.hparams['dropout_on_projection_features']:
+            concat_logits = slim.dropout(
+                concat_logits,
+                keep_prob=self.hparams['dropout_keep_prob'],
+                is_training=is_training,
+                scope=_CONCAT_PROJECTION_SCOPE + '_dropout')
+          return concat_logits
\ No newline at end of file
--- a/research/deeplab/core/dense_prediction_cell_branch5_top1_cityscapes.json
+++ b/research/deeplab/core/dense_prediction_cell_branch5_top1_cityscapes.json
+[{"kernel": 3, "rate": [1, 6], "op": "conv", "input": -1}, {"kernel": 3, "rate": [18, 15], "op": "conv", "input": 0}, {"kernel": 3, "rate": [6, 3], "op": "conv", "input": 1}, {"kernel": 3, "rate": [1, 1], "op": "conv", "input": 0}, {"kernel": 3, "rate": [6, 21], "op": "conv", "input": 0}]
\ No newline at end of file
--- a/research/deeplab/core/dense_prediction_cell_test.py
+++ b/research/deeplab/core/dense_prediction_cell_test.py
+# Copyright 2018 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for dense_prediction_cell."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import tensorflow as tf
+from deeplab.core import dense_prediction_cell
+class DensePredictionCellTest(tf.test.TestCase):
+  def setUp(self):
+    self.segmentation_layer = dense_prediction_cell.DensePredictionCell(
+        config=[
+          {
+            dense_prediction_cell._INPUT: -1,
+            dense_prediction_cell._OP: dense_prediction_cell._CONV,
+            dense_prediction_cell._KERNEL: 1,
+          },
+          {
+            dense_prediction_cell._INPUT: 0,
+            dense_prediction_cell._OP: dense_prediction_cell._CONV,
+            dense_prediction_cell._KERNEL: 3,
+            dense_prediction_cell._RATE: [1, 3],
+          },
+          {
+            dense_prediction_cell._INPUT: 1,
+            dense_prediction_cell._OP: (
+              dense_prediction_cell._PYRAMID_POOLING),
+            dense_prediction_cell._GRID_SIZE: [1, 2],
+          },
+        ],
+        hparams={'conv_rate_multiplier': 2})
+  def testPyramidPoolingArguments(self):
+    features_size, pooled_kernel = (
+      self.segmentation_layer._get_pyramid_pooling_arguments(
+          crop_size=[513, 513],
+          output_stride=16,
+          image_grid=[4, 4]))
+    self.assertListEqual(features_size, [33, 33])
+    self.assertListEqual(pooled_kernel, [9, 9])
+  def testPyramidPoolingArgumentsWithImageGrid1x1(self):
+    features_size, pooled_kernel = (
+      self.segmentation_layer._get_pyramid_pooling_arguments(
+          crop_size=[257, 257],
+          output_stride=16,
+          image_grid=[1, 1]))
+    self.assertListEqual(features_size, [17, 17])
+    self.assertListEqual(pooled_kernel, [17, 17])
+  def testParseOperationStringWithConv1x1(self):
+    operation = self.segmentation_layer._parse_operation(
+        config={
+          dense_prediction_cell._OP: dense_prediction_cell._CONV,
+          dense_prediction_cell._KERNEL: [1, 1],
+        },
+        crop_size=[513, 513], output_stride=16)
+    self.assertEqual(operation[dense_prediction_cell._OP],
+                     dense_prediction_cell._CONV)
+    self.assertListEqual(operation[dense_prediction_cell._KERNEL], [1, 1])
+  def testParseOperationStringWithConv3x3(self):
+    operation = self.segmentation_layer._parse_operation(
+        config={
+          dense_prediction_cell._OP: dense_prediction_cell._CONV,
+          dense_prediction_cell._KERNEL: [3, 3],
+          dense_prediction_cell._RATE: [9, 6],
+        },
+        crop_size=[513, 513], output_stride=16)
+    self.assertEqual(operation[dense_prediction_cell._OP],
+                     dense_prediction_cell._CONV)
+    self.assertListEqual(operation[dense_prediction_cell._KERNEL], [3, 3])
+    self.assertEqual(operation[dense_prediction_cell._RATE], [9, 6])
+  def testParseOperationStringWithPyramidPooling2x2(self):
+    operation = self.segmentation_layer._parse_operation(
+        config={
+          dense_prediction_cell._OP: dense_prediction_cell._PYRAMID_POOLING,
+          dense_prediction_cell._GRID_SIZE: [2, 2],
+        },
+        crop_size=[513, 513],
+        output_stride=16)
+    self.assertEqual(operation[dense_prediction_cell._OP],
+                     dense_prediction_cell._PYRAMID_POOLING)
+    # The feature maps of size [33, 33] should be covered by 2x2 kernels with
+    # size [17, 17].
+    self.assertListEqual(
+        operation[dense_prediction_cell._TARGET_SIZE], [33, 33])
+    self.assertListEqual(operation[dense_prediction_cell._KERNEL], [17, 17])
+  def testBuildCell(self):
+    with self.test_session(graph=tf.Graph()) as sess:
+      features = tf.random_normal([2, 33, 33, 5])
+      concat_logits = self.segmentation_layer.build_cell(
+          features,
+          output_stride=8,
+          crop_size=[257, 257])
+      sess.run(tf.global_variables_initializer())
+      concat_logits = sess.run(concat_logits)
+      self.assertTrue(concat_logits.any())
+  def testBuildCellWithImagePoolingCropSize(self):
+    with self.test_session(graph=tf.Graph()) as sess:
+      features = tf.random_normal([2, 33, 33, 5])
+      concat_logits = self.segmentation_layer.build_cell(
+          features,
+          output_stride=8,
+          crop_size=[257, 257],
+          image_pooling_crop_size=[129, 129])
+      sess.run(tf.global_variables_initializer())
+      concat_logits = sess.run(concat_logits)
+      self.assertTrue(concat_logits.any())
+if __name__ == '__main__':
+  tf.test.main()
\ No newline at end of file
--- a/research/deeplab/core/feature_extractor.py
+++ b/research/deeplab/core/feature_extractor.py
@@ -126,7 +126,7 @@ networks_to_feature_maps = {
    },
    'xception_71': {
        DECODER_END_POINTS: [
-            'entry_flow/block2/unit_1/xception_module/'
+            'entry_flow/block3/unit_1/xception_module/'
            'separable_conv2_pointwise',
        ],
    },

--- a/research/deeplab/core/utils.py
+++ b/research/deeplab/core/utils.py
+# Copyright 2018 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""This script contains utility functions."""
+import tensorflow as tf
+slim = tf.contrib.slim
+def scale_dimension(dim, scale):
+  """Scales the input dimension.
+  Args:
+    dim: Input dimension (a scalar or a scalar Tensor).
+    scale: The amount of scaling applied to the input.
+  Returns:
+    Scaled dimension.
+  """
+  if isinstance(dim, tf.Tensor):
+    return tf.cast((tf.to_float(dim) - 1.0) * scale + 1.0, dtype=tf.int32)
+  else:
+    return int((float(dim) - 1.0) * scale + 1.0)
+def split_separable_conv2d(inputs,
+    filters,
+    kernel_size=3,
+    rate=1,
+    weight_decay=0.00004,
+    depthwise_weights_initializer_stddev=0.33,
+    pointwise_weights_initializer_stddev=0.06,
+    scope=None):
+  """Splits a separable conv2d into depthwise and pointwise conv2d.
+  This operation differs from `tf.layers.separable_conv2d` as this operation
+  applies activation function between depthwise and pointwise conv2d.
+  Args:
+    inputs: Input tensor with shape [batch, height, width, channels].
+    filters: Number of filters in the 1x1 pointwise convolution.
+    kernel_size: A list of length 2: [kernel_height, kernel_width] of
+      of the filters. Can be an int if both values are the same.
+    rate: Atrous convolution rate for the depthwise convolution.
+    weight_decay: The weight decay to use for regularizing the model.
+    depthwise_weights_initializer_stddev: The standard deviation of the
+      truncated normal weight initializer for depthwise convolution.
+    pointwise_weights_initializer_stddev: The standard deviation of the
+      truncated normal weight initializer for pointwise convolution.
+    scope: Optional scope for the operation.
+  Returns:
+    Computed features after split separable conv2d.
+  """
+  outputs = slim.separable_conv2d(
+      inputs,
+      None,
+      kernel_size=kernel_size,
+      depth_multiplier=1,
+      rate=rate,
+      weights_initializer=tf.truncated_normal_initializer(
+          stddev=depthwise_weights_initializer_stddev),
+      weights_regularizer=None,
+      scope=scope + '_depthwise')
+  return slim.conv2d(
+      outputs,
+      filters,
+      1,
+      weights_initializer=tf.truncated_normal_initializer(
+          stddev=pointwise_weights_initializer_stddev),
+      weights_regularizer=slim.l2_regularizer(weight_decay),
+      scope=scope + '_pointwise')
\ No newline at end of file
--- a/research/deeplab/core/utils_test.py
+++ b/research/deeplab/core/utils_test.py
+# Copyright 2018 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for utils.py."""
+import tensorflow as tf
+from deeplab.core import utils
+class UtilsTest(tf.test.TestCase):
+  def testScaleDimensionOutput(self):
+    self.assertEqual(161, utils.scale_dimension(321, 0.5))
+    self.assertEqual(193, utils.scale_dimension(321, 0.6))
+    self.assertEqual(241, utils.scale_dimension(321, 0.75))
+if __name__ == '__main__':
+  tf.test.main()
\ No newline at end of file
--- a/research/deeplab/g3doc/cityscapes.md
+++ b/research/deeplab/g3doc/cityscapes.md
@@ -79,6 +79,14 @@ ${PATH_TO_DATASET} is the directory in which the Cityscapes dataset resides.
 3.  The users could skip the flag, `decoder_output_stride`, if you do not want
    to use the decoder structure.
+4.  Change and add the following flags in order to use the provided dense prediction cell.
+```bash
+--model_variant="xception_71"
+--dense_prediction_cell_json="deeplab/core/dense_prediction_cell_branch5_top1_cityscapes.json"
+```
 A local evaluation job using `xception_65` can be run with the following
 command:

--- a/research/deeplab/g3doc/model_zoo.md
+++ b/research/deeplab/g3doc/model_zoo.md
@@ -30,8 +30,8 @@ Checkpoint name             | Network backbone | Pretrained  dataset | ASPP  | D
 --------------------------- | :--------------: | :-----------------: | :---: | :-----:
 mobilenetv2_coco_voc_trainaug | MobileNet-v2  | MS-COCO <br> VOC 2012 train_aug set| N/A | N/A
 mobilenetv2_coco_voc_trainval | MobileNet-v2  | MS-COCO <br> VOC 2012 train_aug + trainval sets | N/A | N/A
-xception_coco_voc_trainaug  | Xception_65  | MS-COCO <br> VOC 2012 train_aug set| [6,12,18] for OS=16 <br> [12,24,36] for OS=8 | OS = 4
+xception65_coco_voc_trainaug  | Xception_65  | MS-COCO <br> VOC 2012 train_aug set| [6,12,18] for OS=16 <br> [12,24,36] for OS=8 | OS = 4
-xception_coco_voc_trainval  | Xception_65  | MS-COCO <br> VOC 2012 train_aug + trainval sets | [6,12,18] for OS=16 <br> [12,24,36] for OS=8 | OS = 4
+xception65_coco_voc_trainval  | Xception_65  | MS-COCO <br> VOC 2012 train_aug + trainval sets | [6,12,18] for OS=16 <br> [12,24,36] for OS=8 | OS = 4
 In the table, **OS** denotes output stride.
@@ -39,8 +39,8 @@ Checkpoint name
 ------------------------------------------------------------------------------------------------------------------------ | :-------: | :------------------------: | :-------------: | :------------------: | :------------: | :----------------------------: | :-------:
 [mobilenetv2_coco_voc_trainaug](http://download.tensorflow.org/models/deeplabv3_mnv2_pascal_train_aug_2018_01_29.tar.gz) | 16 <br> 8 | [1.0] <br> [0.5:0.25:1.75] | No <br> Yes     | 2.75B <br> 152.59B   | 0.1 <br> 26.9  | 75.32% (val) <br> 77.33 (val)  | 23MB
 [mobilenetv2_coco_voc_trainval](http://download.tensorflow.org/models/deeplabv3_mnv2_pascal_trainval_2018_01_29.tar.gz)  | 8         | [0.5:0.25:1.75]            | Yes             | 152.59B              | 26.9           | 80.25% (**test**)              | 23MB
-[xception_coco_voc_trainaug](http://download.tensorflow.org/models/deeplabv3_pascal_train_aug_2018_01_04.tar.gz)         | 16 <br> 8 | [1.0] <br> [0.5:0.25:1.75] | No <br> Yes     | 54.17B <br> 3055.35B | 0.7 <br> 223.2 | 82.20% (val) <br> 83.58% (val) | 439MB
+[xception65_coco_voc_trainaug](http://download.tensorflow.org/models/deeplabv3_pascal_train_aug_2018_01_04.tar.gz)         | 16 <br> 8 | [1.0] <br> [0.5:0.25:1.75] | No <br> Yes     | 54.17B <br> 3055.35B | 0.7 <br> 223.2 | 82.20% (val) <br> 83.58% (val) | 439MB
-[xception_coco_voc_trainval](http://download.tensorflow.org/models/deeplabv3_pascal_trainval_2018_01_04.tar.gz)          | 8         | [0.5:0.25:1.75]            | Yes             | 3055.35B             | 223.2          | 87.80% (**test**)              | 439MB
+[xception65_coco_voc_trainval](http://download.tensorflow.org/models/deeplabv3_pascal_trainval_2018_01_04.tar.gz)          | 8         | [0.5:0.25:1.75]            | Yes             | 3055.35B             | 223.2          | 87.80% (**test**)              | 439MB
 In the table, we report both computation complexity (in terms of Multiply-Adds
 and CPU Runtime) and segmentation performance (in terms of mIOU) on the PASCAL
@@ -61,14 +61,20 @@ dataset and does not employ ASPP and decoder modules for fast computation.
 Checkpoint name                       | Network backbone | Pretrained dataset                      | ASPP                                             | Decoder
 ------------------------------------- | :--------------: | :-------------------------------------: | :----------------------------------------------: | :-----:
 mobilenetv2_coco_cityscapes_trainfine | MobileNet-v2     | MS-COCO <br> Cityscapes train_fine set  | N/A                                              | N/A
-xception_cityscapes_trainfine         | Xception_65      | ImageNet <br> Cityscapes train_fine set | [6, 12, 18] for OS=16 <br> [12, 24, 36] for OS=8 | OS = 4
+xception65_cityscapes_trainfine         | Xception_65      | ImageNet <br> Cityscapes train_fine set | [6, 12, 18] for OS=16 <br> [12, 24, 36] for OS=8 | OS = 4
+xception71_dpc_cityscapes_trainfine         | Xception_71      | ImageNet <br> MS-COCO <br> Cityscapes train_fine set | Dense Prediction Cell | OS = 4
+xception71_dpc_cityscapes_trainval         | Xception_71      | ImageNet <br> MS-COCO <br> Cityscapes trainval_fine and coarse set | Dense Prediction Cell | OS = 4
 In the table, **OS** denotes output stride.
 Checkpoint name                                                                                                                  | Eval OS   | Eval scales                 | Left-right Flip | Multiply-Adds         | Runtime (sec)  | Cityscapes mIOU                | File Size
 -------------------------------------------------------------------------------------------------------------------------------- | :-------: | :-------------------------: | :-------------: | :-------------------: | :------------: | :----------------------------: | :-------:
 [mobilenetv2_coco_cityscapes_trainfine](http://download.tensorflow.org/models/deeplabv3_mnv2_cityscapes_train_2018_02_05.tar.gz) | 16 <br> 8 | [1.0] <br> [0.75:0.25:1.25] | No <br> Yes     | 21.27B <br> 433.24B   | 0.8 <br> 51.12 | 70.71% (val) <br> 73.57% (val) | 23MB
-[xception_cityscapes_trainfine](http://download.tensorflow.org/models/deeplabv3_cityscapes_train_2018_02_06.tar.gz)              | 16 <br> 8 | [1.0] <br> [0.75:0.25:1.25] | No <br> Yes     | 418.64B <br> 8677.92B | 5.0 <br> 422.8 | 78.79% (val) <br> 80.42% (val) | 439MB
+[xception65_cityscapes_trainfine](http://download.tensorflow.org/models/deeplabv3_cityscapes_train_2018_02_06.tar.gz)              | 16 <br> 8 | [1.0] <br> [0.75:0.25:1.25] | No <br> Yes     | 418.64B <br> 8677.92B | 5.0 <br> 422.8 | 78.79% (val) <br> 80.42% (val) | 439MB
+[xception71_dpc_cityscapes_trainfine](http://download.tensorflow.org/models/deeplab_cityscapes_xception71_trainfine_2018_09_08.tar.gz) | 16 | [1.0] | No  | 502.07B | - | 80.31% (val) | 445MB
+[xception71_dpc_cityscapes_trainval](http://download.tensorflow.org/models/deeplab_cityscapes_xception71_trainvalfine_2018_09_08.tar.gz) | 8 | [0.75:0.25:2] | Yes  | - | - | 82.66% (**test**) | 446MB
 ## DeepLab models trained on ADE20K
@@ -80,11 +86,11 @@ dataset rule.
 Checkpoint name                       | Network backbone | Pretrained dataset                      | ASPP                                             | Decoder
 ------------------------------------- | :--------------: | :-------------------------------------: | :----------------------------------------------: | :-----:
-xception_ade20k_train                 | Xception_65      | ImageNet <br> ADE20K training set       | [6, 12, 18] for OS=16 <br> [12, 24, 36] for OS=8 | OS = 4
+xception65_ade20k_train                 | Xception_65      | ImageNet <br> ADE20K training set       | [6, 12, 18] for OS=16 <br> [12, 24, 36] for OS=8 | OS = 4
 Checkpoint name                       | Eval OS   | Eval scales                 | Left-right Flip |  mIOU                 | Pixel-wise Accuracy | File Size
 ------------------------------------- | :-------: | :-------------------------: | :-------------: | :-------------------: | :-------------------: | :-------:
-[xception_ade20k_train](http://download.tensorflow.org/models/deeplabv3_xception_ade20k_train_2018_05_29.tar.gz)              | 8 | [0.5:0.25:1.75] | Yes     | 45.65% (val) | 82.52% (val) | 439MB
+[xception65_ade20k_train](http://download.tensorflow.org/models/deeplabv3_xception_ade20k_train_2018_05_29.tar.gz)              | 8 | [0.5:0.25:1.75] | Yes     | 45.65% (val) | 82.52% (val) | 439MB
 ## Checkpoints pretrained on ImageNet
@@ -170,4 +176,4 @@ Model name
 12. **Scene Parsing through ADE20K Dataset**<br />
    Bolei Zhou, Hang Zhao, Xavier Puig, Sanja Fidler, Adela Barriuso, Antonio Torralba<br />
    [[link]](http://groups.csail.mit.edu/vision/datasets/ADE20K/). In CVPR,
    2017.
\ No newline at end of file
--- a/research/deeplab/model.py
+++ b/research/deeplab/model.py
@@ -52,7 +52,10 @@ Alan L. Yuille (* equal contribution)
 (https://arxiv.org/abs/1412.7062)
 """
 import tensorflow as tf
+from deeplab.core import dense_prediction_cell
 from deeplab.core import feature_extractor
+from deeplab.core import utils
 slim = tf.contrib.slim
@@ -62,7 +65,10 @@ IMAGE_POOLING_SCOPE = 'image_pooling'
 ASPP_SCOPE = 'aspp'
 CONCAT_PROJECTION_SCOPE = 'concat_projection'
 DECODER_SCOPE = 'decoder'
+META_ARCHITECTURE_SCOPE = 'meta_architecture'
+scale_dimension = utils.scale_dimension
+split_separable_conv2d = utils.split_separable_conv2d
 def get_extra_layer_scopes(last_layers_contain_logits_only=False):
  """Gets the scopes for extra layers.
@@ -83,6 +89,7 @@ def get_extra_layer_scopes(last_layers_contain_logits_only=False):
        ASPP_SCOPE,
        CONCAT_PROJECTION_SCOPE,
        DECODER_SCOPE,
+        META_ARCHITECTURE_SCOPE,
    ]
@@ -186,20 +193,20 @@ def predict_labels(images, model_options, image_pyramid=None):
  return predictions
-def scale_dimension(dim, scale):
+def _resize_bilinear(images, size, output_dtype=tf.float32):
-  """Scales the input dimension.
+  """Returns resized images as output_type.
  Args:
-    dim: Input dimension (a scalar or a scalar Tensor).
+    images: A tensor of size [batch, height_in, width_in, channels].
-    scale: The amount of scaling applied to the input.
+    size: A 1-D int32 Tensor of 2 elements: new_height, new_width. The new size
+      for the images.
+    output_dtype: The destination type.
  Returns:
-    Scaled dimension.
+    A tensor of size [batch, height_out, width_out, channels] as a dtype of
+      output_dtype.
  """
-  if isinstance(dim, tf.Tensor):
+  images = tf.image.resize_bilinear(images, size, align_corners=True)
-    return tf.cast((tf.to_float(dim) - 1.0) * scale + 1.0, dtype=tf.int32)
+  return tf.cast(images, dtype=output_dtype)
-  else:
-    return int((float(dim) - 1.0) * scale + 1.0)
 def multi_scale_logits(images,
@@ -355,92 +362,120 @@ def extract_features(images,
  if not model_options.aspp_with_batch_norm:
    return features, end_points
  else:
-    batch_norm_params = {
+    if model_options.dense_prediction_cell_config is not None:
+      tf.logging.info('Using dense prediction cell config.')
+      dense_prediction_layer = dense_prediction_cell.DensePredictionCell(
+          config=model_options.dense_prediction_cell_config,
+          hparams={
+            'conv_rate_multiplier': 16 // model_options.output_stride,
+          })
+      concat_logits = dense_prediction_layer.build_cell(
+          features,
+          output_stride=model_options.output_stride,
+          crop_size=model_options.crop_size,
+          image_pooling_crop_size=model_options.image_pooling_crop_size,
+          weight_decay=weight_decay,
+          reuse=reuse,
+          is_training=is_training,
+          fine_tune_batch_norm=fine_tune_batch_norm)
+      return concat_logits, end_points
+    else:
+      # The following codes employ the DeepLabv3 ASPP module. Note that We
+      # could express the ASPP module as one particular dense prediction
+      # cell architecture. We do not do so but leave the followng codes in
+      # order for backward compatibility.
+      batch_norm_params = {
        'is_training': is_training and fine_tune_batch_norm,
        'decay': 0.9997,
        'epsilon': 1e-5,
        'scale': True,
-    }
+      }
-    with slim.arg_scope(
+      with slim.arg_scope(
-        [slim.conv2d, slim.separable_conv2d],
+          [slim.conv2d, slim.separable_conv2d],
-        weights_regularizer=slim.l2_regularizer(weight_decay),
+          weights_regularizer=slim.l2_regularizer(weight_decay),
-        activation_fn=tf.nn.relu,
+          activation_fn=tf.nn.relu,
-        normalizer_fn=slim.batch_norm,
+          normalizer_fn=slim.batch_norm,
-        padding='SAME',
+          padding='SAME',
-        stride=1,
+          stride=1,
-        reuse=reuse):
+          reuse=reuse):
-      with slim.arg_scope([slim.batch_norm], **batch_norm_params):
+        with slim.arg_scope([slim.batch_norm], **batch_norm_params):
-        depth = 256
+          depth = 256
-        branch_logits = []
+          branch_logits = []
-        if model_options.add_image_level_feature:
+          if model_options.add_image_level_feature:
-          if model_options.crop_size is not None:
+            if model_options.crop_size is not None:
-            image_pooling_crop_size = model_options.image_pooling_crop_size
+              image_pooling_crop_size = model_options.image_pooling_crop_size
-            # If image_pooling_crop_size is not specified, use crop_size.
+              # If image_pooling_crop_size is not specified, use crop_size.
-            if image_pooling_crop_size is None:
+              if image_pooling_crop_size is None:
-              image_pooling_crop_size = model_options.crop_size
+                image_pooling_crop_size = model_options.crop_size
-            pool_height = scale_dimension(image_pooling_crop_size[0],
+              pool_height = scale_dimension(
-                                          1. / model_options.output_stride)
+                  image_pooling_crop_size[0],
-            pool_width = scale_dimension(image_pooling_crop_size[1],
+                  1. / model_options.output_stride)
-                                         1. / model_options.output_stride)
+              pool_width = scale_dimension(
-            image_feature = slim.avg_pool2d(
+                  image_pooling_crop_size[1],
-                features, [pool_height, pool_width], [1, 1], padding='VALID')
+                  1. / model_options.output_stride)
-            resize_height = scale_dimension(model_options.crop_size[0],
+              image_feature = slim.avg_pool2d(
-                                            1. / model_options.output_stride)
+                  features, [pool_height, pool_width], [1, 1], padding='VALID')
-            resize_width = scale_dimension(model_options.crop_size[1],
+              resize_height = scale_dimension(
-                                           1. / model_options.output_stride)
+                  model_options.crop_size[0],
-          else:
+                  1. / model_options.output_stride)
-            # If crop_size is None, we simply do global pooling.
+              resize_width = scale_dimension(
-            pool_height = tf.shape(features)[1]
+                  model_options.crop_size[1],
-            pool_width = tf.shape(features)[2]
+                  1. / model_options.output_stride)
-            image_feature = tf.reduce_mean(features, axis=[1, 2])[:, tf.newaxis,
-                                                                  tf.newaxis]
-            resize_height = pool_height
-            resize_width = pool_width
-          image_feature = slim.conv2d(
-              image_feature, depth, 1, scope=IMAGE_POOLING_SCOPE)
-          image_feature = tf.image.resize_bilinear(
-              image_feature, [resize_height, resize_width], align_corners=True)
-          # Set shape for resize_height/resize_width if they are not Tensor.
-          if isinstance(resize_height, tf.Tensor):
-            resize_height = None
-          if isinstance(resize_width, tf.Tensor):
-            resize_width = None
-          image_feature.set_shape([None, resize_height, resize_width, depth])
-          branch_logits.append(image_feature)
-        # Employ a 1x1 convolution.
-        branch_logits.append(slim.conv2d(features, depth, 1,
-                                         scope=ASPP_SCOPE + str(0)))
-        if model_options.atrous_rates:
-          # Employ 3x3 convolutions with different atrous rates.
-          for i, rate in enumerate(model_options.atrous_rates, 1):
-            scope = ASPP_SCOPE + str(i)
-            if model_options.aspp_with_separable_conv:
-              aspp_features = split_separable_conv2d(
-                  features,
-                  filters=depth,
-                  rate=rate,
-                  weight_decay=weight_decay,
-                  scope=scope)
            else:
-              aspp_features = slim.conv2d(
+              # If crop_size is None, we simply do global pooling.
-                  features, depth, 3, rate=rate, scope=scope)
+              pool_height = tf.shape(features)[1]
-            branch_logits.append(aspp_features)
+              pool_width = tf.shape(features)[2]
+              image_feature = tf.reduce_mean(
-        # Merge branch logits.
+                  features, axis=[1, 2], keepdims=True)
-        concat_logits = tf.concat(branch_logits, 3)
+              resize_height = pool_height
-        concat_logits = slim.conv2d(
+              resize_width = pool_width
-            concat_logits, depth, 1, scope=CONCAT_PROJECTION_SCOPE)
+            image_feature = slim.conv2d(
-        concat_logits = slim.dropout(
+                image_feature, depth, 1, scope=IMAGE_POOLING_SCOPE)
-            concat_logits,
+            image_feature = _resize_bilinear(
-            keep_prob=0.9,
+                image_feature,
-            is_training=is_training,
+                [resize_height, resize_width],
-            scope=CONCAT_PROJECTION_SCOPE + '_dropout')
+                image_feature.dtype)
+            # Set shape for resize_height/resize_width if they are not Tensor.
-        return concat_logits, end_points
+            if isinstance(resize_height, tf.Tensor):
+              resize_height = None
+            if isinstance(resize_width, tf.Tensor):
+              resize_width = None
+            image_feature.set_shape([None, resize_height, resize_width, depth])
+            branch_logits.append(image_feature)
+          # Employ a 1x1 convolution.
+          branch_logits.append(slim.conv2d(features, depth, 1,
+                                           scope=ASPP_SCOPE + str(0)))
+          if model_options.atrous_rates:
+            # Employ 3x3 convolutions with different atrous rates.
+            for i, rate in enumerate(model_options.atrous_rates, 1):
+              scope = ASPP_SCOPE + str(i)
+              if model_options.aspp_with_separable_conv:
+                aspp_features = split_separable_conv2d(
+                    features,
+                    filters=depth,
+                    rate=rate,
+                    weight_decay=weight_decay,
+                    scope=scope)
+              else:
+                aspp_features = slim.conv2d(
+                    features, depth, 3, rate=rate, scope=scope)
+              branch_logits.append(aspp_features)
+          # Merge branch logits.
+          concat_logits = tf.concat(branch_logits, 3)
+          concat_logits = slim.conv2d(
+              concat_logits, depth, 1, scope=CONCAT_PROJECTION_SCOPE)
+          concat_logits = slim.dropout(
+              concat_logits,
+              keep_prob=0.9,
+              is_training=is_training,
+              scope=CONCAT_PROJECTION_SCOPE + '_dropout')
+          return concat_logits, end_points
 def _get_logits(images,
@@ -672,52 +707,3 @@ def get_branch_logits(features,
                scope=scope))
      return tf.add_n(branch_logits)
-def split_separable_conv2d(inputs,
-                           filters,
-                           kernel_size=3,
-                           rate=1,
-                           weight_decay=0.00004,
-                           depthwise_weights_initializer_stddev=0.33,
-                           pointwise_weights_initializer_stddev=0.06,
-                           scope=None):
-  """Splits a separable conv2d into depthwise and pointwise conv2d.
-  This operation differs from `tf.layers.separable_conv2d` as this operation
-  applies activation function between depthwise and pointwise conv2d.
-  Args:
-    inputs: Input tensor with shape [batch, height, width, channels].
-    filters: Number of filters in the 1x1 pointwise convolution.
-    kernel_size: A list of length 2: [kernel_height, kernel_width] of
-      of the filters. Can be an int if both values are the same.
-    rate: Atrous convolution rate for the depthwise convolution.
-    weight_decay: The weight decay to use for regularizing the model.
-    depthwise_weights_initializer_stddev: The standard deviation of the
-      truncated normal weight initializer for depthwise convolution.
-    pointwise_weights_initializer_stddev: The standard deviation of the
-      truncated normal weight initializer for pointwise convolution.
-    scope: Optional scope for the operation.
-  Returns:
-    Computed features after split separable conv2d.
-  """
-  outputs = slim.separable_conv2d(
-      inputs,
-      None,
-      kernel_size=kernel_size,
-      depth_multiplier=1,
-      rate=rate,
-      weights_initializer=tf.truncated_normal_initializer(
-          stddev=depthwise_weights_initializer_stddev),
-      weights_regularizer=None,
-      scope=scope + '_depthwise')
-  return slim.conv2d(
-      outputs,
-      filters,
-      1,
-      weights_initializer=tf.truncated_normal_initializer(
-          stddev=pointwise_weights_initializer_stddev),
-      weights_regularizer=slim.l2_regularizer(weight_decay),
-      scope=scope + '_pointwise')
--- a/research/deeplab/model_test.py
+++ b/research/deeplab/model_test.py
@@ -23,11 +23,6 @@ from deeplab import model
 class DeeplabModelTest(tf.test.TestCase):
-  def testScaleDimensionOutput(self):
-    self.assertEqual(161, model.scale_dimension(321, 0.5))
-    self.assertEqual(193, model.scale_dimension(321, 0.6))
-    self.assertEqual(241, model.scale_dimension(321, 0.75))
  def testWrongDeepLabVariant(self):
    model_options = common.ModelOptions([])._replace(
        model_variant='no_such_variant')
@@ -115,6 +110,37 @@ class DeeplabModelTest(tf.test.TestCase):
          for logits in scales_to_logits.values():
            self.assertTrue(logits.any())
+  def testBuildDeepLabWithDensePredictionCell(self):
+    batch_size = 1
+    crop_size = [33, 33]
+    outputs_to_num_classes = {'semantic': 2}
+    expected_endpoints = ['merged_logits']
+    dense_prediction_cell_config = [
+      {'kernel': 3, 'rate': [1, 6], 'op': 'conv', 'input': -1},
+      {'kernel': 3, 'rate': [18, 15], 'op': 'conv', 'input': 0},
+    ]
+    model_options = common.ModelOptions(
+        outputs_to_num_classes,
+        crop_size,
+        output_stride=16)._replace(
+        aspp_with_batch_norm=True,
+        model_variant='mobilenet_v2',
+        dense_prediction_cell_config=dense_prediction_cell_config)
+    g = tf.Graph()
+    with g.as_default():
+      with self.test_session(graph=g):
+        inputs = tf.random_uniform(
+            (batch_size, crop_size[0], crop_size[1], 3))
+        outputs_to_scales_to_model_results = model.multi_scale_logits(
+            inputs,
+            model_options,
+            image_pyramid=[1.0])
+        for output in outputs_to_num_classes:
+          scales_to_model_results = outputs_to_scales_to_model_results[output]
+          self.assertListEqual(scales_to_model_results.keys(),
+                               expected_endpoints)
+          self.assertEqual(len(scales_to_model_results), 1)
 if __name__ == '__main__':
  tf.test.main()