Unverified Commit 6801ea36 authored by aquariusjay's avatar aquariusjay Committed by GitHub
Browse files

Merge pull request #5430 from huihui-personal/master

Open source `Searching for Efficient Multi-Scale Architectures for Dense Image Prediction`
parents eb370577 ba0f9acb
......@@ -52,6 +52,18 @@ works:
}
```
* Architecture search for dense prediction cell:
```
@inproceedings{dpc2018,
title={Searching for Efficient Multi-Scale Architectures for Dense Image Prediction},
author={Liang-Chieh Chen and Maxwell D. Collins and Yukun Zhu and George Papandreou and Barret Zoph and Florian Schroff and Hartwig Adam and Jonathon Shlens},
booktitle={NIPS},
year={2018}
}
```
In the current implementation, we support adopting the following network
backbones:
......@@ -114,6 +126,18 @@ with "deeplab".
## Change Logs
### October 1, 2018
Released MobileNet-v2 depth-multiplier = 0.5 COCO-pretrained checkpoints on
PASCAL VOC 2012, and Xception-65 COCO pretrained checkpoint (i.e., no PASCAL
pretrained).
### September 5, 2018
Released Cityscapes pretrained checkpoints with found best dense prediction cell.
### May 26, 2018
Updated ADE20K pretrained checkpoint.
......
......@@ -18,6 +18,7 @@ Common flags from train/eval/vis/export_model.py are collected in this script.
"""
import collections
import copy
import json
import tensorflow as tf
......@@ -85,6 +86,11 @@ flags.DEFINE_boolean('decoder_use_separable_conv', True,
flags.DEFINE_enum('merge_method', 'max', ['max', 'avg'],
'Scheme to merge multi scale features.')
flags.DEFINE_string(
'dense_prediction_cell_json',
'',
'A JSON file that specifies the dense prediction cell.')
FLAGS = flags.FLAGS
# Constants
......@@ -122,6 +128,7 @@ class ModelOptions(
'logits_kernel_size',
'model_variant',
'depth_multiplier',
'dense_prediction_cell_config',
])):
"""Immutable class to hold model options."""
......@@ -145,13 +152,19 @@ class ModelOptions(
Returns:
A new ModelOptions instance.
"""
dense_prediction_cell_config = None
if FLAGS.dense_prediction_cell_json:
with tf.gfile.Open(FLAGS.dense_prediction_cell_json, 'r') as f:
dense_prediction_cell_config = json.load(f)
return super(ModelOptions, cls).__new__(
cls, outputs_to_num_classes, crop_size, atrous_rates, output_stride,
FLAGS.merge_method, FLAGS.add_image_level_feature,
FLAGS.image_pooling_crop_size, FLAGS.aspp_with_batch_norm,
FLAGS.aspp_with_separable_conv, FLAGS.multi_grid,
FLAGS.decoder_output_stride, FLAGS.decoder_use_separable_conv,
FLAGS.logits_kernel_size, FLAGS.model_variant, FLAGS.depth_multiplier)
FLAGS.logits_kernel_size, FLAGS.model_variant, FLAGS.depth_multiplier,
dense_prediction_cell_config)
def __deepcopy__(self, memo):
return ModelOptions(copy.deepcopy(self.outputs_to_num_classes),
......
# Copyright 2018 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Dense Prediction Cell class that can be evolved in semantic segmentation.
DensePredictionCell is used as a `layer` in semantic segmentation whose
architecture is determined by the `config`, a dictionary specifying
the architecture.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tensorflow as tf
from deeplab.core import utils
slim = tf.contrib.slim
# Local constants.
_META_ARCHITECTURE_SCOPE = 'meta_architecture'
_CONCAT_PROJECTION_SCOPE = 'concat_projection'
_OP = 'op'
_CONV = 'conv'
_PYRAMID_POOLING = 'pyramid_pooling'
_KERNEL = 'kernel'
_RATE = 'rate'
_GRID_SIZE = 'grid_size'
_TARGET_SIZE = 'target_size'
_INPUT = 'input'
def dense_prediction_cell_hparams():
"""DensePredictionCell HParams.
Returns:
A dictionary of hyper-parameters used for dense prediction cell with keys:
- reduction_size: Integer, the number of output filters for each operation
inside the cell.
- dropout_on_concat_features: Boolean, apply dropout on the concatenated
features or not.
- dropout_on_projection_features: Boolean, apply dropout on the projection
features or not.
- dropout_keep_prob: Float, when `dropout_on_concat_features' or
`dropout_on_projection_features' is True, the `keep_prob` value used
in the dropout operation.
- concat_channels: Integer, the concatenated features will be
channel-reduced to `concat_channels` channels.
- conv_rate_multiplier: Integer, used to multiply the convolution rates.
This is useful in the case when the output_stride is changed from 16
to 8, we need to double the convolution rates correspondingly.
"""
return {
'reduction_size': 256,
'dropout_on_concat_features': True,
'dropout_on_projection_features': False,
'dropout_keep_prob': 0.9,
'concat_channels': 256,
'conv_rate_multiplier': 1,
}
class DensePredictionCell(object):
"""DensePredictionCell class used as a 'layer' in semantic segmentation."""
def __init__(self, config, hparams=None):
"""Initializes the dense prediction cell.
Args:
config: A dictionary storing the architecture of a dense prediction cell.
hparams: A dictionary of hyper-parameters, provided by users. This
dictionary will be used to update the default dictionary returned by
dense_prediction_cell_hparams().
Raises:
ValueError: If `conv_rate_multiplier` has value < 1.
"""
self.hparams = dense_prediction_cell_hparams()
if hparams is not None:
self.hparams.update(hparams)
self.config = config
# Check values in hparams are valid or not.
if self.hparams['conv_rate_multiplier'] < 1:
raise ValueError('conv_rate_multiplier cannot have value < 1.')
def _get_pyramid_pooling_arguments(
self, crop_size, output_stride, image_grid, image_pooling_crop_size=None):
"""Gets arguments for pyramid pooling.
Args:
crop_size: A list of two integers, [crop_height, crop_width] specifying
whole patch crop size.
output_stride: Integer, output stride value for extracted features.
image_grid: A list of two integers, [image_grid_height, image_grid_width],
specifying the grid size of how the pyramid pooling will be performed.
image_pooling_crop_size: A list of two integers, [crop_height, crop_width]
specifying the crop size for image pooling operations. Note that we
decouple whole patch crop_size and image_pooling_crop_size as one could
perform the image_pooling with different crop sizes.
Returns:
A list of (resize_value, pooled_kernel)
"""
resize_height = utils.scale_dimension(crop_size[0], 1. / output_stride)
resize_width = utils.scale_dimension(crop_size[1], 1. / output_stride)
# If image_pooling_crop_size is not specified, use crop_size.
if image_pooling_crop_size is None:
image_pooling_crop_size = crop_size
pooled_height = utils.scale_dimension(
image_pooling_crop_size[0], 1. / (output_stride * image_grid[0]))
pooled_width = utils.scale_dimension(
image_pooling_crop_size[1], 1. / (output_stride * image_grid[1]))
return ([resize_height, resize_width], [pooled_height, pooled_width])
def _parse_operation(self, config, crop_size, output_stride,
image_pooling_crop_size=None):
"""Parses one operation.
When 'operation' is 'pyramid_pooling', we compute the required
hyper-parameters and save in config.
Args:
config: A dictionary storing required hyper-parameters for one
operation.
crop_size: A list of two integers, [crop_height, crop_width] specifying
whole patch crop size.
output_stride: Integer, output stride value for extracted features.
image_pooling_crop_size: A list of two integers, [crop_height, crop_width]
specifying the crop size for image pooling operations. Note that we
decouple whole patch crop_size and image_pooling_crop_size as one could
perform the image_pooling with different crop sizes.
Returns:
A dictionary stores the related information for the operation.
"""
if config[_OP] == _PYRAMID_POOLING:
(config[_TARGET_SIZE],
config[_KERNEL]) = self._get_pyramid_pooling_arguments(
crop_size=crop_size,
output_stride=output_stride,
image_grid=config[_GRID_SIZE],
image_pooling_crop_size=image_pooling_crop_size)
return config
def build_cell(self,
features,
output_stride=16,
crop_size=None,
image_pooling_crop_size=None,
weight_decay=0.00004,
reuse=None,
is_training=False,
fine_tune_batch_norm=False,
scope=None):
"""Builds the dense prediction cell based on the config.
Args:
features: Input feature map of size [batch, height, width, channels].
output_stride: Int, output stride at which the features were extracted.
crop_size: A list [crop_height, crop_width], determining the input
features resolution.
image_pooling_crop_size: A list of two integers, [crop_height, crop_width]
specifying the crop size for image pooling operations. Note that we
decouple whole patch crop_size and image_pooling_crop_size as one could
perform the image_pooling with different crop sizes.
weight_decay: Float, the weight decay for model variables.
reuse: Reuse the model variables or not.
is_training: Boolean, is training or not.
fine_tune_batch_norm: Boolean, fine-tuning batch norm parameters or not.
scope: Optional string, specifying the variable scope.
Returns:
Features after passing through the constructed dense prediction cell with
shape = [batch, height, width, channels] where channels are determined
by `reduction_size` returned by dense_prediction_cell_hparams().
Raises:
ValueError: Use Convolution with kernel size not equal to 1x1 or 3x3 or
the operation is not recognized.
"""
batch_norm_params = {
'is_training': is_training and fine_tune_batch_norm,
'decay': 0.9997,
'epsilon': 1e-5,
'scale': True,
}
hparams = self.hparams
with slim.arg_scope(
[slim.conv2d, slim.separable_conv2d],
weights_regularizer=slim.l2_regularizer(weight_decay),
activation_fn=tf.nn.relu,
normalizer_fn=slim.batch_norm,
padding='SAME',
stride=1,
reuse=reuse):
with slim.arg_scope([slim.batch_norm], **batch_norm_params):
with tf.variable_scope(scope, _META_ARCHITECTURE_SCOPE, [features]):
depth = hparams['reduction_size']
branch_logits = []
for i, current_config in enumerate(self.config):
scope = 'branch%d' % i
current_config = self._parse_operation(
config=current_config,
crop_size=crop_size,
output_stride=output_stride,
image_pooling_crop_size=image_pooling_crop_size)
tf.logging.info(current_config)
if current_config[_INPUT] < 0:
operation_input = features
else:
operation_input = branch_logits[current_config[_INPUT]]
if current_config[_OP] == _CONV:
if current_config[_KERNEL] == [1, 1] or current_config[
_KERNEL] == 1:
branch_logits.append(
slim.conv2d(operation_input, depth, 1, scope=scope))
else:
conv_rate = [r * hparams['conv_rate_multiplier']
for r in current_config[_RATE]]
branch_logits.append(
utils.split_separable_conv2d(
operation_input,
filters=depth,
kernel_size=current_config[_KERNEL],
rate=conv_rate,
weight_decay=weight_decay,
scope=scope))
elif current_config[_OP] == _PYRAMID_POOLING:
pooled_features = slim.avg_pool2d(
operation_input,
kernel_size=current_config[_KERNEL],
stride=[1, 1],
padding='VALID')
pooled_features = slim.conv2d(
pooled_features,
depth,
1,
scope=scope)
pooled_features = tf.image.resize_bilinear(
pooled_features,
current_config[_TARGET_SIZE],
align_corners=True)
# Set shape for resize_height/resize_width if they are not Tensor.
resize_height = current_config[_TARGET_SIZE][0]
resize_width = current_config[_TARGET_SIZE][1]
if isinstance(resize_height, tf.Tensor):
resize_height = None
if isinstance(resize_width, tf.Tensor):
resize_width = None
pooled_features.set_shape(
[None, resize_height, resize_width, depth])
branch_logits.append(pooled_features)
else:
raise ValueError('Unrecognized operation.')
# Merge branch logits.
concat_logits = tf.concat(branch_logits, 3)
if self.hparams['dropout_on_concat_features']:
concat_logits = slim.dropout(
concat_logits,
keep_prob=self.hparams['dropout_keep_prob'],
is_training=is_training,
scope=_CONCAT_PROJECTION_SCOPE + '_dropout')
concat_logits = slim.conv2d(concat_logits,
self.hparams['concat_channels'],
1,
scope=_CONCAT_PROJECTION_SCOPE)
if self.hparams['dropout_on_projection_features']:
concat_logits = slim.dropout(
concat_logits,
keep_prob=self.hparams['dropout_keep_prob'],
is_training=is_training,
scope=_CONCAT_PROJECTION_SCOPE + '_dropout')
return concat_logits
\ No newline at end of file
[{"kernel": 3, "rate": [1, 6], "op": "conv", "input": -1}, {"kernel": 3, "rate": [18, 15], "op": "conv", "input": 0}, {"kernel": 3, "rate": [6, 3], "op": "conv", "input": 1}, {"kernel": 3, "rate": [1, 1], "op": "conv", "input": 0}, {"kernel": 3, "rate": [6, 21], "op": "conv", "input": 0}]
\ No newline at end of file
# Copyright 2018 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Tests for dense_prediction_cell."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tensorflow as tf
from deeplab.core import dense_prediction_cell
class DensePredictionCellTest(tf.test.TestCase):
def setUp(self):
self.segmentation_layer = dense_prediction_cell.DensePredictionCell(
config=[
{
dense_prediction_cell._INPUT: -1,
dense_prediction_cell._OP: dense_prediction_cell._CONV,
dense_prediction_cell._KERNEL: 1,
},
{
dense_prediction_cell._INPUT: 0,
dense_prediction_cell._OP: dense_prediction_cell._CONV,
dense_prediction_cell._KERNEL: 3,
dense_prediction_cell._RATE: [1, 3],
},
{
dense_prediction_cell._INPUT: 1,
dense_prediction_cell._OP: (
dense_prediction_cell._PYRAMID_POOLING),
dense_prediction_cell._GRID_SIZE: [1, 2],
},
],
hparams={'conv_rate_multiplier': 2})
def testPyramidPoolingArguments(self):
features_size, pooled_kernel = (
self.segmentation_layer._get_pyramid_pooling_arguments(
crop_size=[513, 513],
output_stride=16,
image_grid=[4, 4]))
self.assertListEqual(features_size, [33, 33])
self.assertListEqual(pooled_kernel, [9, 9])
def testPyramidPoolingArgumentsWithImageGrid1x1(self):
features_size, pooled_kernel = (
self.segmentation_layer._get_pyramid_pooling_arguments(
crop_size=[257, 257],
output_stride=16,
image_grid=[1, 1]))
self.assertListEqual(features_size, [17, 17])
self.assertListEqual(pooled_kernel, [17, 17])
def testParseOperationStringWithConv1x1(self):
operation = self.segmentation_layer._parse_operation(
config={
dense_prediction_cell._OP: dense_prediction_cell._CONV,
dense_prediction_cell._KERNEL: [1, 1],
},
crop_size=[513, 513], output_stride=16)
self.assertEqual(operation[dense_prediction_cell._OP],
dense_prediction_cell._CONV)
self.assertListEqual(operation[dense_prediction_cell._KERNEL], [1, 1])
def testParseOperationStringWithConv3x3(self):
operation = self.segmentation_layer._parse_operation(
config={
dense_prediction_cell._OP: dense_prediction_cell._CONV,
dense_prediction_cell._KERNEL: [3, 3],
dense_prediction_cell._RATE: [9, 6],
},
crop_size=[513, 513], output_stride=16)
self.assertEqual(operation[dense_prediction_cell._OP],
dense_prediction_cell._CONV)
self.assertListEqual(operation[dense_prediction_cell._KERNEL], [3, 3])
self.assertEqual(operation[dense_prediction_cell._RATE], [9, 6])
def testParseOperationStringWithPyramidPooling2x2(self):
operation = self.segmentation_layer._parse_operation(
config={
dense_prediction_cell._OP: dense_prediction_cell._PYRAMID_POOLING,
dense_prediction_cell._GRID_SIZE: [2, 2],
},
crop_size=[513, 513],
output_stride=16)
self.assertEqual(operation[dense_prediction_cell._OP],
dense_prediction_cell._PYRAMID_POOLING)
# The feature maps of size [33, 33] should be covered by 2x2 kernels with
# size [17, 17].
self.assertListEqual(
operation[dense_prediction_cell._TARGET_SIZE], [33, 33])
self.assertListEqual(operation[dense_prediction_cell._KERNEL], [17, 17])
def testBuildCell(self):
with self.test_session(graph=tf.Graph()) as sess:
features = tf.random_normal([2, 33, 33, 5])
concat_logits = self.segmentation_layer.build_cell(
features,
output_stride=8,
crop_size=[257, 257])
sess.run(tf.global_variables_initializer())
concat_logits = sess.run(concat_logits)
self.assertTrue(concat_logits.any())
def testBuildCellWithImagePoolingCropSize(self):
with self.test_session(graph=tf.Graph()) as sess:
features = tf.random_normal([2, 33, 33, 5])
concat_logits = self.segmentation_layer.build_cell(
features,
output_stride=8,
crop_size=[257, 257],
image_pooling_crop_size=[129, 129])
sess.run(tf.global_variables_initializer())
concat_logits = sess.run(concat_logits)
self.assertTrue(concat_logits.any())
if __name__ == '__main__':
tf.test.main()
\ No newline at end of file
......@@ -126,7 +126,7 @@ networks_to_feature_maps = {
},
'xception_71': {
DECODER_END_POINTS: [
'entry_flow/block2/unit_1/xception_module/'
'entry_flow/block3/unit_1/xception_module/'
'separable_conv2_pointwise',
],
},
......
# Copyright 2018 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""This script contains utility functions."""
import tensorflow as tf
slim = tf.contrib.slim
def scale_dimension(dim, scale):
"""Scales the input dimension.
Args:
dim: Input dimension (a scalar or a scalar Tensor).
scale: The amount of scaling applied to the input.
Returns:
Scaled dimension.
"""
if isinstance(dim, tf.Tensor):
return tf.cast((tf.to_float(dim) - 1.0) * scale + 1.0, dtype=tf.int32)
else:
return int((float(dim) - 1.0) * scale + 1.0)
def split_separable_conv2d(inputs,
filters,
kernel_size=3,
rate=1,
weight_decay=0.00004,
depthwise_weights_initializer_stddev=0.33,
pointwise_weights_initializer_stddev=0.06,
scope=None):
"""Splits a separable conv2d into depthwise and pointwise conv2d.
This operation differs from `tf.layers.separable_conv2d` as this operation
applies activation function between depthwise and pointwise conv2d.
Args:
inputs: Input tensor with shape [batch, height, width, channels].
filters: Number of filters in the 1x1 pointwise convolution.
kernel_size: A list of length 2: [kernel_height, kernel_width] of
of the filters. Can be an int if both values are the same.
rate: Atrous convolution rate for the depthwise convolution.
weight_decay: The weight decay to use for regularizing the model.
depthwise_weights_initializer_stddev: The standard deviation of the
truncated normal weight initializer for depthwise convolution.
pointwise_weights_initializer_stddev: The standard deviation of the
truncated normal weight initializer for pointwise convolution.
scope: Optional scope for the operation.
Returns:
Computed features after split separable conv2d.
"""
outputs = slim.separable_conv2d(
inputs,
None,
kernel_size=kernel_size,
depth_multiplier=1,
rate=rate,
weights_initializer=tf.truncated_normal_initializer(
stddev=depthwise_weights_initializer_stddev),
weights_regularizer=None,
scope=scope + '_depthwise')
return slim.conv2d(
outputs,
filters,
1,
weights_initializer=tf.truncated_normal_initializer(
stddev=pointwise_weights_initializer_stddev),
weights_regularizer=slim.l2_regularizer(weight_decay),
scope=scope + '_pointwise')
\ No newline at end of file
# Copyright 2018 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Tests for utils.py."""
import tensorflow as tf
from deeplab.core import utils
class UtilsTest(tf.test.TestCase):
def testScaleDimensionOutput(self):
self.assertEqual(161, utils.scale_dimension(321, 0.5))
self.assertEqual(193, utils.scale_dimension(321, 0.6))
self.assertEqual(241, utils.scale_dimension(321, 0.75))
if __name__ == '__main__':
tf.test.main()
\ No newline at end of file
......@@ -79,6 +79,14 @@ ${PATH_TO_DATASET} is the directory in which the Cityscapes dataset resides.
3. The users could skip the flag, `decoder_output_stride`, if you do not want
to use the decoder structure.
4. Change and add the following flags in order to use the provided dense prediction cell.
```bash
--model_variant="xception_71"
--dense_prediction_cell_json="deeplab/core/dense_prediction_cell_branch5_top1_cityscapes.json"
```
A local evaluation job using `xception_65` can be run with the following
command:
......
......@@ -28,19 +28,23 @@ employ ASPP and decoder modules for fast computation.
Checkpoint name | Network backbone | Pretrained dataset | ASPP | Decoder
--------------------------- | :--------------: | :-----------------: | :---: | :-----:
mobilenetv2_dm05_coco_voc_trainaug | MobileNet-v2 <br> Depth-Multiplier = 0.5 | MS-COCO <br> VOC 2012 train_aug set| N/A | N/A
mobilenetv2_dm05_coco_voc_trainval | MobileNet-v2 <br> Depth-Multiplier = 0.5 | MS-COCO <br> VOC 2012 train_aug + trainval sets | N/A | N/A
mobilenetv2_coco_voc_trainaug | MobileNet-v2 | MS-COCO <br> VOC 2012 train_aug set| N/A | N/A
mobilenetv2_coco_voc_trainval | MobileNet-v2 | MS-COCO <br> VOC 2012 train_aug + trainval sets | N/A | N/A
xception_coco_voc_trainaug | Xception_65 | MS-COCO <br> VOC 2012 train_aug set| [6,12,18] for OS=16 <br> [12,24,36] for OS=8 | OS = 4
xception_coco_voc_trainval | Xception_65 | MS-COCO <br> VOC 2012 train_aug + trainval sets | [6,12,18] for OS=16 <br> [12,24,36] for OS=8 | OS = 4
xception65_coco_voc_trainaug | Xception_65 | MS-COCO <br> VOC 2012 train_aug set| [6,12,18] for OS=16 <br> [12,24,36] for OS=8 | OS = 4
xception65_coco_voc_trainval | Xception_65 | MS-COCO <br> VOC 2012 train_aug + trainval sets | [6,12,18] for OS=16 <br> [12,24,36] for OS=8 | OS = 4
In the table, **OS** denotes output stride.
Checkpoint name | Eval OS | Eval scales | Left-right Flip | Multiply-Adds | Runtime (sec) | PASCAL mIOU | File Size
------------------------------------------------------------------------------------------------------------------------ | :-------: | :------------------------: | :-------------: | :------------------: | :------------: | :----------------------------: | :-------:
[mobilenetv2_dm05_coco_voc_trainaug](http://download.tensorflow.org/models/deeplabv3_mnv2_dm05_pascal_trainaug_2018_10_01.tar.gz) | 16 | [1.0] | No | 0.88B | - | 70.19% (val) | 7.6MB
[mobilenetv2_dm05_coco_voc_trainval](http://download.tensorflow.org/models/deeplabv3_mnv2_dm05_pascal_trainval_2018_10_01.tar.gz) | 8 | [1.0] | No | 2.84B | - | 71.83% (test) | 7.6MB
[mobilenetv2_coco_voc_trainaug](http://download.tensorflow.org/models/deeplabv3_mnv2_pascal_train_aug_2018_01_29.tar.gz) | 16 <br> 8 | [1.0] <br> [0.5:0.25:1.75] | No <br> Yes | 2.75B <br> 152.59B | 0.1 <br> 26.9 | 75.32% (val) <br> 77.33 (val) | 23MB
[mobilenetv2_coco_voc_trainval](http://download.tensorflow.org/models/deeplabv3_mnv2_pascal_trainval_2018_01_29.tar.gz) | 8 | [0.5:0.25:1.75] | Yes | 152.59B | 26.9 | 80.25% (**test**) | 23MB
[xception_coco_voc_trainaug](http://download.tensorflow.org/models/deeplabv3_pascal_train_aug_2018_01_04.tar.gz) | 16 <br> 8 | [1.0] <br> [0.5:0.25:1.75] | No <br> Yes | 54.17B <br> 3055.35B | 0.7 <br> 223.2 | 82.20% (val) <br> 83.58% (val) | 439MB
[xception_coco_voc_trainval](http://download.tensorflow.org/models/deeplabv3_pascal_trainval_2018_01_04.tar.gz) | 8 | [0.5:0.25:1.75] | Yes | 3055.35B | 223.2 | 87.80% (**test**) | 439MB
[xception65_coco_voc_trainaug](http://download.tensorflow.org/models/deeplabv3_pascal_train_aug_2018_01_04.tar.gz) | 16 <br> 8 | [1.0] <br> [0.5:0.25:1.75] | No <br> Yes | 54.17B <br> 3055.35B | 0.7 <br> 223.2 | 82.20% (val) <br> 83.58% (val) | 439MB
[xception65_coco_voc_trainval](http://download.tensorflow.org/models/deeplabv3_pascal_trainval_2018_01_04.tar.gz) | 8 | [0.5:0.25:1.75] | Yes | 3055.35B | 223.2 | 87.80% (**test**) | 439MB
In the table, we report both computation complexity (in terms of Multiply-Adds
and CPU Runtime) and segmentation performance (in terms of mIOU) on the PASCAL
......@@ -61,14 +65,20 @@ dataset and does not employ ASPP and decoder modules for fast computation.
Checkpoint name | Network backbone | Pretrained dataset | ASPP | Decoder
------------------------------------- | :--------------: | :-------------------------------------: | :----------------------------------------------: | :-----:
mobilenetv2_coco_cityscapes_trainfine | MobileNet-v2 | MS-COCO <br> Cityscapes train_fine set | N/A | N/A
xception_cityscapes_trainfine | Xception_65 | ImageNet <br> Cityscapes train_fine set | [6, 12, 18] for OS=16 <br> [12, 24, 36] for OS=8 | OS = 4
xception65_cityscapes_trainfine | Xception_65 | ImageNet <br> Cityscapes train_fine set | [6, 12, 18] for OS=16 <br> [12, 24, 36] for OS=8 | OS = 4
xception71_dpc_cityscapes_trainfine | Xception_71 | ImageNet <br> MS-COCO <br> Cityscapes train_fine set | Dense Prediction Cell | OS = 4
xception71_dpc_cityscapes_trainval | Xception_71 | ImageNet <br> MS-COCO <br> Cityscapes trainval_fine and coarse set | Dense Prediction Cell | OS = 4
In the table, **OS** denotes output stride.
Checkpoint name | Eval OS | Eval scales | Left-right Flip | Multiply-Adds | Runtime (sec) | Cityscapes mIOU | File Size
-------------------------------------------------------------------------------------------------------------------------------- | :-------: | :-------------------------: | :-------------: | :-------------------: | :------------: | :----------------------------: | :-------:
[mobilenetv2_coco_cityscapes_trainfine](http://download.tensorflow.org/models/deeplabv3_mnv2_cityscapes_train_2018_02_05.tar.gz) | 16 <br> 8 | [1.0] <br> [0.75:0.25:1.25] | No <br> Yes | 21.27B <br> 433.24B | 0.8 <br> 51.12 | 70.71% (val) <br> 73.57% (val) | 23MB
[xception_cityscapes_trainfine](http://download.tensorflow.org/models/deeplabv3_cityscapes_train_2018_02_06.tar.gz) | 16 <br> 8 | [1.0] <br> [0.75:0.25:1.25] | No <br> Yes | 418.64B <br> 8677.92B | 5.0 <br> 422.8 | 78.79% (val) <br> 80.42% (val) | 439MB
[xception65_cityscapes_trainfine](http://download.tensorflow.org/models/deeplabv3_cityscapes_train_2018_02_06.tar.gz) | 16 <br> 8 | [1.0] <br> [0.75:0.25:1.25] | No <br> Yes | 418.64B <br> 8677.92B | 5.0 <br> 422.8 | 78.79% (val) <br> 80.42% (val) | 439MB
[xception71_dpc_cityscapes_trainfine](http://download.tensorflow.org/models/deeplab_cityscapes_xception71_trainfine_2018_09_08.tar.gz) | 16 | [1.0] | No | 502.07B | - | 80.31% (val) | 445MB
[xception71_dpc_cityscapes_trainval](http://download.tensorflow.org/models/deeplab_cityscapes_xception71_trainvalfine_2018_09_08.tar.gz) | 8 | [0.75:0.25:2] | Yes | - | - | 82.66% (**test**) | 446MB
## DeepLab models trained on ADE20K
......@@ -80,11 +90,11 @@ dataset rule.
Checkpoint name | Network backbone | Pretrained dataset | ASPP | Decoder
------------------------------------- | :--------------: | :-------------------------------------: | :----------------------------------------------: | :-----:
xception_ade20k_train | Xception_65 | ImageNet <br> ADE20K training set | [6, 12, 18] for OS=16 <br> [12, 24, 36] for OS=8 | OS = 4
xception65_ade20k_train | Xception_65 | ImageNet <br> ADE20K training set | [6, 12, 18] for OS=16 <br> [12, 24, 36] for OS=8 | OS = 4
Checkpoint name | Eval OS | Eval scales | Left-right Flip | mIOU | Pixel-wise Accuracy | File Size
------------------------------------- | :-------: | :-------------------------: | :-------------: | :-------------------: | :-------------------: | :-------:
[xception_ade20k_train](http://download.tensorflow.org/models/deeplabv3_xception_ade20k_train_2018_05_29.tar.gz) | 8 | [0.5:0.25:1.75] | Yes | 45.65% (val) | 82.52% (val) | 439MB
[xception65_ade20k_train](http://download.tensorflow.org/models/deeplabv3_xception_ade20k_train_2018_05_29.tar.gz) | 8 | [0.5:0.25:1.75] | Yes | 45.65% (val) | 82.52% (val) | 439MB
## Checkpoints pretrained on ImageNet
......@@ -94,8 +104,9 @@ Un-tar'ed directory includes:
### Model details
We also provide some checkpoints that are only pretrained on ImageNet so that
one could use this for training your own models.
We also provide some checkpoints that are pretrained on ImageNet and/or COCO (as
post-fixed in the model name) so that one could use this for training your own
models.
* mobilenet_v2: We refer the interested users to the TensorFlow open source
[MobileNet-V2](https://github.com/tensorflow/models/tree/master/research/slim/nets/mobilenet)
......@@ -114,11 +125,12 @@ one could use this for training your own models.
Model name | File Size
-------------------------------------------------------------------------------------- | :-------:
[xception_41](http://download.tensorflow.org/models/xception_41_2018_05_09.tar.gz ) | 288MB
[xception_65](http://download.tensorflow.org/models/deeplabv3_xception_2018_01_04.tar.gz) | 447MB
[xception_71](http://download.tensorflow.org/models/xception_71_2018_05_09.tar.gz ) | 474MB
[resnet_v1_50_beta](http://download.tensorflow.org/models/resnet_v1_50_2018_05_04.tar.gz) | 274MB
[resnet_v1_101_beta](http://download.tensorflow.org/models/resnet_v1_101_2018_05_04.tar.gz) | 477MB
[xception_41_imagenet](http://download.tensorflow.org/models/xception_41_2018_05_09.tar.gz ) | 288MB
[xception_65_imagenet](http://download.tensorflow.org/models/deeplabv3_xception_2018_01_04.tar.gz) | 447MB
[xception_65_imagenet_coco](http://download.tensorflow.org/models/xception_65_coco_pretrained_2018_10_02.tar.gz) | 292MB
[xception_71_imagenet](http://download.tensorflow.org/models/xception_71_2018_05_09.tar.gz ) | 474MB
[resnet_v1_50_beta_imagenet](http://download.tensorflow.org/models/resnet_v1_50_2018_05_04.tar.gz) | 274MB
[resnet_v1_101_beta_imagenet](http://download.tensorflow.org/models/resnet_v1_101_2018_05_04.tar.gz) | 477MB
## References
......
......@@ -52,7 +52,10 @@ Alan L. Yuille (* equal contribution)
(https://arxiv.org/abs/1412.7062)
"""
import tensorflow as tf
from deeplab.core import dense_prediction_cell
from deeplab.core import feature_extractor
from deeplab.core import utils
slim = tf.contrib.slim
......@@ -62,7 +65,10 @@ IMAGE_POOLING_SCOPE = 'image_pooling'
ASPP_SCOPE = 'aspp'
CONCAT_PROJECTION_SCOPE = 'concat_projection'
DECODER_SCOPE = 'decoder'
META_ARCHITECTURE_SCOPE = 'meta_architecture'
scale_dimension = utils.scale_dimension
split_separable_conv2d = utils.split_separable_conv2d
def get_extra_layer_scopes(last_layers_contain_logits_only=False):
"""Gets the scopes for extra layers.
......@@ -83,6 +89,7 @@ def get_extra_layer_scopes(last_layers_contain_logits_only=False):
ASPP_SCOPE,
CONCAT_PROJECTION_SCOPE,
DECODER_SCOPE,
META_ARCHITECTURE_SCOPE,
]
......@@ -186,20 +193,20 @@ def predict_labels(images, model_options, image_pyramid=None):
return predictions
def scale_dimension(dim, scale):
"""Scales the input dimension.
def _resize_bilinear(images, size, output_dtype=tf.float32):
"""Returns resized images as output_type.
Args:
dim: Input dimension (a scalar or a scalar Tensor).
scale: The amount of scaling applied to the input.
images: A tensor of size [batch, height_in, width_in, channels].
size: A 1-D int32 Tensor of 2 elements: new_height, new_width. The new size
for the images.
output_dtype: The destination type.
Returns:
Scaled dimension.
A tensor of size [batch, height_out, width_out, channels] as a dtype of
output_dtype.
"""
if isinstance(dim, tf.Tensor):
return tf.cast((tf.to_float(dim) - 1.0) * scale + 1.0, dtype=tf.int32)
else:
return int((float(dim) - 1.0) * scale + 1.0)
images = tf.image.resize_bilinear(images, size, align_corners=True)
return tf.cast(images, dtype=output_dtype)
def multi_scale_logits(images,
......@@ -355,6 +362,28 @@ def extract_features(images,
if not model_options.aspp_with_batch_norm:
return features, end_points
else:
if model_options.dense_prediction_cell_config is not None:
tf.logging.info('Using dense prediction cell config.')
dense_prediction_layer = dense_prediction_cell.DensePredictionCell(
config=model_options.dense_prediction_cell_config,
hparams={
'conv_rate_multiplier': 16 // model_options.output_stride,
})
concat_logits = dense_prediction_layer.build_cell(
features,
output_stride=model_options.output_stride,
crop_size=model_options.crop_size,
image_pooling_crop_size=model_options.image_pooling_crop_size,
weight_decay=weight_decay,
reuse=reuse,
is_training=is_training,
fine_tune_batch_norm=fine_tune_batch_norm)
return concat_logits, end_points
else:
# The following codes employ the DeepLabv3 ASPP module. Note that We
# could express the ASPP module as one particular dense prediction
# cell architecture. We do not do so but leave the following codes in
# order for backward compatibility.
batch_norm_params = {
'is_training': is_training and fine_tune_batch_norm,
'decay': 0.9997,
......@@ -380,28 +409,34 @@ def extract_features(images,
# If image_pooling_crop_size is not specified, use crop_size.
if image_pooling_crop_size is None:
image_pooling_crop_size = model_options.crop_size
pool_height = scale_dimension(image_pooling_crop_size[0],
pool_height = scale_dimension(
image_pooling_crop_size[0],
1. / model_options.output_stride)
pool_width = scale_dimension(image_pooling_crop_size[1],
pool_width = scale_dimension(
image_pooling_crop_size[1],
1. / model_options.output_stride)
image_feature = slim.avg_pool2d(
features, [pool_height, pool_width], [1, 1], padding='VALID')
resize_height = scale_dimension(model_options.crop_size[0],
resize_height = scale_dimension(
model_options.crop_size[0],
1. / model_options.output_stride)
resize_width = scale_dimension(model_options.crop_size[1],
resize_width = scale_dimension(
model_options.crop_size[1],
1. / model_options.output_stride)
else:
# If crop_size is None, we simply do global pooling.
pool_height = tf.shape(features)[1]
pool_width = tf.shape(features)[2]
image_feature = tf.reduce_mean(features, axis=[1, 2])[:, tf.newaxis,
tf.newaxis]
image_feature = tf.reduce_mean(
features, axis=[1, 2], keepdims=True)
resize_height = pool_height
resize_width = pool_width
image_feature = slim.conv2d(
image_feature, depth, 1, scope=IMAGE_POOLING_SCOPE)
image_feature = tf.image.resize_bilinear(
image_feature, [resize_height, resize_width], align_corners=True)
image_feature = _resize_bilinear(
image_feature,
[resize_height, resize_width],
image_feature.dtype)
# Set shape for resize_height/resize_width if they are not Tensor.
if isinstance(resize_height, tf.Tensor):
resize_height = None
......@@ -672,52 +707,3 @@ def get_branch_logits(features,
scope=scope))
return tf.add_n(branch_logits)
def split_separable_conv2d(inputs,
filters,
kernel_size=3,
rate=1,
weight_decay=0.00004,
depthwise_weights_initializer_stddev=0.33,
pointwise_weights_initializer_stddev=0.06,
scope=None):
"""Splits a separable conv2d into depthwise and pointwise conv2d.
This operation differs from `tf.layers.separable_conv2d` as this operation
applies activation function between depthwise and pointwise conv2d.
Args:
inputs: Input tensor with shape [batch, height, width, channels].
filters: Number of filters in the 1x1 pointwise convolution.
kernel_size: A list of length 2: [kernel_height, kernel_width] of
of the filters. Can be an int if both values are the same.
rate: Atrous convolution rate for the depthwise convolution.
weight_decay: The weight decay to use for regularizing the model.
depthwise_weights_initializer_stddev: The standard deviation of the
truncated normal weight initializer for depthwise convolution.
pointwise_weights_initializer_stddev: The standard deviation of the
truncated normal weight initializer for pointwise convolution.
scope: Optional scope for the operation.
Returns:
Computed features after split separable conv2d.
"""
outputs = slim.separable_conv2d(
inputs,
None,
kernel_size=kernel_size,
depth_multiplier=1,
rate=rate,
weights_initializer=tf.truncated_normal_initializer(
stddev=depthwise_weights_initializer_stddev),
weights_regularizer=None,
scope=scope + '_depthwise')
return slim.conv2d(
outputs,
filters,
1,
weights_initializer=tf.truncated_normal_initializer(
stddev=pointwise_weights_initializer_stddev),
weights_regularizer=slim.l2_regularizer(weight_decay),
scope=scope + '_pointwise')
......@@ -23,11 +23,6 @@ from deeplab import model
class DeeplabModelTest(tf.test.TestCase):
def testScaleDimensionOutput(self):
self.assertEqual(161, model.scale_dimension(321, 0.5))
self.assertEqual(193, model.scale_dimension(321, 0.6))
self.assertEqual(241, model.scale_dimension(321, 0.75))
def testWrongDeepLabVariant(self):
model_options = common.ModelOptions([])._replace(
model_variant='no_such_variant')
......@@ -115,6 +110,37 @@ class DeeplabModelTest(tf.test.TestCase):
for logits in scales_to_logits.values():
self.assertTrue(logits.any())
def testBuildDeepLabWithDensePredictionCell(self):
batch_size = 1
crop_size = [33, 33]
outputs_to_num_classes = {'semantic': 2}
expected_endpoints = ['merged_logits']
dense_prediction_cell_config = [
{'kernel': 3, 'rate': [1, 6], 'op': 'conv', 'input': -1},
{'kernel': 3, 'rate': [18, 15], 'op': 'conv', 'input': 0},
]
model_options = common.ModelOptions(
outputs_to_num_classes,
crop_size,
output_stride=16)._replace(
aspp_with_batch_norm=True,
model_variant='mobilenet_v2',
dense_prediction_cell_config=dense_prediction_cell_config)
g = tf.Graph()
with g.as_default():
with self.test_session(graph=g):
inputs = tf.random_uniform(
(batch_size, crop_size[0], crop_size[1], 3))
outputs_to_scales_to_model_results = model.multi_scale_logits(
inputs,
model_options,
image_pyramid=[1.0])
for output in outputs_to_num_classes:
scales_to_model_results = outputs_to_scales_to_model_results[output]
self.assertListEqual(scales_to_model_results.keys(),
expected_endpoints)
self.assertEqual(len(scales_to_model_results), 1)
if __name__ == '__main__':
tf.test.main()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment