Unverified Commit f798e4b5 authored by aquariusjay's avatar aquariusjay Committed by GitHub
Browse files

Merge pull request #4311 from YknZhu/master

PiperOrigin-RevId: 197225788
parents 9dec261e afb2a7dc
...@@ -104,14 +104,33 @@ Misc: ...@@ -104,14 +104,33 @@ Misc:
To get help with issues you may encounter while using the DeepLab Tensorflow To get help with issues you may encounter while using the DeepLab Tensorflow
implementation, create a new question on implementation, create a new question on
[StackOverflow](https://stackoverflow.com/) with the tags "tensorflow" and [StackOverflow](https://stackoverflow.com/) with the tag "tensorflow".
"deeplab".
Please report bugs (i.e., broken code, not usage questions) to the Please report bugs (i.e., broken code, not usage questions) to the
tensorflow/models GitHub [issue tensorflow/models GitHub [issue
tracker](https://github.com/tensorflow/models/issues), prefixing the issue name tracker](https://github.com/tensorflow/models/issues), prefixing the issue name
with "deeplab". with "deeplab".
## Change Logs
### May 18, 2018
1. Added builders for ResNet-v1 and Xception model variants.
1. Added ADE20K support, including colormap and pretrained Xception_65 checkpoint.
1. Fixed a bug on using non-default depth_multiplier for MobileNet-v2.
### March 22, 2018
Released checkpoints using MobileNet-V2 as network backbone and pretrained on
PASCAL VOC 2012 and Cityscapes.
### March 5, 2018
First release of DeepLab in TensorFlow including deeper Xception network
backbone. Included chekcpoints that have been pretrained on PASCAL VOC 2012
and Cityscapes.
## References ## References
1. **Semantic Image Segmentation with Deep Convolutional Nets and Fully Connected CRFs**<br /> 1. **Semantic Image Segmentation with Deep Convolutional Nets and Fully Connected CRFs**<br />
......
...@@ -95,6 +95,7 @@ ORIGINAL_IMAGE = 'original_image' ...@@ -95,6 +95,7 @@ ORIGINAL_IMAGE = 'original_image'
# Test set name. # Test set name.
TEST_SET = 'test' TEST_SET = 'test'
class ModelOptions( class ModelOptions(
collections.namedtuple('ModelOptions', [ collections.namedtuple('ModelOptions', [
'outputs_to_num_classes', 'outputs_to_num_classes',
...@@ -109,7 +110,8 @@ class ModelOptions( ...@@ -109,7 +110,8 @@ class ModelOptions(
'decoder_output_stride', 'decoder_output_stride',
'decoder_use_separable_conv', 'decoder_use_separable_conv',
'logits_kernel_size', 'logits_kernel_size',
'model_variant' 'model_variant',
'depth_multiplier',
])): ])):
"""Immutable class to hold model options.""" """Immutable class to hold model options."""
...@@ -139,4 +141,4 @@ class ModelOptions( ...@@ -139,4 +141,4 @@ class ModelOptions(
FLAGS.aspp_with_batch_norm, FLAGS.aspp_with_separable_conv, FLAGS.aspp_with_batch_norm, FLAGS.aspp_with_separable_conv,
FLAGS.multi_grid, FLAGS.decoder_output_stride, FLAGS.multi_grid, FLAGS.decoder_output_stride,
FLAGS.decoder_use_separable_conv, FLAGS.logits_kernel_size, FLAGS.decoder_use_separable_conv, FLAGS.logits_kernel_size,
FLAGS.model_variant) FLAGS.model_variant, FLAGS.depth_multiplier)
...@@ -17,8 +17,9 @@ ...@@ -17,8 +17,9 @@
import functools import functools
import tensorflow as tf import tensorflow as tf
from deeplab.core import resnet_v1_beta
from deeplab.core import xception from deeplab.core import xception
from nets.mobilenet import mobilenet as mobilenet_lib from tensorflow.contrib.slim.nets import resnet_utils
from nets.mobilenet import mobilenet_v2 from nets.mobilenet import mobilenet_v2
...@@ -56,10 +57,12 @@ def _mobilenet_v2(net, ...@@ -56,10 +57,12 @@ def _mobilenet_v2(net,
""" """
with tf.variable_scope( with tf.variable_scope(
scope, 'MobilenetV2', [net], reuse=reuse) as scope: scope, 'MobilenetV2', [net], reuse=reuse) as scope:
return mobilenet_lib.mobilenet_base( return mobilenet_v2.mobilenet_base(
net, net,
conv_defs=mobilenet_v2.V2_DEF, conv_defs=mobilenet_v2.V2_DEF,
multiplier=depth_multiplier, depth_multiplier=depth_multiplier,
min_depth=8 if depth_multiplier == 1.0 else 1,
divisible_by=8 if depth_multiplier == 1.0 else 1,
final_endpoint=final_endpoint or _MOBILENET_V2_FINAL_ENDPOINT, final_endpoint=final_endpoint or _MOBILENET_V2_FINAL_ENDPOINT,
output_stride=output_stride, output_stride=output_stride,
scope=scope) scope=scope)
...@@ -68,13 +71,25 @@ def _mobilenet_v2(net, ...@@ -68,13 +71,25 @@ def _mobilenet_v2(net,
# A map from network name to network function. # A map from network name to network function.
networks_map = { networks_map = {
'mobilenet_v2': _mobilenet_v2, 'mobilenet_v2': _mobilenet_v2,
'resnet_v1_50': resnet_v1_beta.resnet_v1_50,
'resnet_v1_50_beta': resnet_v1_beta.resnet_v1_50_beta,
'resnet_v1_101': resnet_v1_beta.resnet_v1_101,
'resnet_v1_101_beta': resnet_v1_beta.resnet_v1_101_beta,
'xception_41': xception.xception_41,
'xception_65': xception.xception_65, 'xception_65': xception.xception_65,
'xception_71': xception.xception_71,
} }
# A map from network name to network arg scope. # A map from network name to network arg scope.
arg_scopes_map = { arg_scopes_map = {
'mobilenet_v2': mobilenet_v2.training_scope, 'mobilenet_v2': mobilenet_v2.training_scope,
'resnet_v1_50': resnet_utils.resnet_arg_scope,
'resnet_v1_50_beta': resnet_utils.resnet_arg_scope,
'resnet_v1_101': resnet_utils.resnet_arg_scope,
'resnet_v1_101_beta': resnet_utils.resnet_arg_scope,
'xception_41': xception.xception_arg_scope,
'xception_65': xception.xception_arg_scope, 'xception_65': xception.xception_arg_scope,
'xception_71': xception.xception_arg_scope,
} }
# Names for end point features. # Names for end point features.
...@@ -86,19 +101,49 @@ networks_to_feature_maps = { ...@@ -86,19 +101,49 @@ networks_to_feature_maps = {
# The provided checkpoint does not include decoder module. # The provided checkpoint does not include decoder module.
DECODER_END_POINTS: None, DECODER_END_POINTS: None,
}, },
'resnet_v1_50': {
DECODER_END_POINTS: ['block1/unit_2/bottleneck_v1/conv3'],
},
'resnet_v1_50_beta': {
DECODER_END_POINTS: ['block1/unit_2/bottleneck_v1/conv3'],
},
'resnet_v1_101': {
DECODER_END_POINTS: ['block1/unit_2/bottleneck_v1/conv3'],
},
'resnet_v1_101_beta': {
DECODER_END_POINTS: ['block1/unit_2/bottleneck_v1/conv3'],
},
'xception_41': {
DECODER_END_POINTS: [
'entry_flow/block2/unit_1/xception_module/'
'separable_conv2_pointwise',
],
},
'xception_65': { 'xception_65': {
DECODER_END_POINTS: [ DECODER_END_POINTS: [
'entry_flow/block2/unit_1/xception_module/' 'entry_flow/block2/unit_1/xception_module/'
'separable_conv2_pointwise', 'separable_conv2_pointwise',
], ],
} },
'xception_71': {
DECODER_END_POINTS: [
'entry_flow/block2/unit_1/xception_module/'
'separable_conv2_pointwise',
],
},
} }
# A map from feature extractor name to the network name scope used in the # A map from feature extractor name to the network name scope used in the
# ImageNet pretrained versions of these models. # ImageNet pretrained versions of these models.
name_scope = { name_scope = {
'mobilenet_v2': 'MobilenetV2', 'mobilenet_v2': 'MobilenetV2',
'resnet_v1_50': 'resnet_v1_50',
'resnet_v1_50_beta': 'resnet_v1_50',
'resnet_v1_101': 'resnet_v1_101',
'resnet_v1_101_beta': 'resnet_v1_101',
'xception_41': 'xception_41',
'xception_65': 'xception_65', 'xception_65': 'xception_65',
'xception_71': 'xception_71',
} }
# Mean pixel value. # Mean pixel value.
...@@ -118,7 +163,13 @@ def _preprocess_zero_mean_unit_range(inputs): ...@@ -118,7 +163,13 @@ def _preprocess_zero_mean_unit_range(inputs):
_PREPROCESS_FN = { _PREPROCESS_FN = {
'mobilenet_v2': _preprocess_zero_mean_unit_range, 'mobilenet_v2': _preprocess_zero_mean_unit_range,
'resnet_v1_50': _preprocess_subtract_imagenet_mean,
'resnet_v1_50_beta': _preprocess_zero_mean_unit_range,
'resnet_v1_101': _preprocess_subtract_imagenet_mean,
'resnet_v1_101_beta': _preprocess_zero_mean_unit_range,
'xception_41': _preprocess_zero_mean_unit_range,
'xception_65': _preprocess_zero_mean_unit_range, 'xception_65': _preprocess_zero_mean_unit_range,
'xception_71': _preprocess_zero_mean_unit_range,
} }
...@@ -140,7 +191,8 @@ def mean_pixel(model_variant=None): ...@@ -140,7 +191,8 @@ def mean_pixel(model_variant=None):
Returns: Returns:
Mean pixel value. Mean pixel value.
""" """
if model_variant is None: if model_variant in ['resnet_v1_50',
'resnet_v1_101'] or model_variant is None:
return _MEAN_RGB return _MEAN_RGB
else: else:
return [127.5, 127.5, 127.5] return [127.5, 127.5, 127.5]
...@@ -159,7 +211,8 @@ def extract_features(images, ...@@ -159,7 +211,8 @@ def extract_features(images,
regularize_depthwise=False, regularize_depthwise=False,
preprocess_images=True, preprocess_images=True,
num_classes=None, num_classes=None,
global_pool=False): global_pool=False,
use_bounded_activations=False):
"""Extracts features by the particular model_variant. """Extracts features by the particular model_variant.
Args: Args:
...@@ -184,6 +237,8 @@ def extract_features(images, ...@@ -184,6 +237,8 @@ def extract_features(images,
to None for dense prediction tasks. to None for dense prediction tasks.
global_pool: Global pooling for image classification task. Defaults to global_pool: Global pooling for image classification task. Defaults to
False, since dense prediction tasks do not use this. False, since dense prediction tasks do not use this.
use_bounded_activations: Whether or not to use bounded activations. Bounded
activations better lend themselves to quantized inference.
Returns: Returns:
features: A tensor of size [batch, feature_height, feature_width, features: A tensor of size [batch, feature_height, feature_width,
...@@ -195,7 +250,25 @@ def extract_features(images, ...@@ -195,7 +250,25 @@ def extract_features(images,
Raises: Raises:
ValueError: Unrecognized model variant. ValueError: Unrecognized model variant.
""" """
if 'xception' in model_variant: if 'resnet' in model_variant:
arg_scope = arg_scopes_map[model_variant](
weight_decay=weight_decay,
batch_norm_decay=0.95,
batch_norm_epsilon=1e-5,
batch_norm_scale=True,
activation_fn=tf.nn.relu6 if use_bounded_activations else tf.nn.relu)
features, end_points = get_network(
model_variant, preprocess_images, arg_scope)(
inputs=images,
num_classes=num_classes,
is_training=(is_training and fine_tune_batch_norm),
global_pool=global_pool,
output_stride=output_stride,
multi_grid=multi_grid,
reuse=reuse,
scope=name_scope[model_variant],
use_bounded_activations=use_bounded_activations)
elif 'xception' in model_variant:
arg_scope = arg_scopes_map[model_variant]( arg_scope = arg_scopes_map[model_variant](
weight_decay=weight_decay, weight_decay=weight_decay,
batch_norm_decay=0.9997, batch_norm_decay=0.9997,
......
# Copyright 2018 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Resnet v1 model variants.
Code branched out from slim/nets/resnet_v1.py, and please refer to it for
more details.
The original version ResNets-v1 were proposed by:
[1] Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun
Deep Residual Learning for Image Recognition. arXiv:1512.03385
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import functools
import tensorflow as tf
from tensorflow.contrib.slim.nets import resnet_utils
slim = tf.contrib.slim
_DEFAULT_MULTI_GRID = [1, 1, 1]
@slim.add_arg_scope
def bottleneck(inputs,
depth,
depth_bottleneck,
stride,
unit_rate=1,
rate=1,
outputs_collections=None,
scope=None,
use_bounded_activations=True):
"""Bottleneck residual unit variant with BN after convolutions.
This is the original residual unit proposed in [1]. See Fig. 1(a) of [2] for
its definition. Note that we use here the bottleneck variant which has an
extra bottleneck layer.
When putting together two consecutive ResNet blocks that use this unit, one
should use stride = 2 in the last unit of the first block.
Args:
inputs: A tensor of size [batch, height, width, channels].
depth: The depth of the ResNet unit output.
depth_bottleneck: The depth of the bottleneck layers.
stride: The ResNet unit's stride. Determines the amount of downsampling of
the units output compared to its input.
unit_rate: An integer, unit rate for atrous convolution.
rate: An integer, rate for atrous convolution.
outputs_collections: Collection to add the ResNet unit output.
scope: Optional variable_scope.
use_bounded_activations: Whether or not to use bounded activations. Bounded
activations better lend themselves to quantized inference.
Returns:
The ResNet unit's output.
"""
with tf.variable_scope(scope, 'bottleneck_v1', [inputs]) as sc:
depth_in = slim.utils.last_dimension(inputs.get_shape(), min_rank=4)
if depth == depth_in:
shortcut = resnet_utils.subsample(inputs, stride, 'shortcut')
else:
shortcut = slim.conv2d(
inputs,
depth,
[1, 1],
stride=stride,
activation_fn=tf.nn.relu6 if use_bounded_activations else None,
scope='shortcut')
residual = slim.conv2d(inputs, depth_bottleneck, [1, 1], stride=1,
scope='conv1')
residual = resnet_utils.conv2d_same(residual, depth_bottleneck, 3, stride,
rate=rate*unit_rate, scope='conv2')
residual = slim.conv2d(residual, depth, [1, 1], stride=1,
activation_fn=None, scope='conv3')
if use_bounded_activations:
# Use clip_by_value to simulate bandpass activation.
residual = tf.clip_by_value(residual, -6.0, 6.0)
output = tf.nn.relu6(shortcut + residual)
else:
output = tf.nn.relu(shortcut + residual)
return slim.utils.collect_named_outputs(outputs_collections,
sc.name,
output)
def root_block_fn_for_beta_variant(net):
"""Gets root_block_fn for beta variant.
ResNet-v1 beta variant modifies the first original 7x7 convolution to three
3x3 convolutions.
Args:
net: A tensor of size [batch, height, width, channels], input to the model.
Returns:
A tensor after three 3x3 convolutions.
"""
net = resnet_utils.conv2d_same(net, 64, 3, stride=2, scope='conv1_1')
net = resnet_utils.conv2d_same(net, 64, 3, stride=1, scope='conv1_2')
net = resnet_utils.conv2d_same(net, 128, 3, stride=1, scope='conv1_3')
return net
def resnet_v1_beta(inputs,
blocks,
num_classes=None,
is_training=None,
global_pool=True,
output_stride=None,
root_block_fn=None,
store_non_strided_activations=False,
use_bounded_activations=False,
reuse=None,
scope=None):
"""Generator for v1 ResNet models (beta variant).
This function generates a family of modified ResNet v1 models. In particular,
the first original 7x7 convolution is replaced with three 3x3 convolutions.
See the resnet_v1_*() methods for specific model instantiations, obtained by
selecting different block instantiations that produce ResNets of various
depths.
The code is modified from slim/nets/resnet_v1.py, and please refer to it for
more details.
Args:
inputs: A tensor of size [batch, height_in, width_in, channels].
blocks: A list of length equal to the number of ResNet blocks. Each element
is a resnet_utils.Block object describing the units in the block.
num_classes: Number of predicted classes for classification tasks. If None
we return the features before the logit layer.
is_training: Enable/disable is_training for batch normalization.
global_pool: If True, we perform global average pooling before computing the
logits. Set to True for image classification, False for dense prediction.
output_stride: If None, then the output will be computed at the nominal
network stride. If output_stride is not None, it specifies the requested
ratio of input to output spatial resolution.
root_block_fn: The function consisting of convolution operations applied to
the root input. If root_block_fn is None, use the original setting of
RseNet-v1, which is simply one convolution with 7x7 kernel and stride=2.
store_non_strided_activations: If True, we compute non-strided (undecimated)
activations at the last unit of each block and store them in the
`outputs_collections` before subsampling them. This gives us access to
higher resolution intermediate activations which are useful in some
dense prediction problems but increases 4x the computation and memory cost
at the last unit of each block.
use_bounded_activations: Whether or not to use bounded activations. Bounded
activations better lend themselves to quantized inference.
reuse: whether or not the network and its variables should be reused. To be
able to reuse 'scope' must be given.
scope: Optional variable_scope.
Returns:
net: A rank-4 tensor of size [batch, height_out, width_out, channels_out].
If global_pool is False, then height_out and width_out are reduced by a
factor of output_stride compared to the respective height_in and width_in,
else both height_out and width_out equal one. If num_classes is None, then
net is the output of the last ResNet block, potentially after global
average pooling. If num_classes is not None, net contains the pre-softmax
activations.
end_points: A dictionary from components of the network to the corresponding
activation.
Raises:
ValueError: If the target output_stride is not valid.
"""
if root_block_fn is None:
root_block_fn = functools.partial(resnet_utils.conv2d_same,
num_outputs=64,
kernel_size=7,
stride=2,
scope='conv1')
with tf.variable_scope(scope, 'resnet_v1', [inputs], reuse=reuse) as sc:
end_points_collection = sc.original_name_scope + '_end_points'
with slim.arg_scope([slim.conv2d, bottleneck,
resnet_utils.stack_blocks_dense],
outputs_collections=end_points_collection):
with slim.arg_scope(
[bottleneck], use_bounded_activations=use_bounded_activations):
if is_training is not None:
arg_scope = slim.arg_scope([slim.batch_norm], is_training=is_training)
else:
arg_scope = slim.arg_scope([])
with arg_scope:
net = inputs
if output_stride is not None:
if output_stride % 4 != 0:
raise ValueError('The output_stride needs to be a multiple of 4.')
output_stride /= 4
net = root_block_fn(net)
net = slim.max_pool2d(net, 3, stride=2, padding='SAME', scope='pool1')
net = resnet_utils.stack_blocks_dense(net, blocks, output_stride,
store_non_strided_activations)
if global_pool:
# Global average pooling.
net = tf.reduce_mean(net, [1, 2], name='pool5', keepdims=True)
if num_classes is not None:
net = slim.conv2d(net, num_classes, [1, 1], activation_fn=None,
normalizer_fn=None, scope='logits')
# Convert end_points_collection into a dictionary of end_points.
end_points = slim.utils.convert_collection_to_dict(
end_points_collection)
if num_classes is not None:
end_points['predictions'] = slim.softmax(net, scope='predictions')
return net, end_points
def resnet_v1_beta_block(scope, base_depth, num_units, stride):
"""Helper function for creating a resnet_v1 beta variant bottleneck block.
Args:
scope: The scope of the block.
base_depth: The depth of the bottleneck layer for each unit.
num_units: The number of units in the block.
stride: The stride of the block, implemented as a stride in the last unit.
All other units have stride=1.
Returns:
A resnet_v1 bottleneck block.
"""
return resnet_utils.Block(scope, bottleneck, [{
'depth': base_depth * 4,
'depth_bottleneck': base_depth,
'stride': 1,
'unit_rate': 1
}] * (num_units - 1) + [{
'depth': base_depth * 4,
'depth_bottleneck': base_depth,
'stride': stride,
'unit_rate': 1
}])
def resnet_v1_50(inputs,
num_classes=None,
is_training=None,
global_pool=False,
output_stride=None,
store_non_strided_activations=False,
multi_grid=None,
use_bounded_activations=False,
reuse=None,
scope='resnet_v1_50'):
"""Resnet v1 50.
Args:
inputs: A tensor of size [batch, height_in, width_in, channels].
num_classes: Number of predicted classes for classification tasks. If None
we return the features before the logit layer.
is_training: Enable/disable is_training for batch normalization.
global_pool: If True, we perform global average pooling before computing the
logits. Set to True for image classification, False for dense prediction.
output_stride: If None, then the output will be computed at the nominal
network stride. If output_stride is not None, it specifies the requested
ratio of input to output spatial resolution.
store_non_strided_activations: If True, we compute non-strided (undecimated)
activations at the last unit of each block and store them in the
`outputs_collections` before subsampling them. This gives us access to
higher resolution intermediate activations which are useful in some
dense prediction problems but increases 4x the computation and memory cost
at the last unit of each block.
multi_grid: Employ a hierarchy of different atrous rates within network.
use_bounded_activations: Whether or not to use bounded activations. Bounded
activations better lend themselves to quantized inference.
reuse: whether or not the network and its variables should be reused. To be
able to reuse 'scope' must be given.
scope: Optional variable_scope.
Returns:
net: A rank-4 tensor of size [batch, height_out, width_out, channels_out].
If global_pool is False, then height_out and width_out are reduced by a
factor of output_stride compared to the respective height_in and width_in,
else both height_out and width_out equal one. If num_classes is None, then
net is the output of the last ResNet block, potentially after global
average pooling. If num_classes is not None, net contains the pre-softmax
activations.
end_points: A dictionary from components of the network to the corresponding
activation.
Raises:
ValueError: if multi_grid is not None and does not have length = 3.
"""
if multi_grid is None:
multi_grid = _DEFAULT_MULTI_GRID
else:
if len(multi_grid) != 3:
raise ValueError('Expect multi_grid to have length 3.')
blocks = [
resnet_v1_beta_block(
'block1', base_depth=64, num_units=3, stride=2),
resnet_v1_beta_block(
'block2', base_depth=128, num_units=4, stride=2),
resnet_v1_beta_block(
'block3', base_depth=256, num_units=6, stride=2),
resnet_utils.Block('block4', bottleneck, [
{'depth': 2048,
'depth_bottleneck': 512,
'stride': 1,
'unit_rate': rate} for rate in multi_grid]),
]
return resnet_v1_beta(
inputs,
blocks=blocks,
num_classes=num_classes,
is_training=is_training,
global_pool=global_pool,
output_stride=output_stride,
store_non_strided_activations=store_non_strided_activations,
reuse=reuse,
scope=scope,
use_bounded_activations=use_bounded_activations)
def resnet_v1_50_beta(inputs,
num_classes=None,
is_training=None,
global_pool=False,
output_stride=None,
store_non_strided_activations=False,
multi_grid=None,
use_bounded_activations=False,
reuse=None,
scope='resnet_v1_50'):
"""Resnet v1 50 beta variant.
This variant modifies the first convolution layer of ResNet-v1-50. In
particular, it changes the original one 7x7 convolution to three 3x3
convolutions.
Args:
inputs: A tensor of size [batch, height_in, width_in, channels].
num_classes: Number of predicted classes for classification tasks. If None
we return the features before the logit layer.
is_training: Enable/disable is_training for batch normalization.
global_pool: If True, we perform global average pooling before computing the
logits. Set to True for image classification, False for dense prediction.
output_stride: If None, then the output will be computed at the nominal
network stride. If output_stride is not None, it specifies the requested
ratio of input to output spatial resolution.
store_non_strided_activations: If True, we compute non-strided (undecimated)
activations at the last unit of each block and store them in the
`outputs_collections` before subsampling them. This gives us access to
higher resolution intermediate activations which are useful in some
dense prediction problems but increases 4x the computation and memory cost
at the last unit of each block.
multi_grid: Employ a hierarchy of different atrous rates within network.
use_bounded_activations: Whether or not to use bounded activations. Bounded
activations better lend themselves to quantized inference.
reuse: whether or not the network and its variables should be reused. To be
able to reuse 'scope' must be given.
scope: Optional variable_scope.
Returns:
net: A rank-4 tensor of size [batch, height_out, width_out, channels_out].
If global_pool is False, then height_out and width_out are reduced by a
factor of output_stride compared to the respective height_in and width_in,
else both height_out and width_out equal one. If num_classes is None, then
net is the output of the last ResNet block, potentially after global
average pooling. If num_classes is not None, net contains the pre-softmax
activations.
end_points: A dictionary from components of the network to the corresponding
activation.
Raises:
ValueError: if multi_grid is not None and does not have length = 3.
"""
if multi_grid is None:
multi_grid = _DEFAULT_MULTI_GRID
else:
if len(multi_grid) != 3:
raise ValueError('Expect multi_grid to have length 3.')
blocks = [
resnet_v1_beta_block(
'block1', base_depth=64, num_units=3, stride=2),
resnet_v1_beta_block(
'block2', base_depth=128, num_units=4, stride=2),
resnet_v1_beta_block(
'block3', base_depth=256, num_units=6, stride=2),
resnet_utils.Block('block4', bottleneck, [
{'depth': 2048,
'depth_bottleneck': 512,
'stride': 1,
'unit_rate': rate} for rate in multi_grid]),
]
return resnet_v1_beta(
inputs,
blocks=blocks,
num_classes=num_classes,
is_training=is_training,
global_pool=global_pool,
output_stride=output_stride,
root_block_fn=functools.partial(root_block_fn_for_beta_variant),
store_non_strided_activations=store_non_strided_activations,
reuse=reuse,
scope=scope,
use_bounded_activations=use_bounded_activations)
def resnet_v1_101(inputs,
num_classes=None,
is_training=None,
global_pool=False,
output_stride=None,
store_non_strided_activations=False,
multi_grid=None,
use_bounded_activations=False,
reuse=None,
scope='resnet_v1_101'):
"""Resnet v1 101.
Args:
inputs: A tensor of size [batch, height_in, width_in, channels].
num_classes: Number of predicted classes for classification tasks. If None
we return the features before the logit layer.
is_training: Enable/disable is_training for batch normalization.
global_pool: If True, we perform global average pooling before computing the
logits. Set to True for image classification, False for dense prediction.
output_stride: If None, then the output will be computed at the nominal
network stride. If output_stride is not None, it specifies the requested
ratio of input to output spatial resolution.
store_non_strided_activations: If True, we compute non-strided (undecimated)
activations at the last unit of each block and store them in the
`outputs_collections` before subsampling them. This gives us access to
higher resolution intermediate activations which are useful in some
dense prediction problems but increases 4x the computation and memory cost
at the last unit of each block.
multi_grid: Employ a hierarchy of different atrous rates within network.
use_bounded_activations: Whether or not to use bounded activations. Bounded
activations better lend themselves to quantized inference.
reuse: whether or not the network and its variables should be reused. To be
able to reuse 'scope' must be given.
scope: Optional variable_scope.
Returns:
net: A rank-4 tensor of size [batch, height_out, width_out, channels_out].
If global_pool is False, then height_out and width_out are reduced by a
factor of output_stride compared to the respective height_in and width_in,
else both height_out and width_out equal one. If num_classes is None, then
net is the output of the last ResNet block, potentially after global
average pooling. If num_classes is not None, net contains the pre-softmax
activations.
end_points: A dictionary from components of the network to the corresponding
activation.
Raises:
ValueError: if multi_grid is not None and does not have length = 3.
"""
if multi_grid is None:
multi_grid = _DEFAULT_MULTI_GRID
else:
if len(multi_grid) != 3:
raise ValueError('Expect multi_grid to have length 3.')
blocks = [
resnet_v1_beta_block(
'block1', base_depth=64, num_units=3, stride=2),
resnet_v1_beta_block(
'block2', base_depth=128, num_units=4, stride=2),
resnet_v1_beta_block(
'block3', base_depth=256, num_units=23, stride=2),
resnet_utils.Block('block4', bottleneck, [
{'depth': 2048,
'depth_bottleneck': 512,
'stride': 1,
'unit_rate': rate} for rate in multi_grid]),
]
return resnet_v1_beta(
inputs,
blocks=blocks,
num_classes=num_classes,
is_training=is_training,
global_pool=global_pool,
output_stride=output_stride,
store_non_strided_activations=store_non_strided_activations,
reuse=reuse,
scope=scope,
use_bounded_activations=use_bounded_activations)
def resnet_v1_101_beta(inputs,
num_classes=None,
is_training=None,
global_pool=False,
output_stride=None,
store_non_strided_activations=False,
multi_grid=None,
use_bounded_activations=False,
reuse=None,
scope='resnet_v1_101'):
"""Resnet v1 101 beta variant.
This variant modifies the first convolution layer of ResNet-v1-101. In
particular, it changes the original one 7x7 convolution to three 3x3
convolutions.
Args:
inputs: A tensor of size [batch, height_in, width_in, channels].
num_classes: Number of predicted classes for classification tasks. If None
we return the features before the logit layer.
is_training: Enable/disable is_training for batch normalization.
global_pool: If True, we perform global average pooling before computing the
logits. Set to True for image classification, False for dense prediction.
output_stride: If None, then the output will be computed at the nominal
network stride. If output_stride is not None, it specifies the requested
ratio of input to output spatial resolution.
store_non_strided_activations: If True, we compute non-strided (undecimated)
activations at the last unit of each block and store them in the
`outputs_collections` before subsampling them. This gives us access to
higher resolution intermediate activations which are useful in some
dense prediction problems but increases 4x the computation and memory cost
at the last unit of each block.
multi_grid: Employ a hierarchy of different atrous rates within network.
use_bounded_activations: Whether or not to use bounded activations. Bounded
activations better lend themselves to quantized inference.
reuse: whether or not the network and its variables should be reused. To be
able to reuse 'scope' must be given.
scope: Optional variable_scope.
Returns:
net: A rank-4 tensor of size [batch, height_out, width_out, channels_out].
If global_pool is False, then height_out and width_out are reduced by a
factor of output_stride compared to the respective height_in and width_in,
else both height_out and width_out equal one. If num_classes is None, then
net is the output of the last ResNet block, potentially after global
average pooling. If num_classes is not None, net contains the pre-softmax
activations.
end_points: A dictionary from components of the network to the corresponding
activation.
Raises:
ValueError: if multi_grid is not None and does not have length = 3.
"""
if multi_grid is None:
multi_grid = _DEFAULT_MULTI_GRID
else:
if len(multi_grid) != 3:
raise ValueError('Expect multi_grid to have length 3.')
blocks = [
resnet_v1_beta_block(
'block1', base_depth=64, num_units=3, stride=2),
resnet_v1_beta_block(
'block2', base_depth=128, num_units=4, stride=2),
resnet_v1_beta_block(
'block3', base_depth=256, num_units=23, stride=2),
resnet_utils.Block('block4', bottleneck, [
{'depth': 2048,
'depth_bottleneck': 512,
'stride': 1,
'unit_rate': rate} for rate in multi_grid]),
]
return resnet_v1_beta(
inputs,
blocks=blocks,
num_classes=num_classes,
is_training=is_training,
global_pool=global_pool,
output_stride=output_stride,
root_block_fn=functools.partial(root_block_fn_for_beta_variant),
store_non_strided_activations=store_non_strided_activations,
use_bounded_activations=use_bounded_activations,
reuse=reuse,
scope=scope)
# Copyright 2018 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Tests for resnet_v1_beta module."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import functools
import numpy as np
import tensorflow as tf
from deeplab.core import resnet_v1_beta
from tensorflow.contrib.slim.nets import resnet_utils
slim = tf.contrib.slim
def create_test_input(batch, height, width, channels):
"""Create test input tensor."""
if None in [batch, height, width, channels]:
return tf.placeholder(tf.float32, (batch, height, width, channels))
else:
return tf.to_float(
np.tile(
np.reshape(
np.reshape(np.arange(height), [height, 1]) +
np.reshape(np.arange(width), [1, width]),
[1, height, width, 1]),
[batch, 1, 1, channels]))
class ResnetCompleteNetworkTest(tf.test.TestCase):
"""Tests with complete small ResNet v1 networks."""
def _resnet_small(self,
inputs,
num_classes=None,
is_training=True,
global_pool=True,
output_stride=None,
store_non_strided_activations=False,
multi_grid=None,
reuse=None,
scope='resnet_v1_small'):
"""A shallow and thin ResNet v1 for faster tests."""
if multi_grid is None:
multi_grid = [1, 1, 1]
else:
if len(multi_grid) != 3:
raise ValueError('Expect multi_grid to have length 3.')
block = resnet_v1_beta.resnet_v1_beta_block
blocks = [
block('block1', base_depth=1, num_units=3, stride=2),
block('block2', base_depth=2, num_units=3, stride=2),
block('block3', base_depth=4, num_units=3, stride=2),
resnet_utils.Block('block4', resnet_v1_beta.bottleneck, [
{'depth': 32,
'depth_bottleneck': 8,
'stride': 1,
'unit_rate': rate} for rate in multi_grid])]
return resnet_v1_beta.resnet_v1_beta(
inputs,
blocks,
num_classes=num_classes,
is_training=is_training,
global_pool=global_pool,
output_stride=output_stride,
root_block_fn=functools.partial(
resnet_v1_beta.root_block_fn_for_beta_variant),
store_non_strided_activations=store_non_strided_activations,
reuse=reuse,
scope=scope)
def testClassificationEndPoints(self):
global_pool = True
num_classes = 10
inputs = create_test_input(2, 224, 224, 3)
with slim.arg_scope(resnet_utils.resnet_arg_scope()):
logits, end_points = self._resnet_small(inputs,
num_classes,
global_pool=global_pool,
scope='resnet')
self.assertTrue(logits.op.name.startswith('resnet/logits'))
self.assertListEqual(logits.get_shape().as_list(), [2, 1, 1, num_classes])
self.assertTrue('predictions' in end_points)
self.assertListEqual(end_points['predictions'].get_shape().as_list(),
[2, 1, 1, num_classes])
def testClassificationEndPointsWithMultigrid(self):
global_pool = True
num_classes = 10
inputs = create_test_input(2, 224, 224, 3)
multi_grid = [1, 2, 4]
with slim.arg_scope(resnet_utils.resnet_arg_scope()):
logits, end_points = self._resnet_small(inputs,
num_classes,
global_pool=global_pool,
multi_grid=multi_grid,
scope='resnet')
self.assertTrue(logits.op.name.startswith('resnet/logits'))
self.assertListEqual(logits.get_shape().as_list(), [2, 1, 1, num_classes])
self.assertTrue('predictions' in end_points)
self.assertListEqual(end_points['predictions'].get_shape().as_list(),
[2, 1, 1, num_classes])
def testClassificationShapes(self):
global_pool = True
num_classes = 10
inputs = create_test_input(2, 224, 224, 3)
with slim.arg_scope(resnet_utils.resnet_arg_scope()):
_, end_points = self._resnet_small(inputs,
num_classes,
global_pool=global_pool,
scope='resnet')
endpoint_to_shape = {
'resnet/conv1_1': [2, 112, 112, 64],
'resnet/conv1_2': [2, 112, 112, 64],
'resnet/conv1_3': [2, 112, 112, 128],
'resnet/block1': [2, 28, 28, 4],
'resnet/block2': [2, 14, 14, 8],
'resnet/block3': [2, 7, 7, 16],
'resnet/block4': [2, 7, 7, 32]}
for endpoint, shape in endpoint_to_shape.iteritems():
self.assertListEqual(end_points[endpoint].get_shape().as_list(), shape)
def testFullyConvolutionalEndpointShapes(self):
global_pool = False
num_classes = 10
inputs = create_test_input(2, 321, 321, 3)
with slim.arg_scope(resnet_utils.resnet_arg_scope()):
_, end_points = self._resnet_small(inputs,
num_classes,
global_pool=global_pool,
scope='resnet')
endpoint_to_shape = {
'resnet/conv1_1': [2, 161, 161, 64],
'resnet/conv1_2': [2, 161, 161, 64],
'resnet/conv1_3': [2, 161, 161, 128],
'resnet/block1': [2, 41, 41, 4],
'resnet/block2': [2, 21, 21, 8],
'resnet/block3': [2, 11, 11, 16],
'resnet/block4': [2, 11, 11, 32]}
for endpoint, shape in endpoint_to_shape.iteritems():
self.assertListEqual(end_points[endpoint].get_shape().as_list(), shape)
def testAtrousFullyConvolutionalEndpointShapes(self):
global_pool = False
num_classes = 10
output_stride = 8
inputs = create_test_input(2, 321, 321, 3)
with slim.arg_scope(resnet_utils.resnet_arg_scope()):
_, end_points = self._resnet_small(inputs,
num_classes,
global_pool=global_pool,
output_stride=output_stride,
scope='resnet')
endpoint_to_shape = {
'resnet/conv1_1': [2, 161, 161, 64],
'resnet/conv1_2': [2, 161, 161, 64],
'resnet/conv1_3': [2, 161, 161, 128],
'resnet/block1': [2, 41, 41, 4],
'resnet/block2': [2, 41, 41, 8],
'resnet/block3': [2, 41, 41, 16],
'resnet/block4': [2, 41, 41, 32]}
for endpoint, shape in endpoint_to_shape.iteritems():
self.assertListEqual(end_points[endpoint].get_shape().as_list(), shape)
def testAtrousFullyConvolutionalValues(self):
"""Verify dense feature extraction with atrous convolution."""
nominal_stride = 32
for output_stride in [4, 8, 16, 32, None]:
with slim.arg_scope(resnet_utils.resnet_arg_scope()):
with tf.Graph().as_default():
with self.test_session() as sess:
tf.set_random_seed(0)
inputs = create_test_input(2, 81, 81, 3)
# Dense feature extraction followed by subsampling.
output, _ = self._resnet_small(inputs,
None,
is_training=False,
global_pool=False,
output_stride=output_stride)
if output_stride is None:
factor = 1
else:
factor = nominal_stride // output_stride
output = resnet_utils.subsample(output, factor)
# Make the two networks use the same weights.
tf.get_variable_scope().reuse_variables()
# Feature extraction at the nominal network rate.
expected, _ = self._resnet_small(inputs,
None,
is_training=False,
global_pool=False)
sess.run(tf.global_variables_initializer())
self.assertAllClose(output.eval(), expected.eval(),
atol=1e-4, rtol=1e-4)
def testUnknownBatchSize(self):
batch = 2
height, width = 65, 65
global_pool = True
num_classes = 10
inputs = create_test_input(None, height, width, 3)
with slim.arg_scope(resnet_utils.resnet_arg_scope()):
logits, _ = self._resnet_small(inputs,
num_classes,
global_pool=global_pool,
scope='resnet')
self.assertTrue(logits.op.name.startswith('resnet/logits'))
self.assertListEqual(logits.get_shape().as_list(),
[None, 1, 1, num_classes])
images = create_test_input(batch, height, width, 3)
with self.test_session() as sess:
sess.run(tf.global_variables_initializer())
output = sess.run(logits, {inputs: images.eval()})
self.assertEquals(output.shape, (batch, 1, 1, num_classes))
def testFullyConvolutionalUnknownHeightWidth(self):
batch = 2
height, width = 65, 65
global_pool = False
inputs = create_test_input(batch, None, None, 3)
with slim.arg_scope(resnet_utils.resnet_arg_scope()):
output, _ = self._resnet_small(inputs,
None,
global_pool=global_pool)
self.assertListEqual(output.get_shape().as_list(),
[batch, None, None, 32])
images = create_test_input(batch, height, width, 3)
with self.test_session() as sess:
sess.run(tf.global_variables_initializer())
output = sess.run(output, {inputs: images.eval()})
self.assertEquals(output.shape, (batch, 3, 3, 32))
def testAtrousFullyConvolutionalUnknownHeightWidth(self):
batch = 2
height, width = 65, 65
global_pool = False
output_stride = 8
inputs = create_test_input(batch, None, None, 3)
with slim.arg_scope(resnet_utils.resnet_arg_scope()):
output, _ = self._resnet_small(inputs,
None,
global_pool=global_pool,
output_stride=output_stride)
self.assertListEqual(output.get_shape().as_list(),
[batch, None, None, 32])
images = create_test_input(batch, height, width, 3)
with self.test_session() as sess:
sess.run(tf.global_variables_initializer())
output = sess.run(output, {inputs: images.eval()})
self.assertEquals(output.shape, (batch, 9, 9, 32))
if __name__ == '__main__':
tf.test.main()
...@@ -493,6 +493,73 @@ def xception_block(scope, ...@@ -493,6 +493,73 @@ def xception_block(scope,
}] * num_units) }] * num_units)
def xception_41(inputs,
num_classes=None,
is_training=True,
global_pool=True,
keep_prob=0.5,
output_stride=None,
regularize_depthwise=False,
multi_grid=None,
reuse=None,
scope='xception_41'):
"""Xception-41 model."""
blocks = [
xception_block('entry_flow/block1',
depth_list=[128, 128, 128],
skip_connection_type='conv',
activation_fn_in_separable_conv=False,
regularize_depthwise=regularize_depthwise,
num_units=1,
stride=2),
xception_block('entry_flow/block2',
depth_list=[256, 256, 256],
skip_connection_type='conv',
activation_fn_in_separable_conv=False,
regularize_depthwise=regularize_depthwise,
num_units=1,
stride=2),
xception_block('entry_flow/block3',
depth_list=[728, 728, 728],
skip_connection_type='conv',
activation_fn_in_separable_conv=False,
regularize_depthwise=regularize_depthwise,
num_units=1,
stride=2),
xception_block('middle_flow/block1',
depth_list=[728, 728, 728],
skip_connection_type='sum',
activation_fn_in_separable_conv=False,
regularize_depthwise=regularize_depthwise,
num_units=8,
stride=1),
xception_block('exit_flow/block1',
depth_list=[728, 1024, 1024],
skip_connection_type='conv',
activation_fn_in_separable_conv=False,
regularize_depthwise=regularize_depthwise,
num_units=1,
stride=2),
xception_block('exit_flow/block2',
depth_list=[1536, 1536, 2048],
skip_connection_type='none',
activation_fn_in_separable_conv=True,
regularize_depthwise=regularize_depthwise,
num_units=1,
stride=1,
unit_rate_list=multi_grid),
]
return xception(inputs,
blocks=blocks,
num_classes=num_classes,
is_training=is_training,
global_pool=global_pool,
keep_prob=keep_prob,
output_stride=output_stride,
reuse=reuse,
scope=scope)
def xception_65(inputs, def xception_65(inputs,
num_classes=None, num_classes=None,
is_training=True, is_training=True,
...@@ -560,6 +627,87 @@ def xception_65(inputs, ...@@ -560,6 +627,87 @@ def xception_65(inputs,
scope=scope) scope=scope)
def xception_71(inputs,
num_classes=None,
is_training=True,
global_pool=True,
keep_prob=0.5,
output_stride=None,
regularize_depthwise=False,
multi_grid=None,
reuse=None,
scope='xception_71'):
"""Xception-71 model."""
blocks = [
xception_block('entry_flow/block1',
depth_list=[128, 128, 128],
skip_connection_type='conv',
activation_fn_in_separable_conv=False,
regularize_depthwise=regularize_depthwise,
num_units=1,
stride=2),
xception_block('entry_flow/block2',
depth_list=[256, 256, 256],
skip_connection_type='conv',
activation_fn_in_separable_conv=False,
regularize_depthwise=regularize_depthwise,
num_units=1,
stride=1),
xception_block('entry_flow/block3',
depth_list=[256, 256, 256],
skip_connection_type='conv',
activation_fn_in_separable_conv=False,
regularize_depthwise=regularize_depthwise,
num_units=1,
stride=2),
xception_block('entry_flow/block4',
depth_list=[728, 728, 728],
skip_connection_type='conv',
activation_fn_in_separable_conv=False,
regularize_depthwise=regularize_depthwise,
num_units=1,
stride=1),
xception_block('entry_flow/block5',
depth_list=[728, 728, 728],
skip_connection_type='conv',
activation_fn_in_separable_conv=False,
regularize_depthwise=regularize_depthwise,
num_units=1,
stride=2),
xception_block('middle_flow/block1',
depth_list=[728, 728, 728],
skip_connection_type='sum',
activation_fn_in_separable_conv=False,
regularize_depthwise=regularize_depthwise,
num_units=16,
stride=1),
xception_block('exit_flow/block1',
depth_list=[728, 1024, 1024],
skip_connection_type='conv',
activation_fn_in_separable_conv=False,
regularize_depthwise=regularize_depthwise,
num_units=1,
stride=2),
xception_block('exit_flow/block2',
depth_list=[1536, 1536, 2048],
skip_connection_type='none',
activation_fn_in_separable_conv=True,
regularize_depthwise=regularize_depthwise,
num_units=1,
stride=1,
unit_rate_list=multi_grid),
]
return xception(inputs,
blocks=blocks,
num_classes=num_classes,
is_training=is_training,
global_pool=global_pool,
keep_prob=keep_prob,
output_stride=output_stride,
reuse=reuse,
scope=scope)
def xception_arg_scope(weight_decay=0.00004, def xception_arg_scope(weight_decay=0.00004,
batch_norm_decay=0.9997, batch_norm_decay=0.9997,
batch_norm_epsilon=0.001, batch_norm_epsilon=0.001,
......
...@@ -14,8 +14,8 @@ ...@@ -14,8 +14,8 @@
# ============================================================================== # ==============================================================================
"""Tests for xception.py.""" """Tests for xception.py."""
import six
import numpy as np import numpy as np
import six
import tensorflow as tf import tensorflow as tf
from deeplab.core import xception from deeplab.core import xception
......
...@@ -13,10 +13,11 @@ ...@@ -13,10 +13,11 @@
# limitations under the License. # limitations under the License.
# ============================================================================== # ==============================================================================
"""Converts ADE20K data to TFRecord file format with Example protos."""
import math import math
import os import os
import random import random
import string
import sys import sys
import build_data import build_data
import tensorflow as tf import tensorflow as tf
...@@ -44,12 +45,13 @@ tf.app.flags.DEFINE_string( ...@@ -44,12 +45,13 @@ tf.app.flags.DEFINE_string(
tf.app.flags.DEFINE_string( tf.app.flags.DEFINE_string(
'output_dir', './ADE20K/tfrecord', 'output_dir', './ADE20K/tfrecord',
'Path to save converted SSTable of Tensorflow example') 'Path to save converted tfrecord of Tensorflow example')
_NUM_SHARDS = 4 _NUM_SHARDS = 4
def _convert_dataset(dataset_split, dataset_dir, dataset_label_dir): def _convert_dataset(dataset_split, dataset_dir, dataset_label_dir):
""" Converts the ADE20k dataset into into tfrecord format (SSTable). """Converts the ADE20k dataset into into tfrecord format.
Args: Args:
dataset_split: Dataset split (e.g., train, val). dataset_split: Dataset split (e.g., train, val).
...@@ -65,7 +67,7 @@ def _convert_dataset(dataset_split, dataset_dir, dataset_label_dir): ...@@ -65,7 +67,7 @@ def _convert_dataset(dataset_split, dataset_dir, dataset_label_dir):
seg_names = [] seg_names = []
for f in img_names: for f in img_names:
# get the filename without the extension # get the filename without the extension
basename = os.path.basename(f).split(".")[0] basename = os.path.basename(f).split('.')[0]
# cover its corresponding *_seg.png # cover its corresponding *_seg.png
seg = os.path.join(dataset_label_dir, basename+'.png') seg = os.path.join(dataset_label_dir, basename+'.png')
seg_names.append(seg) seg_names.append(seg)
...@@ -104,10 +106,13 @@ def _convert_dataset(dataset_split, dataset_dir, dataset_label_dir): ...@@ -104,10 +106,13 @@ def _convert_dataset(dataset_split, dataset_dir, dataset_label_dir):
sys.stdout.write('\n') sys.stdout.write('\n')
sys.stdout.flush() sys.stdout.flush()
def main(unused_argv): def main(unused_argv):
tf.gfile.MakeDirs(FLAGS.output_dir) tf.gfile.MakeDirs(FLAGS.output_dir)
_convert_dataset('train', FLAGS.train_image_folder, FLAGS.train_image_label_folder) _convert_dataset(
'train', FLAGS.train_image_folder, FLAGS.train_image_label_folder)
_convert_dataset('val', FLAGS.val_image_folder, FLAGS.val_image_label_folder) _convert_dataset('val', FLAGS.val_image_folder, FLAGS.val_image_label_folder)
if __name__ == '__main__': if __name__ == '__main__':
tf.app.run() tf.app.run()
...@@ -127,9 +127,10 @@ def _bytes_list_feature(values): ...@@ -127,9 +127,10 @@ def _bytes_list_feature(values):
A TF-Feature. A TF-Feature.
""" """
def norm2bytes(value): def norm2bytes(value):
return value.encode() if isinstance(value, str) and six.PY3 else value return value.encode() if isinstance(value, str) and six.PY3 else value
return tf.train.Feature(bytes_list=tf.train.BytesList(value=[norm2bytes(values)])) return tf.train.Feature(
bytes_list=tf.train.BytesList(value=[norm2bytes(values)]))
def image_seg_to_tfexample(image_data, filename, height, width, seg_data): def image_seg_to_tfexample(image_data, filename, height, width, seg_data):
......
...@@ -24,7 +24,7 @@ ...@@ -24,7 +24,7 @@
# - build_data.py # - build_data.py
# - build_ade20k_data.py # - build_ade20k_data.py
# - download_and_convert_ade20k.sh # - download_and_convert_ade20k.sh
# + ADE20K # + ADE20K
# + tfrecord # + tfrecord
# + ADEChallengeData2016 # + ADEChallengeData2016
# + annotations # + annotations
......
...@@ -69,7 +69,10 @@ _ITEMS_TO_DESCRIPTIONS = { ...@@ -69,7 +69,10 @@ _ITEMS_TO_DESCRIPTIONS = {
DatasetDescriptor = collections.namedtuple( DatasetDescriptor = collections.namedtuple(
'DatasetDescriptor', 'DatasetDescriptor',
['splits_to_sizes', # Splits of the dataset into training, val, and test. ['splits_to_sizes', # Splits of the dataset into training, val, and test.
'num_classes', # Number of semantic classes. 'num_classes', # Number of semantic classes, including the background
# class (if exists). For example, there are 20
# foreground classes + 1 background class in the PASCAL
# VOC 2012 dataset. Thus, we set num_classes=21.
'ignore_label', # Ignore label value. 'ignore_label', # Ignore label value.
] ]
) )
...@@ -96,12 +99,12 @@ _PASCAL_VOC_SEG_INFORMATION = DatasetDescriptor( ...@@ -96,12 +99,12 @@ _PASCAL_VOC_SEG_INFORMATION = DatasetDescriptor(
# These number (i.e., 'train'/'test') seems to have to be hard coded # These number (i.e., 'train'/'test') seems to have to be hard coded
# You are required to figure it out for your training/testing example. # You are required to figure it out for your training/testing example.
_ADE20K_INFORMATION = DatasetDescriptor( _ADE20K_INFORMATION = DatasetDescriptor(
splits_to_sizes = { splits_to_sizes={
'train': 20210, # num of samples in images/training 'train': 20210, # num of samples in images/training
'val': 2000, # num of samples in images/validation 'val': 2000, # num of samples in images/validation
}, },
num_classes=150, num_classes=151,
ignore_label=255, ignore_label=0,
) )
......
...@@ -17,8 +17,8 @@ ...@@ -17,8 +17,8 @@
See model.py for more details and usage. See model.py for more details and usage.
""" """
import six
import math import math
import six
import tensorflow as tf import tensorflow as tf
from deeplab import common from deeplab import common
from deeplab import model from deeplab import model
......
...@@ -13,8 +13,7 @@ convert ADE20K semantic segmentation dataset to TFRecord. ...@@ -13,8 +13,7 @@ convert ADE20K semantic segmentation dataset to TFRecord.
bash download_and_convert_ade20k.sh bash download_and_convert_ade20k.sh
``` ```
The converted dataset will be saved at The converted dataset will be saved at ./deeplab/datasets/ADE20K/tfrecord
./deeplab/datasets/ADE20K/tfrecord
## Recommended Directory Structure for Training and Evaluation ## Recommended Directory Structure for Training and Evaluation
...@@ -23,7 +22,7 @@ The converted dataset will be saved at ...@@ -23,7 +22,7 @@ The converted dataset will be saved at
- build_data.py - build_data.py
- build_ade20k_data.py - build_ade20k_data.py
- download_and_convert_ade20k.sh - download_and_convert_ade20k.sh
+ ADE20K + ADE20K
+ tfrecord + tfrecord
+ exp + exp
+ train_on_train_set + train_on_train_set
...@@ -50,7 +49,7 @@ A local training job using `xception_65` can be run with the following command: ...@@ -50,7 +49,7 @@ A local training job using `xception_65` can be run with the following command:
# From tensorflow/models/research/ # From tensorflow/models/research/
python deeplab/train.py \ python deeplab/train.py \
--logtostderr \ --logtostderr \
--training_number_of_steps=50000 \ --training_number_of_steps=90000 \
--train_split="train" \ --train_split="train" \
--model_variant="xception_65" \ --model_variant="xception_65" \
--atrous_rates=6 \ --atrous_rates=6 \
...@@ -61,21 +60,16 @@ python deeplab/train.py \ ...@@ -61,21 +60,16 @@ python deeplab/train.py \
--train_crop_size=513 \ --train_crop_size=513 \
--train_crop_size=513 \ --train_crop_size=513 \
--train_batch_size=4 \ --train_batch_size=4 \
--min_resize_value=350 \ --min_resize_value=513 \
--max_resize_value=500 \ --max_resize_value=513 \
--resize_factor=16 \ --resize_factor=16 \
--fine_tune_batch_norm=False \
--dataset="ade20k" \ --dataset="ade20k" \
--initialize_last_layer=False \
--last_layers_contain_logits_only=True \
--tf_initial_checkpoint=${PATH_TO_INITIAL_CHECKPOINT} \ --tf_initial_checkpoint=${PATH_TO_INITIAL_CHECKPOINT} \
--train_logdir=${PATH_TO_TRAIN_DIR}\ --train_logdir=${PATH_TO_TRAIN_DIR}\
--dataset_dir=${PATH_TO_DATASET} --dataset_dir=${PATH_TO_DATASET}
``` ```
where ${PATH\_TO\_INITIAL\_CHECKPOINT} is the path to the initial checkpoint. where ${PATH\_TO\_INITIAL\_CHECKPOINT} is the path to the initial checkpoint.
For example, if you are using the deeplabv3\_pascal\_train\_aug checkppoint, you
will set this to `/path/to/deeplabv3\_pascal\_train\_aug/model.ckpt`.
${PATH\_TO\_TRAIN\_DIR} is the directory in which training checkpoints and ${PATH\_TO\_TRAIN\_DIR} is the directory in which training checkpoints and
events will be written to (it is recommended to set it to the events will be written to (it is recommended to set it to the
`train_on_train_set/train` above), and ${PATH\_TO\_DATASET} is the directory in `train_on_train_set/train` above), and ${PATH\_TO\_DATASET} is the directory in
...@@ -83,24 +77,22 @@ which the ADE20K dataset resides (the `tfrecord` above) ...@@ -83,24 +77,22 @@ which the ADE20K dataset resides (the `tfrecord` above)
**Note that for train.py:** **Note that for train.py:**
1. In order to fine tune the BN layers, one needs to use large batch size (> 12), 1. In order to fine tune the BN layers, one needs to use large batch size (>
and set fine_tune_batch_norm = True. Here, we simply use small batch size 12), and set fine_tune_batch_norm = True. Here, we simply use small batch
during training for the purpose of demonstration. If the users have limited size during training for the purpose of demonstration. If the users have
GPU memory at hand, please fine-tune from our provided checkpoints whose limited GPU memory at hand, please fine-tune from our provided checkpoints
batch norm parameters have been trained, and use smaller learning rate with whose batch norm parameters have been trained, and use smaller learning rate
fine_tune_batch_norm = False. with fine_tune_batch_norm = False.
2. User should fine tune the `min_resize_value` and `max_resize_value` to get 2. User should fine tune the `min_resize_value` and `max_resize_value` to get
better result. Note that `resize_factor` has to be equal to `output_stride`. better result. Note that `resize_factor` has to be equal to `output_stride`.
2. The users should change atrous_rates from [6, 12, 18] to [12, 24, 36] if 3. The users should change atrous_rates from [6, 12, 18] to [12, 24, 36] if
setting output_stride=8. setting output_stride=8.
3. The users could skip the flag, `decoder_output_stride`, if you do not want 4. The users could skip the flag, `decoder_output_stride`, if you do not want
to use the decoder structure. to use the decoder structure.
Currently there are no fine-tuned checkpoint for the ADE20K dataset.
## Running Tensorboard ## Running Tensorboard
Progress for training and evaluation jobs can be inspected using Tensorboard. If Progress for training and evaluation jobs can be inspected using Tensorboard. If
......
...@@ -60,6 +60,12 @@ sh local_test_mobilenetv2.sh ...@@ -60,6 +60,12 @@ sh local_test_mobilenetv2.sh
First, make sure you could reproduce the results with our provided setting. First, make sure you could reproduce the results with our provided setting.
After that, you could start to make a new change one at a time to help debug. After that, you could start to make a new change one at a time to help debug.
___ ___
Q8: What value of `eval_crop_size` should I use?
A: Our model uses whole-image inference, meaning that we need to set `eval_crop_size` equal to `output_stride` * k + 1, where k is an integer and set k so that the resulting `eval_crop_size` is slightly larger the largest
image dimension in the dataset. For example, we have `eval_crop_size` = 513x513 for PASCAL dataset whose largest image dimension is 512. Similarly, we set `eval_crop_size` = 1025x2049 for Cityscapes images whose
image dimension is all equal to 1024x2048.
___
## References ## References
......
# TensorFlow DeepLab Model Zoo # TensorFlow DeepLab Model Zoo
We provide deeplab models pretrained on PASCAL VOC 2012 and Cityscapes datasets We provide deeplab models pretrained several datasets, including (1) PASCAL VOC
for reproducing our results, as well as some checkpoints that are only 2012, (2) Cityscapes, and (3) ADE20K for reproducing our results, as well as
pretrained on ImageNet for training your own models. some checkpoints that are only pretrained on ImageNet for training your own
models.
## DeepLab models trained on PASCAL VOC 2012 ## DeepLab models trained on PASCAL VOC 2012
...@@ -69,6 +70,22 @@ Checkpoint name ...@@ -69,6 +70,22 @@ Checkpoint name
[mobilenetv2_coco_cityscapes_trainfine](http://download.tensorflow.org/models/deeplabv3_mnv2_cityscapes_train_2018_02_05.tar.gz) | 16 <br> 8 | [1.0] <br> [0.75:0.25:1.25] | No <br> Yes | 21.27B <br> 433.24B | 0.8 <br> 51.12 | 70.71% (val) <br> 73.57% (val) | 23MB [mobilenetv2_coco_cityscapes_trainfine](http://download.tensorflow.org/models/deeplabv3_mnv2_cityscapes_train_2018_02_05.tar.gz) | 16 <br> 8 | [1.0] <br> [0.75:0.25:1.25] | No <br> Yes | 21.27B <br> 433.24B | 0.8 <br> 51.12 | 70.71% (val) <br> 73.57% (val) | 23MB
[xception_cityscapes_trainfine](http://download.tensorflow.org/models/deeplabv3_cityscapes_train_2018_02_06.tar.gz) | 16 <br> 8 | [1.0] <br> [0.75:0.25:1.25] | No <br> Yes | 418.64B <br> 8677.92B | 5.0 <br> 422.8 | 78.79% (val) <br> 80.42% (val) | 439MB [xception_cityscapes_trainfine](http://download.tensorflow.org/models/deeplabv3_cityscapes_train_2018_02_06.tar.gz) | 16 <br> 8 | [1.0] <br> [0.75:0.25:1.25] | No <br> Yes | 418.64B <br> 8677.92B | 5.0 <br> 422.8 | 78.79% (val) <br> 80.42% (val) | 439MB
## DeepLab models trained on ADE20K
### Model details
We provide some checkpoints that have been pretrained on ADE20K training set.
Note that the model has only been pretrained on ImageNet, following the
dataset rule.
Checkpoint name | Network backbone | Pretrained dataset | ASPP | Decoder
------------------------------------- | :--------------: | :-------------------------------------: | :----------------------------------------------: | :-----:
xception_ade20k_train | Xception_65 | ImageNet <br> ADE20K training set | [6, 12, 18] for OS=16 <br> [12, 24, 36] for OS=8 | OS = 4
Checkpoint name | Eval OS | Eval scales | Left-right Flip | mIOU | Pixel-wise Accuracy | File Size
-------------------------------------------------------------------------------------------------------------------------------- | :-------: | :-------------------------: | :-------------: | :-------------------: | :------------: | :----------------------------: | :-------:
[xception_ade20k_train](http://download.tensorflow.org/models/deeplabv3_xception_ade20k_train_2018_05_14.tar.gz) | 16 | [0.5:0.25:1.75] | Yes | 43.54% (val) | 81.74% (val) | 439MB
## Checkpoints pretrained on ImageNet ## Checkpoints pretrained on ImageNet
Un-tar'ed directory includes: Un-tar'ed directory includes:
...@@ -84,15 +101,24 @@ one could use this for training your own models. ...@@ -84,15 +101,24 @@ one could use this for training your own models.
[MobileNet-V2](https://github.com/tensorflow/models/tree/master/research/slim/nets/mobilenet) [MobileNet-V2](https://github.com/tensorflow/models/tree/master/research/slim/nets/mobilenet)
for details. for details.
* xception: We adapt the original Xception model to the task of semantic * xception_{41,65,71}: We adapt the original Xception model to the task of
segmentation with the following changes: (1) more layers, (2) all max semantic segmentation with the following changes: (1) more layers, (2) all
pooling operations are replaced by strided (atrous) separable convolutions, max pooling operations are replaced by strided (atrous) separable
and (3) extra batch-norm and ReLU after each 3x3 depthwise convolution are convolutions, and (3) extra batch-norm and ReLU after each 3x3 depthwise
added. convolution are added. We provide three Xception model variants with
different network depths.
* resnet_v1_{50,101}_beta: We modify the original ResNet-101 [10], similar to
PSPNet [11] by replacing the first 7x7 convolution with three 3x3
convolutions. See resnet_v1_beta.py for more details.
Model name | File Size Model name | File Size
-------------------------------------------------------------------------------------- | :-------: -------------------------------------------------------------------------------------- | :-------:
[xception](http://download.tensorflow.org/models/deeplabv3_xception_2018_01_04.tar.gz) | 447MB [xception_41](http://download.tensorflow.org/models/xception_41_2018_05_09.tar.gz ) | 288MB
[xception_65](http://download.tensorflow.org/models/deeplabv3_xception_2018_01_04.tar.gz) | 447MB
[xception_71](http://download.tensorflow.org/models/xception_71_2018_05_09.tar.gz ) | 474MB
[resnet_v1_50_beta](http://download.tensorflow.org/models/resnet_v1_50_2018_05_04.tar.gz) | 274MB
[resnet_v1_101_beta](http://download.tensorflow.org/models/resnet_v1_101_2018_05_04.tar.gz) | 477MB
## References ## References
...@@ -132,3 +158,16 @@ Model name ...@@ -132,3 +158,16 @@ Model name
9. **ImageNet Large Scale Visual Recognition Challenge**<br /> 9. **ImageNet Large Scale Visual Recognition Challenge**<br />
Olga Russakovsky, Jia Deng, Hao Su, Jonathan Krause, Sanjeev Satheesh, Sean Ma, Zhiheng Huang, Andrej Karpathy, Aditya Khosla, Michael Bernstein, Alexander C. Berg, Li Fei-Fei<br /> Olga Russakovsky, Jia Deng, Hao Su, Jonathan Krause, Sanjeev Satheesh, Sean Ma, Zhiheng Huang, Andrej Karpathy, Aditya Khosla, Michael Bernstein, Alexander C. Berg, Li Fei-Fei<br />
[[link]](http://www.image-net.org/). IJCV, 2015. [[link]](http://www.image-net.org/). IJCV, 2015.
10. **Deep Residual Learning for Image Recognition**<br />
Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun<br />
[[link]](https://arxiv.org/abs/1512.03385). CVPR, 2016.
11. **Pyramid Scene Parsing Network**<br />
Hengshuang Zhao, Jianping Shi, Xiaojuan Qi, Xiaogang Wang, Jiaya Jia<br />
[[link]](https://arxiv.org/abs/1612.01105). In CVPR, 2017.
12. **Scene Parsing through ADE20K Dataset**<br />
Bolei Zhou, Hang Zhao, Xavier Puig, Sanja Fidler, Adela Barriuso, Antonio Torralba<br />
[[link]](http://groups.csail.mit.edu/vision/datasets/ADE20K/). In CVPR,
2017.
...@@ -64,6 +64,10 @@ _CONCAT_PROJECTION_SCOPE = 'concat_projection' ...@@ -64,6 +64,10 @@ _CONCAT_PROJECTION_SCOPE = 'concat_projection'
_DECODER_SCOPE = 'decoder' _DECODER_SCOPE = 'decoder'
def get_merged_logits_scope():
return _MERGED_LOGITS_SCOPE
def get_extra_layer_scopes(last_layers_contain_logits_only=False): def get_extra_layer_scopes(last_layers_contain_logits_only=False):
"""Gets the scopes for extra layers. """Gets the scopes for extra layers.
...@@ -358,6 +362,7 @@ def _extract_features(images, ...@@ -358,6 +362,7 @@ def _extract_features(images,
output_stride=model_options.output_stride, output_stride=model_options.output_stride,
multi_grid=model_options.multi_grid, multi_grid=model_options.multi_grid,
model_variant=model_options.model_variant, model_variant=model_options.model_variant,
depth_multiplier=model_options.depth_multiplier,
weight_decay=weight_decay, weight_decay=weight_decay,
reuse=reuse, reuse=reuse,
is_training=is_training, is_training=is_training,
......
...@@ -111,7 +111,7 @@ class DeeplabModelTest(tf.test.TestCase): ...@@ -111,7 +111,7 @@ class DeeplabModelTest(tf.test.TestCase):
for output in outputs_to_num_classes: for output in outputs_to_num_classes:
scales_to_logits = outputs_to_scales_to_logits[output] scales_to_logits = outputs_to_scales_to_logits[output]
# Expect only one output. # Expect only one output.
self.assertEquals(len(scales_to_logits), 1) self.assertEqual(len(scales_to_logits), 1)
for logits in scales_to_logits.values(): for logits in scales_to_logits.values():
self.assertTrue(logits.any()) self.assertTrue(logits.any())
......
...@@ -68,7 +68,8 @@ flags.DEFINE_integer('save_summaries_secs', 600, ...@@ -68,7 +68,8 @@ flags.DEFINE_integer('save_summaries_secs', 600,
'How often, in seconds, we compute the summaries.') 'How often, in seconds, we compute the summaries.')
flags.DEFINE_boolean('save_summaries_images', False, flags.DEFINE_boolean('save_summaries_images', False,
'Save sample inputs, labels, and semantic predictions as images to summary.') 'Save sample inputs, labels, and semantic predictions as '
'images to summary.')
# Settings for training strategy. # Settings for training strategy.
...@@ -184,9 +185,11 @@ def _build_deeplab(inputs_queue, outputs_to_num_classes, ignore_label): ...@@ -184,9 +185,11 @@ def _build_deeplab(inputs_queue, outputs_to_num_classes, ignore_label):
""" """
samples = inputs_queue.dequeue() samples = inputs_queue.dequeue()
# add name to input and label nodes so we can add to summary # Add name to input and label nodes so we can add to summary.
samples[common.IMAGE] = tf.identity(samples[common.IMAGE], name = common.IMAGE) samples[common.IMAGE] = tf.identity(
samples[common.LABEL] = tf.identity(samples[common.LABEL], name = common.LABEL) samples[common.IMAGE], name=common.IMAGE)
samples[common.LABEL] = tf.identity(
samples[common.LABEL], name=common.LABEL)
model_options = common.ModelOptions( model_options = common.ModelOptions(
outputs_to_num_classes=outputs_to_num_classes, outputs_to_num_classes=outputs_to_num_classes,
...@@ -201,11 +204,11 @@ def _build_deeplab(inputs_queue, outputs_to_num_classes, ignore_label): ...@@ -201,11 +204,11 @@ def _build_deeplab(inputs_queue, outputs_to_num_classes, ignore_label):
is_training=True, is_training=True,
fine_tune_batch_norm=FLAGS.fine_tune_batch_norm) fine_tune_batch_norm=FLAGS.fine_tune_batch_norm)
# add name to graph node so we can add to summary # Add name to graph node so we can add to summary.
outputs_to_scales_to_logits[common.OUTPUT_TYPE][model._MERGED_LOGITS_SCOPE] = tf.identity( output_type_dict = outputs_to_scales_to_logits[common.OUTPUT_TYPE]
outputs_to_scales_to_logits[common.OUTPUT_TYPE][model._MERGED_LOGITS_SCOPE], output_type_dict[model.get_merged_logits_scope()] = tf.identity(
name = common.OUTPUT_TYPE output_type_dict[model.get_merged_logits_scope()],
) name=common.OUTPUT_TYPE)
for output, num_classes in six.iteritems(outputs_to_num_classes): for output, num_classes in six.iteritems(outputs_to_num_classes):
train_utils.add_softmax_cross_entropy_loss_for_each_scale( train_utils.add_softmax_cross_entropy_loss_for_each_scale(
...@@ -234,7 +237,7 @@ def main(unused_argv): ...@@ -234,7 +237,7 @@ def main(unused_argv):
assert FLAGS.train_batch_size % config.num_clones == 0, ( assert FLAGS.train_batch_size % config.num_clones == 0, (
'Training batch size not divisble by number of clones (GPUs).') 'Training batch size not divisble by number of clones (GPUs).')
clone_batch_size = int(FLAGS.train_batch_size / config.num_clones) clone_batch_size = FLAGS.train_batch_size // config.num_clones
# Get dataset-dependent information. # Get dataset-dependent information.
dataset = segmentation_dataset.get_dataset( dataset = segmentation_dataset.get_dataset(
...@@ -286,19 +289,27 @@ def main(unused_argv): ...@@ -286,19 +289,27 @@ def main(unused_argv):
# Add summaries for images, labels, semantic predictions # Add summaries for images, labels, semantic predictions
if FLAGS.save_summaries_images: if FLAGS.save_summaries_images:
summary_image = graph.get_tensor_by_name( summary_image = graph.get_tensor_by_name(
('%s/%s:0' % (first_clone_scope, common.IMAGE)).strip('/')) ('%s/%s:0' % (first_clone_scope, common.IMAGE)).strip('/'))
summaries.add(tf.summary.image('samples/%s' % common.IMAGE, summary_image)) summaries.add(
tf.summary.image('samples/%s' % common.IMAGE, summary_image))
summary_label = tf.cast(graph.get_tensor_by_name(
('%s/%s:0' % (first_clone_scope, common.LABEL)).strip('/')), first_clone_label = graph.get_tensor_by_name(
tf.uint8) ('%s/%s:0' % (first_clone_scope, common.LABEL)).strip('/'))
summaries.add(tf.summary.image('samples/%s' % common.LABEL, summary_label)) # Scale up summary image pixel values for better visualization.
pixel_scaling = max(1, 255 // dataset.num_classes)
predictions = tf.cast(tf.expand_dims(tf.argmax(graph.get_tensor_by_name( summary_label = tf.cast(first_clone_label * pixel_scaling, tf.uint8)
('%s/%s:0' % (first_clone_scope, common.OUTPUT_TYPE)).strip('/')), summaries.add(
3), -1), tf.uint8) tf.summary.image('samples/%s' % common.LABEL, summary_label))
summaries.add(tf.summary.image('samples/%s' % common.OUTPUT_TYPE, predictions))
first_clone_output = graph.get_tensor_by_name(
('%s/%s:0' % (first_clone_scope, common.OUTPUT_TYPE)).strip('/'))
predictions = tf.expand_dims(tf.argmax(first_clone_output, 3), -1)
summary_predictions = tf.cast(predictions * pixel_scaling, tf.uint8)
summaries.add(
tf.summary.image(
'samples/%s' % common.OUTPUT_TYPE, summary_predictions))
# Add summaries for losses. # Add summaries for losses.
for loss in tf.get_collection(tf.GraphKeys.LOSSES, first_clone_scope): for loss in tf.get_collection(tf.GraphKeys.LOSSES, first_clone_scope):
...@@ -325,7 +336,8 @@ def main(unused_argv): ...@@ -325,7 +336,8 @@ def main(unused_argv):
summaries.add(tf.summary.scalar('total_loss', total_loss)) summaries.add(tf.summary.scalar('total_loss', total_loss))
# Modify the gradients for biases and last layer variables. # Modify the gradients for biases and last layer variables.
last_layers = model.get_extra_layer_scopes(FLAGS.last_layers_contain_logits_only) last_layers = model.get_extra_layer_scopes(
FLAGS.last_layers_contain_logits_only)
grad_mult = train_utils.get_model_gradient_multipliers( grad_mult = train_utils.get_model_gradient_multipliers(
last_layers, FLAGS.last_layer_gradient_multiplier) last_layers, FLAGS.last_layer_gradient_multiplier)
if grad_mult: if grad_mult:
......
...@@ -17,30 +17,196 @@ ...@@ -17,30 +17,196 @@
Visualizes the semantic segmentation results by the color map Visualizes the semantic segmentation results by the color map
defined by the different datasets. Supported colormaps are: defined by the different datasets. Supported colormaps are:
1. PASCAL VOC semantic segmentation benchmark. * ADE20K (http://groups.csail.mit.edu/vision/datasets/ADE20K/).
Website: http://host.robots.ox.ac.uk/pascal/VOC/
* Cityscapes dataset (https://www.cityscapes-dataset.com).
* PASCAL VOC 2012 (http://host.robots.ox.ac.uk/pascal/VOC/).
""" """
import numpy as np import numpy as np
# Dataset names. # Dataset names.
_ADE20K = 'ade20k'
_CITYSCAPES = 'cityscapes' _CITYSCAPES = 'cityscapes'
_PASCAL = 'pascal' _PASCAL = 'pascal'
# Max number of entries in the colormap for each dataset. # Max number of entries in the colormap for each dataset.
_DATASET_MAX_ENTRIES = { _DATASET_MAX_ENTRIES = {
_ADE20K: 151,
_CITYSCAPES: 19, _CITYSCAPES: 19,
_PASCAL: 256, _PASCAL: 256,
} }
def create_ade20k_label_colormap():
"""Creates a label colormap used in ADE20K segmentation benchmark.
Returns:
A colormap for visualizing segmentation results.
"""
return np.asarray([
[0, 0, 0],
[120, 120, 120],
[180, 120, 120],
[6, 230, 230],
[80, 50, 50],
[4, 200, 3],
[120, 120, 80],
[140, 140, 140],
[204, 5, 255],
[230, 230, 230],
[4, 250, 7],
[224, 5, 255],
[235, 255, 7],
[150, 5, 61],
[120, 120, 70],
[8, 255, 51],
[255, 6, 82],
[143, 255, 140],
[204, 255, 4],
[255, 51, 7],
[204, 70, 3],
[0, 102, 200],
[61, 230, 250],
[255, 6, 51],
[11, 102, 255],
[255, 7, 71],
[255, 9, 224],
[9, 7, 230],
[220, 220, 220],
[255, 9, 92],
[112, 9, 255],
[8, 255, 214],
[7, 255, 224],
[255, 184, 6],
[10, 255, 71],
[255, 41, 10],
[7, 255, 255],
[224, 255, 8],
[102, 8, 255],
[255, 61, 6],
[255, 194, 7],
[255, 122, 8],
[0, 255, 20],
[255, 8, 41],
[255, 5, 153],
[6, 51, 255],
[235, 12, 255],
[160, 150, 20],
[0, 163, 255],
[140, 140, 140],
[250, 10, 15],
[20, 255, 0],
[31, 255, 0],
[255, 31, 0],
[255, 224, 0],
[153, 255, 0],
[0, 0, 255],
[255, 71, 0],
[0, 235, 255],
[0, 173, 255],
[31, 0, 255],
[11, 200, 200],
[255, 82, 0],
[0, 255, 245],
[0, 61, 255],
[0, 255, 112],
[0, 255, 133],
[255, 0, 0],
[255, 163, 0],
[255, 102, 0],
[194, 255, 0],
[0, 143, 255],
[51, 255, 0],
[0, 82, 255],
[0, 255, 41],
[0, 255, 173],
[10, 0, 255],
[173, 255, 0],
[0, 255, 153],
[255, 92, 0],
[255, 0, 255],
[255, 0, 245],
[255, 0, 102],
[255, 173, 0],
[255, 0, 20],
[255, 184, 184],
[0, 31, 255],
[0, 255, 61],
[0, 71, 255],
[255, 0, 204],
[0, 255, 194],
[0, 255, 82],
[0, 10, 255],
[0, 112, 255],
[51, 0, 255],
[0, 194, 255],
[0, 122, 255],
[0, 255, 163],
[255, 153, 0],
[0, 255, 10],
[255, 112, 0],
[143, 255, 0],
[82, 0, 255],
[163, 255, 0],
[255, 235, 0],
[8, 184, 170],
[133, 0, 255],
[0, 255, 92],
[184, 0, 255],
[255, 0, 31],
[0, 184, 255],
[0, 214, 255],
[255, 0, 112],
[92, 255, 0],
[0, 224, 255],
[112, 224, 255],
[70, 184, 160],
[163, 0, 255],
[153, 0, 255],
[71, 255, 0],
[255, 0, 163],
[255, 204, 0],
[255, 0, 143],
[0, 255, 235],
[133, 255, 0],
[255, 0, 235],
[245, 0, 255],
[255, 0, 122],
[255, 245, 0],
[10, 190, 212],
[214, 255, 0],
[0, 204, 255],
[20, 0, 255],
[255, 255, 0],
[0, 153, 255],
[0, 41, 255],
[0, 255, 204],
[41, 0, 255],
[41, 255, 0],
[173, 0, 255],
[0, 245, 255],
[71, 0, 255],
[122, 0, 255],
[0, 255, 184],
[0, 92, 255],
[184, 255, 0],
[0, 133, 255],
[255, 214, 0],
[25, 194, 194],
[102, 255, 0],
[92, 0, 255],
])
def create_cityscapes_label_colormap(): def create_cityscapes_label_colormap():
"""Creates a label colormap used in CITYSCAPES segmentation benchmark. """Creates a label colormap used in CITYSCAPES segmentation benchmark.
Returns: Returns:
A Colormap for visualizing segmentation results. A colormap for visualizing segmentation results.
""" """
colormap = np.asarray([ return np.asarray([
[128, 64, 128], [128, 64, 128],
[244, 35, 232], [244, 35, 232],
[70, 70, 70], [70, 70, 70],
...@@ -61,17 +227,37 @@ def create_cityscapes_label_colormap(): ...@@ -61,17 +227,37 @@ def create_cityscapes_label_colormap():
[0, 0, 230], [0, 0, 230],
[119, 11, 32], [119, 11, 32],
]) ])
def create_pascal_label_colormap():
"""Creates a label colormap used in PASCAL VOC segmentation benchmark.
Returns:
A colormap for visualizing segmentation results.
"""
colormap = np.zeros((_DATASET_MAX_ENTRIES[_PASCAL], 3), dtype=int)
ind = np.arange(_DATASET_MAX_ENTRIES[_PASCAL], dtype=int)
for shift in reversed(range(8)):
for channel in range(3):
colormap[:, channel] |= bit_get(ind, channel) << shift
ind >>= 3
return colormap return colormap
def get_pascal_name(): def get_ade20k_name():
return _PASCAL return _ADE20K
def get_cityscapes_name(): def get_cityscapes_name():
return _CITYSCAPES return _CITYSCAPES
def get_pascal_name():
return _PASCAL
def bit_get(val, idx): def bit_get(val, idx):
"""Gets the bit value. """Gets the bit value.
...@@ -85,23 +271,6 @@ def bit_get(val, idx): ...@@ -85,23 +271,6 @@ def bit_get(val, idx):
return (val >> idx) & 1 return (val >> idx) & 1
def create_pascal_label_colormap():
"""Creates a label colormap used in PASCAL VOC segmentation benchmark.
Returns:
A Colormap for visualizing segmentation results.
"""
colormap = np.zeros((_DATASET_MAX_ENTRIES[_PASCAL], 3), dtype=int)
ind = np.arange(_DATASET_MAX_ENTRIES[_PASCAL], dtype=int)
for shift in reversed(range(8)):
for channel in range(3):
colormap[:, channel] |= bit_get(ind, channel) << shift
ind >>= 3
return colormap
def create_label_colormap(dataset=_PASCAL): def create_label_colormap(dataset=_PASCAL):
"""Creates a label colormap for the specified dataset. """Creates a label colormap for the specified dataset.
...@@ -114,10 +283,12 @@ def create_label_colormap(dataset=_PASCAL): ...@@ -114,10 +283,12 @@ def create_label_colormap(dataset=_PASCAL):
Raises: Raises:
ValueError: If the dataset is not supported. ValueError: If the dataset is not supported.
""" """
if dataset == _PASCAL: if dataset == _ADE20K:
return create_pascal_label_colormap() return create_ade20k_label_colormap()
elif dataset == _CITYSCAPES: elif dataset == _CITYSCAPES:
return create_cityscapes_label_colormap() return create_cityscapes_label_colormap()
elif dataset == _PASCAL:
return create_pascal_label_colormap()
else: else:
raise ValueError('Unsupported dataset.') raise ValueError('Unsupported dataset.')
...@@ -132,7 +303,7 @@ def label_to_color_image(label, dataset=_PASCAL): ...@@ -132,7 +303,7 @@ def label_to_color_image(label, dataset=_PASCAL):
Returns: Returns:
result: A 2D array with floating type. The element of the array result: A 2D array with floating type. The element of the array
is the color indexed by the corresponding element in the input label is the color indexed by the corresponding element in the input label
to the PASCAL color map. to the dataset color map.
Raises: Raises:
ValueError: If label is not of rank 2 or its value is larger than color ValueError: If label is not of rank 2 or its value is larger than color
......
...@@ -70,6 +70,22 @@ class VisualizationUtilTest(tf.test.TestCase): ...@@ -70,6 +70,22 @@ class VisualizationUtilTest(tf.test.TestCase):
with self.assertRaises(ValueError): with self.assertRaises(ValueError):
get_dataset_colormap.create_label_colormap('unsupported_dataset') get_dataset_colormap.create_label_colormap('unsupported_dataset')
def testUnExpectedLabelDimensionForLabelToADE20KColorImage(self):
label = np.array([250])
with self.assertRaises(ValueError):
get_dataset_colormap.label_to_color_image(
label, get_dataset_colormap.get_ade20k_name())
def testFirstColorInADE20KColorMap(self):
label = np.array([[1, 3], [10, 20]])
expected_result = np.array([
[[120, 120, 120], [6, 230, 230]],
[[4, 250, 7], [204, 70, 3]]
])
colored_label = get_dataset_colormap.label_to_color_image(
label, get_dataset_colormap.get_ade20k_name())
self.assertTrue(np.array_equal(colored_label, expected_result))
if __name__ == '__main__': if __name__ == '__main__':
tf.test.main() tf.test.main()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment