"enter/dglenter/model/edge_encoder/ele.py" did not exist on "80fb4dbe2675adfb2bd469260e20facdaae0631d"
Unverified Commit 2310bc34 authored by Yukun Zhu's avatar Yukun Zhu Committed by GitHub
Browse files

Merge pull request #4534 from huihui-personal/master

PiperOrigin-RevId: 200493322
parents 1f82c227 e2e820c1
......@@ -113,6 +113,11 @@ with "deeplab".
## Change Logs
### May 26, 2018
Updated ADE20K pretrained checkpoint.
### May 18, 2018
1. Added builders for ResNet-v1 and Xception model variants.
1. Added ADE20K support, including colormap and pretrained Xception_65 checkpoint.
......
......@@ -40,10 +40,10 @@ flags.DEFINE_integer('logits_kernel_size', 1,
'generates logits.')
# When using 'mobilent_v2', we set atrous_rates = decoder_output_stride = None.
# When using 'xception_65', we set atrous_rates = [6, 12, 18] (output stride 16)
# and decoder_output_stride = 4.
flags.DEFINE_enum('model_variant', 'mobilenet_v2',
['xception_65', 'mobilenet_v2'], 'DeepLab model variant.')
# When using 'xception_65' or 'resnet_v1' model variants, we set
# atrous_rates = [6, 12, 18] (output stride 16) and decoder_output_stride = 4.
# See core/feature_extractor.py for supported model variants.
flags.DEFINE_string('model_variant', 'mobilenet_v2', 'DeepLab model variant.')
flags.DEFINE_multi_float('image_pyramid', None,
'Input scales for multi-scale feature extraction.')
......@@ -57,6 +57,8 @@ flags.DEFINE_boolean('aspp_with_batch_norm', True,
flags.DEFINE_boolean('aspp_with_separable_conv', True,
'Use separable convolution for ASPP or not.')
# Defaults to None. Set multi_grid = [1, 2, 4] when using provided
# 'resnet_v1_{50,101}_beta' checkpoints.
flags.DEFINE_multi_integer('multi_grid', None,
'Employ a hierarchy of atrous rates for ResNet.')
......
# Copyright 2018 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Tests for common.py."""
import tensorflow as tf
from deeplab import common
class CommonTest(tf.test.TestCase):
def testOutputsToNumClasses(self):
num_classes = 21
model_options = common.ModelOptions(
outputs_to_num_classes={common.OUTPUT_TYPE: num_classes})
self.assertEqual(model_options.outputs_to_num_classes[common.OUTPUT_TYPE],
num_classes)
if __name__ == '__main__':
tf.test.main()
......@@ -98,8 +98,7 @@ DECODER_END_POINTS = 'decoder_end_points'
# A dictionary from network name to a map of end point features.
networks_to_feature_maps = {
'mobilenet_v2': {
# The provided checkpoint does not include decoder module.
DECODER_END_POINTS: None,
DECODER_END_POINTS: ['layer_4/depthwise_output'],
},
'resnet_v1_50': {
DECODER_END_POINTS: ['block1/unit_2/bottleneck_v1/conv3'],
......@@ -211,8 +210,7 @@ def extract_features(images,
regularize_depthwise=False,
preprocess_images=True,
num_classes=None,
global_pool=False,
use_bounded_activations=False):
global_pool=False):
"""Extracts features by the particular model_variant.
Args:
......@@ -237,8 +235,6 @@ def extract_features(images,
to None for dense prediction tasks.
global_pool: Global pooling for image classification task. Defaults to
False, since dense prediction tasks do not use this.
use_bounded_activations: Whether or not to use bounded activations. Bounded
activations better lend themselves to quantized inference.
Returns:
features: A tensor of size [batch, feature_height, feature_width,
......@@ -255,8 +251,7 @@ def extract_features(images,
weight_decay=weight_decay,
batch_norm_decay=0.95,
batch_norm_epsilon=1e-5,
batch_norm_scale=True,
activation_fn=tf.nn.relu6 if use_bounded_activations else tf.nn.relu)
batch_norm_scale=True)
features, end_points = get_network(
model_variant, preprocess_images, arg_scope)(
inputs=images,
......@@ -266,8 +261,7 @@ def extract_features(images,
output_stride=output_stride,
multi_grid=multi_grid,
reuse=reuse,
scope=name_scope[model_variant],
use_bounded_activations=use_bounded_activations)
scope=name_scope[model_variant])
elif 'xception' in model_variant:
arg_scope = arg_scopes_map[model_variant](
weight_decay=weight_decay,
......
......@@ -44,8 +44,7 @@ def bottleneck(inputs,
unit_rate=1,
rate=1,
outputs_collections=None,
scope=None,
use_bounded_activations=True):
scope=None):
"""Bottleneck residual unit variant with BN after convolutions.
This is the original residual unit proposed in [1]. See Fig. 1(a) of [2] for
......@@ -65,8 +64,6 @@ def bottleneck(inputs,
rate: An integer, rate for atrous convolution.
outputs_collections: Collection to add the ResNet unit output.
scope: Optional variable_scope.
use_bounded_activations: Whether or not to use bounded activations. Bounded
activations better lend themselves to quantized inference.
Returns:
The ResNet unit's output.
......@@ -81,7 +78,7 @@ def bottleneck(inputs,
depth,
[1, 1],
stride=stride,
activation_fn=tf.nn.relu6 if use_bounded_activations else None,
activation_fn=None,
scope='shortcut')
residual = slim.conv2d(inputs, depth_bottleneck, [1, 1], stride=1,
......@@ -90,13 +87,7 @@ def bottleneck(inputs,
rate=rate*unit_rate, scope='conv2')
residual = slim.conv2d(residual, depth, [1, 1], stride=1,
activation_fn=None, scope='conv3')
if use_bounded_activations:
# Use clip_by_value to simulate bandpass activation.
residual = tf.clip_by_value(residual, -6.0, 6.0)
output = tf.nn.relu6(shortcut + residual)
else:
output = tf.nn.relu(shortcut + residual)
output = tf.nn.relu(shortcut + residual)
return slim.utils.collect_named_outputs(outputs_collections,
sc.name,
......@@ -129,8 +120,6 @@ def resnet_v1_beta(inputs,
global_pool=True,
output_stride=None,
root_block_fn=None,
store_non_strided_activations=False,
use_bounded_activations=False,
reuse=None,
scope=None):
"""Generator for v1 ResNet models (beta variant).
......@@ -159,14 +148,6 @@ def resnet_v1_beta(inputs,
root_block_fn: The function consisting of convolution operations applied to
the root input. If root_block_fn is None, use the original setting of
RseNet-v1, which is simply one convolution with 7x7 kernel and stride=2.
store_non_strided_activations: If True, we compute non-strided (undecimated)
activations at the last unit of each block and store them in the
`outputs_collections` before subsampling them. This gives us access to
higher resolution intermediate activations which are useful in some
dense prediction problems but increases 4x the computation and memory cost
at the last unit of each block.
use_bounded_activations: Whether or not to use bounded activations. Bounded
activations better lend themselves to quantized inference.
reuse: whether or not the network and its variables should be reused. To be
able to reuse 'scope' must be given.
scope: Optional variable_scope.
......@@ -196,35 +177,32 @@ def resnet_v1_beta(inputs,
with slim.arg_scope([slim.conv2d, bottleneck,
resnet_utils.stack_blocks_dense],
outputs_collections=end_points_collection):
with slim.arg_scope(
[bottleneck], use_bounded_activations=use_bounded_activations):
if is_training is not None:
arg_scope = slim.arg_scope([slim.batch_norm], is_training=is_training)
else:
arg_scope = slim.arg_scope([])
with arg_scope:
net = inputs
if output_stride is not None:
if output_stride % 4 != 0:
raise ValueError('The output_stride needs to be a multiple of 4.')
output_stride /= 4
net = root_block_fn(net)
net = slim.max_pool2d(net, 3, stride=2, padding='SAME', scope='pool1')
net = resnet_utils.stack_blocks_dense(net, blocks, output_stride,
store_non_strided_activations)
if global_pool:
# Global average pooling.
net = tf.reduce_mean(net, [1, 2], name='pool5', keepdims=True)
if num_classes is not None:
net = slim.conv2d(net, num_classes, [1, 1], activation_fn=None,
normalizer_fn=None, scope='logits')
# Convert end_points_collection into a dictionary of end_points.
end_points = slim.utils.convert_collection_to_dict(
end_points_collection)
if num_classes is not None:
end_points['predictions'] = slim.softmax(net, scope='predictions')
return net, end_points
if is_training is not None:
arg_scope = slim.arg_scope([slim.batch_norm], is_training=is_training)
else:
arg_scope = slim.arg_scope([])
with arg_scope:
net = inputs
if output_stride is not None:
if output_stride % 4 != 0:
raise ValueError('The output_stride needs to be a multiple of 4.')
output_stride /= 4
net = root_block_fn(net)
net = slim.max_pool2d(net, 3, stride=2, padding='SAME', scope='pool1')
net = resnet_utils.stack_blocks_dense(net, blocks, output_stride)
if global_pool:
# Global average pooling.
net = tf.reduce_mean(net, [1, 2], name='pool5', keepdims=True)
if num_classes is not None:
net = slim.conv2d(net, num_classes, [1, 1], activation_fn=None,
normalizer_fn=None, scope='logits')
# Convert end_points_collection into a dictionary of end_points.
end_points = slim.utils.convert_collection_to_dict(
end_points_collection)
if num_classes is not None:
end_points['predictions'] = slim.softmax(net, scope='predictions')
return net, end_points
def resnet_v1_beta_block(scope, base_depth, num_units, stride):
......@@ -258,9 +236,7 @@ def resnet_v1_50(inputs,
is_training=None,
global_pool=False,
output_stride=None,
store_non_strided_activations=False,
multi_grid=None,
use_bounded_activations=False,
reuse=None,
scope='resnet_v1_50'):
"""Resnet v1 50.
......@@ -275,15 +251,7 @@ def resnet_v1_50(inputs,
output_stride: If None, then the output will be computed at the nominal
network stride. If output_stride is not None, it specifies the requested
ratio of input to output spatial resolution.
store_non_strided_activations: If True, we compute non-strided (undecimated)
activations at the last unit of each block and store them in the
`outputs_collections` before subsampling them. This gives us access to
higher resolution intermediate activations which are useful in some
dense prediction problems but increases 4x the computation and memory cost
at the last unit of each block.
multi_grid: Employ a hierarchy of different atrous rates within network.
use_bounded_activations: Whether or not to use bounded activations. Bounded
activations better lend themselves to quantized inference.
reuse: whether or not the network and its variables should be reused. To be
able to reuse 'scope' must be given.
scope: Optional variable_scope.
......@@ -328,10 +296,8 @@ def resnet_v1_50(inputs,
is_training=is_training,
global_pool=global_pool,
output_stride=output_stride,
store_non_strided_activations=store_non_strided_activations,
reuse=reuse,
scope=scope,
use_bounded_activations=use_bounded_activations)
scope=scope)
def resnet_v1_50_beta(inputs,
......@@ -339,9 +305,7 @@ def resnet_v1_50_beta(inputs,
is_training=None,
global_pool=False,
output_stride=None,
store_non_strided_activations=False,
multi_grid=None,
use_bounded_activations=False,
reuse=None,
scope='resnet_v1_50'):
"""Resnet v1 50 beta variant.
......@@ -360,15 +324,7 @@ def resnet_v1_50_beta(inputs,
output_stride: If None, then the output will be computed at the nominal
network stride. If output_stride is not None, it specifies the requested
ratio of input to output spatial resolution.
store_non_strided_activations: If True, we compute non-strided (undecimated)
activations at the last unit of each block and store them in the
`outputs_collections` before subsampling them. This gives us access to
higher resolution intermediate activations which are useful in some
dense prediction problems but increases 4x the computation and memory cost
at the last unit of each block.
multi_grid: Employ a hierarchy of different atrous rates within network.
use_bounded_activations: Whether or not to use bounded activations. Bounded
activations better lend themselves to quantized inference.
reuse: whether or not the network and its variables should be reused. To be
able to reuse 'scope' must be given.
scope: Optional variable_scope.
......@@ -414,10 +370,8 @@ def resnet_v1_50_beta(inputs,
global_pool=global_pool,
output_stride=output_stride,
root_block_fn=functools.partial(root_block_fn_for_beta_variant),
store_non_strided_activations=store_non_strided_activations,
reuse=reuse,
scope=scope,
use_bounded_activations=use_bounded_activations)
scope=scope)
def resnet_v1_101(inputs,
......@@ -425,9 +379,7 @@ def resnet_v1_101(inputs,
is_training=None,
global_pool=False,
output_stride=None,
store_non_strided_activations=False,
multi_grid=None,
use_bounded_activations=False,
reuse=None,
scope='resnet_v1_101'):
"""Resnet v1 101.
......@@ -442,15 +394,7 @@ def resnet_v1_101(inputs,
output_stride: If None, then the output will be computed at the nominal
network stride. If output_stride is not None, it specifies the requested
ratio of input to output spatial resolution.
store_non_strided_activations: If True, we compute non-strided (undecimated)
activations at the last unit of each block and store them in the
`outputs_collections` before subsampling them. This gives us access to
higher resolution intermediate activations which are useful in some
dense prediction problems but increases 4x the computation and memory cost
at the last unit of each block.
multi_grid: Employ a hierarchy of different atrous rates within network.
use_bounded_activations: Whether or not to use bounded activations. Bounded
activations better lend themselves to quantized inference.
reuse: whether or not the network and its variables should be reused. To be
able to reuse 'scope' must be given.
scope: Optional variable_scope.
......@@ -495,10 +439,8 @@ def resnet_v1_101(inputs,
is_training=is_training,
global_pool=global_pool,
output_stride=output_stride,
store_non_strided_activations=store_non_strided_activations,
reuse=reuse,
scope=scope,
use_bounded_activations=use_bounded_activations)
scope=scope)
def resnet_v1_101_beta(inputs,
......@@ -506,9 +448,7 @@ def resnet_v1_101_beta(inputs,
is_training=None,
global_pool=False,
output_stride=None,
store_non_strided_activations=False,
multi_grid=None,
use_bounded_activations=False,
reuse=None,
scope='resnet_v1_101'):
"""Resnet v1 101 beta variant.
......@@ -527,15 +467,7 @@ def resnet_v1_101_beta(inputs,
output_stride: If None, then the output will be computed at the nominal
network stride. If output_stride is not None, it specifies the requested
ratio of input to output spatial resolution.
store_non_strided_activations: If True, we compute non-strided (undecimated)
activations at the last unit of each block and store them in the
`outputs_collections` before subsampling them. This gives us access to
higher resolution intermediate activations which are useful in some
dense prediction problems but increases 4x the computation and memory cost
at the last unit of each block.
multi_grid: Employ a hierarchy of different atrous rates within network.
use_bounded_activations: Whether or not to use bounded activations. Bounded
activations better lend themselves to quantized inference.
reuse: whether or not the network and its variables should be reused. To be
able to reuse 'scope' must be given.
scope: Optional variable_scope.
......@@ -581,7 +513,5 @@ def resnet_v1_101_beta(inputs,
global_pool=global_pool,
output_stride=output_stride,
root_block_fn=functools.partial(root_block_fn_for_beta_variant),
store_non_strided_activations=store_non_strided_activations,
use_bounded_activations=use_bounded_activations,
reuse=reuse,
scope=scope)
......@@ -53,7 +53,6 @@ class ResnetCompleteNetworkTest(tf.test.TestCase):
is_training=True,
global_pool=True,
output_stride=None,
store_non_strided_activations=False,
multi_grid=None,
reuse=None,
scope='resnet_v1_small'):
......@@ -84,7 +83,6 @@ class ResnetCompleteNetworkTest(tf.test.TestCase):
output_stride=output_stride,
root_block_fn=functools.partial(
resnet_v1_beta.root_block_fn_for_beta_variant),
store_non_strided_activations=store_non_strided_activations,
reuse=reuse,
scope=scope)
......
......@@ -89,6 +89,7 @@ _CITYSCAPES_INFORMATION = DatasetDescriptor(
_PASCAL_VOC_SEG_INFORMATION = DatasetDescriptor(
splits_to_sizes={
'train': 1464,
'train_aug': 10582,
'trainval': 2913,
'val': 1449,
},
......
......@@ -49,7 +49,7 @@ A local training job using `xception_65` can be run with the following command:
# From tensorflow/models/research/
python deeplab/train.py \
--logtostderr \
--training_number_of_steps=90000 \
--training_number_of_steps=150000 \
--train_split="train" \
--model_variant="xception_65" \
--atrous_rates=6 \
......
......@@ -84,7 +84,7 @@ xception_ade20k_train | Xception_65 | ImageNet <br> ADE20K
Checkpoint name | Eval OS | Eval scales | Left-right Flip | mIOU | Pixel-wise Accuracy | File Size
------------------------------------- | :-------: | :-------------------------: | :-------------: | :-------------------: | :-------------------: | :-------:
[xception_ade20k_train](http://download.tensorflow.org/models/deeplabv3_xception_ade20k_train_2018_05_14.tar.gz) | 16 | [0.5:0.25:1.75] | Yes | 43.54% (val) | 81.74% (val) | 439MB
[xception_ade20k_train](http://download.tensorflow.org/models/deeplabv3_xception_ade20k_train_2018_05_29.tar.gz) | 8 | [0.5:0.25:1.75] | Yes | 45.65% (val) | 82.52% (val) | 439MB
## Checkpoints pretrained on ImageNet
......
......@@ -56,16 +56,12 @@ from deeplab.core import feature_extractor
slim = tf.contrib.slim
_LOGITS_SCOPE_NAME = 'logits'
_MERGED_LOGITS_SCOPE = 'merged_logits'
_IMAGE_POOLING_SCOPE = 'image_pooling'
_ASPP_SCOPE = 'aspp'
_CONCAT_PROJECTION_SCOPE = 'concat_projection'
_DECODER_SCOPE = 'decoder'
def get_merged_logits_scope():
return _MERGED_LOGITS_SCOPE
LOGITS_SCOPE_NAME = 'logits'
MERGED_LOGITS_SCOPE = 'merged_logits'
IMAGE_POOLING_SCOPE = 'image_pooling'
ASPP_SCOPE = 'aspp'
CONCAT_PROJECTION_SCOPE = 'concat_projection'
DECODER_SCOPE = 'decoder'
def get_extra_layer_scopes(last_layers_contain_logits_only=False):
......@@ -79,14 +75,14 @@ def get_extra_layer_scopes(last_layers_contain_logits_only=False):
A list of scopes for extra layers.
"""
if last_layers_contain_logits_only:
return [_LOGITS_SCOPE_NAME]
return [LOGITS_SCOPE_NAME]
else:
return [
_LOGITS_SCOPE_NAME,
_IMAGE_POOLING_SCOPE,
_ASPP_SCOPE,
_CONCAT_PROJECTION_SCOPE,
_DECODER_SCOPE,
LOGITS_SCOPE_NAME,
IMAGE_POOLING_SCOPE,
ASPP_SCOPE,
CONCAT_PROJECTION_SCOPE,
DECODER_SCOPE,
]
......@@ -133,7 +129,7 @@ def predict_labels_multi_scale(images,
for output in sorted(outputs_to_scales_to_logits):
scales_to_logits = outputs_to_scales_to_logits[output]
logits = tf.image.resize_bilinear(
scales_to_logits[_MERGED_LOGITS_SCOPE],
scales_to_logits[MERGED_LOGITS_SCOPE],
tf.shape(images)[1:3],
align_corners=True)
outputs_to_predictions[output].append(
......@@ -143,7 +139,7 @@ def predict_labels_multi_scale(images,
scales_to_logits_reversed = (
outputs_to_scales_to_logits_reversed[output])
logits_reversed = tf.image.resize_bilinear(
tf.reverse_v2(scales_to_logits_reversed[_MERGED_LOGITS_SCOPE], [2]),
tf.reverse_v2(scales_to_logits_reversed[MERGED_LOGITS_SCOPE], [2]),
tf.shape(images)[1:3],
align_corners=True)
outputs_to_predictions[output].append(
......@@ -182,7 +178,7 @@ def predict_labels(images, model_options, image_pyramid=None):
for output in sorted(outputs_to_scales_to_logits):
scales_to_logits = outputs_to_scales_to_logits[output]
logits = tf.image.resize_bilinear(
scales_to_logits[_MERGED_LOGITS_SCOPE],
scales_to_logits[MERGED_LOGITS_SCOPE],
tf.shape(images)[1:3],
align_corners=True)
predictions[output] = tf.argmax(logits, 3)
......@@ -221,7 +217,6 @@ def multi_scale_logits(images,
images: A tensor of size [batch, height, width, channels].
model_options: A ModelOptions instance to configure models.
image_pyramid: Input image scales for multi-scale feature extraction.
weight_decay: The weight decay for model variables.
is_training: Is training or not.
fine_tune_batch_norm: Fine-tune the batch norm parameters or not.
......@@ -242,17 +237,9 @@ def multi_scale_logits(images,
# Setup default values.
if not image_pyramid:
image_pyramid = [1.0]
if model_options.crop_size is None and model_options.add_image_level_feature:
raise ValueError(
'Crop size must be specified for using image-level feature.')
if model_options.model_variant == 'mobilenet_v2':
if (model_options.atrous_rates is not None or
model_options.decoder_output_stride is not None):
# Output a warning and users should make sure if the setting is desired.
tf.logging.warning('Our provided mobilenet_v2 checkpoint does not '
'include ASPP and decoder modules.')
crop_height = (
model_options.crop_size[0]
if model_options.crop_size else tf.shape(images)[1])
......@@ -277,7 +264,7 @@ def multi_scale_logits(images,
for k in model_options.outputs_to_num_classes
}
for count, image_scale in enumerate(image_pyramid):
for image_scale in image_pyramid:
if image_scale != 1.0:
scaled_height = scale_dimension(crop_height, image_scale)
scaled_width = scale_dimension(crop_width, image_scale)
......@@ -295,7 +282,7 @@ def multi_scale_logits(images,
scaled_images,
updated_options,
weight_decay=weight_decay,
reuse=True if count else None,
reuse=tf.AUTO_REUSE,
is_training=is_training,
fine_tune_batch_norm=fine_tune_batch_norm)
......@@ -309,7 +296,7 @@ def multi_scale_logits(images,
if len(image_pyramid) == 1:
for output in sorted(model_options.outputs_to_num_classes):
outputs_to_scales_to_logits[output][
_MERGED_LOGITS_SCOPE] = outputs_to_logits[output]
MERGED_LOGITS_SCOPE] = outputs_to_logits[output]
return outputs_to_scales_to_logits
# Save logits to the output map.
......@@ -328,18 +315,18 @@ def multi_scale_logits(images,
merge_fn = (
tf.reduce_max
if model_options.merge_method == 'max' else tf.reduce_mean)
outputs_to_scales_to_logits[output][_MERGED_LOGITS_SCOPE] = merge_fn(
outputs_to_scales_to_logits[output][MERGED_LOGITS_SCOPE] = merge_fn(
all_logits, axis=4)
return outputs_to_scales_to_logits
def _extract_features(images,
model_options,
weight_decay=0.0001,
reuse=None,
is_training=False,
fine_tune_batch_norm=False):
def extract_features(images,
model_options,
weight_decay=0.0001,
reuse=None,
is_training=False,
fine_tune_batch_norm=False):
"""Extracts features by the particular model_variant.
Args:
......@@ -399,7 +386,7 @@ def _extract_features(images,
features, [pool_height, pool_width], [pool_height, pool_width],
padding='VALID')
image_feature = slim.conv2d(
image_feature, depth, 1, scope=_IMAGE_POOLING_SCOPE)
image_feature, depth, 1, scope=IMAGE_POOLING_SCOPE)
image_feature = tf.image.resize_bilinear(
image_feature, [pool_height, pool_width], align_corners=True)
image_feature.set_shape([None, pool_height, pool_width, depth])
......@@ -407,14 +394,14 @@ def _extract_features(images,
# Employ a 1x1 convolution.
branch_logits.append(slim.conv2d(features, depth, 1,
scope=_ASPP_SCOPE + str(0)))
scope=ASPP_SCOPE + str(0)))
if model_options.atrous_rates:
# Employ 3x3 convolutions with different atrous rates.
for i, rate in enumerate(model_options.atrous_rates, 1):
scope = _ASPP_SCOPE + str(i)
scope = ASPP_SCOPE + str(i)
if model_options.aspp_with_separable_conv:
aspp_features = _split_separable_conv2d(
aspp_features = split_separable_conv2d(
features,
filters=depth,
rate=rate,
......@@ -428,12 +415,12 @@ def _extract_features(images,
# Merge branch logits.
concat_logits = tf.concat(branch_logits, 3)
concat_logits = slim.conv2d(
concat_logits, depth, 1, scope=_CONCAT_PROJECTION_SCOPE)
concat_logits, depth, 1, scope=CONCAT_PROJECTION_SCOPE)
concat_logits = slim.dropout(
concat_logits,
keep_prob=0.9,
is_training=is_training,
scope=_CONCAT_PROJECTION_SCOPE + '_dropout')
scope=CONCAT_PROJECTION_SCOPE + '_dropout')
return concat_logits, end_points
......@@ -457,7 +444,7 @@ def _get_logits(images,
Returns:
outputs_to_logits: A map from output_type to logits.
"""
features, end_points = _extract_features(
features, end_points = extract_features(
images,
model_options,
weight_decay=weight_decay,
......@@ -484,7 +471,7 @@ def _get_logits(images,
outputs_to_logits = {}
for output in sorted(model_options.outputs_to_num_classes):
outputs_to_logits[output] = _get_branch_logits(
outputs_to_logits[output] = get_branch_logits(
features,
model_options.outputs_to_num_classes[output],
model_options.atrous_rates,
......@@ -543,7 +530,7 @@ def refine_by_decoder(features,
stride=1,
reuse=reuse):
with slim.arg_scope([slim.batch_norm], **batch_norm_params):
with tf.variable_scope(_DECODER_SCOPE, _DECODER_SCOPE, [features]):
with tf.variable_scope(DECODER_SCOPE, DECODER_SCOPE, [features]):
feature_list = feature_extractor.networks_to_feature_maps[
model_variant][feature_extractor.DECODER_END_POINTS]
if feature_list is None:
......@@ -553,8 +540,13 @@ def refine_by_decoder(features,
decoder_features = features
for i, name in enumerate(feature_list):
decoder_features_list = [decoder_features]
feature_name = '{}/{}'.format(
feature_extractor.name_scope[model_variant], name)
# MobileNet variants use different naming convention.
if 'mobilenet' in model_variant:
feature_name = name
else:
feature_name = '{}/{}'.format(
feature_extractor.name_scope[model_variant], name)
decoder_features_list.append(
slim.conv2d(
end_points[feature_name],
......@@ -569,13 +561,13 @@ def refine_by_decoder(features,
[None, decoder_height, decoder_width, None])
decoder_depth = 256
if decoder_use_separable_conv:
decoder_features = _split_separable_conv2d(
decoder_features = split_separable_conv2d(
tf.concat(decoder_features_list, 3),
filters=decoder_depth,
rate=1,
weight_decay=weight_decay,
scope='decoder_conv0')
decoder_features = _split_separable_conv2d(
decoder_features = split_separable_conv2d(
decoder_features,
filters=decoder_depth,
rate=1,
......@@ -593,14 +585,14 @@ def refine_by_decoder(features,
return decoder_features
def _get_branch_logits(features,
num_classes,
atrous_rates=None,
aspp_with_batch_norm=False,
kernel_size=1,
weight_decay=0.0001,
reuse=None,
scope_suffix=''):
def get_branch_logits(features,
num_classes,
atrous_rates=None,
aspp_with_batch_norm=False,
kernel_size=1,
weight_decay=0.0001,
reuse=None,
scope_suffix=''):
"""Gets the logits from each model's branch.
The underlying model is branched out in the last layer when atrous
......@@ -624,7 +616,7 @@ def _get_branch_logits(features,
ValueError: Upon invalid input kernel_size value.
"""
# When using batch normalization with ASPP, ASPP has been applied before
# in _extract_features, and thus we simply apply 1x1 convolution here.
# in extract_features, and thus we simply apply 1x1 convolution here.
if aspp_with_batch_norm or atrous_rates is None:
if kernel_size != 1:
raise ValueError('Kernel size must be 1 when atrous_rates is None or '
......@@ -636,7 +628,7 @@ def _get_branch_logits(features,
weights_regularizer=slim.l2_regularizer(weight_decay),
weights_initializer=tf.truncated_normal_initializer(stddev=0.01),
reuse=reuse):
with tf.variable_scope(_LOGITS_SCOPE_NAME, _LOGITS_SCOPE_NAME, [features]):
with tf.variable_scope(LOGITS_SCOPE_NAME, LOGITS_SCOPE_NAME, [features]):
branch_logits = []
for i, rate in enumerate(atrous_rates):
scope = scope_suffix
......@@ -656,13 +648,14 @@ def _get_branch_logits(features,
return tf.add_n(branch_logits)
def _split_separable_conv2d(inputs,
filters,
rate=1,
weight_decay=0.00004,
depthwise_weights_initializer_stddev=0.33,
pointwise_weights_initializer_stddev=0.06,
scope=None):
def split_separable_conv2d(inputs,
filters,
kernel_size=3,
rate=1,
weight_decay=0.00004,
depthwise_weights_initializer_stddev=0.33,
pointwise_weights_initializer_stddev=0.06,
scope=None):
"""Splits a separable conv2d into depthwise and pointwise conv2d.
This operation differs from `tf.layers.separable_conv2d` as this operation
......@@ -671,6 +664,8 @@ def _split_separable_conv2d(inputs,
Args:
inputs: Input tensor with shape [batch, height, width, channels].
filters: Number of filters in the 1x1 pointwise convolution.
kernel_size: A list of length 2: [kernel_height, kernel_width] of
of the filters. Can be an int if both values are the same.
rate: Atrous convolution rate for the depthwise convolution.
weight_decay: The weight decay to use for regularizing the model.
depthwise_weights_initializer_stddev: The standard deviation of the
......@@ -685,7 +680,7 @@ def _split_separable_conv2d(inputs,
outputs = slim.separable_conv2d(
inputs,
None,
3,
kernel_size=kernel_size,
depth_multiplier=1,
rate=rate,
weights_initializer=tf.truncated_normal_initializer(
......
......@@ -101,6 +101,8 @@ flags.DEFINE_float('momentum', 0.9, 'The momentum value to use')
flags.DEFINE_integer('train_batch_size', 8,
'The number of images in each batch during training.')
# For weight_decay, use 0.00004 for MobileNet-V2 or Xcpetion model variants.
# Use 0.0001 for ResNet model variants.
flags.DEFINE_float('weight_decay', 0.00004,
'The value of the weight decay for training.')
......@@ -206,8 +208,8 @@ def _build_deeplab(inputs_queue, outputs_to_num_classes, ignore_label):
# Add name to graph node so we can add to summary.
output_type_dict = outputs_to_scales_to_logits[common.OUTPUT_TYPE]
output_type_dict[model.get_merged_logits_scope()] = tf.identity(
output_type_dict[model.get_merged_logits_scope()],
output_type_dict[model.MERGED_LOGITS_SCOPE] = tf.identity(
output_type_dict[model.MERGED_LOGITS_SCOPE],
name=common.OUTPUT_TYPE)
for output, num_classes in six.iteritems(outputs_to_num_classes):
......
......@@ -29,12 +29,14 @@ import numpy as np
# Dataset names.
_ADE20K = 'ade20k'
_CITYSCAPES = 'cityscapes'
_MAPILLARY_VISTAS = 'mapillary_vistas'
_PASCAL = 'pascal'
# Max number of entries in the colormap for each dataset.
_DATASET_MAX_ENTRIES = {
_ADE20K: 151,
_CITYSCAPES: 19,
_MAPILLARY_VISTAS: 66,
_PASCAL: 256,
}
......@@ -229,6 +231,82 @@ def create_cityscapes_label_colormap():
])
def create_mapillary_vistas_label_colormap():
"""Creates a label colormap used in Mapillary Vistas segmentation benchmark.
Returns:
A colormap for visualizing segmentation results.
"""
return np.asarray([
[165, 42, 42],
[0, 192, 0],
[196, 196, 196],
[190, 153, 153],
[180, 165, 180],
[102, 102, 156],
[102, 102, 156],
[128, 64, 255],
[140, 140, 200],
[170, 170, 170],
[250, 170, 160],
[96, 96, 96],
[230, 150, 140],
[128, 64, 128],
[110, 110, 110],
[244, 35, 232],
[150, 100, 100],
[70, 70, 70],
[150, 120, 90],
[220, 20, 60],
[255, 0, 0],
[255, 0, 0],
[255, 0, 0],
[200, 128, 128],
[255, 255, 255],
[64, 170, 64],
[128, 64, 64],
[70, 130, 180],
[255, 255, 255],
[152, 251, 152],
[107, 142, 35],
[0, 170, 30],
[255, 255, 128],
[250, 0, 30],
[0, 0, 0],
[220, 220, 220],
[170, 170, 170],
[222, 40, 40],
[100, 170, 30],
[40, 40, 40],
[33, 33, 33],
[170, 170, 170],
[0, 0, 142],
[170, 170, 170],
[210, 170, 100],
[153, 153, 153],
[128, 128, 128],
[0, 0, 142],
[250, 170, 30],
[192, 192, 192],
[220, 220, 0],
[180, 165, 180],
[119, 11, 32],
[0, 0, 142],
[0, 60, 100],
[0, 0, 142],
[0, 0, 90],
[0, 0, 230],
[0, 80, 100],
[128, 64, 64],
[0, 0, 110],
[0, 0, 70],
[0, 0, 192],
[32, 32, 32],
[0, 0, 0],
[0, 0, 0],
])
def create_pascal_label_colormap():
"""Creates a label colormap used in PASCAL VOC segmentation benchmark.
......@@ -254,6 +332,10 @@ def get_cityscapes_name():
return _CITYSCAPES
def get_mapillary_vistas_name():
return _MAPILLARY_VISTAS
def get_pascal_name():
return _PASCAL
......@@ -287,6 +369,8 @@ def create_label_colormap(dataset=_PASCAL):
return create_ade20k_label_colormap()
elif dataset == _CITYSCAPES:
return create_cityscapes_label_colormap()
elif dataset == _MAPILLARY_VISTAS:
return create_mapillary_vistas_label_colormap()
elif dataset == _PASCAL:
return create_pascal_label_colormap()
else:
......
......@@ -86,6 +86,11 @@ class VisualizationUtilTest(tf.test.TestCase):
label, get_dataset_colormap.get_ade20k_name())
self.assertTrue(np.array_equal(colored_label, expected_result))
def testMapillaryVistasColorMapValue(self):
colormap = get_dataset_colormap.create_mapillary_vistas_label_colormap()
self.assertTrue(np.array_equal([190, 153, 153], colormap[3, :]))
self.assertTrue(np.array_equal([102, 102, 156], colormap[6, :]))
if __name__ == '__main__':
tf.test.main()
......@@ -17,6 +17,7 @@
import six
import tensorflow as tf
from deeplab.core import preprocess_utils
slim = tf.contrib.slim
......@@ -54,12 +55,16 @@ def add_softmax_cross_entropy_loss_for_each_scale(scales_to_logits,
if upsample_logits:
# Label is not downsampled, and instead we upsample logits.
logits = tf.image.resize_bilinear(
logits, tf.shape(labels)[1:3], align_corners=True)
logits,
preprocess_utils.resolve_shape(labels, 4)[1:3],
align_corners=True)
scaled_labels = labels
else:
# Label is downsampled to the same size as logits.
scaled_labels = tf.image.resize_nearest_neighbor(
labels, tf.shape(logits)[1:3], align_corners=True)
labels,
preprocess_utils.resolve_shape(logits, 4)[1:3],
align_corners=True)
scaled_labels = tf.reshape(scaled_labels, shape=[-1])
not_ignore_mask = tf.to_float(tf.not_equal(scaled_labels,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment