# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from collections import OrderedDict import paddle import paddle.nn as nn import paddle.nn.functional as F from paddleseg.cvlibs import manager from paddleseg.models import layers from paddleseg.utils import utils __all__ = ['PanopticDeepLab'] @manager.MODELS.add_component class PanopticDeepLab(nn.Layer): """ The PanopticDeeplab implementation based on PaddlePaddle. The original article refers to Bowen Cheng, et, al. "Panoptic-DeepLab: A Simple, Strong, and Fast Baseline for Bottom-Up Panoptic Segmentation" (https://arxiv.org/abs/1911.10194) Args: num_classes (int): The unique number of target classes. backbone (paddle.nn.Layer): Backbone network, currently support Resnet50_vd/Resnet101_vd/Xception65. backbone_indices (tuple, optional): Two values in the tuple indicate the indices of output of backbone. Default: (2, 1, 0, 3). aspp_ratios (tuple, optional): The dilation rate using in ASSP module. If output_stride=16, aspp_ratios should be set as (1, 6, 12, 18). If output_stride=8, aspp_ratios is (1, 12, 24, 36). Default: (1, 6, 12, 18). aspp_out_channels (int, optional): The output channels of ASPP module. Default: 256. decoder_channels (int, optional): The channels of Decoder. Default: 256. low_level_channels_projects (list, opitonal). The channels of low level features to output. Defualt: None. align_corners (bool, optional): An argument of F.interpolate. It should be set to False when the feature size is even, e.g. 1024x512, otherwise it is True, e.g. 769x769. Default: False. pretrained (str, optional): The path or url of pretrained model. Default: None. """ def __init__(self, num_classes, backbone, backbone_indices=(2, 1, 0, 3), aspp_ratios=(1, 6, 12, 18), aspp_out_channels=256, decoder_channels=256, low_level_channels_projects=None, align_corners=False, pretrained=None, **kwargs): super().__init__() self.backbone = backbone backbone_channels = [ backbone.feat_channels[i] for i in backbone_indices ] self.head = PanopticDeepLabHead( num_classes, backbone_indices, backbone_channels, aspp_ratios, aspp_out_channels, decoder_channels, align_corners, low_level_channels_projects, **kwargs) self.align_corners = align_corners self.pretrained = pretrained self.init_weight() def _upsample_predictions(self, pred, input_shape): """Upsamples final prediction, with special handling to offset. Args: pred (dict): stores all output of the segmentation model. input_shape (tuple): spatial resolution of the desired shape. Returns: result (OrderedDict): upsampled dictionary. """ # Override upsample method to correctly handle `offset` result = OrderedDict() for key in pred.keys(): out = F.interpolate( pred[key], size=input_shape, mode='bilinear', align_corners=self.align_corners) if 'offset' in key: if input_shape[0] % 2 == 0: scale = input_shape[0] // pred[key].shape[2] else: scale = (input_shape[0] - 1) // (pred[key].shape[2] - 1) out *= scale result[key] = out return result def forward(self, x): feat_list = self.backbone(x) logit_dict = self.head(feat_list) results = self._upsample_predictions(logit_dict, x.shape[-2:]) # return results logit_list = [results['semantic'], results['center'], results['offset']] return logit_list # return [results['semantic']] def init_weight(self): if self.pretrained is not None: utils.load_entire_model(self, self.pretrained) class PanopticDeepLabHead(nn.Layer): """ The DeepLabV3PHead implementation based on PaddlePaddle. Args: num_classes (int): The unique number of target classes. backbone_indices (tuple): Two values in the tuple indicate the indices of output of backbone. the first index will be taken as a low-level feature in Decoder component; the second one will be taken as input of ASPP component. Usually backbone consists of four downsampling stage, and return an output of each stage. If we set it as (0, 3), it means taking feature map of the first stage in backbone as low-level feature used in Decoder, and feature map of the fourth stage as input of ASPP. backbone_channels (tuple): The same length with "backbone_indices". It indicates the channels of corresponding index. aspp_ratios (tuple): The dilation rates using in ASSP module. aspp_out_channels (int): The output channels of ASPP module. decoder_channels (int, optional): The channels of Decoder. Default: 256. align_corners (bool, optional): An argument of F.interpolate. It should be set to False when the feature size is even, e.g. 1024x512, otherwise it is True, e.g. 769x769. Default: False. low_level_channels_projects (list, opitonal). The channels of low level features to output. Defualt: None. """ def __init__(self, num_classes, backbone_indices, backbone_channels, aspp_ratios, aspp_out_channels, decoder_channels, align_corners, low_level_channels_projects, **kwargs): super().__init__() self.semantic_decoder = SinglePanopticDeepLabDecoder( backbone_indices=backbone_indices, backbone_channels=backbone_channels, aspp_ratios=aspp_ratios, aspp_out_channels=aspp_out_channels, decoder_channels=decoder_channels, align_corners=align_corners, low_level_channels_projects=low_level_channels_projects) self.semantic_head = SinglePanopticDeepLabHead( num_classes=[num_classes], decoder_channels=decoder_channels, head_channels=decoder_channels, class_key=['semantic']) self.instance_decoder = SinglePanopticDeepLabDecoder( backbone_indices=backbone_indices, backbone_channels=backbone_channels, aspp_ratios=aspp_ratios, aspp_out_channels=kwargs['instance_aspp_out_channels'], decoder_channels=kwargs['instance_decoder_channels'], align_corners=align_corners, low_level_channels_projects=kwargs[ 'instance_low_level_channels_projects']) self.instance_head = SinglePanopticDeepLabHead( num_classes=kwargs['instance_num_classes'], decoder_channels=kwargs['instance_decoder_channels'], head_channels=kwargs['instance_head_channels'], class_key=kwargs['instance_class_key']) def forward(self, features): # pred = OrdereDict() pred = {} # Semantic branch semantic = self.semantic_decoder(features) semantic = self.semantic_head(semantic) for key in semantic.keys(): pred[key] = semantic[key] # Instance branch instance = self.instance_decoder(features) instance = self.instance_head(instance) for key in instance.keys(): pred[key] = instance[key] return pred class SeparableConvBNReLU(nn.Layer): def __init__(self, in_channels, out_channels, kernel_size, padding='same', **kwargs): super().__init__() self.depthwise_conv = layers.ConvBNReLU( in_channels, out_channels=in_channels, kernel_size=kernel_size, padding=padding, groups=in_channels, **kwargs) self.piontwise_conv = layers.ConvBNReLU( in_channels, out_channels, kernel_size=1, groups=1, bias_attr=False) def forward(self, x): x = self.depthwise_conv(x) x = self.piontwise_conv(x) return x class ASPPModule(nn.Layer): """ Atrous Spatial Pyramid Pooling. Args: aspp_ratios (tuple): The dilation rate using in ASSP module. in_channels (int): The number of input channels. out_channels (int): The number of output channels. align_corners (bool): An argument of F.interpolate. It should be set to False when the output size of feature is even, e.g. 1024x512, otherwise it is True, e.g. 769x769. use_sep_conv (bool, optional): If using separable conv in ASPP module. Default: False. image_pooling (bool, optional): If augmented with image-level features. Default: False drop_rate (float, optional): The drop rate. Default: 0.1. """ def __init__(self, aspp_ratios, in_channels, out_channels, align_corners, use_sep_conv=False, image_pooling=False, drop_rate=0.1): super().__init__() self.align_corners = align_corners self.aspp_blocks = nn.LayerList() for ratio in aspp_ratios: if use_sep_conv and ratio > 1: conv_func = SeparableConvBNReLU else: conv_func = layers.ConvBNReLU block = conv_func( in_channels=in_channels, out_channels=out_channels, kernel_size=1 if ratio == 1 else 3, dilation=ratio, padding=0 if ratio == 1 else ratio, bias_attr=False) self.aspp_blocks.append(block) out_size = len(self.aspp_blocks) if image_pooling: self.global_avg_pool = nn.Sequential( nn.AdaptiveAvgPool2D(output_size=(1, 1)), layers.ConvBNReLU( in_channels, out_channels, kernel_size=1, bias_attr=False)) out_size += 1 self.image_pooling = image_pooling self.conv_bn_relu = layers.ConvBNReLU( in_channels=out_channels * out_size, out_channels=out_channels, kernel_size=1, bias_attr=False) self.dropout = nn.Dropout(p=drop_rate) # drop rate def forward(self, x): outputs = [] for block in self.aspp_blocks: y = block(x) interpolate_shape = x.shape[2:] y = F.interpolate( y, interpolate_shape, mode='bilinear', align_corners=self.align_corners) outputs.append(y) if self.image_pooling: img_avg = self.global_avg_pool(x) img_avg = F.interpolate( img_avg, interpolate_shape, mode='bilinear', align_corners=self.align_corners) outputs.append(img_avg) x = paddle.concat(outputs, axis=1) x = self.conv_bn_relu(x) x = self.dropout(x) return x class SinglePanopticDeepLabDecoder(nn.Layer): """ The DeepLabV3PHead implementation based on PaddlePaddle. Args: backbone_indices (tuple): Two values in the tuple indicate the indices of output of backbone. the first index will be taken as a low-level feature in Decoder component; the second one will be taken as input of ASPP component. Usually backbone consists of four downsampling stage, and return an output of each stage. If we set it as (0, 3), it means taking feature map of the first stage in backbone as low-level feature used in Decoder, and feature map of the fourth stage as input of ASPP. backbone_channels (tuple): The same length with "backbone_indices". It indicates the channels of corresponding index. aspp_ratios (tuple): The dilation rates using in ASSP module. aspp_out_channels (int): The output channels of ASPP module. decoder_channels (int): The channels of decoder align_corners (bool): An argument of F.interpolate. It should be set to False when the output size of feature is even, e.g. 1024x512, otherwise it is True, e.g. 769x769. low_level_channels_projects (list). The channels of low level features to output. """ def __init__(self, backbone_indices, backbone_channels, aspp_ratios, aspp_out_channels, decoder_channels, align_corners, low_level_channels_projects): super().__init__() self.aspp = ASPPModule( aspp_ratios, backbone_channels[-1], aspp_out_channels, align_corners, use_sep_conv=False, image_pooling=True, drop_rate=0.5) self.backbone_indices = backbone_indices self.decoder_stage = len(low_level_channels_projects) if self.decoder_stage != len(self.backbone_indices) - 1: raise ValueError( "len(low_level_channels_projects) != len(backbone_indices) - 1, they are {} and {}" .format(low_level_channels_projects, backbone_indices)) self.align_corners = align_corners # Transform low-level feature project = [] # Fuse fuse = [] # Top-down direction, i.e. starting from largest stride for i in range(self.decoder_stage): project.append( layers.ConvBNReLU( backbone_channels[i], low_level_channels_projects[i], 1, bias_attr=False)) if i == 0: fuse_in_channels = aspp_out_channels + low_level_channels_projects[ i] else: fuse_in_channels = decoder_channels + low_level_channels_projects[ i] fuse.append( SeparableConvBNReLU( fuse_in_channels, decoder_channels, 5, padding=2, bias_attr=False)) self.project = nn.LayerList(project) self.fuse = nn.LayerList(fuse) def forward(self, feat_list): x = feat_list[self.backbone_indices[-1]] x = self.aspp(x) for i in range(self.decoder_stage): l = feat_list[self.backbone_indices[i]] l = self.project[i](l) x = F.interpolate( x, size=l.shape[-2:], mode='bilinear', align_corners=self.align_corners) x = paddle.concat([x, l], axis=1) x = self.fuse[i](x) return x class SinglePanopticDeepLabHead(nn.Layer): """ Decoder module of DeepLabV3P model Args: num_classes (int): The number of classes. decoder_channels (int): The channels of decoder. head_channels (int): The channels of head. class_key (list): The key name of output by classifier. """ def __init__(self, num_classes, decoder_channels, head_channels, class_key): super(SinglePanopticDeepLabHead, self).__init__() self.num_head = len(num_classes) if self.num_head != len(class_key): raise ValueError( "len(num_classes) != len(class_key), they are {} and {}".format( num_classes, class_key)) classifier = [] for i in range(self.num_head): classifier.append( nn.Sequential( SeparableConvBNReLU( decoder_channels, head_channels, 5, padding=2, bias_attr=False), nn.Conv2D(head_channels, num_classes[i], 1))) self.classifier = nn.LayerList(classifier) self.class_key = class_key def forward(self, x): pred = OrderedDict() # build classifier for i, key in enumerate(self.class_key): pred[key] = self.classifier[i](x) return pred