deeplabv2.py

# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os

import paddle
import paddle.nn as nn
import paddle.nn.functional as F

from paddleseg.cvlibs import manager
from paddleseg.models import layers
from paddleseg.models.backbones import resnet_vd
from paddleseg.utils import utils, logger

from .gscnn import GSCNNHead
from .backbones.resnet import ClassifierModule

__all__ = ['DeepLabV2', ]


@manager.MODELS.add_component
class DeepLabV2(nn.Layer):
    """
    The DeepLabV2 implementation based on PaddlePaddle.

    The original article refers to:
        Chen, L. C., Papandreou, G., Kokkinos, I., Murphy, K., & Yuille, A. L. (2017). Deeplab: Semantic image segmentation with deep convolutional nets, atrous convolution, and fully connected crfs. IEEE transactions on pattern analysis and machine intelligence, 40(4), 834-848.

    Args:
        backbone (paddle.nn.Layer): Backbone network, currently support Resnet101.
        align_corners (bool, optional): An argument of F.interpolate. It should be set to False when the feature size is even,
            e.g. 1024x512, otherwise it is True, e.g. 769x769. Default: False.
        pretrained (str, optional): The path or url of pretrained model. Default: None.
        data_format(str, optional): Data format that specifies the layout of input. It can be "NCHW" or "NHWC". Default: "NCHW".
    """

    def __init__(self,
                 backbone,
                 align_corners=False,
                 pretrained=None,
                 shape_stream=False):
        super().__init__()

        self.backbone = backbone
        self.shape_stream = shape_stream
        self.head = edge_branch(
            inplanes=(64, 2048),
            out_channels=1024,
            dilation_series=[6, 12, 18, 24],
            padding_series=[6, 12, 18, 24],
            num_classes=2)

        self.fusion = ClassifierModule(21, [6, 18, 30, 42], [6, 18, 30, 42], 19)
        self.align_corners = align_corners
        self.pretrained = pretrained
        self.init_weight()

    def forward(self, x):
        feat_list = self.backbone(x)

        if self.shape_stream:
            logit_list = self.head(self.backbone.conv1_logit, feat_list[-1])
            logit_list.extend(feat_list[:2])
            edge_logit, seg_logit, aug_logit = [
                F.interpolate(
                    logit,
                    x.shape[2:],
                    mode='bilinear',
                    align_corners=self.align_corners) for logit in logit_list
            ]
            return [seg_logit, aug_logit, edge_logit]
        else:
            logit_list = feat_list[:2]
            return [
                F.interpolate(
                    logit,
                    x.shape[2:],
                    mode='bilinear',
                    align_corners=self.align_corners) for logit in logit_list
            ]  # x6, x_aug

    def init_weight(self):
        if self.pretrained is not None:
            para_state_dict = paddle.load(self.pretrained)
            model_state_dict = self.backbone.state_dict()
            keys = model_state_dict.keys()
            num_params_loaded = 0
            for k in keys:
                k_parts = k.split('.')
                torchkey = 'backbone.' + k
                if k_parts[1] == 'layer5':
                    logger.warning("{} should not be loaded".format(k))
                elif torchkey not in para_state_dict:
                    logger.warning("{} is not in pretrained model".format(k))
                elif list(para_state_dict[torchkey].shape) != list(
                        model_state_dict[k].shape):
                    logger.warning(
                        "[SKIP] Shape of pretrained params {} doesn't match.(Pretrained: {}, Actual: {})"
                        .format(k, para_state_dict[torchkey].shape,
                                model_state_dict[k].shape))
                else:
                    model_state_dict[k] = para_state_dict[torchkey]
                    num_params_loaded += 1
            self.backbone.set_dict(model_state_dict)
            logger.info("There are {}/{} variables loaded into {}.".format(
                num_params_loaded,
                len(model_state_dict), self.backbone.__class__.__name__))


class edge_branch(nn.Layer):
    def __init__(self, inplanes, out_channels, dilation_series, padding_series,
                 num_classes):
        super(edge_branch, self).__init__()
        self.conv_x1 = nn.Conv2D(inplanes[0], 512, kernel_size=3)
        self.conv_x4 = nn.Conv2D(inplanes[1], 512, kernel_size=3)

        self.conv0 = resnet_vd.ConvBNLayer(
            in_channels=512 * 2,
            out_channels=out_channels,
            kernel_size=3,
            act='relu')
        self.conv1 = resnet_vd.ConvBNLayer(
            in_channels=out_channels,
            out_channels=out_channels,
            kernel_size=3,
            act=None)

        self.add = layers.Add()
        self.relu = layers.Activation(act="relu")

        self.conv2d_list = nn.LayerList()
        for dilation, padding in zip(dilation_series, padding_series):
            weight_attr = paddle.ParamAttr(
                initializer=nn.initializer.Normal(std=0.01), learning_rate=10.0)
            bias_attr = paddle.ParamAttr(
                initializer=nn.initializer.Constant(value=0.0),
                learning_rate=10.0)
            self.conv2d_list.append(
                nn.Conv2D(
                    out_channels,
                    num_classes,
                    kernel_size=3,
                    stride=1,
                    padding=padding,
                    dilation=dilation,
                    weight_attr=weight_attr,
                    bias_attr=bias_attr))
        self.classifier = nn.Conv2D(
            out_channels, num_classes, kernel_size=3, stride=1)

    def forward(self, conv1_logit, x4):
        H = paddle.shape(x4)[2]
        W = paddle.shape(x4)[3]
        conv1_logit = F.interpolate(
            conv1_logit, size=[H, W], mode='bilinear', align_corners=True)

        conv1_logit = self.conv_x1(conv1_logit)
        x4 = self.conv_x4(x4)  # 1, 512, 81,161

        feats = paddle.concat([conv1_logit, x4], axis=1)
        y = self.conv0(feats)
        y = self.conv1(y)

        y = self.add(feats, y)
        y = self.relu(y)

        out = self.conv2d_list[0](y)
        for i in range(len(self.conv2d_list) - 1):
            out += self.conv2d_list[i + 1](y)

        return out