cspnext.py

# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. 
#   
# Licensed under the Apache License, Version 2.0 (the "License");   
# you may not use this file except in compliance with the License.  
# You may obtain a copy of the License at   
#   
#     http://www.apache.org/licenses/LICENSE-2.0    
# 
# Unless required by applicable law or agreed to in writing, software   
# distributed under the License is distributed on an "AS IS" BASIS, 
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
# See the License for the specific language governing permissions and   
# limitations under the License.

import paddle
import paddle.nn as nn
from ppdet.core.workspace import register, serializable
from .csp_darknet import BaseConv, DWConv, SPPLayer
from ..shape_spec import ShapeSpec

__all__ = ['CSPNeXtBlock', 'CSPNeXtLayer', 'CSPNeXt']


class CSPNeXtBlock(nn.Layer):
    """The basic bottleneck block used in CSPNeXt."""

    def __init__(self,
                 in_channels,
                 out_channels,
                 shortcut=True,
                 expansion=0.5,
                 depthwise=False,
                 kernel_size=5,
                 bias=False,
                 act="silu"):
        super(CSPNeXtBlock, self).__init__()
        hidden_channels = int(out_channels * expansion)
        Conv = DWConv if depthwise else BaseConv
        self.conv1 = BaseConv(
            in_channels, hidden_channels, 3, stride=1, bias=bias, act=act)
        self.conv2 = DWConv(
            hidden_channels,
            out_channels,
            ksize=kernel_size,
            stride=1,
            bias=bias,
            act=act)
        self.add_shortcut = shortcut and in_channels == out_channels

    def forward(self, x):
        y = self.conv2(self.conv1(x))
        if self.add_shortcut:
            y = y + x
        return y


class ChannelAttention(nn.Layer):
    def __init__(self, channels=256):
        super().__init__()
        self.pool = nn.AdaptiveAvgPool2D(1)
        self.fc = nn.Conv2D(channels, channels, 1, 1, bias_attr=True)
        self.act = nn.Hardsigmoid()

    def forward(self, x):
        y = self.pool(x)
        out = self.act(self.fc(y))
        return x * out


class CSPNeXtLayer(nn.Layer):
    """CSPNeXt layer used in RTMDet, like CSPLayer(C3) in YOLOv5/YOLOX"""

    def __init__(self,
                 in_channels,
                 out_channels,
                 num_blocks=1,
                 shortcut=True,
                 expansion=0.5,
                 depthwise=False,
                 ch_attn=False,
                 bias=False,
                 act="silu"):
        super(CSPNeXtLayer, self).__init__()
        hidden_channels = int(out_channels * expansion)
        self.ch_attn = ch_attn
        self.conv1 = BaseConv(
            in_channels, hidden_channels, ksize=1, stride=1, bias=bias, act=act)
        self.conv2 = BaseConv(
            in_channels, hidden_channels, ksize=1, stride=1, bias=bias, act=act)
        self.conv3 = BaseConv(
            hidden_channels * 2,
            out_channels,
            ksize=1,
            stride=1,
            bias=bias,
            act=act)
        self.bottlenecks = nn.Sequential(* [
            CSPNeXtBlock(
                hidden_channels,
                hidden_channels,
                shortcut=shortcut,
                expansion=1.0,
                depthwise=depthwise,
                bias=bias,
                act=act) for _ in range(num_blocks)
        ])
        if ch_attn:
            self.ch_attn = ChannelAttention(hidden_channels * 2)

    def forward(self, x):
        x_1 = self.conv1(x)
        x_1 = self.bottlenecks(x_1)
        x_2 = self.conv2(x)
        x = paddle.concat([x_1, x_2], axis=1)
        if self.ch_attn:
            x = self.ch_attn(x)
        x = self.conv3(x)
        return x


@register
@serializable
class CSPNeXt(nn.Layer):
    """
    CSPNeXt backbone of RTMDet.
    Args:
        arch (str): Architecture of CSPNeXt, from {P5, P6}.
        depth_mult (float): Depth multiplier, multiply number of channels in
            each layer, default as 1.0.
        width_mult (float): Width multiplier, multiply number of blocks in
            CSPNeXtLayer, default as 1.0.
        depthwise (bool): Whether to use depth-wise conv layer.
        spp_kernel_sizes (tuple): kernel_sizes of SPP
        ch_attn (bool): Whether to add channel attention.
        act (str): Activation function type, default as 'silu'.
        trt (str): Whether to use trt infer in activation.
        return_idx (list): Index of stages whose feature maps are returned.
    """

    __shared__ = ['depth_mult', 'width_mult', 'act', 'trt']

    # in_channels, out_channels, num_blocks, add_shortcut, use_spp(use_sppf)
    arch_settings = {
        'P5': [[64, 128, 3, True, False], [128, 256, 6, True, False],
               [256, 512, 6, True, False], [512, 1024, 3, False, True]],
        'P6': [[64, 128, 3, True, False], [128, 256, 6, True, False],
               [256, 512, 6, True, False], [512, 768, 3, True, False],
               [768, 1024, 3, False, True]]
    }

    def __init__(self,
                 arch='P5',
                 depth_mult=1.0,
                 width_mult=1.0,
                 depthwise=False,
                 spp_kernel_sizes=(5, 9, 13),
                 ch_attn=True,
                 act='silu',
                 trt=False,
                 return_idx=[2, 3, 4]):
        super(CSPNeXt, self).__init__()
        self.arch = arch
        self.return_idx = return_idx
        Conv = DWConv if depthwise else BaseConv
        arch_setting = self.arch_settings[arch]
        stem_ch = int(arch_setting[0][0] * width_mult // 2)
        stem_out_ch = int(stem_ch * 2)

        self.stem = nn.Sequential(
            ('conv1', BaseConv(
                3, stem_ch, 3, 2, act=act)), ('conv2', BaseConv(
                    stem_ch, stem_ch, 3, 1, act=act)), ('conv3', BaseConv(
                        stem_ch, stem_out_ch, 3, 1, act=act)))

        _out_channels = [stem_out_ch]
        layers_num = 1
        self.csp_next_blocks = []

        for i, (in_ch, out_ch, n, shortcut, use_spp) in enumerate(arch_setting):
            in_channels = int(in_ch * width_mult)
            out_channels = int(out_ch * width_mult)
            _out_channels.append(out_channels)
            num_blocks = max(round(n * depth_mult), 1)
            stage = []

            conv_layer = self.add_sublayer(
                'layers{}.stage{}.conv_layer'.format(layers_num, i + 1),
                Conv(
                    in_channels, out_channels, 3, 2, act=act))
            stage.append(conv_layer)
            layers_num += 1

            if use_spp:
                spp_layer = self.add_sublayer(
                    'layers{}.stage{}.spp_layer'.format(layers_num, i + 1),
                    SPPLayer(
                        out_channels,
                        out_channels,
                        kernel_sizes=spp_kernel_sizes,
                        bias=False,
                        act=act))
                stage.append(spp_layer)
                layers_num += 1

            csp_layer = self.add_sublayer(
                'layers{}.stage{}.cspnext_layer'.format(layers_num, i + 1),
                CSPNeXtLayer(
                    out_channels,
                    out_channels,
                    num_blocks=num_blocks,
                    shortcut=shortcut,
                    depthwise=depthwise,
                    ch_attn=ch_attn,
                    bias=False,
                    act=act))
            stage.append(csp_layer)
            layers_num += 1

            self.csp_next_blocks.append(nn.Sequential(*stage))

        self._out_channels = [_out_channels[i] for i in self.return_idx]
        self.strides = [[2, 4, 8, 16, 32, 64][i] for i in self.return_idx]

    def forward(self, inputs):
        x = inputs['image']
        outputs = []
        x = self.stem(x)
        for i, layer in enumerate(self.csp_next_blocks):
            x = layer(x)
            if i + 1 in self.return_idx:
                outputs.append(x)
        return outputs

    @property
    def out_shape(self):
        return [
            ShapeSpec(
                channels=c, stride=s)
            for c, s in zip(self._out_channels, self.strides)
        ]