intern_image.py

# Copyright (c) 2022 OpenGVLab
# Copyright (c) OpenMMLab. All rights reserved.
# modified from
# https://github.com/OpenGVLab/InternImage/blob/master/classification/models/intern_image.py
import torch
import torch.nn as nn
import torch.utils.checkpoint as cp
from mmcv.cnn.bricks import DropPath, build_activation_layer
from mmcv.cnn.bricks.transformer import FFN
from mmengine.model.weight_init import trunc_normal_
from ops_dcnv3 import modules as opsm

from mmpretrain.models.backbones.base_backbone import BaseBackbone
from mmpretrain.models.utils import CrossMultiheadAttention
from mmpretrain.registry import MODELS


class to_channels_first(nn.Module):

    def __init__(self):
        super().__init__()

    def forward(self, x):
        return x.permute(0, 3, 1, 2)


class to_channels_last(nn.Module):

    def __init__(self):
        super().__init__()

    def forward(self, x):
        return x.permute(0, 2, 3, 1)


def build_norm_layer(dim,
                     norm_layer,
                     in_format='channels_last',
                     out_format='channels_last',
                     eps=1e-6):
    layers = []
    if norm_layer == 'BN':
        if in_format == 'channels_last':
            layers.append(to_channels_first())
        layers.append(nn.BatchNorm2d(dim))
        if out_format == 'channels_last':
            layers.append(to_channels_last())
    elif norm_layer == 'LN':
        if in_format == 'channels_first':
            layers.append(to_channels_last())
        layers.append(nn.LayerNorm(dim, eps=eps))
        if out_format == 'channels_first':
            layers.append(to_channels_first())
    else:
        raise NotImplementedError(
            f'build_norm_layer does not support {norm_layer}')
    return nn.Sequential(*layers)


class AttentiveBlock(nn.Module):
    """Attentive Block.

    Args:
        dim (int): Number of input channels.
        num_heads (int): Number of attention heads.
        qkv_bias (bool, optional):  If True, add a learnable bias to q, k, v.
            Default: False.
        qk_scale (float, optional): Override default qk scale of
            head_dim ** -0.5 if set. Default: None.
        drop (float, optional): Dropout rate. Default: 0.0.
        attn_drop (float, optional): Attention dropout rate. Default: 0.0.
        drop_path (float, optional): Stochastic depth rate. Default: 0.0.
        norm_cfg (dict, optional): Normalization layer.
            Default: dict(type='LN')
        out_dim (int, optional): Dimension of output. Default: None.
    """

    def __init__(self,
                 dim,
                 num_heads,
                 qkv_bias=False,
                 qk_scale=None,
                 drop=0.,
                 attn_drop=0.,
                 drop_path=0.,
                 norm_cfg=dict(type='LN'),
                 out_dim=None):
        super().__init__()
        norm_layer = norm_cfg['type']
        self.norm1_q = build_norm_layer(dim, norm_layer, eps=1e-6)
        self.norm1_k = build_norm_layer(dim, norm_layer, eps=1e-6)
        self.norm1_v = build_norm_layer(dim, norm_layer, eps=1e-6)

        self.cross_dcn = CrossMultiheadAttention(
            embed_dims=dim,
            num_heads=num_heads,
            qkv_bias=qkv_bias,
            qk_scale=qk_scale,
            attn_drop=attn_drop,
            proj_drop=drop,
        )
        if out_dim and out_dim != dim:
            self.cross_dcn.proj = nn.Linear(dim, out_dim)

        self.drop_path = DropPath(
            drop_path) if drop_path > 0. else nn.Identity()

    def forward(self, x_q, x_kv, pos_q, pos_k):
        x_q = self.norm1_q(x_q + pos_q)
        x_k = self.norm1_k(x_kv + pos_k)
        x_v = self.norm1_v(x_kv)
        x = self.cross_dcn(x_q, k=x_k, v=x_v)
        return x


class AttentionPoolingBlock(AttentiveBlock):

    def forward(self, x):
        x_q = x.mean(1, keepdim=True)
        x_kv = x
        pos_q, pos_k = 0, 0
        x = super().forward(x_q, x_kv, pos_q, pos_k)
        x = x.squeeze(1)
        return x


class DownsampleLayer(nn.Module):
    """Downsample layer of InternImage.

    Args:
        channels (int): number of input channels
        norm_layer (str): normalization layer
    """

    def __init__(self, channels, norm_layer='LN'):
        super().__init__()
        self.conv = nn.Conv2d(
            channels,
            2 * channels,
            kernel_size=3,
            stride=2,
            padding=1,
            bias=False)
        self.norm = build_norm_layer(2 * channels, norm_layer,
                                     'channels_first', 'channels_last')

    def forward(self, x):
        x = self.conv(x.permute(0, 3, 1, 2))
        x = self.norm(x)
        return x


class InternImageLayer(nn.Module):
    """Basic layer of InternImage.

    Args:
        core_op (nn.Module): core operation of InternImage
        channels (int): number of input channels
        groups (list): Groups of each block.
        mlp_ratio (float): ratio of mlp hidden features to input channels
        drop (float): dropout rate
        drop_path (float): drop path rate
        act_cfg (dict): activation layer
        norm_cfg (dict): normalization layer
        post_norm (bool): whether to use post normalization
        layer_scale (float): layer scale
        offset_scale (float): offset scale
        with_cp (bool): whether to use checkpoint
    """

    def __init__(
        self,
        core_op,
        channels,
        groups,
        mlp_ratio=4.,
        drop=0.,
        drop_path=0.,
        act_cfg=dict(type='GELU'),
        norm_cfg=dict(type='LN'),
        post_norm=False,
        layer_scale=None,
        offset_scale=1.0,
        with_cp=False,
        dw_kernel_size=None,
        res_post_norm=False,
        center_feature_scale=False,
        remove_center=False,
    ):
        super().__init__()
        self.channels = channels
        self.groups = groups
        self.mlp_ratio = mlp_ratio
        self.with_cp = with_cp

        self.norm1 = build_norm_layer(channels, 'LN')
        self.post_norm = post_norm
        self.dcn = core_op(
            channels=channels,
            kernel_size=3,
            stride=1,
            pad=1,
            dilation=1,
            group=groups,
            offset_scale=offset_scale,
            act_layer=act_cfg['type'],
            norm_layer=norm_cfg['type'],
            dw_kernel_size=dw_kernel_size,
            center_feature_scale=center_feature_scale,
            remove_center=remove_center,
        )
        self.drop_path = DropPath(drop_path) if drop_path > 0. \
            else nn.Identity()
        self.norm2 = build_norm_layer(channels, 'LN')

        self.mlp = FFN(
            embed_dims=channels,
            feedforward_channels=int(channels * mlp_ratio),
            act_cfg=act_cfg,
            ffn_drop=drop,
            add_identity=False)

        self.layer_scale = layer_scale is not None
        if self.layer_scale:
            self.gamma1 = nn.Parameter(
                layer_scale * torch.ones(channels), requires_grad=True)
            self.gamma2 = nn.Parameter(
                layer_scale * torch.ones(channels), requires_grad=True)
        self.res_post_norm = res_post_norm
        if res_post_norm:
            self.res_post_norm1 = build_norm_layer(channels, 'LN')
            self.res_post_norm2 = build_norm_layer(channels, 'LN')

    def forward(self, x):

        def _inner_forward(x):
            if not self.layer_scale:
                if self.post_norm:
                    x = x + self.drop_path(self.norm1(self.dcn(x)))
                    x = x + self.drop_path(self.norm2(self.mlp(x)))
                elif self.res_post_norm:
                    x = x + self.drop_path(
                        self.res_post_norm1(self.dcn(self.norm1(x))))
                    x = x + self.drop_path(
                        self.res_post_norm2(self.mlp(self.norm2(x))))
                else:
                    x = x + self.drop_path(self.dcn(self.norm1(x)))
                    x = x + self.drop_path(self.mlp(self.norm2(x)))
                return x
            if self.post_norm:
                x = x + self.drop_path(self.gamma1 * self.norm1(self.dcn(x)))
                x = x + self.drop_path(self.gamma2 * self.norm2(self.mlp(x)))
            else:
                x = x + self.drop_path(self.gamma1 * self.dcn(self.norm1(x)))
                x = x + self.drop_path(self.gamma2 * self.mlp(self.norm2(x)))
            return x

        if self.with_cp and x.requires_grad:
            x = cp.checkpoint(_inner_forward, x)
        else:
            x = _inner_forward(x)
        return x


class InternImageBlock(nn.Module):
    """Block of InternImage.

    Args:
        core_op (nn.Module): core operation of InternImage
        channels (int): number of input channels
        depths (list): Depth of each block.
        groups (list): Groups of each block.
        mlp_ratio (float): ratio of mlp hidden features to input channels
        drop (float): dropout rate
        drop_path (float): drop path rate
        act_cfg (dict): activation layer
        norm_cfg (dict): normalization layer
        post_norm (bool): whether to use post normalization
        layer_scale (float): layer scale
        offset_scale (float): offset scale
        with_cp (bool): whether to use checkpoint
    """

    def __init__(
        self,
        core_op,
        channels,
        depth,
        groups,
        downsample=True,
        mlp_ratio=4.,
        drop=0.,
        drop_path=0.,
        act_cfg=dict(type='GELU'),
        norm_cfg=dict(type='LN'),
        post_norm=False,
        offset_scale=1.0,
        layer_scale=None,
        with_cp=False,
        dw_kernel_size=None,
        post_norm_block_ids=None,
        res_post_norm=False,
        center_feature_scale=False,
        remove_center=False,
    ):
        super().__init__()
        self.channels = channels
        self.depth = depth
        self.post_norm = post_norm
        self.center_feature_scale = center_feature_scale

        self.blocks = nn.ModuleList([
            InternImageLayer(
                core_op=core_op,
                channels=channels,
                groups=groups,
                mlp_ratio=mlp_ratio,
                drop=drop,
                drop_path=drop_path[i]
                if isinstance(drop_path, list) else drop_path,
                act_cfg=act_cfg,
                norm_cfg=norm_cfg,
                post_norm=post_norm,
                layer_scale=layer_scale,
                offset_scale=offset_scale,
                with_cp=with_cp,
                dw_kernel_size=dw_kernel_size,
                res_post_norm=res_post_norm,
                center_feature_scale=center_feature_scale,
                remove_center=remove_center,
            ) for i in range(depth)
        ])
        if not self.post_norm or center_feature_scale:
            self.norm = build_norm_layer(channels, 'LN')
        self.post_norm_block_ids = post_norm_block_ids
        if post_norm_block_ids is not None:
            self.post_norms = nn.ModuleList([
                build_norm_layer(channels, 'LN', eps=1e-6)
                for _ in post_norm_block_ids
            ])
        self.downsample = DownsampleLayer(
            channels=channels,
            norm_layer=norm_cfg['type']) if downsample else None

    def forward(self, x, return_wo_downsample=False):
        for i, blk in enumerate(self.blocks):
            x = blk(x)
            if (self.post_norm_block_ids
                    is not None) and (i in self.post_norm_block_ids):
                index = self.post_norm_block_ids.index(i)
                x = self.post_norms[index](x)
        if not self.post_norm or self.center_feature_scale:
            x = self.norm(x)
        if return_wo_downsample:
            x_ = x
        if self.downsample is not None:
            x = self.downsample(x)

        if return_wo_downsample:
            return x, x_
        return x


@MODELS.register_module()
class InternImage(BaseBackbone):
    """ InternImage
        A PyTorch impl of : `InternImage: Exploring Large-Scale Vision Foundation Models with Deformable Convolutions`  -
          https://arxiv.org/pdf/2103.14030

    Args:
        core_op (str): Core operator. Default: 'DCNv3'
        stem_channels (int): Number of the first stage. Default: 64
        stage_blocks (list): Depth of each block. Default: [3, 4, 18, 5]
        groups (list): Groups of each block. Default: [3, 6, 12, 24]
        num_classes (int): Number of classes. Default: 1000
        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.
        drop_rate (float): Probability of an element to be zeroed. Default: 0.
        drop_path_rate (float): Stochastic depth rate. Default: 0.
        act_cfg (dict): Activation layer. Default: dict(type='GELU')
        norm_cfg (dict): Normalization layer. Default: dict(type='LN')
        layer_scale (bool): Whether to use layer scale. Default: False
        cls_scale (bool): Whether to use class scale. Default: False
        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
        dw_kernel_size (int): Size of the dwconv. Default: None
        use_clip_projector (bool): Whether to use clip projector. Default: False
        level2_post_norm (bool): Whether to use level2 post norm. Default: False
        level2_post_norm_block_ids (list): Indexes of post norm blocks. Default: None
        res_post_norm (bool): Whether to use res post norm. Default: False
        center_feature_scale (bool): Whether to use center feature scale. Default: False
    """  # noqa: E501

    def __init__(self,
                 stem_channels=64,
                 stage_blocks=[3, 4, 18, 5],
                 groups=[3, 6, 12, 24],
                 mlp_ratio=4.,
                 drop_rate=0.,
                 drop_path_rate=0.2,
                 drop_path_type='linear',
                 act_cfg=dict(type='GELU'),
                 norm_cfg=dict(type='LN'),
                 layer_scale=None,
                 offset_scale=1.0,
                 post_norm=False,
                 cls_scale=1.5,
                 with_cp=False,
                 dw_kernel_size=None,
                 use_clip_projector=False,
                 level2_post_norm=False,
                 level2_post_norm_block_ids=None,
                 res_post_norm=False,
                 center_feature_scale=False,
                 remove_center=False,
                 init_cfg=None):
        super(InternImage, self).__init__(init_cfg)

        self.core_op = 'DCNv3'
        self.num_stages = len(stage_blocks)
        self.num_features = int(stem_channels * 2**(self.num_stages - 1))
        self.post_norm = post_norm
        self.mlp_ratio = mlp_ratio
        self.use_clip_projector = use_clip_projector
        self.level2_post_norm_block_ids = level2_post_norm_block_ids
        self.remove_center = remove_center
        self.act_cfg = act_cfg
        self.norm_cfg = norm_cfg

        # stem layer
        self._make_stem_layer(in_channels=3, stem_channels=stem_channels)
        self.pos_drop = nn.Dropout(p=drop_rate)

        # stochastic depth decay rule
        total_depth = sum(stage_blocks)
        dpr = [
            x.item() for x in torch.linspace(0, drop_path_rate, total_depth)
        ]
        if drop_path_type == 'uniform':
            for i in range(len(dpr)):
                dpr[i] = drop_path_rate

        # InternImage Layers
        self.layers = nn.ModuleList()
        for i in range(self.num_stages):
            if level2_post_norm and i == 2:
                post_norm_block_ids = level2_post_norm_block_ids
            else:
                post_norm_block_ids = None

            layer = InternImageBlock(
                core_op=getattr(opsm, self.core_op),
                channels=int(stem_channels * 2**i),
                depth=stage_blocks[i],
                groups=groups[i],
                mlp_ratio=self.mlp_ratio,
                drop=drop_rate,
                drop_path=dpr[sum(stage_blocks[:i]):sum(stage_blocks[:i + 1])],
                act_cfg=act_cfg,
                norm_cfg=norm_cfg,
                post_norm=post_norm,
                downsample=(i < self.num_stages - 1),
                layer_scale=layer_scale,
                offset_scale=offset_scale,
                with_cp=with_cp,
                dw_kernel_size=dw_kernel_size,
                post_norm_block_ids=post_norm_block_ids,
                res_post_norm=res_post_norm,
                center_feature_scale=center_feature_scale,
                remove_center=remove_center,
            )
            self.layers.append(layer)

        # Conv Head
        if not use_clip_projector:
            self.conv_head = nn.Sequential(
                nn.Conv2d(
                    self.num_features,
                    int(self.num_features * cls_scale),
                    kernel_size=1,
                    bias=False),
                build_norm_layer(
                    int(self.num_features * cls_scale), 'BN', 'channels_first',
                    'channels_first'), build_activation_layer(act_cfg))

        else:
            pretrain_embed_dim, _stride, attnpool_num_heads, clip_embed_dim \
                = 1024, 2, 16, 768
            self.dcnv3_head_x4 = nn.Sequential(
                nn.Conv2d(
                    in_channels=self.num_features,
                    out_channels=pretrain_embed_dim * (_stride**2),
                    kernel_size=1), nn.PixelShuffle(_stride))
            self.dcnv3_head_x3 = nn.Conv2d(
                in_channels=self.num_features // 2,
                out_channels=pretrain_embed_dim,
                kernel_size=1)
            self.clip_projector = AttentionPoolingBlock(
                dim=pretrain_embed_dim,
                num_heads=attnpool_num_heads,
                qkv_bias=True,
                qk_scale=None,
                drop=0.,
                attn_drop=0.,
                norm_cfg=norm_cfg,
                out_dim=clip_embed_dim)
            norm_layer = norm_cfg['type']
            self.fc_norm = build_norm_layer(
                clip_embed_dim, norm_layer, eps=1e-6)

    def init_weights(self):
        super(InternImage, self).init_weights()

        for m in self.modules():
            if isinstance(m, nn.Linear):
                trunc_normal_(m.weight, std=.02)
                if isinstance(m, nn.Linear) and m.bias is not None:
                    nn.init.constant_(m.bias, 0)
            elif isinstance(m, nn.LayerNorm):
                nn.init.constant_(m.bias, 0)
                nn.init.constant_(m.weight, 1.0)

            elif isinstance(m, getattr(opsm, self.core_op)):
                m._reset_parameters()

    def _make_stem_layer(self, in_channels, stem_channels):
        norm_layer = self.norm_cfg['type']
        self.patch_embed = nn.Sequential(
            nn.Conv2d(
                in_channels,
                stem_channels // 2,
                kernel_size=3,
                stride=2,
                padding=1),
            build_norm_layer(stem_channels // 2, norm_layer, 'channels_first',
                             'channels_first'),
            build_activation_layer(self.act_cfg),
            nn.Conv2d(
                stem_channels // 2,
                stem_channels,
                kernel_size=3,
                stride=2,
                padding=1),
            build_norm_layer(stem_channels, norm_layer, 'channels_first',
                             'channels_last'),
        )

    def forward_features(self, x):
        x = self.patch_embed(x)
        x = self.pos_drop(x)

        for layer in self.layers:
            x = layer(x)

        x = self.conv_head(x.permute(0, 3, 1, 2))
        return (x, )

    def forward_features_seq_out(self, x):
        x = self.patch_embed(x)
        x = self.pos_drop(x)

        seq_out = []
        for layer in self.layers:
            x, x_ = layer(x, return_wo_downsample=True)
            seq_out.append(x_)
        return seq_out

    def forward_clip_projector(self, x):  # for InternImage-H/G
        xs = self.forward_features_seq_out(x)
        x1, x2, x3, x4 = xs

        x1 = x1.permute(0, 3, 1, 2)  # NHWC -> NCHW
        x2 = x2.permute(0, 3, 1, 2)  # NHWC -> NCHW
        x3 = x3.permute(0, 3, 1, 2)  # NHWC -> NCHW
        x4 = x4.permute(0, 3, 1, 2)  # NHWC -> NCHW

        x4 = self.dcnv3_head_x4(x4)
        x = x4
        x3 = self.dcnv3_head_x3(x3)
        x = x + x3

        x = x.flatten(-2).transpose(1, 2).contiguous()
        x = self.clip_projector(x)
        x = self.fc_norm(x)

        return (x, )

    def forward(self, x):
        if not self.use_clip_projector:
            # for InternImage-T/S/B/L/XL
            return self.forward_features(x)
        else:
            # for InternImage-H/G
            return self.forward_clip_projector(x)

    @staticmethod
    def _checkpoint_filter(state_dict, prefix, local_metadata, strict,
                           missing_keys, unexpected_keys, error_msgs):

        def internimage_to_mmpretrain():
            for k, v in state_dict['model'].items():
                if 'head.' in k and 'conv_head' not in k:
                    if 'weight' in k:
                        new_k = 'head.fc.weight'
                    else:
                        new_k = 'head.fc.bias'
                elif 'patch_embed' in k:
                    map_fun = {
                        'conv1': '0',
                        'norm1': '1',
                        'conv2': '3',
                        'norm2': '4'
                    }
                    new_k = k
                    for old, new in map_fun.items():
                        new_k = new_k.replace(old, new)
                    new_k = 'backbone.' + new_k

                elif 'levels' in k:
                    new_k = k.replace('levels', 'layers')
                    if 'mlp' in new_k:
                        new_k = new_k.replace('fc1', 'layers.0.0')
                        new_k = new_k.replace('fc2', 'layers.1')
                    new_k = 'backbone.' + new_k
                elif 'clip_projector.cross_dcn.k_bias' in k:
                    continue
                else:
                    new_k = 'backbone.' + k

                state_dict[new_k] = state_dict['model'][k]
            del state_dict['model']

        # The original weights need to be converted to mmpretrain format.
        # Some modules in the original weights starts with 'levels',
        # and in this implement they are replaced with 'layers'.
        if 'model' in state_dict and 'levels.0.blocks.0.norm1.0.weight'\
                in state_dict['model']:
            internimage_to_mmpretrain()