release classification

59c80aa2 · PRC-Huang · zhe chen · 2cab2294 · 59c80aa2 · 59c80aa2
Commit 59c80aa2 authored Mar 08, 2023 by PRC-Huang Committed by zhe chen Mar 08, 2023
20 changed files
--- a/classification/models/__init__.py
+++ b/classification/models/__init__.py
+# --------------------------------------------------------
+# InternImage
+# Copyright (c) 2022 OpenGVLab
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+from .build import build_model
\ No newline at end of file
--- a/classification/models/build.py
+++ b/classification/models/build.py
+# --------------------------------------------------------
+# InternImage
+# Copyright (c) 2022 OpenGVLab
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+from .intern_image import InternImage
+def build_model(config):
+    model_type = config.MODEL.TYPE
+    if model_type == 'intern_image':
+        model = InternImage(
+            core_op=config.MODEL.INTERN_IMAGE.CORE_OP,
+            num_classes=config.MODEL.NUM_CLASSES,
+            channels=config.MODEL.INTERN_IMAGE.CHANNELS,
+            depths=config.MODEL.INTERN_IMAGE.DEPTHS,
+            groups=config.MODEL.INTERN_IMAGE.GROUPS,
+            layer_scale=config.MODEL.INTERN_IMAGE.LAYER_SCALE,
+            offset_scale=config.MODEL.INTERN_IMAGE.OFFSET_SCALE,
+            post_norm=config.MODEL.INTERN_IMAGE.POST_NORM,
+            mlp_ratio=config.MODEL.INTERN_IMAGE.MLP_RATIO,
+            with_cp=config.TRAIN.USE_CHECKPOINT,
+        )
+    else:
+        raise NotImplementedError(f"Unkown model: {model_type}")
+    return model
--- a/classification/models/intern_image.py
+++ b/classification/models/intern_image.py
+# --------------------------------------------------------
+# InternImage
+# Copyright (c) 2022 OpenGVLab
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+import torch
+import torch.nn as nn
+import torch.utils.checkpoint as checkpoint
+from timm.models.layers import trunc_normal_, DropPath
+from ops_dcnv3 import modules as opsm
+class to_channels_first(nn.Module):
+    def __init__(self):
+        super().__init__()
+    def forward(self, x):
+        return x.permute(0, 3, 1, 2)
+class to_channels_last(nn.Module):
+    def __init__(self):
+        super().__init__()
+    def forward(self, x):
+        return x.permute(0, 2, 3, 1)
+def build_norm_layer(dim,
+                     norm_layer,
+                     in_format='channels_last',
+                     out_format='channels_last',
+                     eps=1e-6):
+    layers = []
+    if norm_layer == 'BN':
+        if in_format == 'channels_last':
+            layers.append(to_channels_first())
+        layers.append(nn.BatchNorm2d(dim))
+        if out_format == 'channels_last':
+            layers.append(to_channels_last())
+    elif norm_layer == 'LN':
+        if in_format == 'channels_first':
+            layers.append(to_channels_last())
+        layers.append(nn.LayerNorm(dim, eps=eps))
+        if out_format == 'channels_first':
+            layers.append(to_channels_first())
+    else:
+        raise NotImplementedError(
+            f'build_norm_layer does not support {norm_layer}')
+    return nn.Sequential(*layers)
+def build_act_layer(act_layer):
+    if act_layer == 'ReLU':
+        return nn.ReLU(inplace=True)
+    elif act_layer == 'SiLU':
+        return nn.SiLU(inplace=True)
+    elif act_layer == 'GELU':
+        return nn.GELU()
+    raise NotImplementedError(f'build_act_layer does not support {act_layer}')
+class StemLayer(nn.Module):
+    r""" Stem layer of InternImage
+    Args:
+        in_chans (int): number of input channels
+        out_chans (int): number of output channels
+        act_layer (str): activation layer
+        norm_layer (str): normalization layer
+    """
+    def __init__(self,
+                 in_chans=3,
+                 out_chans=96,
+                 act_layer='GELU',
+                 norm_layer='BN'):
+        super().__init__()
+        self.conv1 = nn.Conv2d(in_chans,
+                               out_chans // 2,
+                               kernel_size=3,
+                               stride=2,
+                               padding=1)
+        self.norm1 = build_norm_layer(out_chans // 2, norm_layer,
+                                      'channels_first', 'channels_first')
+        self.act = build_act_layer(act_layer)
+        self.conv2 = nn.Conv2d(out_chans // 2,
+                               out_chans,
+                               kernel_size=3,
+                               stride=2,
+                               padding=1)
+        self.norm2 = build_norm_layer(out_chans, norm_layer, 'channels_first',
+                                      'channels_last')
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.norm1(x)
+        x = self.act(x)
+        x = self.conv2(x)
+        x = self.norm2(x)
+        return x
+class DownsampleLayer(nn.Module):
+    r""" Downsample layer of InternImage
+    Args:
+        channels (int): number of input channels
+        norm_layer (str): normalization layer
+    """
+    def __init__(self, channels, norm_layer='LN'):
+        super().__init__()
+        self.conv = nn.Conv2d(channels,
+                              2 * channels,
+                              kernel_size=3,
+                              stride=2,
+                              padding=1,
+                              bias=False)
+        self.norm = build_norm_layer(2 * channels, norm_layer,
+                                     'channels_first', 'channels_last')
+    def forward(self, x):
+        x = self.conv(x.permute(0, 3, 1, 2))
+        x = self.norm(x)
+        return x
+class MLPLayer(nn.Module):
+    r""" MLP layer of InternImage
+    Args:
+        in_features (int): number of input features
+        hidden_features (int): number of hidden features
+        out_features (int): number of output features
+        act_layer (str): activation layer
+        drop (float): dropout rate
+    """
+    def __init__(self,
+                 in_features,
+                 hidden_features=None,
+                 out_features=None,
+                 act_layer='GELU',
+                 drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = build_act_layer(act_layer)
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+class InternImageLayer(nn.Module):
+    r""" Basic layer of InternImage
+    Args:
+        core_op (nn.Module): core operation of InternImage
+        channels (int): number of input channels
+        groups (list): Groups of each block.
+        mlp_ratio (float): ratio of mlp hidden features to input channels
+        drop (float): dropout rate
+        drop_path (float): drop path rate
+        act_layer (str): activation layer
+        norm_layer (str): normalization layer
+        post_norm (bool): whether to use post normalization
+        layer_scale (float): layer scale
+        offset_scale (float): offset scale
+        with_cp (bool): whether to use checkpoint
+    """
+    def __init__(self,
+                 core_op,
+                 channels,
+                 groups,
+                 mlp_ratio=4.,
+                 drop=0.,
+                 drop_path=0.,
+                 act_layer='GELU',
+                 norm_layer='LN',
+                 post_norm=False,
+                 layer_scale=None,
+                 offset_scale=1.0,
+                 with_cp=False):
+        super().__init__()
+        self.channels = channels
+        self.groups = groups
+        self.mlp_ratio = mlp_ratio
+        self.with_cp = with_cp
+        self.norm1 = build_norm_layer(channels, 'LN')
+        self.post_norm = post_norm
+        self.dcn = core_op(channels=channels,
+                           kernel_size=3,
+                           stride=1,
+                           pad=1,
+                           dilation=1,
+                           group=groups,
+                           offset_scale=offset_scale,
+                           act_layer=act_layer,
+                           norm_layer=norm_layer)
+        self.drop_path = DropPath(drop_path) if drop_path > 0. \
+            else nn.Identity()
+        self.norm2 = build_norm_layer(channels, 'LN')
+        self.mlp = MLPLayer(in_features=channels,
+                            hidden_features=int(channels * mlp_ratio),
+                            act_layer=act_layer,
+                            drop=drop)
+        self.layer_scale = layer_scale is not None
+        if self.layer_scale:
+            self.gamma1 = nn.Parameter(layer_scale * torch.ones(channels),
+                                       requires_grad=True)
+            self.gamma2 = nn.Parameter(layer_scale * torch.ones(channels),
+                                       requires_grad=True)
+    def forward(self, x):
+        def _inner_forward(x):
+            if not self.layer_scale:
+                if self.post_norm:
+                    x = x + self.drop_path(self.norm1(self.dcn(x)))
+                    x = x + self.drop_path(self.norm2(self.mlp(x)))
+                else:
+                    x = x + self.drop_path(self.dcn(self.norm1(x)))
+                    x = x + self.drop_path(self.mlp(self.norm2(x)))
+                return x
+            if self.post_norm:
+                x = x + self.drop_path(self.gamma1 * self.norm1(self.dcn(x)))
+                x = x + self.drop_path(self.gamma2 * self.norm2(self.mlp(x)))
+            else:
+                x = x + self.drop_path(self.gamma1 * self.dcn(self.norm1(x)))
+                x = x + self.drop_path(self.gamma2 * self.mlp(self.norm2(x)))
+            return x
+        if self.with_cp and x.requires_grad:
+            x = checkpoint.checkpoint(_inner_forward, x)
+        else:
+            x = _inner_forward(x)
+        return x
+class InternImageBlock(nn.Module):
+    r""" Block of InternImage
+    Args:
+        core_op (nn.Module): core operation of InternImage
+        channels (int): number of input channels
+        depths (list): Depth of each block.
+        groups (list): Groups of each block.
+        mlp_ratio (float): ratio of mlp hidden features to input channels
+        drop (float): dropout rate
+        drop_path (float): drop path rate
+        act_layer (str): activation layer
+        norm_layer (str): normalization layer
+        post_norm (bool): whether to use post normalization
+        layer_scale (float): layer scale
+        offset_scale (float): offset scale
+        with_cp (bool): whether to use checkpoint
+    """
+    def __init__(self,
+                 core_op,
+                 channels,
+                 depth,
+                 groups,
+                 downsample=True,
+                 mlp_ratio=4.,
+                 drop=0.,
+                 drop_path=0.,
+                 act_layer='GELU',
+                 norm_layer='LN',
+                 post_norm=False,
+                 offset_scale=1.0,
+                 layer_scale=None,
+                 with_cp=False):
+        super().__init__()
+        self.channels = channels
+        self.depth = depth
+        self.post_norm = post_norm
+        self.blocks = nn.ModuleList([
+            InternImageLayer(core_op=core_op,
+                             channels=channels,
+                             groups=groups,
+                             mlp_ratio=mlp_ratio,
+                             drop=drop,
+                             drop_path=drop_path[i] if isinstance(
+                                 drop_path, list) else drop_path,
+                             act_layer=act_layer,
+                             norm_layer=norm_layer,
+                             post_norm=post_norm,
+                             layer_scale=layer_scale,
+                             offset_scale=offset_scale,
+                             with_cp=with_cp) for i in range(depth)
+        ])
+        if not self.post_norm:
+            self.norm = build_norm_layer(channels, 'LN')
+        self.downsample = DownsampleLayer(
+            channels=channels, norm_layer=norm_layer) if downsample else None
+    def forward(self, x, return_wo_downsample=False):
+        for blk in self.blocks:
+            x = blk(x)
+        if not self.post_norm:
+            x = self.norm(x)
+        if return_wo_downsample:
+            x_ = x
+        if self.downsample is not None:
+            x = self.downsample(x)
+        if return_wo_downsample:
+            return x, x_
+        return x
+class InternImage(nn.Module):
+    r""" InternImage
+        A PyTorch impl of : `InternImage: Exploring Large-Scale Vision Foundation Models with Deformable Convolutions`  -
+          https://arxiv.org/pdf/2103.14030
+    Args:
+        core_op (str): Core operator. Default: 'DCNv3'
+        channels (int): Number of the first stage. Default: 64
+        depths (list): Depth of each block. Default: [3, 4, 18, 5]
+        groups (list): Groups of each block. Default: [3, 6, 12, 24]
+        num_classes (int): Number of classes. Default: 1000
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.
+        drop_rate (float): Probability of an element to be zeroed. Default: 0.
+        drop_path_rate (float): Stochastic depth rate. Default: 0.
+        act_layer (str): Activation layer. Default: 'GELU'
+        norm_layer (str): Normalization layer. Default: 'LN'
+        layer_scale (bool): Whether to use layer scale. Default: False
+        cls_scale (bool): Whether to use class scale. Default: False
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+    """
+    def __init__(self,
+                 core_op='DCNv3',
+                 channels=64,
+                 depths=[3, 4, 18, 5],
+                 groups=[3, 6, 12, 24],
+                 num_classes=1000,
+                 mlp_ratio=4.,
+                 drop_rate=0.,
+                 drop_path_rate=0.2,
+                 drop_path_type='linear',
+                 act_layer='GELU',
+                 norm_layer='LN',
+                 layer_scale=None,
+                 offset_scale=1.0,
+                 post_norm=False,
+                 cls_scale=1.5,
+                 with_cp=False,
+                 **kwargs):
+        super().__init__()
+        self.core_op = core_op
+        self.num_classes = num_classes
+        self.num_levels = len(depths)
+        self.depths = depths
+        self.channels = channels
+        self.num_features = int(channels * 2**(self.num_levels - 1))
+        self.post_norm = post_norm
+        self.mlp_ratio = mlp_ratio
+        print(f'using core type: {core_op}')
+        print(f'using activation layer: {act_layer}')
+        print(f'using main norm layer: {norm_layer}')
+        print(f'using dpr: {drop_path_type}, {drop_path_rate}')
+        in_chans = 3
+        self.patch_embed = StemLayer(in_chans=in_chans,
+                                     out_chans=channels,
+                                     act_layer=act_layer,
+                                     norm_layer=norm_layer)
+        self.pos_drop = nn.Dropout(p=drop_rate)
+        dpr = [
+            x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))
+        ]
+        if drop_path_type == 'uniform':
+            for i in range(len(dpr)):
+                dpr[i] = drop_path_rate
+        self.levels = nn.ModuleList()
+        for i in range(self.num_levels):
+            level = InternImageBlock(
+                core_op=getattr(opsm, core_op),
+                channels=int(channels * 2**i),
+                depth=depths[i],
+                groups=groups[i],
+                mlp_ratio=self.mlp_ratio,
+                drop=drop_rate,
+                drop_path=dpr[sum(depths[:i]):sum(depths[:i + 1])],
+                act_layer=act_layer,
+                norm_layer=norm_layer,
+                post_norm=post_norm,
+                downsample=(i < self.num_levels - 1),
+                layer_scale=layer_scale,
+                offset_scale=offset_scale,
+                with_cp=with_cp)
+            self.levels.append(level)
+        self.conv_head = nn.Sequential(
+            nn.Conv2d(self.num_features,
+                      int(self.num_features * cls_scale),
+                      kernel_size=1,
+                      bias=False),
+            build_norm_layer(int(self.num_features * cls_scale), 'BN',
+                             'channels_first', 'channels_first'),
+            build_act_layer(act_layer))
+        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
+        self.head = nn.Linear(int(self.num_features * cls_scale), num_classes) \
+            if num_classes > 0 else nn.Identity()
+        self.num_layers = len(depths)
+        self.apply(self._init_weights)
+        self.apply(self._init_deform_weights)
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+    def _init_deform_weights(self, m):
+        if isinstance(m, getattr(opsm, self.core_op)):
+            m._reset_parameters()
+    @torch.jit.ignore
+    def lr_decay_keywards(self, decay_ratio=0.87):
+        lr_ratios = {}
+        # blocks
+        idx = 0
+        for i in range(4):
+            layer_num = 3 - i  # 3 2 1 0
+            for j in range(self.depths[layer_num]):
+                block_num = self.depths[layer_num] - j - 1
+                tag = 'levels.{}.blocks.{}.'.format(layer_num, block_num)
+                decay = 1.0 * (decay_ratio**idx)
+                lr_ratios[tag] = decay
+                idx += 1
+        # patch_embed (before stage-1)
+        lr_ratios["patch_embed"] = lr_ratios['levels.0.blocks.0.']
+        # levels.0.downsample (between stage-1 and stage-2)
+        lr_ratios["levels.0.downsample"] = lr_ratios['levels.1.blocks.0.']
+        lr_ratios["levels.0.norm"] = lr_ratios['levels.1.blocks.0.']
+        # levels.1.downsample (between stage-2 and stage-3)
+        lr_ratios["levels.1.downsample"] = lr_ratios['levels.2.blocks.0.']
+        lr_ratios["levels.1.norm"] = lr_ratios['levels.2.blocks.0.']
+        # levels.2.downsample (between stage-3 and stage-4)
+        lr_ratios["levels.2.downsample"] = lr_ratios['levels.3.blocks.0.']
+        lr_ratios["levels.2.norm"] = lr_ratios['levels.3.blocks.0.']
+        return lr_ratios
+    def forward_features(self, x):
+        x = self.patch_embed(x)
+        x = self.pos_drop(x)
+        for level in self.levels:
+            x = level(x)
+        x = self.conv_head(x.permute(0, 3, 1, 2))
+        x = self.avgpool(x)
+        x = torch.flatten(x, 1)
+        return x
+    def forward_features_seq_out(self, x):
+        x = self.patch_embed(x)
+        x = self.pos_drop(x)
+        seq_out = []
+        for level in self.levels:
+            x, x_ = level(x, return_wo_downsample=True)
+            seq_out.append(x_)
+        return seq_out
+    def forward(self, x):
+        x = self.forward_features(x)
+        x = self.head(x)
+        return x
--- a/classification/ops_dcnv3/functions/__init__.py
+++ b/classification/ops_dcnv3/functions/__init__.py
+# --------------------------------------------------------
+# InternImage
+# Copyright (c) 2022 OpenGVLab
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+from .dcnv3_func import DCNv3Function, dcnv3_core_pytorch
--- a/classification/ops_dcnv3/functions/dcnv3_func.py
+++ b/classification/ops_dcnv3/functions/dcnv3_func.py
+# --------------------------------------------------------
+# InternImage
+# Copyright (c) 2022 OpenGVLab
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+from __future__ import absolute_import
+from __future__ import print_function
+from __future__ import division
+import torch
+import torch.nn.functional as F
+from torch.autograd import Function
+from torch.autograd.function import once_differentiable
+from torch.cuda.amp import custom_bwd, custom_fwd
+import DCNv3
+class DCNv3Function(Function):
+    @staticmethod
+    @custom_fwd
+    def forward(
+            ctx, input, offset, mask,
+            kernel_h, kernel_w, stride_h, stride_w,
+            pad_h, pad_w, dilation_h, dilation_w,
+            group, group_channels, offset_scale, im2col_step):
+        ctx.kernel_h = kernel_h
+        ctx.kernel_w = kernel_w
+        ctx.stride_h = stride_h
+        ctx.stride_w = stride_w
+        ctx.pad_h = pad_h
+        ctx.pad_w = pad_w
+        ctx.dilation_h = dilation_h
+        ctx.dilation_w = dilation_w
+        ctx.group = group
+        ctx.group_channels = group_channels
+        ctx.offset_scale = offset_scale
+        ctx.im2col_step = im2col_step
+        output = DCNv3.dcnv3_forward(
+            input, offset, mask, kernel_h,
+            kernel_w, stride_h, stride_w, pad_h,
+            pad_w, dilation_h, dilation_w, group,
+            group_channels, offset_scale, ctx.im2col_step)
+        ctx.save_for_backward(input, offset, mask)
+        return output
+    @staticmethod
+    @once_differentiable
+    @custom_bwd
+    def backward(ctx, grad_output):
+        input, offset, mask = ctx.saved_tensors
+        grad_input, grad_offset, grad_mask = \
+            DCNv3.dcnv3_backward(
+                input, offset, mask, ctx.kernel_h,
+                ctx.kernel_w, ctx.stride_h, ctx.stride_w, ctx.pad_h,
+                ctx.pad_w, ctx.dilation_h, ctx.dilation_w, ctx.group,
+                ctx.group_channels, ctx.offset_scale, grad_output.contiguous(), ctx.im2col_step)
+        return grad_input, grad_offset, grad_mask, \
+            None, None, None, None, None, None, None, None, None, None, None, None
+    @staticmethod
+    def symbolic(g, input, offset, mask, kernel_h, kernel_w, stride_h,
+                 stride_w, pad_h, pad_w, dilation_h, dilation_w, group,
+                 group_channels, offset_scale, im2col_step):
+        """Symbolic function for mmdeploy::DCNv3.
+        Returns:
+            DCNv3 op for onnx.
+        """
+        return g.op(
+            'mmdeploy::TRTDCNv3',
+            input,
+            offset,
+            mask,
+            kernel_h_i=int(kernel_h),
+            kernel_w_i=int(kernel_w),
+            stride_h_i=int(stride_h),
+            stride_w_i=int(stride_w),
+            pad_h_i=int(pad_h),
+            pad_w_i=int(pad_w),
+            dilation_h_i=int(dilation_h),
+            dilation_w_i=int(dilation_w),
+            group_i=int(group),
+            group_channels_i=int(group_channels),
+            offset_scale_f=float(offset_scale),
+            im2col_step_i=int(im2col_step),
+        )
+def _get_reference_points(spatial_shapes, device, kernel_h, kernel_w, dilation_h, dilation_w, pad_h=0, pad_w=0, stride_h=1, stride_w=1):
+    _, H_, W_, _ = spatial_shapes
+    H_out = (H_ - (dilation_h * (kernel_h - 1) + 1)) // stride_h + 1
+    W_out = (W_ - (dilation_w * (kernel_w - 1) + 1)) // stride_w + 1
+    ref_y, ref_x = torch.meshgrid(
+        torch.linspace(
+            # pad_h + 0.5,
+            # H_ - pad_h - 0.5,
+            (dilation_h * (kernel_h - 1)) // 2 + 0.5,
+            (dilation_h * (kernel_h - 1)) // 2 + 0.5 + (H_out - 1) * stride_h,
+            H_out,
+            dtype=torch.float32,
+            device=device),
+        torch.linspace(
+            # pad_w + 0.5,
+            # W_ - pad_w - 0.5,
+            (dilation_w * (kernel_w - 1)) // 2 + 0.5,
+            (dilation_w * (kernel_w - 1)) // 2 + 0.5 + (W_out - 1) * stride_w,
+            W_out,
+            dtype=torch.float32,
+            device=device))
+    ref_y = ref_y.reshape(-1)[None] / H_
+    ref_x = ref_x.reshape(-1)[None] / W_
+    ref = torch.stack((ref_x, ref_y), -1).reshape(
+        1, H_out, W_out, 1, 2)
+    return ref
+def _generate_dilation_grids(spatial_shapes, kernel_h, kernel_w, dilation_h, dilation_w, group, device):
+    _, H_, W_, _ = spatial_shapes
+    points_list = []
+    x, y = torch.meshgrid(
+        torch.linspace(
+            -((dilation_w * (kernel_w - 1)) // 2),
+            -((dilation_w * (kernel_w - 1)) // 2) +
+            (kernel_w - 1) * dilation_w, kernel_w,
+            dtype=torch.float32,
+            device=device),
+        torch.linspace(
+            -((dilation_h * (kernel_h - 1)) // 2),
+            -((dilation_h * (kernel_h - 1)) // 2) +
+            (kernel_h - 1) * dilation_h, kernel_h,
+            dtype=torch.float32,
+            device=device))
+    points_list.extend([x / W_, y / H_])
+    grid = torch.stack(points_list, -1).reshape(-1, 1, 2).\
+        repeat(1, group, 1).permute(1, 0, 2)
+    grid = grid.reshape(1, 1, 1, group * kernel_h * kernel_w, 2)
+    return grid
+def dcnv3_core_pytorch(
+        input, offset, mask, kernel_h,
+        kernel_w, stride_h, stride_w, pad_h,
+        pad_w, dilation_h, dilation_w, group,
+        group_channels, offset_scale):
+    # for debug and test only,
+    # need to use cuda version instead
+    input = F.pad(
+        input,
+        [0, 0, pad_h, pad_h, pad_w, pad_w])
+    N_, H_in, W_in, _ = input.shape
+    _, H_out, W_out, _ = offset.shape
+    ref = _get_reference_points(
+        input.shape, input.device, kernel_h, kernel_w, dilation_h, dilation_w, pad_h, pad_w, stride_h, stride_w)
+    grid = _generate_dilation_grids(
+        input.shape, kernel_h, kernel_w, dilation_h, dilation_w, group, input.device)
+    spatial_norm = torch.tensor([W_in, H_in]).reshape(1, 1, 1, 2).\
+        repeat(1, 1, 1, group*kernel_h*kernel_w).to(input.device)
+    sampling_locations = (ref + grid * offset_scale).repeat(N_, 1, 1, 1, 1).flatten(3, 4) + \
+        offset * offset_scale / spatial_norm
+    P_ = kernel_h * kernel_w
+    sampling_grids = 2 * sampling_locations - 1
+    # N_, H_in, W_in, group*group_channels -> N_, H_in*W_in, group*group_channels -> N_, group*group_channels, H_in*W_in -> N_*group, group_channels, H_in, W_in
+    input_ = input.view(N_, H_in*W_in, group*group_channels).transpose(1, 2).\
+        reshape(N_*group, group_channels, H_in, W_in)
+    # N_, H_out, W_out, group*P_*2 -> N_, H_out*W_out, group, P_, 2 -> N_, group, H_out*W_out, P_, 2 -> N_*group, H_out*W_out, P_, 2
+    sampling_grid_ = sampling_grids.view(N_, H_out*W_out, group, P_, 2).transpose(1, 2).\
+        flatten(0, 1)
+    # N_*group, group_channels, H_out*W_out, P_
+    sampling_input_ = F.grid_sample(
+        input_, sampling_grid_, mode='bilinear', padding_mode='zeros', align_corners=False)
+    # (N_, H_out, W_out, group*P_) -> N_, H_out*W_out, group, P_ -> (N_, group, H_out*W_out, P_) -> (N_*group, 1, H_out*W_out, P_)
+    mask = mask.view(N_, H_out*W_out, group, P_).transpose(1, 2).\
+        reshape(N_*group, 1, H_out*W_out, P_)
+    output = (sampling_input_ * mask).sum(-1).view(N_,
+                                                   group*group_channels, H_out*W_out)
+    return output.transpose(1, 2).reshape(N_, H_out, W_out, -1).contiguous()
--- a/classification/ops_dcnv3/make.sh
+++ b/classification/ops_dcnv3/make.sh
+#!/usr/bin/env bash
+# --------------------------------------------------------
+# InternImage
+# Copyright (c) 2022 OpenGVLab
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+python setup.py build install
--- a/classification/ops_dcnv3/modules/__init__.py
+++ b/classification/ops_dcnv3/modules/__init__.py
+# --------------------------------------------------------
+# InternImage
+# Copyright (c) 2022 OpenGVLab
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+from .dcnv3 import DCNv3, DCNv3_pytorch
\ No newline at end of file
--- a/classification/ops_dcnv3/modules/dcnv3.py
+++ b/classification/ops_dcnv3/modules/dcnv3.py
+# --------------------------------------------------------
+# InternImage
+# Copyright (c) 2022 OpenGVLab
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+from __future__ import absolute_import
+from __future__ import print_function
+from __future__ import division
+import warnings
+from torch import nn
+import torch.nn.functional as F
+from torch.nn.init import xavier_uniform_, constant_
+from ..functions import DCNv3Function, dcnv3_core_pytorch
+class to_channels_first(nn.Module):
+    def __init__(self):
+        super().__init__()
+    def forward(self, x):
+        return x.permute(0, 3, 1, 2)
+class to_channels_last(nn.Module):
+    def __init__(self):
+        super().__init__()
+    def forward(self, x):
+        return x.permute(0, 2, 3, 1)
+def build_norm_layer(dim,
+                     norm_layer,
+                     in_format='channels_last',
+                     out_format='channels_last',
+                     eps=1e-6):
+    layers = []
+    if norm_layer == 'BN':
+        if in_format == 'channels_last':
+            layers.append(to_channels_first())
+        layers.append(nn.BatchNorm2d(dim))
+        if out_format == 'channels_last':
+            layers.append(to_channels_last())
+    elif norm_layer == 'LN':
+        if in_format == 'channels_first':
+            layers.append(to_channels_last())
+        layers.append(nn.LayerNorm(dim, eps=eps))
+        if out_format == 'channels_first':
+            layers.append(to_channels_first())
+    else:
+        raise NotImplementedError(
+            f'build_norm_layer does not support {norm_layer}')
+    return nn.Sequential(*layers)
+def build_act_layer(act_layer):
+    if act_layer == 'ReLU':
+        return nn.ReLU(inplace=True)
+    elif act_layer == 'SiLU':
+        return nn.SiLU(inplace=True)
+    elif act_layer == 'GELU':
+        return nn.GELU()
+    raise NotImplementedError(f'build_act_layer does not support {act_layer}')
+def _is_power_of_2(n):
+    if (not isinstance(n, int)) or (n < 0):
+        raise ValueError(
+            "invalid input for _is_power_of_2: {} (type: {})".format(n, type(n)))
+    return (n & (n-1) == 0) and n != 0
+class DCNv3_pytorch(nn.Module):
+    def __init__(
+            self, channels=64, kernel_size=3, stride=1,
+            pad=1, dilation=1, group=4, offset_scale=1.0,
+            act_layer='GELU', norm_layer='LN'):
+        """
+        DCNv3 Module
+        :param channels     
+        :param kernel_size  
+        :param stride      
+        :param pad     
+        :param dilation
+        :param group
+        :param offset_scale
+        :param act_layer
+        :param norm_layer
+        """
+        super().__init__()
+        if channels % group != 0:
+            raise ValueError(
+                f'channels must be divisible by group, but got {channels} and {group}')
+        _d_per_group = channels // group
+        # you'd better set _d_per_group to a power of 2 which is more efficient in our CUDA implementation
+        if not _is_power_of_2(_d_per_group):
+            warnings.warn(
+                "You'd better set channels in DCNv3 to make the dimension of each attention head a power of 2 "
+                "which is more efficient in our CUDA implementation.")
+        self.offset_scale = offset_scale
+        self.channels = channels
+        self.kernel_size = kernel_size
+        self.stride = stride
+        self.dilation = 1
+        self.pad = pad
+        self.group = group
+        self.group_channels = channels // group
+        self.offset_scale = offset_scale
+        self.dw_conv = nn.Sequential(
+            nn.Conv2d(
+                channels,
+                channels,
+                kernel_size=kernel_size,
+                stride=1,
+                padding=(kernel_size-1)//2,
+                groups=channels),
+            build_norm_layer(
+                channels,
+                norm_layer,
+                'channels_first',
+                'channels_last'),
+            build_act_layer(act_layer))
+        self.offset = nn.Linear(
+            channels,
+            group * kernel_size * kernel_size * 2)
+        self.mask = nn.Linear(
+            channels,
+            group * kernel_size * kernel_size)
+        self.input_proj = nn.Linear(channels, channels)
+        self.output_proj = nn.Linear(channels, channels)
+        self._reset_parameters()
+    def _reset_parameters(self):
+        constant_(self.offset.weight.data, 0.)
+        constant_(self.offset.bias.data, 0.)
+        constant_(self.mask.weight.data, 0.)
+        constant_(self.mask.bias.data, 0.)
+        xavier_uniform_(self.input_proj.weight.data)
+        constant_(self.input_proj.bias.data, 0.)
+        xavier_uniform_(self.output_proj.weight.data)
+        constant_(self.output_proj.bias.data, 0.)
+    def forward(self, input):
+        """
+        :param query                       (N, H, W, C)
+        :return output                     (N, H, W, C)
+        """
+        N, H, W, _ = input.shape
+        x = self.input_proj(input)
+        x1 = input.permute(0, 3, 1, 2)
+        x1 = self.dw_conv(x1)
+        offset = self.offset(x1)
+        mask = self.mask(x1).reshape(N, H, W, self.group, -1)
+        mask = F.softmax(mask, -1).reshape(N, H, W, -1)
+        x = dcnv3_core_pytorch(
+            x, offset, mask,
+            self.kernel_size, self.kernel_size,
+            self.stride, self.stride,
+            self.pad, self.pad,
+            self.dilation, self.dilation,
+            self.group, self.group_channels,
+            self.offset_scale)
+        x = self.output_proj(x)
+        return x
+class DCNv3(nn.Module):
+    def __init__(
+            self, channels=64, kernel_size=3, stride=1,
+            pad=1, dilation=1, group=4, offset_scale=1.0,
+            act_layer='GELU', norm_layer='LN'):
+        """
+        DCNv3 Module
+        :param channels     
+        :param kernel_size  
+        :param stride      
+        :param pad     
+        :param dilation
+        :param group
+        :param offset_scale
+        :param act_layer
+        :param norm_layer
+        """
+        super().__init__()
+        if channels % group != 0:
+            raise ValueError(
+                f'channels must be divisible by group, but got {channels} and {group}')
+        _d_per_group = channels // group
+        # you'd better set _d_per_group to a power of 2 which is more efficient in our CUDA implementation
+        if not _is_power_of_2(_d_per_group):
+            warnings.warn(
+                "You'd better set channels in DCNv3 to make the dimension of each attention head a power of 2 "
+                "which is more efficient in our CUDA implementation.")
+        self.offset_scale = offset_scale
+        self.channels = channels
+        self.kernel_size = kernel_size
+        self.stride = stride
+        self.dilation = 1
+        self.pad = pad
+        self.group = group
+        self.group_channels = channels // group
+        self.offset_scale = offset_scale
+        self.dw_conv = nn.Sequential(
+            nn.Conv2d(
+                channels,
+                channels,
+                kernel_size=kernel_size,
+                stride=1,
+                padding=(kernel_size-1)//2,
+                groups=channels),
+            build_norm_layer(
+                channels,
+                norm_layer,
+                'channels_first',
+                'channels_last'),
+            build_act_layer(act_layer))
+        self.offset = nn.Linear(
+            channels,
+            group * kernel_size * kernel_size * 2)
+        self.mask = nn.Linear(
+            channels,
+            group * kernel_size * kernel_size)
+        self.input_proj = nn.Linear(channels, channels)
+        self.output_proj = nn.Linear(channels, channels)
+        self._reset_parameters()
+    def _reset_parameters(self):
+        constant_(self.offset.weight.data, 0.)
+        constant_(self.offset.bias.data, 0.)
+        constant_(self.mask.weight.data, 0.)
+        constant_(self.mask.bias.data, 0.)
+        xavier_uniform_(self.input_proj.weight.data)
+        constant_(self.input_proj.bias.data, 0.)
+        xavier_uniform_(self.output_proj.weight.data)
+        constant_(self.output_proj.bias.data, 0.)
+    def forward(self, input):
+        """
+        :param query                       (N, H, W, C)
+        :return output                     (N, H, W, C)
+        """
+        N, H, W, _ = input.shape
+        x = self.input_proj(input)
+        dtype = x.dtype
+        x1 = input.permute(0, 3, 1, 2)
+        x1 = self.dw_conv(x1)
+        offset = self.offset(x1)
+        mask = self.mask(x1).reshape(N, H, W, self.group, -1)
+        mask = F.softmax(mask, -1).reshape(N, H, W, -1).type(dtype)
+        x = DCNv3Function.apply(
+            x, offset, mask,
+            self.kernel_size, self.kernel_size,
+            self.stride, self.stride,
+            self.pad, self.pad,
+            self.dilation, self.dilation,
+            self.group, self.group_channels,
+            self.offset_scale,
+            256)
+        x = self.output_proj(x)
+        return x
--- a/classification/ops_dcnv3/setup.py
+++ b/classification/ops_dcnv3/setup.py
+# --------------------------------------------------------
+# InternImage
+# Copyright (c) 2022 OpenGVLab
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+import os
+import glob
+import torch
+from torch.utils.cpp_extension import CUDA_HOME
+from torch.utils.cpp_extension import CppExtension
+from torch.utils.cpp_extension import CUDAExtension
+from setuptools import find_packages
+from setuptools import setup
+requirements = ["torch", "torchvision"]
+def get_extensions():
+    this_dir = os.path.dirname(os.path.abspath(__file__))
+    extensions_dir = os.path.join(this_dir, "src")
+    main_file = glob.glob(os.path.join(extensions_dir, "*.cpp"))
+    source_cpu = glob.glob(os.path.join(extensions_dir, "cpu", "*.cpp"))
+    source_cuda = glob.glob(os.path.join(extensions_dir, "cuda", "*.cu"))
+    sources = main_file + source_cpu
+    extension = CppExtension
+    extra_compile_args = {"cxx": []}
+    define_macros = []
+    if torch.cuda.is_available() and CUDA_HOME is not None:
+        extension = CUDAExtension
+        sources += source_cuda
+        define_macros += [("WITH_CUDA", None)]
+        extra_compile_args["nvcc"] = [
+            # "-DCUDA_HAS_FP16=1",
+            # "-D__CUDA_NO_HALF_OPERATORS__",
+            # "-D__CUDA_NO_HALF_CONVERSIONS__",
+            # "-D__CUDA_NO_HALF2_OPERATORS__",
+        ]
+    else:
+        raise NotImplementedError('Cuda is not availabel')
+    sources = [os.path.join(extensions_dir, s) for s in sources]
+    include_dirs = [extensions_dir]
+    ext_modules = [
+        extension(
+            "DCNv3",
+            sources,
+            include_dirs=include_dirs,
+            define_macros=define_macros,
+            extra_compile_args=extra_compile_args,
+        )
+    ]
+    return ext_modules
+setup(
+    name="DCNv3",
+    version="1.0",
+    author="InternImage",
+    url="https://github.com/OpenGVLab/InternImage",
+    description=
+    "PyTorch Wrapper for CUDA Functions of DCNv3",
+    packages=find_packages(exclude=(
+        "configs",
+        "tests",
+    )),
+    ext_modules=get_extensions(),
+    cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension},
+)
--- a/classification/ops_dcnv3/src/cpu/dcnv3_cpu.cpp
+++ b/classification/ops_dcnv3/src/cpu/dcnv3_cpu.cpp
+/*!
+**************************************************************************************************
+* InternImage
+* Copyright (c) 2022 OpenGVLab
+* Licensed under The MIT License [see LICENSE for details]
+**************************************************************************************************
+* Modified from
+*https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+**************************************************************************************************
+*/
+#include <vector>
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+at::Tensor dcnv3_cpu_forward(const at::Tensor &input, const at::Tensor &offset,
+                             const at::Tensor &mask, const int kernel_h,
+                             const int kernel_w, const int stride_h,
+                             const int stride_w, const int pad_h,
+                             const int pad_w, const int dilation_h,
+                             const int dilation_w, const int group,
+                             const int group_channels, const float offset_scale,
+                             const int im2col_step) {
+    AT_ERROR("Not implement on cpu");
+}
+std::vector<at::Tensor>
+dcnv3_cpu_backward(const at::Tensor &input, const at::Tensor &offset,
+                   const at::Tensor &mask, const int kernel_h,
+                   const int kernel_w, const int stride_h, const int stride_w,
+                   const int pad_h, const int pad_w, const int dilation_h,
+                   const int dilation_w, const int group,
+                   const int group_channels, const float offset_scale,
+                   const at::Tensor &grad_output, const int im2col_step) {
+    AT_ERROR("Not implement on cpu");
+}
--- a/classification/ops_dcnv3/src/cpu/dcnv3_cpu.h
+++ b/classification/ops_dcnv3/src/cpu/dcnv3_cpu.h
+/*!
+**************************************************************************************************
+* InternImage
+* Copyright (c) 2022 OpenGVLab
+* Licensed under The MIT License [see LICENSE for details]
+**************************************************************************************************
+* Modified from
+*https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+**************************************************************************************************
+*/
+#pragma once
+#include <torch/extension.h>
+at::Tensor dcnv3_cpu_forward(const at::Tensor &input, const at::Tensor &offset,
+                             const at::Tensor &mask, const int kernel_h,
+                             const int kernel_w, const int stride_h,
+                             const int stride_w, const int pad_h,
+                             const int pad_w, const int dilation_h,
+                             const int dilation_w, const int group,
+                             const int group_channels, const float offset_scale,
+                             const int im2col_step);
+std::vector<at::Tensor>
+dcnv3_cpu_backward(const at::Tensor &input, const at::Tensor &offset,
+                   const at::Tensor &mask, const int kernel_h,
+                   const int kernel_w, const int stride_h, const int stride_w,
+                   const int pad_h, const int pad_w, const int dilation_h,
+                   const int dilation_w, const int group,
+                   const int group_channels, const float offset_scale,
+                   const at::Tensor &grad_output, const int im2col_step);
--- a/classification/ops_dcnv3/src/cuda/dcnv3_cuda.cu
+++ b/classification/ops_dcnv3/src/cuda/dcnv3_cuda.cu
+/*!
+**************************************************************************************************
+* InternImage
+* Copyright (c) 2022 OpenGVLab
+* Licensed under The MIT License [see LICENSE for details]
+**************************************************************************************************
+* Modified from
+*https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+**************************************************************************************************
+*/
+#include "cuda/dcnv3_im2col_cuda.cuh"
+#include <vector>
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <torch/torch.h>
+at::Tensor dcnv3_cuda_forward(const at::Tensor &input, const at::Tensor &offset,
+                              const at::Tensor &mask, const int kernel_h,
+                              const int kernel_w, const int stride_h,
+                              const int stride_w, const int pad_h,
+                              const int pad_w, const int dilation_h,
+                              const int dilation_w, const int group,
+                              const int group_channels,
+                              const float offset_scale, const int im2col_step) {
+    AT_ASSERTM(input.is_contiguous(), "input tensor has to be contiguous");
+    AT_ASSERTM(offset.is_contiguous(), "offset tensor has to be contiguous");
+    AT_ASSERTM(mask.is_contiguous(), "mask tensor has to be contiguous");
+    AT_ASSERTM(input.type().is_cuda(), "input must be a CUDA tensor");
+    AT_ASSERTM(offset.type().is_cuda(), "offset must be a CUDA tensor");
+    AT_ASSERTM(mask.type().is_cuda(), "mask must be a CUDA tensor");
+    const int batch = input.size(0);
+    const int height_in = input.size(1);
+    const int width_in = input.size(2);
+    const int channels = input.size(3);
+    const int height_out =
+        (height_in + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h +
+        1;
+    const int width_out =
+        (width_in + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w +
+        1;
+    const int im2col_step_ = std::min(batch, im2col_step);
+    AT_ASSERTM(batch % im2col_step_ == 0,
+               "batch(%d) must divide im2col_step(%d)", batch, im2col_step_);
+    AT_ASSERTM(
+        channels == (group * group_channels),
+        "Input channels and group times group channels wont match: (%d vs %d).",
+        channels, group * group_channels);
+    auto output =
+        at::zeros({batch, height_out, width_out, group * group_channels},
+                  input.options());
+    const int batch_n = im2col_step_;
+    auto output_n = output.view({batch / batch_n, batch_n, height_out,
+                                 width_out, group * group_channels});
+    auto per_input_size = height_in * width_in * group * group_channels;
+    auto per_offset_size =
+        height_out * width_out * group * kernel_h * kernel_w * 2;
+    auto per_mask_size = height_out * width_out * group * kernel_h * kernel_w;
+    for (int n = 0; n < batch / im2col_step_; ++n) {
+        auto columns = output_n.select(0, n);
+        // AT_DISPATCH_FLOATING_TYPES(
+        AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+            input.type(), "ms_deform_attn_forward_cuda", ([&] {
+                dcnv3_im2col_cuda(
+                    at::cuda::getCurrentCUDAStream(),
+                    input.data<scalar_t>() + n * im2col_step_ * per_input_size,
+                    offset.data<scalar_t>() +
+                        n * im2col_step_ * per_offset_size,
+                    mask.data<scalar_t>() + n * im2col_step_ * per_mask_size,
+                    columns.data<scalar_t>(), kernel_h, kernel_w, stride_h,
+                    stride_w, pad_h, pad_w, dilation_h, dilation_w, group,
+                    group_channels, batch_n, height_in, width_in, height_out,
+                    width_out, offset_scale);
+            }));
+    }
+    return output;
+}
+std::vector<at::Tensor>
+dcnv3_cuda_backward(const at::Tensor &input, const at::Tensor &offset,
+                    const at::Tensor &mask, const int kernel_h,
+                    const int kernel_w, const int stride_h, const int stride_w,
+                    const int pad_h, const int pad_w, const int dilation_h,
+                    const int dilation_w, const int group,
+                    const int group_channels, const float offset_scale,
+                    const at::Tensor &grad_output, const int im2col_step) {
+    AT_ASSERTM(input.is_contiguous(), "input tensor has to be contiguous");
+    AT_ASSERTM(offset.is_contiguous(), "offset tensor has to be contiguous");
+    AT_ASSERTM(mask.is_contiguous(), "mask tensor has to be contiguous");
+    AT_ASSERTM(grad_output.is_contiguous(),
+               "grad_output tensor has to be contiguous");
+    AT_ASSERTM(input.type().is_cuda(), "input must be a CUDA tensor");
+    AT_ASSERTM(offset.type().is_cuda(), "offset must be a CUDA tensor");
+    AT_ASSERTM(mask.type().is_cuda(), "mask must be a CUDA tensor");
+    AT_ASSERTM(grad_output.type().is_cuda(),
+               "grad_output must be a CUDA tensor");
+    const int batch = input.size(0);
+    const int height_in = input.size(1);
+    const int width_in = input.size(2);
+    const int channels = input.size(3);
+    const int height_out =
+        (height_in + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h +
+        1;
+    const int width_out =
+        (width_in + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w +
+        1;
+    const int im2col_step_ = std::min(batch, im2col_step);
+    AT_ASSERTM(batch % im2col_step_ == 0,
+               "batch(%d) must divide im2col_step(%d)", batch, im2col_step_);
+    AT_ASSERTM(
+        channels == (group * group_channels),
+        "Input channels and group times group channels wont match: (%d vs %d).",
+        channels, group * group_channels);
+    auto dtype = input.dtype();
+    if (dtype == at::kHalf) {
+        dtype = at::kFloat;
+    }
+    auto grad_input = at::zeros_like(input, dtype);
+    auto grad_offset = at::zeros_like(offset, dtype);
+    auto grad_mask = at::zeros_like(mask, dtype);
+    const int batch_n = im2col_step_;
+    auto per_input_size = height_in * width_in * group * group_channels;
+    auto per_offset_size =
+        height_out * width_out * group * kernel_h * kernel_w * 2;
+    auto per_mask_size = height_out * width_out * group * kernel_h * kernel_w;
+    auto grad_output_n =
+        grad_output.view({batch / im2col_step_, batch_n, height_out * width_out,
+                          group, group_channels});
+    for (int n = 0; n < batch / im2col_step_; ++n) {
+        auto grad_output_g = grad_output_n.select(0, n);
+        // AT_DISPATCH_FLOATING_TYPES(
+        AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+            input.type(), "ms_deform_attn_backward_cuda", ([&] {
+                dcnv3_col2im_cuda(
+                    at::cuda::getCurrentCUDAStream(),
+                    grad_output_g.data<scalar_t>(),
+                    input.data<scalar_t>() + n * im2col_step_ * per_input_size,
+                    offset.data<scalar_t>() +
+                        n * im2col_step_ * per_offset_size,
+                    mask.data<scalar_t>() + n * im2col_step_ * per_mask_size,
+                    kernel_h, kernel_w, stride_h, stride_w, pad_h, pad_w,
+                    dilation_h, dilation_w, group, group_channels, batch_n,
+                    height_in, width_in, height_out, width_out, offset_scale,
+                    grad_input.data<opmath_t>() +
+                        n * im2col_step_ * per_input_size,
+                    grad_offset.data<opmath_t>() +
+                        n * im2col_step_ * per_offset_size,
+                    grad_mask.data<opmath_t>() +
+                        n * im2col_step_ * per_mask_size);
+            }));
+    }
+    if (input.dtype() == torch::kHalf) {
+        return {grad_input.to(torch::kHalf), grad_offset.to(torch::kHalf),
+                grad_mask.to(torch::kHalf)};
+    } else {
+        return {grad_input, grad_offset, grad_mask};
+    }
+}
\ No newline at end of file
--- a/classification/ops_dcnv3/src/cuda/dcnv3_cuda.h
+++ b/classification/ops_dcnv3/src/cuda/dcnv3_cuda.h
+/*!
+**************************************************************************************************
+* InternImage
+* Copyright (c) 2022 OpenGVLab
+* Licensed under The MIT License [see LICENSE for details]
+**************************************************************************************************
+* Modified from
+*https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+**************************************************************************************************
+*/
+#pragma once
+#include <torch/extension.h>
+at::Tensor dcnv3_cuda_forward(const at::Tensor &input, const at::Tensor &offset,
+                              const at::Tensor &mask, const int kernel_h,
+                              const int kernel_w, const int stride_h,
+                              const int stride_w, const int pad_h,
+                              const int pad_w, const int dilation_h,
+                              const int dilation_w, const int group,
+                              const int group_channels,
+                              const float offset_scale, const int im2col_step);
+std::vector<at::Tensor>
+dcnv3_cuda_backward(const at::Tensor &input, const at::Tensor &offset,
+                    const at::Tensor &mask, const int kernel_h,
+                    const int kernel_w, const int stride_h, const int stride_w,
+                    const int pad_h, const int pad_w, const int dilation_h,
+                    const int dilation_w, const int group,
+                    const int group_channels, const float offset_scale,
+                    const at::Tensor &grad_output, const int im2col_step);
--- a/classification/ops_dcnv3/src/cuda/dcnv3_im2col_cuda.cuh
+++ b/classification/ops_dcnv3/src/cuda/dcnv3_im2col_cuda.cuh
+/*!
+**************************************************************************************************
+* InternImage
+* Copyright (c) 2022 OpenGVLab
+* Licensed under The MIT License [see LICENSE for details]
+**************************************************************************************************
+* Modified from
+*https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+**************************************************************************************************
+*/
+#include <algorithm>
+#include <cstdio>
+#include <cstring>
+#include <ATen/ATen.h>
+#include <ATen/OpMathType.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <THC/THCAtomics.cuh>
+#define CUDA_KERNEL_LOOP(i, n)                                                 \
+    for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n);               \
+         i += blockDim.x * gridDim.x)
+const int CUDA_NUM_THREADS = 256;
+inline int GET_BLOCKS(const int N, const int num_threads) {
+    return (N + num_threads - 1) / num_threads;
+}
+#define opmath_t at::opmath_type<scalar_t>
+template <typename scalar_t>
+__device__ opmath_t dcnv3_im2col_bilinear(const scalar_t *&bottom_data,
+                                          const int &height, const int &width,
+                                          const int &group,
+                                          const int &group_channels,
+                                          const opmath_t &h, const opmath_t &w,
+                                          const int &g, const int &c) {
+    const int h_low = floor(h);
+    const int w_low = floor(w);
+    const int h_high = h_low + 1;
+    const int w_high = w_low + 1;
+    const opmath_t lh = h - h_low;
+    const opmath_t lw = w - w_low;
+    const opmath_t hh = 1 - lh, hw = 1 - lw;
+    const int w_stride = group * group_channels;
+    const int h_stride = width * w_stride;
+    const int h_low_ptr_offset = h_low * h_stride;
+    const int h_high_ptr_offset = h_low_ptr_offset + h_stride;
+    const int w_low_ptr_offset = w_low * w_stride;
+    const int w_high_ptr_offset = w_low_ptr_offset + w_stride;
+    const int base_ptr = g * group_channels + c;
+    opmath_t v1 = 0;
+    if (h_low >= 0 && w_low >= 0) {
+        const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr;
+        v1 = bottom_data[ptr1];
+    }
+    opmath_t v2 = 0;
+    if (h_low >= 0 && w_high <= width - 1) {
+        const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr;
+        v2 = bottom_data[ptr2];
+    }
+    opmath_t v3 = 0;
+    if (h_high <= height - 1 && w_low >= 0) {
+        const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr;
+        v3 = bottom_data[ptr3];
+    }
+    opmath_t v4 = 0;
+    if (h_high <= height - 1 && w_high <= width - 1) {
+        const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr;
+        v4 = bottom_data[ptr4];
+    }
+    const opmath_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
+    const opmath_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+    return val;
+}
+template <typename scalar_t>
+__device__ void dcnv3_col2im_bilinear(
+    const scalar_t *&bottom_data, const int &height, const int &width,
+    const int &nheads, const int &group_channels, const opmath_t &h,
+    const opmath_t &w, const int &m, const int &c, const opmath_t offset_scale,
+    const opmath_t &top_grad, const opmath_t &mask, opmath_t *&grad_im,
+    opmath_t *grad_offset, opmath_t *grad_mask) {
+    const int h_low = floor(h);
+    const int w_low = floor(w);
+    const int h_high = h_low + 1;
+    const int w_high = w_low + 1;
+    const opmath_t lh = h - h_low;
+    const opmath_t lw = w - w_low;
+    const opmath_t hh = 1 - lh, hw = 1 - lw;
+    const int w_stride = nheads * group_channels;
+    const int h_stride = width * w_stride;
+    const int h_low_ptr_offset = h_low * h_stride;
+    const int h_high_ptr_offset = h_low_ptr_offset + h_stride;
+    const int w_low_ptr_offset = w_low * w_stride;
+    const int w_high_ptr_offset = w_low_ptr_offset + w_stride;
+    const int base_ptr = m * group_channels + c;
+    const opmath_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
+    const opmath_t top_grad_im = top_grad * mask;
+    opmath_t grad_h_weight = 0, grad_w_weight = 0;
+    opmath_t v1 = 0;
+    if (h_low >= 0 && w_low >= 0) {
+        const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr;
+        v1 = bottom_data[ptr1];
+        grad_h_weight -= hw * v1;
+        grad_w_weight -= hh * v1;
+        atomicAdd(grad_im + ptr1, w1 * top_grad_im);
+    }
+    opmath_t v2 = 0;
+    if (h_low >= 0 && w_high <= width - 1) {
+        const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr;
+        v2 = bottom_data[ptr2];
+        grad_h_weight -= lw * v2;
+        grad_w_weight += hh * v2;
+        atomicAdd(grad_im + ptr2, w2 * top_grad_im);
+    }
+    opmath_t v3 = 0;
+    if (h_high <= height - 1 && w_low >= 0) {
+        const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr;
+        v3 = bottom_data[ptr3];
+        grad_h_weight += hw * v3;
+        grad_w_weight -= lh * v3;
+        atomicAdd(grad_im + ptr3, w3 * top_grad_im);
+    }
+    opmath_t v4 = 0;
+    if (h_high <= height - 1 && w_high <= width - 1) {
+        const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr;
+        v4 = bottom_data[ptr4];
+        grad_h_weight += lw * v4;
+        grad_w_weight += lh * v4;
+        atomicAdd(grad_im + ptr4, w4 * top_grad_im);
+    }
+    const opmath_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+    *grad_mask = top_grad * val;
+    *grad_offset = offset_scale * grad_w_weight * top_grad_im;
+    *(grad_offset + 1) = offset_scale * grad_h_weight * top_grad_im;
+}
+template <typename scalar_t>
+__device__ void dcnv3_col2im_bilinear_gm(
+    const scalar_t *&bottom_data, const int &height, const int &width,
+    const int &nheads, const int &group_channels, const opmath_t &h,
+    const opmath_t &w, const int &m, const int &c, const opmath_t offset_scale,
+    const opmath_t &top_grad, const opmath_t &mask, opmath_t *&grad_im,
+    opmath_t *grad_offset, opmath_t *grad_mask) {
+    const int h_low = floor(h);
+    const int w_low = floor(w);
+    const int h_high = h_low + 1;
+    const int w_high = w_low + 1;
+    const opmath_t lh = h - h_low;
+    const opmath_t lw = w - w_low;
+    const opmath_t hh = 1 - lh, hw = 1 - lw;
+    const int w_stride = nheads * group_channels;
+    const int h_stride = width * w_stride;
+    const int h_low_ptr_offset = h_low * h_stride;
+    const int h_high_ptr_offset = h_low_ptr_offset + h_stride;
+    const int w_low_ptr_offset = w_low * w_stride;
+    const int w_high_ptr_offset = w_low_ptr_offset + w_stride;
+    const int base_ptr = m * group_channels + c;
+    const opmath_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
+    const opmath_t top_grad_im = top_grad * mask;
+    opmath_t grad_h_weight = 0, grad_w_weight = 0;
+    opmath_t v1 = 0;
+    if (h_low >= 0 && w_low >= 0) {
+        const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr;
+        v1 = bottom_data[ptr1];
+        grad_h_weight -= hw * v1;
+        grad_w_weight -= hh * v1;
+        atomicAdd(grad_im + ptr1, w1 * top_grad_im);
+    }
+    opmath_t v2 = 0;
+    if (h_low >= 0 && w_high <= width - 1) {
+        const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr;
+        v2 = bottom_data[ptr2];
+        grad_h_weight -= lw * v2;
+        grad_w_weight += hh * v2;
+        atomicAdd(grad_im + ptr2, w2 * top_grad_im);
+    }
+    opmath_t v3 = 0;
+    if (h_high <= height - 1 && w_low >= 0) {
+        const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr;
+        v3 = bottom_data[ptr3];
+        grad_h_weight += hw * v3;
+        grad_w_weight -= lh * v3;
+        atomicAdd(grad_im + ptr3, w3 * top_grad_im);
+    }
+    opmath_t v4 = 0;
+    if (h_high <= height - 1 && w_high <= width - 1) {
+        const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr;
+        v4 = bottom_data[ptr4];
+        grad_h_weight += lw * v4;
+        grad_w_weight += lh * v4;
+        atomicAdd(grad_im + ptr4, w4 * top_grad_im);
+    }
+    const opmath_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+    atomicAdd(grad_mask, top_grad * val);
+    atomicAdd(grad_offset, offset_scale * grad_w_weight * top_grad_im);
+    atomicAdd(grad_offset + 1, offset_scale * grad_h_weight * top_grad_im);
+}
+template <typename scalar_t>
+__global__ void dcnv3_im2col_gpu_kernel(
+    const int num_kernels, const scalar_t *data_im, const scalar_t *data_offset,
+    const scalar_t *data_mask, scalar_t *data_col, const int kernel_h,
+    const int kernel_w, const int stride_h, const int stride_w, const int pad_h,
+    const int pad_w, const int dilation_h, const int dilation_w,
+    const int group, const int group_channels, const int height_in,
+    const int width_in, const int height_out, const int width_out,
+    const opmath_t offset_scale) {
+    CUDA_KERNEL_LOOP(index, num_kernels) {
+        int _temp = index;
+        const int c_col = _temp % group_channels;
+        _temp /= group_channels;
+        const int sampling_index = _temp;
+        const int g_col = _temp % group;
+        _temp /= group;
+        const int p0_w = ((dilation_w * (kernel_w - 1)) >> 1) - pad_w +
+                         (_temp % width_out) * stride_w;
+        _temp /= width_out;
+        const int p0_h = ((dilation_h * (kernel_h - 1)) >> 1) - pad_h +
+                         (_temp % height_out) * stride_h;
+        _temp /= height_out;
+        const int b_col = _temp;
+        const int input_size = height_in * width_in;
+        scalar_t *data_col_ptr = data_col + index;
+        const int kernel_size = kernel_h * kernel_w;
+        int data_weight_ptr = sampling_index * kernel_size;
+        int data_loc_w_ptr = data_weight_ptr << 1;
+        const int qid_stride = group * group_channels;
+        opmath_t col = 0;
+        const scalar_t *data_im_ptr = data_im + b_col * input_size * qid_stride;
+        // top-left
+        const opmath_t p0_w_ =
+            p0_w - ((dilation_w * (kernel_w - 1)) >> 1) * offset_scale;
+        const opmath_t p0_h_ =
+            p0_h - ((dilation_h * (kernel_h - 1)) >> 1) * offset_scale;
+        for (int i = 0; i < kernel_w; ++i) {
+            for (int j = 0; j < kernel_h; ++j) {
+                const opmath_t offset_w = data_offset[data_loc_w_ptr];
+                const opmath_t offset_h = data_offset[data_loc_w_ptr + 1];
+                const opmath_t loc_w =
+                    p0_w_ + (i * dilation_w + offset_w) * offset_scale;
+                const opmath_t loc_h =
+                    p0_h_ + (j * dilation_h + offset_h) * offset_scale;
+                const opmath_t weight = data_mask[data_weight_ptr];
+                if (loc_h > -1 && loc_w > -1 && loc_h < height_in &&
+                    loc_w < width_in) {
+                    col += dcnv3_im2col_bilinear(
+                               data_im_ptr, height_in, width_in, group,
+                               group_channels, loc_h, loc_w, g_col, c_col) *
+                           weight;
+                }
+                data_weight_ptr += 1;
+                data_loc_w_ptr += 2;
+            }
+        }
+        *data_col_ptr = col;
+    }
+}
+// debug
+template <typename scalar_t, unsigned int blockSize>
+__global__ void dcnv3_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1(
+    const int num_kernels, const scalar_t *grad_col, const scalar_t *data_im,
+    const scalar_t *data_offset, const scalar_t *data_mask, const int kernel_h,
+    const int kernel_w, const int stride_h, const int stride_w, const int pad_h,
+    const int pad_w, const int dilation_h, const int dilation_w,
+    const int group, const int group_channels, const int height_in,
+    const int width_in, const int height_out, const int width_out,
+    const opmath_t offset_scale, opmath_t *grad_im, opmath_t *grad_offset,
+    opmath_t *grad_mask) {
+    CUDA_KERNEL_LOOP(index, num_kernels) {
+        __shared__ opmath_t cache_grad_offset[blockSize * 2];
+        __shared__ opmath_t cache_grad_mask[blockSize];
+        unsigned int tid = threadIdx.x;
+        int _temp = index;
+        const int c_col = _temp % group_channels;
+        _temp /= group_channels;
+        const int sampling_index = _temp;
+        const int g_col = _temp % group;
+        _temp /= group;
+        const int p0_w = ((dilation_w * (kernel_w - 1)) >> 1) - pad_w +
+                         (_temp % width_out) * stride_w;
+        _temp /= width_out;
+        const int p0_h = ((dilation_h * (kernel_h - 1)) >> 1) - pad_h +
+                         (_temp % height_out) * stride_h;
+        _temp /= height_out;
+        const int b_col = _temp;
+        const opmath_t top_grad = grad_col[index];
+        const int input_size = height_in * width_in;
+        const int kernel_size = kernel_h * kernel_w;
+        int data_weight_ptr = sampling_index * kernel_size;
+        int data_loc_w_ptr = data_weight_ptr << 1;
+        const int grad_sampling_ptr = data_weight_ptr;
+        grad_offset += grad_sampling_ptr << 1;
+        grad_mask += grad_sampling_ptr;
+        const int qid_stride = group * group_channels;
+        const int im_ptr_offset = b_col * input_size * qid_stride;
+        const scalar_t *data_im_ptr = data_im + im_ptr_offset;
+        opmath_t *grad_im_ptr = grad_im + im_ptr_offset;
+        const opmath_t p0_w_ =
+            p0_w - ((dilation_w * (kernel_w - 1)) >> 1) * offset_scale;
+        const opmath_t p0_h_ =
+            p0_h - ((dilation_h * (kernel_h - 1)) >> 1) * offset_scale;
+        for (int i = 0; i < kernel_w; ++i) {
+            for (int j = 0; j < kernel_h; ++j) {
+                const opmath_t offset_w = data_offset[data_loc_w_ptr];
+                const opmath_t offset_h = data_offset[data_loc_w_ptr + 1];
+                const opmath_t loc_w =
+                    p0_w_ + (i * dilation_w + offset_w) * offset_scale;
+                const opmath_t loc_h =
+                    p0_h_ + (j * dilation_h + offset_h) * offset_scale;
+                const opmath_t weight = data_mask[data_weight_ptr];
+                *(cache_grad_offset + (threadIdx.x << 1)) = 0;
+                *(cache_grad_offset + ((threadIdx.x << 1) + 1)) = 0;
+                *(cache_grad_mask + threadIdx.x) = 0;
+                if (loc_h > -1 && loc_w > -1 && loc_h < height_in &&
+                    loc_w < width_in) {
+                    dcnv3_col2im_bilinear(
+                        data_im_ptr, height_in, width_in, group, group_channels,
+                        loc_h, loc_w, g_col, c_col, offset_scale, top_grad,
+                        weight, grad_im_ptr,
+                        cache_grad_offset + (threadIdx.x << 1),
+                        cache_grad_mask + threadIdx.x);
+                }
+                __syncthreads();
+                if (tid == 0) {
+                    opmath_t _grad_w = cache_grad_offset[0],
+                             _grad_h = cache_grad_offset[1],
+                             _grad_a = cache_grad_mask[0];
+                    int sid = 2;
+                    for (unsigned int tid = 1; tid < blockSize; ++tid) {
+                        _grad_w += cache_grad_offset[sid];
+                        _grad_h += cache_grad_offset[sid + 1];
+                        _grad_a += cache_grad_mask[tid];
+                        sid += 2;
+                    }
+                    *grad_offset = _grad_w;
+                    *(grad_offset + 1) = _grad_h;
+                    *grad_mask = _grad_a;
+                }
+                __syncthreads();
+                data_weight_ptr += 1;
+                data_loc_w_ptr += 2;
+                grad_mask += 1;
+                grad_offset += 2;
+            }
+        }
+    }
+}
+template <typename scalar_t, unsigned int blockSize>
+__global__ void dcnv3_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2(
+    const int num_kernels, const scalar_t *grad_col, const scalar_t *data_im,
+    const scalar_t *data_offset, const scalar_t *data_mask, const int kernel_h,
+    const int kernel_w, const int stride_h, const int stride_w, const int pad_h,
+    const int pad_w, const int dilation_h, const int dilation_w,
+    const int group, const int group_channels, const int height_in,
+    const int width_in, const int height_out, const int width_out,
+    const opmath_t offset_scale, opmath_t *grad_im, opmath_t *grad_offset,
+    opmath_t *grad_mask) {
+    CUDA_KERNEL_LOOP(index, num_kernels) {
+        __shared__ opmath_t cache_grad_offset[blockSize * 2];
+        __shared__ opmath_t cache_grad_mask[blockSize];
+        unsigned int tid = threadIdx.x;
+        int _temp = index;
+        const int c_col = _temp % group_channels;
+        _temp /= group_channels;
+        const int sampling_index = _temp;
+        const int g_col = _temp % group;
+        _temp /= group;
+        const int p0_w = ((dilation_w * (kernel_w - 1)) >> 1) - pad_w +
+                         (_temp % width_out) * stride_w;
+        _temp /= width_out;
+        const int p0_h = ((dilation_h * (kernel_h - 1)) >> 1) - pad_h +
+                         (_temp % height_out) * stride_h;
+        _temp /= height_out;
+        const int b_col = _temp;
+        const opmath_t top_grad = grad_col[index];
+        const int input_size = height_in * width_in;
+        const int kernel_size = kernel_h * kernel_w;
+        int data_weight_ptr = sampling_index * kernel_size;
+        int data_loc_w_ptr = data_weight_ptr << 1;
+        const int grad_sampling_ptr = data_weight_ptr;
+        grad_offset += grad_sampling_ptr << 1;
+        grad_mask += grad_sampling_ptr;
+        const int qid_stride = group * group_channels;
+        const int im_ptr_offset = b_col * input_size * qid_stride;
+        const scalar_t *data_im_ptr = data_im + im_ptr_offset;
+        opmath_t *grad_im_ptr = grad_im + im_ptr_offset;
+        const opmath_t p0_w_ =
+            p0_w - ((dilation_w * (kernel_w - 1)) >> 1) * offset_scale;
+        const opmath_t p0_h_ =
+            p0_h - ((dilation_h * (kernel_h - 1)) >> 1) * offset_scale;
+        for (int i = 0; i < kernel_w; ++i) {
+            for (int j = 0; j < kernel_h; ++j) {
+                const opmath_t offset_w = data_offset[data_loc_w_ptr];
+                const opmath_t offset_h = data_offset[data_loc_w_ptr + 1];
+                const opmath_t loc_w =
+                    p0_w_ + (i * dilation_w + offset_w) * offset_scale;
+                const opmath_t loc_h =
+                    p0_h_ + (j * dilation_h + offset_h) * offset_scale;
+                const opmath_t weight = data_mask[data_weight_ptr];
+                *(cache_grad_offset + (threadIdx.x << 1)) = 0;
+                *(cache_grad_offset + ((threadIdx.x << 1) + 1)) = 0;
+                *(cache_grad_mask + threadIdx.x) = 0;
+                if (loc_h > -1 && loc_w > -1 && loc_h < height_in &&
+                    loc_w < width_in) {
+                    dcnv3_col2im_bilinear(
+                        data_im_ptr, height_in, width_in, group, group_channels,
+                        loc_h, loc_w, g_col, c_col, offset_scale, top_grad,
+                        weight, grad_im_ptr,
+                        cache_grad_offset + (threadIdx.x << 1),
+                        cache_grad_mask + threadIdx.x);
+                }
+                __syncthreads();
+                for (unsigned int s = blockSize / 2; s > 0; s >>= 1) {
+                    if (tid < s) {
+                        const unsigned int xid1 = tid << 1;
+                        const unsigned int xid2 = (tid + s) << 1;
+                        cache_grad_mask[tid] += cache_grad_mask[tid + s];
+                        cache_grad_offset[xid1] += cache_grad_offset[xid2];
+                        cache_grad_offset[xid1 + 1] +=
+                            cache_grad_offset[xid2 + 1];
+                    }
+                    __syncthreads();
+                }
+                if (tid == 0) {
+                    *grad_offset = cache_grad_offset[0];
+                    *(grad_offset + 1) = cache_grad_offset[1];
+                    *grad_mask = cache_grad_mask[0];
+                }
+                __syncthreads();
+                data_weight_ptr += 1;
+                data_loc_w_ptr += 2;
+                grad_mask += 1;
+                grad_offset += 2;
+            }
+        }
+    }
+}
+template <typename scalar_t>
+__global__ void dcnv3_col2im_gpu_kernel_shm_reduce_v1(
+    const int num_kernels, const scalar_t *grad_col, const scalar_t *data_im,
+    const scalar_t *data_offset, const scalar_t *data_mask, const int kernel_h,
+    const int kernel_w, const int stride_h, const int stride_w, const int pad_h,
+    const int pad_w, const int dilation_h, const int dilation_w,
+    const int group, const int group_channels, const int height_in,
+    const int width_in, const int height_out, const int width_out,
+    const opmath_t offset_scale, opmath_t *grad_im, opmath_t *grad_offset,
+    opmath_t *grad_mask) {
+    CUDA_KERNEL_LOOP(index, num_kernels) {
+        extern __shared__ int _s[];
+        opmath_t *cache_grad_offset = (opmath_t *)_s;
+        opmath_t *cache_grad_mask = cache_grad_offset + 2 * blockDim.x;
+        unsigned int tid = threadIdx.x;
+        int _temp = index;
+        const int c_col = _temp % group_channels;
+        _temp /= group_channels;
+        const int sampling_index = _temp;
+        const int g_col = _temp % group;
+        _temp /= group;
+        const int p0_w = ((dilation_w * (kernel_w - 1)) >> 1) - pad_w +
+                         (_temp % width_out) * stride_w;
+        _temp /= width_out;
+        const int p0_h = ((dilation_h * (kernel_h - 1)) >> 1) - pad_h +
+                         (_temp % height_out) * stride_h;
+        _temp /= height_out;
+        const int b_col = _temp;
+        const opmath_t top_grad = grad_col[index];
+        const int input_size = height_in * width_in;
+        const int kernel_size = kernel_h * kernel_w;
+        int data_weight_ptr = sampling_index * kernel_size;
+        int data_loc_w_ptr = data_weight_ptr << 1;
+        const int grad_sampling_ptr = data_weight_ptr;
+        grad_offset += grad_sampling_ptr << 1;
+        grad_mask += grad_sampling_ptr;
+        const int qid_stride = group * group_channels;
+        const int im_ptr_offset = b_col * input_size * qid_stride;
+        const scalar_t *data_im_ptr = data_im + im_ptr_offset;
+        opmath_t *grad_im_ptr = grad_im + im_ptr_offset;
+        const opmath_t p0_w_ =
+            p0_w - ((dilation_w * (kernel_w - 1)) >> 1) * offset_scale;
+        const opmath_t p0_h_ =
+            p0_h - ((dilation_h * (kernel_h - 1)) >> 1) * offset_scale;
+        for (int i = 0; i < kernel_w; ++i) {
+            for (int j = 0; j < kernel_h; ++j) {
+                const opmath_t offset_w = data_offset[data_loc_w_ptr];
+                const opmath_t offset_h = data_offset[data_loc_w_ptr + 1];
+                const opmath_t loc_w =
+                    p0_w_ + (i * dilation_w + offset_w) * offset_scale;
+                const opmath_t loc_h =
+                    p0_h_ + (j * dilation_h + offset_h) * offset_scale;
+                const opmath_t weight = data_mask[data_weight_ptr];
+                *(cache_grad_offset + (threadIdx.x << 1)) = 0;
+                *(cache_grad_offset + ((threadIdx.x << 1) + 1)) = 0;
+                *(cache_grad_mask + threadIdx.x) = 0;
+                if (loc_h > -1 && loc_w > -1 && loc_h < height_in &&
+                    loc_w < width_in) {
+                    dcnv3_col2im_bilinear(
+                        data_im_ptr, height_in, width_in, group, group_channels,
+                        loc_h, loc_w, g_col, c_col, offset_scale, top_grad,
+                        weight, grad_im_ptr,
+                        cache_grad_offset + (threadIdx.x << 1),
+                        cache_grad_mask + threadIdx.x);
+                }
+                __syncthreads();
+                if (tid == 0) {
+                    opmath_t _grad_w = cache_grad_offset[0],
+                             _grad_h = cache_grad_offset[1],
+                             _grad_a = cache_grad_mask[0];
+                    int sid = 2;
+                    for (unsigned int tid = 1; tid < blockDim.x; ++tid) {
+                        _grad_w += cache_grad_offset[sid];
+                        _grad_h += cache_grad_offset[sid + 1];
+                        _grad_a += cache_grad_mask[tid];
+                        sid += 2;
+                    }
+                    *grad_offset = _grad_w;
+                    *(grad_offset + 1) = _grad_h;
+                    *grad_mask = _grad_a;
+                }
+                __syncthreads();
+                data_weight_ptr += 1;
+                data_loc_w_ptr += 2;
+                grad_mask += 1;
+                grad_offset += 2;
+            }
+        }
+    }
+}
+template <typename scalar_t>
+__global__ void dcnv3_col2im_gpu_kernel_shm_reduce_v2(
+    const int num_kernels, const scalar_t *grad_col, const scalar_t *data_im,
+    const scalar_t *data_offset, const scalar_t *data_mask, const int kernel_h,
+    const int kernel_w, const int stride_h, const int stride_w, const int pad_h,
+    const int pad_w, const int dilation_h, const int dilation_w,
+    const int group, const int group_channels, const int height_in,
+    const int width_in, const int height_out, const int width_out,
+    const opmath_t offset_scale, opmath_t *grad_im, opmath_t *grad_offset,
+    opmath_t *grad_mask) {
+    CUDA_KERNEL_LOOP(index, num_kernels) {
+        extern __shared__ int _s[];
+        opmath_t *cache_grad_offset = (opmath_t *)_s;
+        opmath_t *cache_grad_mask = cache_grad_offset + 2 * blockDim.x;
+        unsigned int tid = threadIdx.x;
+        int _temp = index;
+        const int c_col = _temp % group_channels;
+        _temp /= group_channels;
+        const int sampling_index = _temp;
+        const int g_col = _temp % group;
+        _temp /= group;
+        const int p0_w = ((dilation_w * (kernel_w - 1)) >> 1) - pad_w +
+                         (_temp % width_out) * stride_w;
+        _temp /= width_out;
+        const int p0_h = ((dilation_h * (kernel_h - 1)) >> 1) - pad_h +
+                         (_temp % height_out) * stride_h;
+        _temp /= height_out;
+        const int b_col = _temp;
+        const opmath_t top_grad = grad_col[index];
+        const int input_size = height_in * width_in;
+        const int kernel_size = kernel_h * kernel_w;
+        int data_weight_ptr = sampling_index * kernel_size;
+        int data_loc_w_ptr = data_weight_ptr << 1;
+        const int grad_sampling_ptr = data_weight_ptr;
+        grad_offset += grad_sampling_ptr << 1;
+        grad_mask += grad_sampling_ptr;
+        const int qid_stride = group * group_channels;
+        const int im_ptr_offset = b_col * input_size * qid_stride;
+        const scalar_t *data_im_ptr = data_im + im_ptr_offset;
+        opmath_t *grad_im_ptr = grad_im + im_ptr_offset;
+        const opmath_t p0_w_ =
+            p0_w - ((dilation_w * (kernel_w - 1)) >> 1) * offset_scale;
+        const opmath_t p0_h_ =
+            p0_h - ((dilation_h * (kernel_h - 1)) >> 1) * offset_scale;
+        for (int i = 0; i < kernel_w; ++i) {
+            for (int j = 0; j < kernel_h; ++j) {
+                const opmath_t offset_w = data_offset[data_loc_w_ptr];
+                const opmath_t offset_h = data_offset[data_loc_w_ptr + 1];
+                const opmath_t loc_w =
+                    p0_w_ + (i * dilation_w + offset_w) * offset_scale;
+                const opmath_t loc_h =
+                    p0_h_ + (j * dilation_h + offset_h) * offset_scale;
+                const opmath_t weight = data_mask[data_weight_ptr];
+                *(cache_grad_offset + (threadIdx.x << 1)) = 0;
+                *(cache_grad_offset + ((threadIdx.x << 1) + 1)) = 0;
+                *(cache_grad_mask + threadIdx.x) = 0;
+                if (loc_h > -1 && loc_w > -1 && loc_h < height_in &&
+                    loc_w < width_in) {
+                    dcnv3_col2im_bilinear(
+                        data_im_ptr, height_in, width_in, group, group_channels,
+                        loc_h, loc_w, g_col, c_col, offset_scale, top_grad,
+                        weight, grad_im_ptr,
+                        cache_grad_offset + (threadIdx.x << 1),
+                        cache_grad_mask + threadIdx.x);
+                }
+                __syncthreads();
+                for (unsigned int s = blockDim.x / 2, spre = blockDim.x; s > 0;
+                     s >>= 1, spre >>= 1) {
+                    if (tid < s) {
+                        const unsigned int xid1 = tid << 1;
+                        const unsigned int xid2 = (tid + s) << 1;
+                        cache_grad_mask[tid] += cache_grad_mask[tid + s];
+                        cache_grad_offset[xid1] += cache_grad_offset[xid2];
+                        cache_grad_offset[xid1 + 1] +=
+                            cache_grad_offset[xid2 + 1];
+                        if (tid + (s << 1) < spre) {
+                            cache_grad_mask[tid] +=
+                                cache_grad_mask[tid + (s << 1)];
+                            cache_grad_offset[xid1] +=
+                                cache_grad_offset[xid2 + (s << 1)];
+                            cache_grad_offset[xid1 + 1] +=
+                                cache_grad_offset[xid2 + 1 + (s << 1)];
+                        }
+                    }
+                    __syncthreads();
+                }
+                if (tid == 0) {
+                    *grad_offset = cache_grad_offset[0];
+                    *(grad_offset + 1) = cache_grad_offset[1];
+                    *grad_mask = cache_grad_mask[0];
+                }
+                __syncthreads();
+                data_weight_ptr += 1;
+                data_loc_w_ptr += 2;
+                grad_mask += 1;
+                grad_offset += 2;
+            }
+        }
+    }
+}
+template <typename scalar_t>
+__global__ void dcnv3_col2im_gpu_kernel_shm_reduce_v2_multi_blocks(
+    const int num_kernels, const scalar_t *grad_col, const scalar_t *data_im,
+    const scalar_t *data_offset, const scalar_t *data_mask, const int kernel_h,
+    const int kernel_w, const int stride_h, const int stride_w, const int pad_h,
+    const int pad_w, const int dilation_h, const int dilation_w,
+    const int group, const int group_channels, const int height_in,
+    const int width_in, const int height_out, const int width_out,
+    const opmath_t offset_scale, opmath_t *grad_im, opmath_t *grad_offset,
+    opmath_t *grad_mask) {
+    CUDA_KERNEL_LOOP(index, num_kernels) {
+        extern __shared__ int _s[];
+        opmath_t *cache_grad_offset = (opmath_t *)_s;
+        opmath_t *cache_grad_mask = cache_grad_offset + 2 * blockDim.x;
+        unsigned int tid = threadIdx.x;
+        int _temp = index;
+        const int c_col = _temp % group_channels;
+        _temp /= group_channels;
+        const int sampling_index = _temp;
+        const int g_col = _temp % group;
+        _temp /= group;
+        const int p0_w = ((dilation_w * (kernel_w - 1)) >> 1) - pad_w +
+                         (_temp % width_out) * stride_w;
+        _temp /= width_out;
+        const int p0_h = ((dilation_h * (kernel_h - 1)) >> 1) - pad_h +
+                         (_temp % height_out) * stride_h;
+        _temp /= height_out;
+        const int b_col = _temp;
+        const opmath_t top_grad = grad_col[index];
+        const int input_size = height_in * width_in;
+        const int kernel_size = kernel_h * kernel_w;
+        int data_weight_ptr = sampling_index * kernel_size;
+        int data_loc_w_ptr = data_weight_ptr << 1;
+        const int grad_sampling_ptr = data_weight_ptr;
+        grad_offset += grad_sampling_ptr << 1;
+        grad_mask += grad_sampling_ptr;
+        const int qid_stride = group * group_channels;
+        const int im_ptr_offset = b_col * input_size * qid_stride;
+        const scalar_t *data_im_ptr = data_im + im_ptr_offset;
+        opmath_t *grad_im_ptr = grad_im + im_ptr_offset;
+        const opmath_t p0_w_ =
+            p0_w - ((dilation_w * (kernel_w - 1)) >> 1) * offset_scale;
+        const opmath_t p0_h_ =
+            p0_h - ((dilation_h * (kernel_h - 1)) >> 1) * offset_scale;
+        for (int i = 0; i < kernel_w; ++i) {
+            for (int j = 0; j < kernel_h; ++j) {
+                const opmath_t offset_w = data_offset[data_loc_w_ptr];
+                const opmath_t offset_h = data_offset[data_loc_w_ptr + 1];
+                const opmath_t loc_w =
+                    p0_w_ + (i * dilation_w + offset_w) * offset_scale;
+                const opmath_t loc_h =
+                    p0_h_ + (j * dilation_h + offset_h) * offset_scale;
+                const opmath_t weight = data_mask[data_weight_ptr];
+                *(cache_grad_offset + (threadIdx.x << 1)) = 0;
+                *(cache_grad_offset + ((threadIdx.x << 1) + 1)) = 0;
+                *(cache_grad_mask + threadIdx.x) = 0;
+                if (loc_h > -1 && loc_w > -1 && loc_h < height_in &&
+                    loc_w < width_in) {
+                    dcnv3_col2im_bilinear(
+                        data_im_ptr, height_in, width_in, group, group_channels,
+                        loc_h, loc_w, g_col, c_col, offset_scale, top_grad,
+                        weight, grad_im_ptr,
+                        cache_grad_offset + (threadIdx.x << 1),
+                        cache_grad_mask + threadIdx.x);
+                }
+                __syncthreads();
+                for (unsigned int s = blockDim.x / 2, spre = blockDim.x; s > 0;
+                     s >>= 1, spre >>= 1) {
+                    if (tid < s) {
+                        const unsigned int xid1 = tid << 1;
+                        const unsigned int xid2 = (tid + s) << 1;
+                        cache_grad_mask[tid] += cache_grad_mask[tid + s];
+                        cache_grad_offset[xid1] += cache_grad_offset[xid2];
+                        cache_grad_offset[xid1 + 1] +=
+                            cache_grad_offset[xid2 + 1];
+                        if (tid + (s << 1) < spre) {
+                            cache_grad_mask[tid] +=
+                                cache_grad_mask[tid + (s << 1)];
+                            cache_grad_offset[xid1] +=
+                                cache_grad_offset[xid2 + (s << 1)];
+                            cache_grad_offset[xid1 + 1] +=
+                                cache_grad_offset[xid2 + 1 + (s << 1)];
+                        }
+                    }
+                    __syncthreads();
+                }
+                if (tid == 0) {
+                    atomicAdd(grad_offset, cache_grad_offset[0]);
+                    atomicAdd(grad_offset + 1, cache_grad_offset[1]);
+                    atomicAdd(grad_mask, cache_grad_mask[0]);
+                }
+                __syncthreads();
+                data_weight_ptr += 1;
+                data_loc_w_ptr += 2;
+                grad_mask += 1;
+                grad_offset += 2;
+            }
+        }
+    }
+}
+template <typename scalar_t>
+__global__ void dcnv3_col2im_gpu_kernel_gm(
+    const int num_kernels, const scalar_t *grad_col, const scalar_t *data_im,
+    const scalar_t *data_offset, const scalar_t *data_mask, const int kernel_h,
+    const int kernel_w, const int stride_h, const int stride_w, const int pad_h,
+    const int pad_w, const int dilation_h, const int dilation_w,
+    const int group, const int group_channels, const int height_in,
+    const int width_in, const int height_out, const int width_out,
+    const opmath_t offset_scale, opmath_t *grad_im, opmath_t *grad_offset,
+    opmath_t *grad_mask) {
+    CUDA_KERNEL_LOOP(index, num_kernels) {
+        int _temp = index;
+        const int c_col = _temp % group_channels;
+        _temp /= group_channels;
+        const int sampling_index = _temp;
+        const int g_col = _temp % group;
+        _temp /= group;
+        const int p0_w = ((dilation_w * (kernel_w - 1)) >> 1) - pad_w +
+                         (_temp % width_out) * stride_w;
+        _temp /= width_out;
+        const int p0_h = ((dilation_h * (kernel_h - 1)) >> 1) - pad_h +
+                         (_temp % height_out) * stride_h;
+        _temp /= height_out;
+        const int b_col = _temp;
+        const opmath_t top_grad = grad_col[index];
+        const int input_size = height_in * width_in;
+        const int kernel_size = kernel_h * kernel_w;
+        int data_weight_ptr = sampling_index * kernel_size;
+        int data_loc_w_ptr = data_weight_ptr << 1;
+        const int grad_sampling_ptr = data_weight_ptr;
+        grad_offset += grad_sampling_ptr << 1;
+        grad_mask += grad_sampling_ptr;
+        const int qid_stride = group * group_channels;
+        const int im_ptr_offset = b_col * input_size * qid_stride;
+        const scalar_t *data_im_ptr = data_im + im_ptr_offset;
+        opmath_t *grad_im_ptr = grad_im + im_ptr_offset;
+        const opmath_t p0_w_ =
+            p0_w - ((dilation_w * (kernel_w - 1)) >> 1) * offset_scale;
+        const opmath_t p0_h_ =
+            p0_h - ((dilation_h * (kernel_h - 1)) >> 1) * offset_scale;
+        for (int i = 0; i < kernel_w; ++i) {
+            for (int j = 0; j < kernel_h; ++j) {
+                const opmath_t offset_w = data_offset[data_loc_w_ptr];
+                const opmath_t offset_h = data_offset[data_loc_w_ptr + 1];
+                const opmath_t loc_w =
+                    p0_w_ + (i * dilation_w + offset_w) * offset_scale;
+                const opmath_t loc_h =
+                    p0_h_ + (j * dilation_h + offset_h) * offset_scale;
+                const opmath_t weight = data_mask[data_weight_ptr];
+                if (loc_h > -1 && loc_w > -1 && loc_h < height_in &&
+                    loc_w < width_in) {
+                    dcnv3_col2im_bilinear_gm(
+                        data_im_ptr, height_in, width_in, group, group_channels,
+                        loc_h, loc_w, g_col, c_col, offset_scale, top_grad,
+                        weight, grad_im_ptr, grad_offset, grad_mask);
+                }
+                data_weight_ptr += 1;
+                data_loc_w_ptr += 2;
+                grad_mask += 1;
+                grad_offset += 2;
+            }
+        }
+    }
+}
+template <typename scalar_t>
+void dcnv3_im2col_cuda(cudaStream_t stream, const scalar_t *data_im,
+                       const scalar_t *data_offset, const scalar_t *data_mask,
+                       scalar_t *data_col, const int kernel_h,
+                       const int kernel_w, const int stride_h,
+                       const int stride_w, const int pad_h, const int pad_w,
+                       const int dilation_h, const int dilation_w,
+                       const int group, const int group_channels,
+                       const int batch_n, const int height_in,
+                       const int width_in, const int height_out,
+                       const int width_out, const opmath_t offset_scale) {
+    const int num_kernels =
+        batch_n * height_out * width_out * group * group_channels;
+    const int num_actual_kernels =
+        batch_n * height_out * width_out * group * group_channels;
+    const int num_threads = CUDA_NUM_THREADS;
+    dcnv3_im2col_gpu_kernel<scalar_t>
+        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
+           stream>>>(num_kernels, data_im, data_offset, data_mask, data_col,
+                     kernel_h, kernel_w, stride_h, stride_w, pad_h, pad_w,
+                     dilation_h, dilation_w, group, group_channels, height_in,
+                     width_in, height_out, width_out, offset_scale);
+    cudaError_t err = cudaGetLastError();
+    if (err != cudaSuccess) {
+        printf("error in dcnv3_im2col_cuda: %s\n", cudaGetErrorString(err));
+    }
+}
+template <typename scalar_t>
+void dcnv3_col2im_cuda(
+    cudaStream_t stream, const scalar_t *grad_col, const scalar_t *data_im,
+    const scalar_t *data_offset, const scalar_t *data_mask, const int kernel_h,
+    const int kernel_w, const int stride_h, const int stride_w, const int pad_h,
+    const int pad_w, const int dilation_h, const int dilation_w,
+    const int group, const int group_channels, const int batch_n,
+    const int height_in, const int width_in, const int height_out,
+    const int width_out, const opmath_t offset_scale, opmath_t *grad_im,
+    opmath_t *grad_offset, opmath_t *grad_mask) {
+    const int num_threads =
+        (group_channels > CUDA_NUM_THREADS) ? CUDA_NUM_THREADS : group_channels;
+    const int num_kernels =
+        batch_n * height_out * width_out * group * group_channels;
+    const int num_actual_kernels =
+        batch_n * height_out * width_out * group * group_channels;
+    if (group_channels > 1024) {
+        if ((group_channels & 1023) == 0) {
+            dcnv3_col2im_gpu_kernel_shm_reduce_v2_multi_blocks<scalar_t>
+                <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+                   num_threads * 3 * sizeof(opmath_t), stream>>>(
+                    num_kernels, grad_col, data_im, data_offset, data_mask,
+                    kernel_h, kernel_w, stride_h, stride_w, pad_h, pad_w,
+                    dilation_h, dilation_w, group, group_channels, height_in,
+                    width_in, height_out, width_out, offset_scale, grad_im,
+                    grad_offset, grad_mask);
+        } else {
+            dcnv3_col2im_gpu_kernel_gm<scalar_t>
+                <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
+                   stream>>>(num_kernels, grad_col, data_im, data_offset,
+                             data_mask, kernel_h, kernel_w, stride_h, stride_w,
+                             pad_h, pad_w, dilation_h, dilation_w, group,
+                             group_channels, height_in, width_in, height_out,
+                             width_out, offset_scale, grad_im, grad_offset,
+                             grad_mask);
+        }
+    } else {
+        switch (group_channels) {
+        case 1:
+            dcnv3_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t, 1>
+                <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
+                   stream>>>(num_kernels, grad_col, data_im, data_offset,
+                             data_mask, kernel_h, kernel_w, stride_h, stride_w,
+                             pad_h, pad_w, dilation_h, dilation_w, group,
+                             group_channels, height_in, width_in, height_out,
+                             width_out, offset_scale, grad_im, grad_offset,
+                             grad_mask);
+            break;
+        case 2:
+            dcnv3_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t, 2>
+                <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
+                   stream>>>(num_kernels, grad_col, data_im, data_offset,
+                             data_mask, kernel_h, kernel_w, stride_h, stride_w,
+                             pad_h, pad_w, dilation_h, dilation_w, group,
+                             group_channels, height_in, width_in, height_out,
+                             width_out, offset_scale, grad_im, grad_offset,
+                             grad_mask);
+            break;
+        case 4:
+            dcnv3_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t, 4>
+                <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
+                   stream>>>(num_kernels, grad_col, data_im, data_offset,
+                             data_mask, kernel_h, kernel_w, stride_h, stride_w,
+                             pad_h, pad_w, dilation_h, dilation_w, group,
+                             group_channels, height_in, width_in, height_out,
+                             width_out, offset_scale, grad_im, grad_offset,
+                             grad_mask);
+            break;
+        case 8:
+            dcnv3_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t, 8>
+                <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
+                   stream>>>(num_kernels, grad_col, data_im, data_offset,
+                             data_mask, kernel_h, kernel_w, stride_h, stride_w,
+                             pad_h, pad_w, dilation_h, dilation_w, group,
+                             group_channels, height_in, width_in, height_out,
+                             width_out, offset_scale, grad_im, grad_offset,
+                             grad_mask);
+            break;
+        case 16:
+            dcnv3_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t, 16>
+                <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
+                   stream>>>(num_kernels, grad_col, data_im, data_offset,
+                             data_mask, kernel_h, kernel_w, stride_h, stride_w,
+                             pad_h, pad_w, dilation_h, dilation_w, group,
+                             group_channels, height_in, width_in, height_out,
+                             width_out, offset_scale, grad_im, grad_offset,
+                             grad_mask);
+            break;
+        case 32:
+            dcnv3_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t, 32>
+                <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
+                   stream>>>(num_kernels, grad_col, data_im, data_offset,
+                             data_mask, kernel_h, kernel_w, stride_h, stride_w,
+                             pad_h, pad_w, dilation_h, dilation_w, group,
+                             group_channels, height_in, width_in, height_out,
+                             width_out, offset_scale, grad_im, grad_offset,
+                             grad_mask);
+            break;
+        case 64:
+            dcnv3_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t, 64>
+                <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
+                   stream>>>(num_kernels, grad_col, data_im, data_offset,
+                             data_mask, kernel_h, kernel_w, stride_h, stride_w,
+                             pad_h, pad_w, dilation_h, dilation_w, group,
+                             group_channels, height_in, width_in, height_out,
+                             width_out, offset_scale, grad_im, grad_offset,
+                             grad_mask);
+            break;
+        case 128:
+            dcnv3_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t, 128>
+                <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
+                   stream>>>(num_kernels, grad_col, data_im, data_offset,
+                             data_mask, kernel_h, kernel_w, stride_h, stride_w,
+                             pad_h, pad_w, dilation_h, dilation_w, group,
+                             group_channels, height_in, width_in, height_out,
+                             width_out, offset_scale, grad_im, grad_offset,
+                             grad_mask);
+            break;
+        case 256:
+            dcnv3_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t, 256>
+                <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
+                   stream>>>(num_kernels, grad_col, data_im, data_offset,
+                             data_mask, kernel_h, kernel_w, stride_h, stride_w,
+                             pad_h, pad_w, dilation_h, dilation_w, group,
+                             group_channels, height_in, width_in, height_out,
+                             width_out, offset_scale, grad_im, grad_offset,
+                             grad_mask);
+            break;
+        case 512:
+            dcnv3_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t, 512>
+                <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
+                   stream>>>(num_kernels, grad_col, data_im, data_offset,
+                             data_mask, kernel_h, kernel_w, stride_h, stride_w,
+                             pad_h, pad_w, dilation_h, dilation_w, group,
+                             group_channels, height_in, width_in, height_out,
+                             width_out, offset_scale, grad_im, grad_offset,
+                             grad_mask);
+            break;
+        case 1024:
+            dcnv3_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t,
+                                                                  1024>
+                <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
+                   stream>>>(num_kernels, grad_col, data_im, data_offset,
+                             data_mask, kernel_h, kernel_w, stride_h, stride_w,
+                             pad_h, pad_w, dilation_h, dilation_w, group,
+                             group_channels, height_in, width_in, height_out,
+                             width_out, offset_scale, grad_im, grad_offset,
+                             grad_mask);
+            break;
+        default:
+            if (group_channels < 64) {
+                dcnv3_col2im_gpu_kernel_shm_reduce_v1<scalar_t>
+                    <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+                       num_threads * 3 * sizeof(opmath_t), stream>>>(
+                        num_kernels, grad_col, data_im, data_offset, data_mask,
+                        kernel_h, kernel_w, stride_h, stride_w, pad_h, pad_w,
+                        dilation_h, dilation_w, group, group_channels,
+                        height_in, width_in, height_out, width_out,
+                        offset_scale, grad_im, grad_offset, grad_mask);
+            } else {
+                dcnv3_col2im_gpu_kernel_shm_reduce_v2<scalar_t>
+                    <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+                       num_threads * 3 * sizeof(opmath_t), stream>>>(
+                        num_kernels, grad_col, data_im, data_offset, data_mask,
+                        kernel_h, kernel_w, stride_h, stride_w, pad_h, pad_w,
+                        dilation_h, dilation_w, group, group_channels,
+                        height_in, width_in, height_out, width_out,
+                        offset_scale, grad_im, grad_offset, grad_mask);
+            }
+        }
+    }
+    cudaError_t err = cudaGetLastError();
+    if (err != cudaSuccess) {
+        printf("error in dcnv3_col2im_cuda: %s\n", cudaGetErrorString(err));
+    }
+}
\ No newline at end of file
--- a/classification/ops_dcnv3/src/dcnv3.h
+++ b/classification/ops_dcnv3/src/dcnv3.h
+/*!
+**************************************************************************************************
+* InternImage
+* Copyright (c) 2022 OpenGVLab
+* Licensed under The MIT License [see LICENSE for details]
+**************************************************************************************************
+* Modified from
+*https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+**************************************************************************************************
+*/
+#pragma once
+#include "cpu/dcnv3_cpu.h"
+#ifdef WITH_CUDA
+#include "cuda/dcnv3_cuda.h"
+#endif
+at::Tensor dcnv3_forward(const at::Tensor &input, const at::Tensor &offset,
+                         const at::Tensor &mask, const int kernel_h,
+                         const int kernel_w, const int stride_h,
+                         const int stride_w, const int pad_h, const int pad_w,
+                         const int dilation_h, const int dilation_w,
+                         const int group, const int group_channels,
+                         const float offset_scale, const int im2col_step) {
+    if (input.type().is_cuda()) {
+#ifdef WITH_CUDA
+        return dcnv3_cuda_forward(input, offset, mask, kernel_h, kernel_w,
+                                  stride_h, stride_w, pad_h, pad_w, dilation_h,
+                                  dilation_w, group, group_channels,
+                                  offset_scale, im2col_step);
+#else
+        AT_ERROR("Not compiled with GPU support");
+#endif
+    }
+    AT_ERROR("Not implemented on the CPU");
+}
+std::vector<at::Tensor>
+dcnv3_backward(const at::Tensor &input, const at::Tensor &offset,
+               const at::Tensor &mask, const int kernel_h, const int kernel_w,
+               const int stride_h, const int stride_w, const int pad_h,
+               const int pad_w, const int dilation_h, const int dilation_w,
+               const int group, const int group_channels,
+               const float offset_scale, const at::Tensor &grad_output,
+               const int im2col_step) {
+    if (input.type().is_cuda()) {
+#ifdef WITH_CUDA
+        return dcnv3_cuda_backward(input, offset, mask, kernel_h, kernel_w,
+                                   stride_h, stride_w, pad_h, pad_w, dilation_h,
+                                   dilation_w, group, group_channels,
+                                   offset_scale, grad_output, im2col_step);
+#else
+        AT_ERROR("Not compiled with GPU support");
+#endif
+    }
+    AT_ERROR("Not implemented on the CPU");
+}
--- a/classification/ops_dcnv3/src/vision.cpp
+++ b/classification/ops_dcnv3/src/vision.cpp
+/*!
+**************************************************************************************************
+* InternImage
+* Copyright (c) 2022 OpenGVLab
+* Licensed under The MIT License [see LICENSE for details]
+**************************************************************************************************
+* Modified from
+*https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+**************************************************************************************************
+*/
+#include "dcnv3.h"
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+    m.def("dcnv3_forward", &dcnv3_forward, "dcnv3_forward");
+    m.def("dcnv3_backward", &dcnv3_backward, "dcnv3_backward");
+}
--- a/classification/ops_dcnv3/test.py
+++ b/classification/ops_dcnv3/test.py
+# --------------------------------------------------------
+# InternImage
+# Copyright (c) 2022 OpenGVLab
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+from __future__ import absolute_import
+from __future__ import print_function
+from __future__ import division
+import time
+import torch
+import torch.nn as nn
+import math
+from torch.autograd import gradcheck
+from functions.dcnv3_func import DCNv3Function, dcnv3_core_pytorch
+H_in, W_in = 8, 8
+N, M, D = 2, 4, 16
+Kh, Kw = 3, 3
+P = Kh * Kw
+offset_scale = 2.0
+pad = 1
+dilation = 1
+stride = 1
+H_out = (H_in + 2 * pad - (dilation * (Kh - 1) + 1)) // stride + 1
+W_out = (W_in + 2 * pad - (dilation * (Kw - 1) + 1)) // stride + 1
+torch.manual_seed(3)
+@torch.no_grad()
+def check_forward_equal_with_pytorch_double():
+    input = torch.rand(N, H_in, W_in, M*D).cuda() * 0.01
+    offset = torch.rand(N, H_out, W_out, M*P*2).cuda() * 10
+    mask = torch.rand(N, H_out, W_out, M, P).cuda() + 1e-5
+    mask /= mask.sum(-1, keepdim=True)
+    mask = mask.reshape(N, H_out, W_out, M*P)
+    output_pytorch = dcnv3_core_pytorch(
+        input.double(),
+        offset.double(),
+        mask.double(),
+        Kh, Kw, stride, stride, Kh // 2, Kw // 2, dilation, dilation, M, D, offset_scale).detach().cpu()
+    im2col_step = 2
+    output_cuda = DCNv3Function.apply(
+        input.double(),
+        offset.double(),
+        mask.double(),
+        Kh, Kw, stride, stride, Kh // 2, Kw // 2, dilation, dilation, M, D, offset_scale,
+        im2col_step).detach().cpu()
+    fwdok = torch.allclose(output_cuda, output_pytorch)
+    max_abs_err = (output_cuda - output_pytorch).abs().max()
+    max_rel_err = ((output_cuda - output_pytorch).abs() /
+                   output_pytorch.abs()).max()
+    print('>>> forward double')
+    print(f'* {fwdok} check_forward_equal_with_pytorch_double: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}')
+@torch.no_grad()
+def check_forward_equal_with_pytorch_float():
+    input = torch.rand(N, H_in, W_in, M*D).cuda() * 0.01
+    offset = torch.rand(N, H_out, W_out, M*P*2).cuda() * 10
+    mask = torch.rand(N, H_out, W_out, M, P).cuda() + 1e-5
+    mask /= mask.sum(-1, keepdim=True)
+    mask = mask.reshape(N, H_out, W_out, M*P)
+    output_pytorch = dcnv3_core_pytorch(
+        input,
+        offset,
+        mask,
+        Kh, Kw, stride, stride, Kh // 2, Kw // 2, dilation, dilation, M, D, offset_scale).detach().cpu()
+    im2col_step = 2
+    output_cuda = DCNv3Function.apply(
+        input,
+        offset,
+        mask,
+        Kh, Kw, stride, stride, Kh // 2, Kw // 2, dilation, dilation, M, D, offset_scale,
+        im2col_step).detach().cpu()
+    fwdok = torch.allclose(output_cuda, output_pytorch, rtol=1e-2, atol=1e-3)
+    max_abs_err = (output_cuda - output_pytorch).abs().max()
+    max_rel_err = ((output_cuda - output_pytorch).abs() /
+                   output_pytorch.abs()).max()
+    print('>>> forward float')
+    print(f'* {fwdok} check_forward_equal_with_pytorch_float: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}')
+def check_backward_equal_with_pytorch_double(channels=4, grad_input=True, grad_offset=True, grad_mask=True):
+    # H_in, W_in = 4, 4
+    N = 2
+    M = 2
+    H_out = (H_in + 2 * pad - (dilation * (Kh - 1) + 1)) // stride + 1
+    W_out = (W_in + 2 * pad - (dilation * (Kw - 1) + 1)) // stride + 1
+    D = channels
+    input0 = torch.rand(N, H_in, W_in, M*D).cuda() * 0.01
+    offset0 = torch.rand(N, H_out, W_out, M*P*2).cuda() * 10
+    mask0 = torch.rand(N, H_out, W_out, M, P).cuda() + 1e-5
+    mask0 /= mask0.sum(-1, keepdim=True)
+    mask0 = mask0.reshape(N, H_out, W_out, M*P)
+    input0.requires_grad = grad_input
+    offset0.requires_grad = grad_offset
+    mask0.requires_grad = grad_mask
+    output_pytorch = dcnv3_core_pytorch(
+        input0.double(),
+        offset0.double(),
+        mask0.double(),
+        Kh, Kw, stride, stride, Kh // 2, Kw // 2, dilation, dilation, M, D, offset_scale)
+    output_pytorch.sum().backward()
+    input1 = input0.detach()
+    offset1 = offset0.detach()
+    mask1 = mask0.detach()
+    input1.requires_grad = grad_input
+    offset1.requires_grad = grad_offset
+    mask1.requires_grad = grad_mask
+    im2col_step = 2
+    output_cuda = DCNv3Function.apply(
+        input1.double(),
+        offset1.double(),
+        mask1.double(),
+        Kh, Kw, stride, stride, Kh // 2, Kw // 2, dilation, dilation, M, D, offset_scale,
+        im2col_step)
+    output_cuda.sum().backward()
+    print(f'>>> backward double: channels {D}')
+    bwdok = torch.allclose(input0.grad, input1.grad, rtol=1e-2, atol=1e-3)
+    max_abs_err = (input0.grad - input1.grad).abs().max()
+    max_rel_err = ((input0.grad - input1.grad).abs() /
+                   input0.grad.abs()).max()
+    print(
+        f'* {bwdok} input_grad check_backward_equal_with_pytorch_double: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}')
+    bwdok = torch.allclose(offset0.grad, offset1.grad, rtol=1e-2, atol=1e-3)
+    max_abs_err = (offset0.grad - offset1.grad).abs().max()
+    max_rel_err = ((offset0.grad - offset1.grad).abs() /
+                   offset0.grad.abs()).max()
+    print(
+        f'* {bwdok} offset_grad check_backward_equal_with_pytorch_double: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}')
+    bwdok = torch.allclose(mask0.grad, mask1.grad, rtol=1e-2, atol=1e-3)
+    max_abs_err = (mask0.grad - mask1.grad).abs().max()
+    max_rel_err = ((mask0.grad - mask1.grad).abs() /
+                   mask0.grad.abs()).max()
+    print(
+        f'* {bwdok} mask_grad check_backward_equal_with_pytorch_double: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}')
+def check_backward_equal_with_pytorch_float(channels=4, grad_input=True, grad_offset=True, grad_mask=True):
+    # H_in, W_in = 4, 4
+    N = 2
+    M = 2
+    H_out = (H_in + 2 * pad - (dilation * (Kh - 1) + 1)) // stride + 1
+    W_out = (W_in + 2 * pad - (dilation * (Kw - 1) + 1)) // stride + 1
+    D = channels
+    input0 = torch.rand(N, H_in, W_in, M*D).cuda() * 0.01
+    offset0 = torch.rand(N, H_out, W_out, M*P*2).cuda() * 10
+    mask0 = torch.rand(N, H_out, W_out, M, P).cuda() + 1e-5
+    mask0 /= mask0.sum(-1, keepdim=True)
+    mask0 = mask0.reshape(N, H_out, W_out, M*P)
+    input0.requires_grad = grad_input
+    offset0.requires_grad = grad_offset
+    mask0.requires_grad = grad_mask
+    output_pytorch = dcnv3_core_pytorch(
+        input0,
+        offset0,
+        mask0,
+        Kh, Kw, stride, stride, Kh // 2, Kw // 2, dilation, dilation, M, D, offset_scale)
+    output_pytorch.sum().backward()
+    input1 = input0.detach()
+    offset1 = offset0.detach()
+    mask1 = mask0.detach()
+    input1.requires_grad = grad_input
+    offset1.requires_grad = grad_offset
+    mask1.requires_grad = grad_mask
+    im2col_step = 2
+    output_cuda = DCNv3Function.apply(
+        input1,
+        offset1,
+        mask1,
+        Kh, Kw, stride, stride, Kh // 2, Kw // 2, dilation, dilation, M, D, offset_scale,
+        im2col_step)
+    output_cuda.sum().backward()
+    print(f'>>> backward float: channels {D}')
+    bwdok = torch.allclose(input0.grad, input1.grad, rtol=1e-2, atol=1e-3)
+    max_abs_err = (input0.grad - input1.grad).abs().max()
+    max_rel_err = ((input0.grad - input1.grad).abs() /
+                   input0.grad.abs()).max()
+    print(
+        f'* {bwdok} input_grad check_backward_equal_with_pytorch_float: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}')
+    bwdok = torch.allclose(offset0.grad, offset1.grad, rtol=1e-2, atol=1e-3)
+    max_abs_err = (offset0.grad - offset1.grad).abs().max()
+    max_rel_err = ((offset0.grad - offset1.grad).abs() /
+                   offset0.grad.abs()).max()
+    print(
+        f'* {bwdok} offset_grad check_backward_equal_with_pytorch_float: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}')
+    bwdok = torch.allclose(mask0.grad, mask1.grad, rtol=1e-2, atol=1e-3)
+    max_abs_err = (mask0.grad - mask1.grad).abs().max()
+    max_rel_err = ((mask0.grad - mask1.grad).abs() /
+                   mask0.grad.abs()).max()
+    print(
+        f'* {bwdok} mask_grad check_backward_equal_with_pytorch_float: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}')
+@torch.no_grad()
+def check_time_cost(im2col_step=128):
+    N = 512
+    H_in, W_in = 64, 64
+    H_out = (H_in + 2 * pad - (dilation * (Kh - 1) + 1)) // stride + 1
+    W_out = (W_in + 2 * pad - (dilation * (Kw - 1) + 1)) // stride + 1
+    input = torch.rand(N, H_in, W_in, M*D).cuda() * 0.01
+    offset = torch.rand(N, H_out, W_out, M*P*2).cuda() * 10
+    mask = torch.rand(N, H_out, W_out, M, P).cuda() + 1e-5
+    mask /= mask.sum(-1, keepdim=True)
+    mask = mask.reshape(N, H_out, W_out, M*P)
+    print(
+        f'>>> time cost: im2col_step {im2col_step}; input {input.shape}; points {P} ')
+    repeat = 100
+    for i in range(repeat):
+        output_cuda = DCNv3Function.apply(
+            input,
+            offset,
+            mask,
+            Kh, Kw, stride, stride, Kh // 2, Kw // 2, dilation, dilation, M, D, 1.0,
+            im2col_step)
+    torch.cuda.synchronize()
+    start = time.time()
+    for i in range(repeat):
+        output_cuda = DCNv3Function.apply(
+            input,
+            offset,
+            mask,
+            Kh, Kw, stride, stride, Kh // 2, Kw // 2, dilation, dilation, M, D, 1.0,
+            im2col_step)
+    torch.cuda.synchronize()
+    print(f'foward time cost: {(time.time() - start) / repeat}')
+if __name__ == '__main__':
+    check_forward_equal_with_pytorch_double()
+    check_forward_equal_with_pytorch_float()
+    for channels in [1, 16, 30, 32, 64, 71, 1025]:
+        check_backward_equal_with_pytorch_double(channels, True, True, True)
+    for channels in [1, 16, 30, 32, 64, 71, 1025]:
+        check_backward_equal_with_pytorch_float(channels, True, True, True)
+    for i in range(3):
+        im2col_step = 128 * (2 ** i)
+        check_time_cost(im2col_step)
--- a/classification/optimizer.py
+++ b/classification/optimizer.py
+# --------------------------------------------------------
+# InternImage
+# Copyright (c) 2022 OpenGVLab
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+from torch import optim as optim
+from torch.distributed.optim import ZeroRedundancyOptimizer
+def build_optimizer(config, model):
+    """
+    Build optimizer, set weight decay of normalization to 0 by default.
+    """
+    skip = {}
+    skip_keywords = {}
+    if hasattr(model, 'no_weight_decay'):
+        skip = model.no_weight_decay()
+    if hasattr(model, 'no_weight_decay_keywords'):
+        skip_keywords = model.no_weight_decay_keywords()
+    parameters = set_weight_decay_and_lr(
+        model,
+        config.TRAIN.WEIGHT_DECAY,
+        config.TRAIN.BASE_LR,
+        skip,
+        skip_keywords,
+        lr_layer_decay=config.TRAIN.LR_LAYER_DECAY,
+        lr_layer_decay_ratio=config.TRAIN.LR_LAYER_DECAY_RATIO,
+        freeze_backbone=config.TRAIN.OPTIMIZER.FREEZE_BACKBONE,
+        dcn_lr_mul=config.TRAIN.OPTIMIZER.DCN_LR_MUL,
+    )
+    opt_lower = config.TRAIN.OPTIMIZER.NAME.lower()
+    optimizer = None
+    use_zero = config.TRAIN.OPTIMIZER.USE_ZERO
+    if use_zero:
+        print(f"\nUse Zero!")
+        if opt_lower == 'sgd':
+            # an ugly implementation
+            # https://github.com/pytorch/pytorch/issues/71347
+            optimizer = ZeroRedundancyOptimizer(
+                parameters[0]['params'],
+                optimizer_class=optim.SGD,
+                momentum=config.TRAIN.OPTIMIZER.MOMENTUM,
+                nesterov=True,
+                lr=config.TRAIN.BASE_LR,
+                weight_decay=config.TRAIN.WEIGHT_DECAY)
+            if len(parameters[1]['params']) > 0:
+                optimizer.add_param_group({
+                    "params": parameters[1]['params'],
+                    'weight_decay': 0.
+                })
+        elif opt_lower == 'adamw':
+            optimizer = ZeroRedundancyOptimizer(
+                parameters[0]['params'],
+                optimizer_class=optim.AdamW,
+                eps=config.TRAIN.OPTIMIZER.EPS,
+                betas=config.TRAIN.OPTIMIZER.BETAS,
+                lr=config.TRAIN.BASE_LR,
+                weight_decay=config.TRAIN.WEIGHT_DECAY)
+            if len(parameters[1]['params']) > 0:
+                optimizer.add_param_group({
+                    "params": parameters[1]['params'],
+                    'weight_decay': 0.
+                })
+    else:
+        if opt_lower == 'sgd':
+            optimizer = optim.SGD(parameters,
+                                  momentum=config.TRAIN.OPTIMIZER.MOMENTUM,
+                                  nesterov=True,
+                                  lr=config.TRAIN.BASE_LR,
+                                  weight_decay=config.TRAIN.WEIGHT_DECAY)
+        elif opt_lower == 'adamw':
+            optimizer = optim.AdamW(parameters,
+                                    eps=config.TRAIN.OPTIMIZER.EPS,
+                                    betas=config.TRAIN.OPTIMIZER.BETAS,
+                                    lr=config.TRAIN.BASE_LR,
+                                    weight_decay=config.TRAIN.WEIGHT_DECAY)
+    return optimizer
+def check_keywords_in_name(name, keywords=()):
+    isin = False
+    for keyword in keywords:
+        if keyword in name:
+            isin = True
+    return isin
+def check_keywords_in_dict(name, keywords_dict):
+    for k, v in keywords_dict.items():
+        if k in name:
+            return v
+    return None
+def set_weight_decay_and_lr(
+    model,
+    weight_decay,
+    base_lr,
+    skip_list=(),
+    skip_keywords=(),
+    lr_layer_decay=None,
+    lr_layer_decay_ratio=None,
+    freeze_backbone=None,
+    dcn_lr_mul=None,
+    layerwise_lr=True,
+):
+    parameters = []
+    no_decay_name = []
+    lr_ratio_log = {}
+    for name, param in model.named_parameters():
+        if not param.requires_grad:
+            continue  # frozen weights
+        if freeze_backbone:
+            for i in freeze_backbone:
+                if f'levels.{i}' in name:
+                    param.requires_grad = False
+        # 1. check wd
+        if len(param.shape) == 1 or name.endswith(".bias") or (
+                name in skip_list) or check_keywords_in_name(
+                    name, skip_keywords):
+            wd = 0.
+            no_decay_name.append(name)
+        else:
+            wd = weight_decay
+        if lr_layer_decay:
+            print('layer-wise lr decay is used !')
+            assert hasattr(model, 'lr_decay_keywards')
+            lr_ratio_keywards = model.lr_decay_keywards(lr_layer_decay_ratio)
+            # 2. check lr
+            ratio = check_keywords_in_dict(name, lr_ratio_keywards)
+            if ratio is not None:
+                lr = ratio * base_lr
+            else:
+                lr = base_lr
+            # dcn lr
+            if dcn_lr_mul is not None:
+                if 'offset' in name or 'attention_weights' in name or 'center_feature_scale_proj' in name or 'alpha_beta' in name:
+                    lr = dcn_lr_mul * lr
+            lr_ratio_log[name] = (base_lr, ratio, wd, param.requires_grad)
+        else:
+            lr = base_lr
+        parameters.append({'params': [param], 'weight_decay': wd, 'lr': lr})
+    print('no decay params: {no_decay_name}')
+    if layerwise_lr:
+        print('lr_ratio_params:')
+        for k, v in lr_ratio_log.items():
+            print(k, v)
+    return parameters
--- a/classification/train_in1k.sh
+++ b/classification/train_in1k.sh
+#!/usr/bin/env bash
+set -x
+PARTITION=$1
+JOB_NAME=$2
+CONFIG=$3
+GPUS=${GPUS:-8}
+GPUS_PER_NODE=${GPUS_PER_NODE:-8}
+CPUS_PER_TASK=${CPUS_PER_TASK:-12}
+SRUN_ARGS=${SRUN_ARGS:-""}
+PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
+    srun -p ${PARTITION} \
+    --job-name=${JOB_NAME} \
+    --gres=gpu:${GPUS_PER_NODE} \
+    --ntasks=${GPUS} \
+    --ntasks-per-node=${GPUS_PER_NODE} \
+    --cpus-per-task=${CPUS_PER_TASK} \
+    --kill-on-bad-exit=1 \
+    --quotatype=reserved \
+    ${SRUN_ARGS} \
+    python -u main.py \
+    --cfg ${CONFIG} \
+    --accumulation-steps 1 \
+    --local_rank 0 \
+    --data-path /mnt/lustre/share/images \
+    --output work_dirs ${@:4}
--- a/classification/utils.py
+++ b/classification/utils.py
+# --------------------------------------------------------
+# InternImage
+# Copyright (c) 2022 OpenGVLab
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+import os
+import math
+import torch
+import numpy as np
+import torch.distributed as dist
+from collections import OrderedDict
+from timm.utils import get_state_dict
+try:
+    # noinspection PyUnresolvedReferences
+    from apex import amp
+except ImportError:
+    amp = None
+def load_ema_checkpoint(config, model_ema, logger):
+    logger.info(
+        f'==============> Resuming form {config.MODEL.RESUME}....................'
+    )
+    if config.MODEL.RESUME.startswith('https'):
+        checkpoint = torch.hub.load_state_dict_from_url(config.MODEL.RESUME,
+                                                        map_location='cpu',
+                                                        check_hash=True)
+    else:
+        checkpoint = torch.load(config.MODEL.RESUME, map_location='cpu')
+    assert isinstance(checkpoint, dict)
+    if 'model_ema' in checkpoint:
+        new_state_dict = OrderedDict()
+        for k, v in checkpoint['model_ema'].items():
+            if model_ema.ema_has_module:
+                name = 'module.' + k if not k.startswith('module') else k
+            else:
+                name = k
+            new_state_dict[name] = v
+        msg = model_ema.ema.load_state_dict(new_state_dict, strict=False)
+        logger.info(msg)
+        logger.info('Loaded state_dict_ema')
+    else:
+        logger.warning(
+            'Failed to find state_dict_ema, starting from loaded model weights'
+        )
+    max_accuracy_ema = 0
+    if 'max_accuracy_ema' in checkpoint:
+        max_accuracy_ema = checkpoint['max_accuracy_ema']
+    if 'ema_decay' in checkpoint:
+        model_ema.decay = checkpoint['ema_decay']
+    return max_accuracy_ema
+def load_checkpoint(config, model, optimizer, lr_scheduler, scaler, logger):
+    logger.info(
+        f'==============> Resuming form {config.MODEL.RESUME}....................'
+    )
+    if config.MODEL.RESUME.startswith('https'):
+        checkpoint = torch.hub.load_state_dict_from_url(config.MODEL.RESUME,
+                                                        map_location='cpu',
+                                                        check_hash=True)
+    else:
+        checkpoint = torch.load(config.MODEL.RESUME, map_location='cpu')
+    print('resuming model')
+    msg = model.load_state_dict(checkpoint['model'], strict=False)
+    logger.info(msg)
+    max_accuracy = 0.0
+    if not config.EVAL_MODE and 'optimizer' in checkpoint and 'lr_scheduler' in checkpoint and 'epoch' in checkpoint:
+        if optimizer is not None:
+            print('resuming optimizer')
+            try:
+                optimizer.load_state_dict(checkpoint['optimizer'])
+            except:
+                print('resume optimizer failed')
+        if lr_scheduler is not None:
+            print('resuming lr_scheduler')
+            lr_scheduler.load_state_dict(checkpoint['lr_scheduler'])
+        config.defrost()
+        config.TRAIN.START_EPOCH = checkpoint['epoch'] + 1
+        config.freeze()
+        if 'amp' in checkpoint and config.AMP_OPT_LEVEL != 'O0' and checkpoint[
+                'config'].AMP_OPT_LEVEL != 'O0':
+            scaler.load_state_dict(checkpoint['amp'])
+        logger.info(
+            f"=> loaded successfully {config.MODEL.RESUME} (epoch {checkpoint['epoch']})"
+        )
+        if 'max_accuracy' in checkpoint:
+            max_accuracy = checkpoint['max_accuracy']
+    del checkpoint
+    torch.cuda.empty_cache()
+    return max_accuracy
+def load_pretrained(config, model, logger):
+    logger.info(
+        f'==============> Loading weight {config.MODEL.PRETRAINED} for fine-tuning......'
+    )
+    checkpoint = torch.load(config.MODEL.PRETRAINED, map_location='cpu')
+    state_dict = checkpoint
+    if 'model' in checkpoint:
+        state_dict = checkpoint['model']
+    elif 'module' in checkpoint:
+        state_dict = checkpoint['module']
+    first_key = list(state_dict.keys())[0]
+    # delete teacher weights
+    if 'student' in first_key or 'teacher' in first_key:
+        new_state_dict = OrderedDict()
+        for k, v in state_dict.items():
+            if 'student_proj' in k:
+                continue
+            if 'student' in k:
+                new_k = k.replace('student.', '')
+                new_state_dict[new_k] = v
+        state_dict = new_state_dict
+    # weights from sim
+    if 'mask_token' in first_key:
+        new_state_dict = OrderedDict()
+        for k, v in state_dict.items():
+            if 'mm_dcnv3' in k:
+                continue
+            if 'dcnv3' not in k and 'clip_projector' not in k:
+                continue
+            new_k = k.replace('dcnv3.', '')
+            new_state_dict[new_k] = v
+        new_state_dict['fc_norm.weight'] = state_dict[
+            'clip.classifier_ln.weight']
+        new_state_dict['fc_norm.bias'] = state_dict['clip.classifier_ln.bias']
+        new_state_dict['head.weight'] = state_dict['clip.classifier.weight']
+        new_state_dict['head.bias'] = state_dict['clip.classifier.bias']
+        state_dict = new_state_dict
+    # delete relative_position_index since we always re-init it
+    relative_position_index_keys = [
+        k for k in state_dict.keys() if 'relative_position_index' in k
+    ]
+    for k in relative_position_index_keys:
+        del state_dict[k]
+    # delete relative_coords_table since we always re-init it
+    relative_position_index_keys = [
+        k for k in state_dict.keys() if 'relative_coords_table' in k
+    ]
+    for k in relative_position_index_keys:
+        del state_dict[k]
+    # delete attn_mask since we always re-init it
+    attn_mask_keys = [k for k in state_dict.keys() if 'attn_mask' in k]
+    for k in attn_mask_keys:
+        del state_dict[k]
+    # bicubic interpolate relative_position_bias_table if not match
+    relative_position_bias_table_keys = [
+        k for k in state_dict.keys() if 'relative_position_bias_table' in k
+    ]
+    for k in relative_position_bias_table_keys:
+        relative_position_bias_table_pretrained = state_dict[k]
+        relative_position_bias_table_current = model.state_dict()[k]
+        L1, nH1 = relative_position_bias_table_pretrained.size()
+        L2, nH2 = relative_position_bias_table_current.size()
+        if nH1 != nH2:
+            logger.warning(f'Error in loading {k}, passing......')
+        else:
+            if L1 != L2:
+                # bicubic interpolate relative_position_bias_table if not match
+                S1 = int(L1**0.5)
+                S2 = int(L2**0.5)
+                relative_position_bias_table_pretrained_resized = torch.nn.functional.interpolate(
+                    relative_position_bias_table_pretrained.permute(1, 0).view(
+                        1, nH1, S1, S1),
+                    size=(S2, S2),
+                    mode='bicubic')
+                state_dict[
+                    k] = relative_position_bias_table_pretrained_resized.view(
+                        nH2, L2).permute(1, 0)
+    # bicubic interpolate absolute_pos_embed if not match
+    absolute_pos_embed_keys = [
+        k for k in state_dict.keys() if 'absolute_pos_embed' in k
+    ]
+    for k in absolute_pos_embed_keys:
+        # dpe
+        absolute_pos_embed_pretrained = state_dict[k]
+        absolute_pos_embed_current = model.state_dict()[k]
+        _, L1, C1 = absolute_pos_embed_pretrained.size()
+        _, L2, C2 = absolute_pos_embed_current.size()
+        if C1 != C1:
+            logger.warning(f'Error in loading {k}, passing......')
+        else:
+            if L1 != L2:
+                S1 = int(L1**0.5)
+                S2 = int(L2**0.5)
+                absolute_pos_embed_pretrained = absolute_pos_embed_pretrained.reshape(
+                    -1, S1, S1, C1)
+                absolute_pos_embed_pretrained = absolute_pos_embed_pretrained.permute(
+                    0, 3, 1, 2)
+                absolute_pos_embed_pretrained_resized = torch.nn.functional.interpolate(
+                    absolute_pos_embed_pretrained,
+                    size=(S2, S2),
+                    mode='bicubic')
+                absolute_pos_embed_pretrained_resized = absolute_pos_embed_pretrained_resized.permute(
+                    0, 2, 3, 1)
+                absolute_pos_embed_pretrained_resized = absolute_pos_embed_pretrained_resized.flatten(
+                    1, 2)
+                state_dict[k] = absolute_pos_embed_pretrained_resized
+    # check classifier, if not match, then re-init classifier to zero
+    if 'head.bias' in state_dict:
+        head_bias_pretrained = state_dict['head.bias']
+        Nc1 = head_bias_pretrained.shape[0]
+        Nc2 = model.head.bias.shape[0]
+        if (Nc1 != Nc2):
+            if config.TRAIN.RAND_INIT_FT_HEAD:
+                model.head.weight.data = model.head.weight.data * 0.001
+                model.head.bias.data = model.head.bias.data * 0.001
+                del state_dict['head.weight']
+                del state_dict['head.bias']
+                logger.warning(
+                    f'Error in loading classifier head, re-init classifier head to 0'
+                )
+            elif Nc1 == 21841 and Nc2 == 1000:
+                logger.info(
+                    'loading ImageNet-22K weight to ImageNet-1K ......')
+                map22kto1k_path = 'meta_data/map22kto1k.txt'
+                logger.info(map22kto1k_path)
+                with open(map22kto1k_path) as f:
+                    map22kto1k = f.readlines()
+                map22kto1k = [int(id22k.strip()) for id22k in map22kto1k]
+                state_dict['head.weight'] = state_dict['head.weight'][
+                    map22kto1k, :]
+                state_dict['head.bias'] = state_dict['head.bias'][map22kto1k]
+    msg = model.load_state_dict(state_dict, strict=False)
+    logger.warning(msg)
+    # from IPython import embed
+    # embed()
+    logger.info(f'=> loaded successfully {config.MODEL.PRETRAINED}')
+    del checkpoint
+    torch.cuda.empty_cache()
+def convert_22k_head_to_1k(model, logger):
+    head_weight = model.module.head.weight
+    head_bias = model.module.head.bias
+    Nc1 = head_bias.shape[0]
+    if Nc1 == 21841:
+        logger.info('converting ImageNet-22K head to ImageNet-1K ......')
+        map22kto1k_path = 'meta_data/map22kto1k.txt'
+        logger.info(map22kto1k_path)
+        with open(map22kto1k_path) as f:
+            map22kto1k = f.readlines()
+        map22kto1k = [int(id22k.strip()) for id22k in map22kto1k]
+        model.module.head.weight = torch.nn.Parameter(
+            head_weight[map22kto1k, :])
+        model.module.head.bias = torch.nn.Parameter(head_bias[map22kto1k])
+    else:
+        logger.warning(f'Error in converting classifier head')
+    return model
+def save_checkpoint(config,
+                    epoch,
+                    model,
+                    max_accuracy,
+                    optimizer,
+                    lr_scheduler,
+                    scaler,
+                    logger,
+                    model_ema=None,
+                    max_accuracy_ema=None,
+                    ema_decay=None,
+                    model_ems=None,
+                    max_accuracy_ems=None,
+                    ems_model_num=None,
+                    best=None):
+    save_state = {
+        'model': model.state_dict(),
+        'optimizer': optimizer.state_dict(),
+        'lr_scheduler': lr_scheduler.state_dict(),
+        'max_accuracy': max_accuracy,
+        'epoch': epoch,
+        'config': config
+    }
+    if model_ema is not None:
+        save_state['model_ema'] = get_state_dict(model_ema)
+    if max_accuracy_ema is not None:
+        save_state['max_accuracy_ema'] = max_accuracy_ema
+    if ema_decay is not None:
+        save_state['ema_decay'] = ema_decay
+    if model_ems is not None:
+        save_state['model_ems'] = get_state_dict(model_ems)
+    if max_accuracy_ems is not None:
+        save_state['max_accuracy_ems'] = max_accuracy_ems
+    if ems_model_num is not None:
+        save_state['ems_model_num'] = ems_model_num
+    if config.AMP_OPT_LEVEL != 'O0':
+        # save_state['amp'] = amp.state_dict()
+        save_state['amp'] = scaler.state_dict()
+    if best is None:
+        save_path = os.path.join(config.OUTPUT, f'ckpt_epoch_{epoch}.pth')
+    else:
+        save_path = os.path.join(config.OUTPUT, f'ckpt_epoch_{best}.pth')
+    logger.info(f'{save_path} saving......')
+    torch.save(save_state, save_path)
+    logger.info(f'{save_path} saved !!!')
+    if dist.get_rank() == 0 and isinstance(epoch, int):
+        to_del = epoch - config.SAVE_CKPT_NUM * config.SAVE_FREQ
+        old_ckpt = os.path.join(config.OUTPUT, f'ckpt_epoch_{to_del}.pth')
+        if os.path.exists(old_ckpt):
+            os.remove(old_ckpt)
+def get_grad_norm(parameters, norm_type=2):
+    if isinstance(parameters, torch.Tensor):
+        parameters = [parameters]
+    parameters = list(filter(lambda p: p.grad is not None, parameters))
+    norm_type = float(norm_type)
+    total_norm = 0
+    for p in parameters:
+        param_norm = p.grad.data.norm(norm_type)
+        total_norm += param_norm.item()**norm_type
+    total_norm = total_norm**(1. / norm_type)
+    return total_norm
+def auto_resume_helper(output_dir):
+    checkpoints = os.listdir(output_dir)
+    checkpoints = [ckpt for ckpt in checkpoints if ckpt.endswith('pth')]
+    print(f'All checkpoints founded in {output_dir}: {checkpoints}')
+    if len(checkpoints) > 0:
+        latest_checkpoint = max(
+            [os.path.join(output_dir, d) for d in checkpoints],
+            key=os.path.getmtime)
+        print(f'The latest checkpoint founded: {latest_checkpoint}')
+        resume_file = latest_checkpoint
+    else:
+        resume_file = None
+    return resume_file
+def reduce_tensor(tensor):
+    rt = tensor.clone()
+    dist.all_reduce(rt, op=dist.ReduceOp.SUM)
+    rt /= dist.get_world_size()
+    return rt
+# https://github.com/facebookresearch/ConvNeXt/blob/main/utils.py
+class NativeScalerWithGradNormCount:
+    state_dict_key = 'amp_scaler'
+    def __init__(self):
+        self._scaler = torch.cuda.amp.GradScaler()
+    def __call__(self,
+                 loss,
+                 optimizer,
+                 clip_grad=None,
+                 parameters=None,
+                 create_graph=False,
+                 update_grad=True):
+        self._scaler.scale(loss).backward(create_graph=create_graph)
+        if update_grad:
+            if clip_grad is not None:
+                assert parameters is not None
+                self._scaler.unscale_(
+                    optimizer
+                )  # unscale the gradients of optimizer's assigned params in-place
+                norm = torch.nn.utils.clip_grad_norm_(parameters, clip_grad)
+            else:
+                self._scaler.unscale_(optimizer)
+                norm = get_grad_norm(parameters)
+            self._scaler.step(optimizer)
+            self._scaler.update()
+        else:
+            norm = None
+        return norm
+    def state_dict(self):
+        return self._scaler.state_dict()
+    def load_state_dict(self, state_dict):
+        self._scaler.load_state_dict(state_dict)
+class MyAverageMeter(object):
+    """Computes and stores the average and current value."""
+    def __init__(self, max_len=-1):
+        self.val_list = []
+        self.count = []
+        self.max_len = max_len
+        self.val = 0
+        self.avg = 0
+        self.var = 0
+    def update(self, val):
+        self.val = val
+        self.avg = 0
+        self.var = 0
+        if not math.isnan(val) and not math.isinf(val):
+            self.val_list.append(val)
+        if self.max_len > 0 and len(self.val_list) > self.max_len:
+            self.val_list = self.val_list[-self.max_len:]
+        if len(self.val_list) > 0:
+            self.avg = np.mean(np.array(self.val_list))
+            self.var = np.std(np.array(self.val_list))