Release detection and segmentation

c4552f79 · zhe chen · 5ba0b547 · c4552f79 · c4552f79 · c4552f79
Commit c4552f79 authored Mar 04, 2023 by zhe chen
20 changed files
--- a/segmentation/mmcv_custom/__init__.py
+++ b/segmentation/mmcv_custom/__init__.py
+# --------------------------------------------------------
+# InternImage
+# Copyright (c) 2022 OpenGVLab
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+
+# -*- coding: utf-8 -*-
+from .custom_layer_decay_optimizer_constructor import CustomLayerDecayOptimizerConstructor
+
+__all__ = ['CustomLayerDecayOptimizerConstructor',]
--- a/segmentation/mmcv_custom/custom_layer_decay_optimizer_constructor.py
+++ b/segmentation/mmcv_custom/custom_layer_decay_optimizer_constructor.py
+# --------------------------------------------------------
+# InternImage
+# Copyright (c) 2022 OpenGVLab
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+"""
+Mostly copy-paste from BEiT library:
+https://github.com/microsoft/unilm/blob/master/beit/semantic_segmentation/mmcv_custom/layer_decay_optimizer_constructor.py
+"""
+
+import json
+
+from mmcv.runner import OPTIMIZER_BUILDERS, DefaultOptimizerConstructor
+from mmcv.runner import get_dist_info
+from mmseg.utils import get_root_logger
+
+
+def get_num_layer_for_swin(var_name, num_max_layer, depths):
+    if var_name.startswith("backbone.patch_embed"):
+        return 0
+    elif var_name.startswith('decode_head.mask_embed'):
+        return 0
+    elif var_name.startswith('decode_head.cls_embed'):
+        return 0
+    elif var_name.startswith('decode_head.level_embed'):
+        return 0
+    elif var_name.startswith('decode_head.query_embed'):
+        return 0
+    elif var_name.startswith('decode_head.query_feat'):
+        return 0
+    if var_name.startswith("backbone.cb_modules.0.patch_embed"):
+        return 0
+    elif "level_embeds" in var_name:
+        return 0
+    elif var_name.startswith("backbone.layers") or var_name.startswith(
+            "backbone.levels"):
+        if var_name.split('.')[3] not in ['downsample', 'norm']:
+            stage_id = int(var_name.split('.')[2])
+            layer_id = int(var_name.split('.')[4])
+            # layers for Swin-Large: [2, 2, 18, 2]
+            if stage_id == 0:
+                return layer_id + 1
+            elif stage_id == 1:
+                return layer_id + 1 + depths[0]
+            elif stage_id == 2:
+                return layer_id + 1 + depths[0] + depths[1]
+            else:
+                return layer_id + 1 + depths[0] + depths[1] + depths[2]
+        else:
+            stage_id = int(var_name.split('.')[2])
+            if stage_id == 0:
+                return 1 + depths[0]
+            elif stage_id == 1:
+                return 1 + depths[0] + depths[1]
+            elif stage_id == 2:
+                return 1 + depths[0] + depths[1] + depths[2]
+            else:
+                return 1 + depths[0] + depths[1] + depths[2]
+    else:
+        return num_max_layer - 1
+
+
+@OPTIMIZER_BUILDERS.register_module()
+class CustomLayerDecayOptimizerConstructor(DefaultOptimizerConstructor):
+
+    def add_params(self, params, module, prefix='', is_dcn_module=None):
+        """Add all parameters of module to the params list.
+        The parameters of the given module will be added to the list of param
+        groups, with specific rules defined by paramwise_cfg.
+        Args:
+            params (list[dict]): A list of param groups, it will be modified
+                in place.
+            module (nn.Module): The module to be added.
+            prefix (str): The prefix of the module
+            is_dcn_module (int|float|None): If the current module is a
+                submodule of DCN, `is_dcn_module` will be passed to
+                control conv_offset layer's learning rate. Defaults to None.
+        """
+        parameter_groups = {}
+        logger = get_root_logger()
+        logger.info(self.paramwise_cfg)
+        backbone_small_lr = self.paramwise_cfg.get('backbone_small_lr', False)
+        dino_head = self.paramwise_cfg.get('dino_head', False)
+        num_layers = self.paramwise_cfg.get('num_layers') + 2
+        layer_decay_rate = self.paramwise_cfg.get('layer_decay_rate')
+        depths = self.paramwise_cfg.get('depths')
+        offset_lr_scale = self.paramwise_cfg.get('offset_lr_scale', 1.0)
+
+        logger.info("Build CustomLayerDecayOptimizerConstructor %f - %d" %
+                    (layer_decay_rate, num_layers))
+        weight_decay = self.base_wd
+
+        for name, param in module.named_parameters():
+            if not param.requires_grad:
+                continue  # frozen weights
+            if len(param.shape) == 1 or name.endswith(".bias") or \
+                    "relative_position" in name or \
+                    "norm" in name or\
+                    "sampling_offsets" in name:
+                group_name = "no_decay"
+                this_weight_decay = 0.
+            else:
+                group_name = "decay"
+                this_weight_decay = weight_decay
+
+            layer_id = get_num_layer_for_swin(name, num_layers, depths)
+            if layer_id == num_layers - 1 and dino_head and \
+                    ("sampling_offsets" in name or "reference_points" in name):
+                group_name = "layer_%d_%s_0.1x" % (layer_id, group_name)
+            elif ("sampling_offsets" in name or "reference_points" in name) and "backbone" in name:
+                group_name = "layer_%d_%s_offset_lr_scale" % (layer_id,
+                                                              group_name)
+            else:
+                group_name = "layer_%d_%s" % (layer_id, group_name)
+
+            if group_name not in parameter_groups:
+                scale = layer_decay_rate ** (num_layers - layer_id - 1)
+                if scale < 1 and backbone_small_lr == True:
+                    scale = scale * 0.1
+                if "0.1x" in group_name:
+                    scale = scale * 0.1
+                if "offset_lr_scale" in group_name:
+                    scale = scale * offset_lr_scale
+
+                parameter_groups[group_name] = {
+                    "weight_decay": this_weight_decay,
+                    "params": [],
+                    "param_names": [],
+                    "lr_scale": scale,
+                    "group_name": group_name,
+                    "lr": scale * self.base_lr,
+                }
+
+            parameter_groups[group_name]["params"].append(param)
+            parameter_groups[group_name]["param_names"].append(name)
+        rank, _ = get_dist_info()
+        if rank == 0:
+            to_display = {}
+            for key in parameter_groups:
+                to_display[key] = {
+                    "param_names": parameter_groups[key]["param_names"],
+                    "lr_scale": parameter_groups[key]["lr_scale"],
+                    "lr": parameter_groups[key]["lr"],
+                    "weight_decay": parameter_groups[key]["weight_decay"],
+                }
+            logger.info("Param groups = %s" % json.dumps(to_display, indent=2))
+
+        # state_dict = module.state_dict()
+        # for group_name in parameter_groups:
+        #     group = parameter_groups[group_name]
+        #     for name in group["param_names"]:
+        #         group["params"].append(state_dict[name])
+
+        params.extend(parameter_groups.values())
\ No newline at end of file
--- a/segmentation/mmseg_custom/__init__.py
+++ b/segmentation/mmseg_custom/__init__.py
+# --------------------------------------------------------
+# InternImage
+# Copyright (c) 2022 OpenGVLab
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+
+from .models import *  # noqa: F401,F403
--- a/segmentation/mmseg_custom/models/__init__.py
+++ b/segmentation/mmseg_custom/models/__init__.py
+# --------------------------------------------------------
+# InternImage
+# Copyright (c) 2022 OpenGVLab
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+
+from .backbones import *  # noqa: F401,F403
\ No newline at end of file
--- a/segmentation/mmseg_custom/models/backbones/__init__.py
+++ b/segmentation/mmseg_custom/models/backbones/__init__.py
+# --------------------------------------------------------
+# InternImage
+# Copyright (c) 2022 OpenGVLab
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+
+from .intern_image import InternImage
+
+__all__ = ['InternImage']
--- a/segmentation/mmseg_custom/models/backbones/intern_image.py
+++ b/segmentation/mmseg_custom/models/backbones/intern_image.py
+# --------------------------------------------------------
+# InternImage
+# Copyright (c) 2022 OpenGVLab
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+
+import torch
+import torch.nn as nn
+from collections import OrderedDict
+import torch.utils.checkpoint as checkpoint
+from timm.models.layers import trunc_normal_, DropPath
+from mmcv.runner import _load_checkpoint
+from mmcv.cnn import constant_init, trunc_normal_init
+from mmseg.utils import get_root_logger
+from mmseg.models.builder import BACKBONES
+
+from ops_dcnv3 import modules as opsm
+
+class to_channels_first(nn.Module):
+
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x):
+        return x.permute(0, 3, 1, 2)
+
+
+class to_channels_last(nn.Module):
+
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x):
+        return x.permute(0, 2, 3, 1)
+
+
+def build_norm_layer(dim,
+                     norm_layer,
+                     in_format='channels_last',
+                     out_format='channels_last',
+                     eps=1e-6):
+    layers = []
+    if norm_layer == 'BN':
+        if in_format == 'channels_last':
+            layers.append(to_channels_first())
+        layers.append(nn.BatchNorm2d(dim))
+        if out_format == 'channels_last':
+            layers.append(to_channels_last())
+    elif norm_layer == 'LN':
+        if in_format == 'channels_first':
+            layers.append(to_channels_last())
+        layers.append(nn.LayerNorm(dim, eps=eps))
+        if out_format == 'channels_first':
+            layers.append(to_channels_first())
+    else:
+        raise NotImplementedError(
+            f'build_norm_layer does not support {norm_layer}')
+    return nn.Sequential(*layers)
+
+
+def build_act_layer(act_layer):
+    if act_layer == 'ReLU':
+        return nn.ReLU(inplace=True)
+    elif act_layer == 'SiLU':
+        return nn.SiLU(inplace=True)
+    elif act_layer == 'GELU':
+        return nn.GELU()
+
+    raise NotImplementedError(f'build_act_layer does not support {act_layer}')
+
+
+class StemLayer(nn.Module):
+    r""" Stem layer of InternImage
+    Args:
+        in_chans (int): number of input channels
+        out_chans (int): number of output channels
+        act_layer (str): activation layer
+        norm_layer (str): normalization layer
+    """
+
+    def __init__(self,
+                 in_chans=3,
+                 out_chans=96,
+                 act_layer='GELU',
+                 norm_layer='BN'):
+        super().__init__()
+        self.conv1 = nn.Conv2d(in_chans,
+                               out_chans // 2,
+                               kernel_size=3,
+                               stride=2,
+                               padding=1)
+        self.norm1 = build_norm_layer(out_chans // 2, norm_layer,
+                                      'channels_first', 'channels_first')
+        self.act = build_act_layer(act_layer)
+        self.conv2 = nn.Conv2d(out_chans // 2,
+                               out_chans,
+                               kernel_size=3,
+                               stride=2,
+                               padding=1)
+        self.norm2 = build_norm_layer(out_chans, norm_layer, 'channels_first',
+                                      'channels_last')
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.norm1(x)
+        x = self.act(x)
+        x = self.conv2(x)
+        x = self.norm2(x)
+        return x
+
+
+class DownsampleLayer(nn.Module):
+    r""" Downsample layer of InternImage
+    Args:
+        channels (int): number of input channels
+        norm_layer (str): normalization layer
+    """
+
+    def __init__(self, channels, norm_layer='LN'):
+        super().__init__()
+        self.conv = nn.Conv2d(channels,
+                              2 * channels,
+                              kernel_size=3,
+                              stride=2,
+                              padding=1,
+                              bias=False)
+        self.norm = build_norm_layer(2 * channels, norm_layer,
+                                     'channels_first', 'channels_last')
+
+    def forward(self, x):
+        x = self.conv(x.permute(0, 3, 1, 2))
+        x = self.norm(x)
+        return x
+
+
+class MLPLayer(nn.Module):
+    r""" MLP layer of InternImage
+    Args:
+        in_features (int): number of input features
+        hidden_features (int): number of hidden features
+        out_features (int): number of output features
+        act_layer (str): activation layer
+        drop (float): dropout rate
+    """
+
+    def __init__(self,
+                 in_features,
+                 hidden_features=None,
+                 out_features=None,
+                 act_layer='GELU',
+                 drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = build_act_layer(act_layer)
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+
+
+class InternImageLayer(nn.Module):
+    r""" Basic layer of InternImage
+    Args:
+        core_op (nn.Module): core operation of InternImage
+        channels (int): number of input channels
+        groups (list): Groups of each block.
+        mlp_ratio (float): ratio of mlp hidden features to input channels
+        drop (float): dropout rate
+        drop_path (float): drop path rate
+        act_layer (str): activation layer
+        norm_layer (str): normalization layer
+        post_norm (bool): whether to use post normalization
+        layer_scale (float): layer scale
+        offset_scale (float): offset scale
+        with_cp (bool): whether to use checkpoint
+    """
+
+    def __init__(self,
+                 core_op,
+                 channels,
+                 groups,
+                 mlp_ratio=4.,
+                 drop=0.,
+                 drop_path=0.,
+                 act_layer='GELU',
+                 norm_layer='LN',
+                 post_norm=False,
+                 layer_scale=None,
+                 offset_scale=1.0,
+                 with_cp=False):
+        super().__init__()
+        self.channels = channels
+        self.groups = groups
+        self.mlp_ratio = mlp_ratio
+        self.with_cp = with_cp
+
+        self.norm1 = build_norm_layer(channels, 'LN')
+        self.post_norm = post_norm
+        self.dcn = core_op(channels=channels,
+                           kernel_size=3,
+                           stride=1,
+                           pad=1,
+                           dilation=1,
+                           group=groups,
+                           offset_scale=offset_scale,
+                           act_layer=act_layer,
+                           norm_layer=norm_layer)
+        self.drop_path = DropPath(drop_path) if drop_path > 0. \
+            else nn.Identity()
+        self.norm2 = build_norm_layer(channels, 'LN')
+        self.mlp = MLPLayer(in_features=channels,
+                            hidden_features=int(channels * mlp_ratio),
+                            act_layer=act_layer,
+                            drop=drop)
+        self.layer_scale = layer_scale is not None
+        if self.layer_scale:
+            self.gamma1 = nn.Parameter(layer_scale * torch.ones(channels),
+                                       requires_grad=True)
+            self.gamma2 = nn.Parameter(layer_scale * torch.ones(channels),
+                                       requires_grad=True)
+
+    def forward(self, x):
+
+        def _inner_forward(x):
+            if not self.layer_scale:
+                if self.post_norm:
+                    x = x + self.drop_path(self.norm1(self.dcn(x)))
+                    x = x + self.drop_path(self.norm2(self.mlp(x)))
+                else:
+                    x = x + self.drop_path(self.dcn(self.norm1(x)))
+                    x = x + self.drop_path(self.mlp(self.norm2(x)))
+                return x
+            if self.post_norm:
+                x = x + self.drop_path(self.gamma1 * self.norm1(self.dcn(x)))
+                x = x + self.drop_path(self.gamma2 * self.norm2(self.mlp(x)))
+            else:
+                x = x + self.drop_path(self.gamma1 * self.dcn(self.norm1(x)))
+                x = x + self.drop_path(self.gamma2 * self.mlp(self.norm2(x)))
+            return x
+
+        if self.with_cp and x.requires_grad:
+            x = checkpoint.checkpoint(_inner_forward, x)
+        else:
+            x = _inner_forward(x)
+        return x
+
+
+class InternImageBlock(nn.Module):
+    r""" Block of InternImage
+    Args:
+        core_op (nn.Module): core operation of InternImage
+        channels (int): number of input channels
+        depths (list): Depth of each block.
+        groups (list): Groups of each block.
+        mlp_ratio (float): ratio of mlp hidden features to input channels
+        drop (float): dropout rate
+        drop_path (float): drop path rate
+        act_layer (str): activation layer
+        norm_layer (str): normalization layer
+        post_norm (bool): whether to use post normalization
+        layer_scale (float): layer scale
+        offset_scale (float): offset scale
+        with_cp (bool): whether to use checkpoint
+    """
+
+    def __init__(self,
+                 core_op,
+                 channels,
+                 depth,
+                 groups,
+                 downsample=True,
+                 mlp_ratio=4.,
+                 drop=0.,
+                 drop_path=0.,
+                 act_layer='GELU',
+                 norm_layer='LN',
+                 post_norm=False,
+                 offset_scale=1.0,
+                 layer_scale=None,
+                 with_cp=False):
+        super().__init__()
+        self.channels = channels
+        self.depth = depth
+        self.post_norm = post_norm
+
+        self.blocks = nn.ModuleList([
+            InternImageLayer(core_op=core_op,
+                             channels=channels,
+                             groups=groups,
+                             mlp_ratio=mlp_ratio,
+                             drop=drop,
+                             drop_path=drop_path[i] if isinstance(
+                                 drop_path, list) else drop_path,
+                             act_layer=act_layer,
+                             norm_layer=norm_layer,
+                             post_norm=post_norm,
+                             layer_scale=layer_scale,
+                             offset_scale=offset_scale,
+                             with_cp=with_cp) for i in range(depth)
+        ])
+        if not self.post_norm:
+            self.norm = build_norm_layer(channels, 'LN')
+        self.downsample = DownsampleLayer(
+            channels=channels, norm_layer=norm_layer) if downsample else None
+
+    def forward(self, x, return_wo_downsample=False):
+        for blk in self.blocks:
+            x = blk(x)
+        if not self.post_norm:
+            x = self.norm(x)
+        if return_wo_downsample:
+            x_ = x
+        if self.downsample is not None:
+            x = self.downsample(x)
+
+        if return_wo_downsample:
+            return x, x_
+        return x
+
+
+@BACKBONES.register_module()
+class InternImage(nn.Module):
+    r""" InternImage
+        A PyTorch impl of : `InternImage: Exploring Large-Scale Vision Foundation Models with Deformable Convolutions`  -
+          https://arxiv.org/pdf/2103.14030
+    Args:
+        core_op (str): Core operator. Default: 'DCNv3'
+        channels (int): Number of the first stage. Default: 64
+        depths (list): Depth of each block. Default: [3, 4, 18, 5]
+        groups (list): Groups of each block. Default: [3, 6, 12, 24]
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.
+        drop_rate (float): Probability of an element to be zeroed. Default: 0.
+        drop_path_rate (float): Stochastic depth rate. Default: 0.
+        act_layer (str): Activation layer. Default: 'GELU'
+        norm_layer (str): Normalization layer. Default: 'LN'
+        layer_scale (bool): Whether to use layer scale. Default: False
+        cls_scale (bool): Whether to use class scale. Default: False
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+    """
+
+    def __init__(self,
+                 core_op='DCNv3',
+                 channels=64,
+                 depths=[3, 4, 18, 5],
+                 groups=[3, 6, 12, 24],
+                 mlp_ratio=4.,
+                 drop_rate=0.,
+                 drop_path_rate=0.2,
+                 drop_path_type='linear',
+                 act_layer='GELU',
+                 norm_layer='LN',
+                 layer_scale=None,
+                 offset_scale=1.0,
+                 post_norm=False,
+                 with_cp=False,
+                 out_indices=(0, 1, 2, 3),
+                 init_cfg=None,
+                 **kwargs):
+        super().__init__()
+        self.core_op = core_op
+        self.num_levels = len(depths)
+        self.depths = depths
+        self.channels = channels
+        self.num_features = int(channels * 2**(self.num_levels - 1))
+        self.post_norm = post_norm
+        self.mlp_ratio = mlp_ratio
+        self.init_cfg = init_cfg
+        self.out_indices = out_indices
+        print(f'using core type: {core_op}')
+        print(f'using activation layer: {act_layer}')
+        print(f'using main norm layer: {norm_layer}')
+        print(f'using dpr: {drop_path_type}, {drop_path_rate}')
+
+        in_chans = 3
+        self.patch_embed = StemLayer(in_chans=in_chans,
+                                     out_chans=channels,
+                                     act_layer=act_layer,
+                                     norm_layer=norm_layer)
+        self.pos_drop = nn.Dropout(p=drop_rate)
+
+        dpr = [
+            x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))
+        ]
+        if drop_path_type == 'uniform':
+            for i in range(len(dpr)):
+                dpr[i] = drop_path_rate
+
+        self.levels = nn.ModuleList()
+        for i in range(self.num_levels):
+            level = InternImageBlock(
+                core_op=getattr(opsm, core_op),
+                channels=int(channels * 2**i),
+                depth=depths[i],
+                groups=groups[i],
+                mlp_ratio=self.mlp_ratio,
+                drop=drop_rate,
+                drop_path=dpr[sum(depths[:i]):sum(depths[:i + 1])],
+                act_layer=act_layer,
+                norm_layer=norm_layer,
+                post_norm=post_norm,
+                downsample=(i < self.num_levels - 1),
+                layer_scale=layer_scale,
+                offset_scale=offset_scale,
+                with_cp=with_cp)
+            self.levels.append(level)
+
+        self.num_layers = len(depths)
+        self.apply(self._init_weights)
+        self.apply(self._init_deform_weights)
+
+    def init_weights(self):
+        logger = get_root_logger()
+        if self.init_cfg is None:
+            logger.warn(f'No pre-trained weights for '
+                        f'{self.__class__.__name__}, '
+                        f'training start from scratch')
+            for m in self.modules():
+                if isinstance(m, nn.Linear):
+                    trunc_normal_init(m, std=.02, bias=0.)
+                elif isinstance(m, nn.LayerNorm):
+                    constant_init(m, 1.0)
+        else:
+            assert 'checkpoint' in self.init_cfg, f'Only support ' \
+                                                  f'specify `Pretrained` in ' \
+                                                  f'`init_cfg` in ' \
+                                                  f'{self.__class__.__name__} '
+            ckpt = _load_checkpoint(self.init_cfg.checkpoint,
+                                    logger=logger,
+                                    map_location='cpu')
+            if 'state_dict' in ckpt:
+                _state_dict = ckpt['state_dict']
+            elif 'model' in ckpt:
+                _state_dict = ckpt['model']
+            else:
+                _state_dict = ckpt
+
+            state_dict = OrderedDict()
+            for k, v in _state_dict.items():
+                if k.startswith('backbone.'):
+                    state_dict[k[9:]] = v
+                else:
+                    state_dict[k] = v
+
+            # strip prefix of state_dict
+            if list(state_dict.keys())[0].startswith('module.'):
+                state_dict = {k[7:]: v for k, v in state_dict.items()}
+
+            # load state_dict
+            meg = self.load_state_dict(state_dict, False)
+            logger.info(meg)
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+
+    def _init_deform_weights(self, m):
+        if isinstance(m, getattr(opsm, self.core_op)):
+            m._reset_parameters()
+
+    def forward(self, x):
+        x = self.patch_embed(x)
+        x = self.pos_drop(x)
+
+        seq_out = []
+        for level_idx, level in enumerate(self.levels):
+            x, x_ = level(x, return_wo_downsample=True)
+            if level_idx in self.out_indices:
+                seq_out.append(x_.permute(0, 3, 1, 2).contiguous())
+        return seq_out
--- a/segmentation/ops_dcnv3/functions/__init__.py
+++ b/segmentation/ops_dcnv3/functions/__init__.py
+# --------------------------------------------------------
+# InternImage
+# Copyright (c) 2022 OpenGVLab
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+
+from .dcnv3_func import DCNv3Function, dcnv3_core_pytorch
--- a/segmentation/ops_dcnv3/functions/dcnv3_func.py
+++ b/segmentation/ops_dcnv3/functions/dcnv3_func.py
+# --------------------------------------------------------
+# InternImage
+# Copyright (c) 2022 OpenGVLab
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+
+from __future__ import absolute_import
+from __future__ import print_function
+from __future__ import division
+
+import torch
+import torch.nn.functional as F
+from torch.autograd import Function
+from torch.autograd.function import once_differentiable
+from torch.cuda.amp import custom_bwd, custom_fwd
+import DCNv3
+
+
+class DCNv3Function(Function):
+    @staticmethod
+    @custom_fwd
+    def forward(
+            ctx, input, offset, mask,
+            kernel_h, kernel_w, stride_h, stride_w,
+            pad_h, pad_w, dilation_h, dilation_w,
+            group, group_channels, offset_scale, im2col_step):
+        ctx.kernel_h = kernel_h
+        ctx.kernel_w = kernel_w
+        ctx.stride_h = stride_h
+        ctx.stride_w = stride_w
+        ctx.pad_h = pad_h
+        ctx.pad_w = pad_w
+        ctx.dilation_h = dilation_h
+        ctx.dilation_w = dilation_w
+        ctx.group = group
+        ctx.group_channels = group_channels
+        ctx.offset_scale = offset_scale
+        ctx.im2col_step = im2col_step
+        output = DCNv3.dcnv3_forward(
+            input, offset, mask, kernel_h,
+            kernel_w, stride_h, stride_w, pad_h,
+            pad_w, dilation_h, dilation_w, group,
+            group_channels, offset_scale, ctx.im2col_step)
+        ctx.save_for_backward(input, offset, mask)
+
+        return output
+
+    @staticmethod
+    @once_differentiable
+    @custom_bwd
+    def backward(ctx, grad_output):
+        input, offset, mask = ctx.saved_tensors
+        grad_input, grad_offset, grad_mask = \
+            DCNv3.dcnv3_backward(
+                input, offset, mask, ctx.kernel_h,
+                ctx.kernel_w, ctx.stride_h, ctx.stride_w, ctx.pad_h,
+                ctx.pad_w, ctx.dilation_h, ctx.dilation_w, ctx.group,
+                ctx.group_channels, ctx.offset_scale, grad_output.contiguous(), ctx.im2col_step)
+
+        return grad_input, grad_offset, grad_mask, \
+            None, None, None, None, None, None, None, None, None, None, None, None
+
+
+def _get_reference_points(spatial_shapes, device, kernel_h, kernel_w, dilation_h, dilation_w, pad_h=0, pad_w=0, stride_h=1, stride_w=1):
+    _, H_, W_, _ = spatial_shapes
+    H_out = (H_ - (dilation_h * (kernel_h - 1) + 1)) // stride_h + 1
+    W_out = (W_ - (dilation_w * (kernel_w - 1) + 1)) // stride_w + 1
+
+    ref_y, ref_x = torch.meshgrid(
+        torch.linspace(
+            # pad_h + 0.5,
+            # H_ - pad_h - 0.5,
+            (dilation_h * (kernel_h - 1)) // 2 + 0.5,
+            (dilation_h * (kernel_h - 1)) // 2 + 0.5 + (H_out - 1) * stride_h,
+            H_out,
+            dtype=torch.float32,
+            device=device),
+        torch.linspace(
+            # pad_w + 0.5,
+            # W_ - pad_w - 0.5,
+            (dilation_w * (kernel_w - 1)) // 2 + 0.5,
+            (dilation_w * (kernel_w - 1)) // 2 + 0.5 + (W_out - 1) * stride_w,
+            W_out,
+            dtype=torch.float32,
+            device=device))
+    ref_y = ref_y.reshape(-1)[None] / H_
+    ref_x = ref_x.reshape(-1)[None] / W_
+
+    ref = torch.stack((ref_x, ref_y), -1).reshape(
+        1, H_out, W_out, 1, 2)
+
+    return ref
+
+
+def _generate_dilation_grids(spatial_shapes, kernel_h, kernel_w, dilation_h, dilation_w, group, device):
+    _, H_, W_, _ = spatial_shapes
+    points_list = []
+    x, y = torch.meshgrid(
+        torch.linspace(
+            -((dilation_w * (kernel_w - 1)) // 2),
+            -((dilation_w * (kernel_w - 1)) // 2) +
+            (kernel_w - 1) * dilation_w, kernel_w,
+            dtype=torch.float32,
+            device=device),
+        torch.linspace(
+            -((dilation_h * (kernel_h - 1)) // 2),
+            -((dilation_h * (kernel_h - 1)) // 2) +
+            (kernel_h - 1) * dilation_h, kernel_h,
+            dtype=torch.float32,
+            device=device))
+
+    points_list.extend([x / W_, y / H_])
+    grid = torch.stack(points_list, -1).reshape(-1, 1, 2).\
+        repeat(1, group, 1).permute(1, 0, 2)
+    grid = grid.reshape(1, 1, 1, group * kernel_h * kernel_w, 2)
+
+    return grid
+
+
+def dcnv3_core_pytorch(
+        input, offset, mask, kernel_h,
+        kernel_w, stride_h, stride_w, pad_h,
+        pad_w, dilation_h, dilation_w, group,
+        group_channels, offset_scale):
+    # for debug and test only,
+    # need to use cuda version instead
+    input = F.pad(
+        input,
+        [0, 0, pad_h, pad_h, pad_w, pad_w])
+    N_, H_in, W_in, _ = input.shape
+    _, H_out, W_out, _ = offset.shape
+
+    ref = _get_reference_points(
+        input.shape, input.device, kernel_h, kernel_w, dilation_h, dilation_w, pad_h, pad_w, stride_h, stride_w)
+    grid = _generate_dilation_grids(
+        input.shape, kernel_h, kernel_w, dilation_h, dilation_w, group, input.device)
+    spatial_norm = torch.tensor([W_in, H_in]).reshape(1, 1, 1, 2).\
+        repeat(1, 1, 1, group*kernel_h*kernel_w).to(input.device)
+
+    sampling_locations = (ref + grid * offset_scale).repeat(N_, 1, 1, 1, 1).flatten(3, 4) + \
+        offset * offset_scale / spatial_norm
+
+    P_ = kernel_h * kernel_w
+    sampling_grids = 2 * sampling_locations - 1
+    # N_, H_in, W_in, group*group_channels -> N_, H_in*W_in, group*group_channels -> N_, group*group_channels, H_in*W_in -> N_*group, group_channels, H_in, W_in
+    input_ = input.view(N_, H_in*W_in, group*group_channels).transpose(1, 2).\
+        reshape(N_*group, group_channels, H_in, W_in)
+    # N_, H_out, W_out, group*P_*2 -> N_, H_out*W_out, group, P_, 2 -> N_, group, H_out*W_out, P_, 2 -> N_*group, H_out*W_out, P_, 2
+    sampling_grid_ = sampling_grids.view(N_, H_out*W_out, group, P_, 2).transpose(1, 2).\
+        flatten(0, 1)
+    # N_*group, group_channels, H_out*W_out, P_
+    sampling_input_ = F.grid_sample(
+        input_, sampling_grid_, mode='bilinear', padding_mode='zeros', align_corners=False)
+
+    # (N_, H_out, W_out, group*P_) -> N_, H_out*W_out, group, P_ -> (N_, group, H_out*W_out, P_) -> (N_*group, 1, H_out*W_out, P_)
+    mask = mask.view(N_, H_out*W_out, group, P_).transpose(1, 2).\
+        reshape(N_*group, 1, H_out*W_out, P_)
+    output = (sampling_input_ * mask).sum(-1).view(N_,
+                                                   group*group_channels, H_out*W_out)
+
+    return output.transpose(1, 2).reshape(N_, H_out, W_out, -1).contiguous()
--- a/segmentation/ops_dcnv3/make.sh
+++ b/segmentation/ops_dcnv3/make.sh
+#!/usr/bin/env bash
+# --------------------------------------------------------
+# InternImage
+# Copyright (c) 2022 OpenGVLab
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+
+python setup.py build install
--- a/segmentation/ops_dcnv3/modules/__init__.py
+++ b/segmentation/ops_dcnv3/modules/__init__.py
+# --------------------------------------------------------
+# InternImage
+# Copyright (c) 2022 OpenGVLab
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+
+from .dcnv3 import DCNv3, DCNv3_pytorch
\ No newline at end of file
--- a/segmentation/ops_dcnv3/modules/dcnv3.py
+++ b/segmentation/ops_dcnv3/modules/dcnv3.py
+# --------------------------------------------------------
+# InternImage
+# Copyright (c) 2022 OpenGVLab
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+
+from __future__ import absolute_import
+from __future__ import print_function
+from __future__ import division
+
+import warnings
+from torch import nn
+import torch.nn.functional as F
+from torch.nn.init import xavier_uniform_, constant_
+from ..functions import DCNv3Function, dcnv3_core_pytorch
+
+
+class to_channels_first(nn.Module):
+
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x):
+        return x.permute(0, 3, 1, 2)
+
+
+class to_channels_last(nn.Module):
+
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x):
+        return x.permute(0, 2, 3, 1)
+
+
+def build_norm_layer(dim,
+                     norm_layer,
+                     in_format='channels_last',
+                     out_format='channels_last',
+                     eps=1e-6):
+    layers = []
+    if norm_layer == 'BN':
+        if in_format == 'channels_last':
+            layers.append(to_channels_first())
+        layers.append(nn.BatchNorm2d(dim))
+        if out_format == 'channels_last':
+            layers.append(to_channels_last())
+    elif norm_layer == 'LN':
+        if in_format == 'channels_first':
+            layers.append(to_channels_last())
+        layers.append(nn.LayerNorm(dim, eps=eps))
+        if out_format == 'channels_first':
+            layers.append(to_channels_first())
+    else:
+        raise NotImplementedError(
+            f'build_norm_layer does not support {norm_layer}')
+    return nn.Sequential(*layers)
+
+
+def build_act_layer(act_layer):
+    if act_layer == 'ReLU':
+        return nn.ReLU(inplace=True)
+    elif act_layer == 'SiLU':
+        return nn.SiLU(inplace=True)
+    elif act_layer == 'GELU':
+        return nn.GELU()
+
+    raise NotImplementedError(f'build_act_layer does not support {act_layer}')
+
+
+def _is_power_of_2(n):
+    if (not isinstance(n, int)) or (n < 0):
+        raise ValueError(
+            "invalid input for _is_power_of_2: {} (type: {})".format(n, type(n)))
+
+    return (n & (n-1) == 0) and n != 0
+
+
+class DCNv3_pytorch(nn.Module):
+    def __init__(
+            self, channels=64, kernel_size=3, stride=1,
+            pad=1, dilation=1, group=4, offset_scale=1.0,
+            act_layer='GELU', norm_layer='LN'):
+        """
+        DCNv3 Module
+        :param channels     
+        :param kernel_size  
+        :param stride      
+        :param pad     
+        :param dilation
+        :param group
+        :param offset_scale
+        :param act_layer
+        :param norm_layer
+        """
+        super().__init__()
+        if channels % group != 0:
+            raise ValueError(
+                f'channels must be divisible by group, but got {channels} and {group}')
+        _d_per_group = channels // group
+        # you'd better set _d_per_group to a power of 2 which is more efficient in our CUDA implementation
+        if not _is_power_of_2(_d_per_group):
+            warnings.warn(
+                "You'd better set channels in DCNv3 to make the dimension of each attention head a power of 2 "
+                "which is more efficient in our CUDA implementation.")
+
+        self.offset_scale = offset_scale
+        self.channels = channels
+        self.kernel_size = kernel_size
+        self.stride = stride
+        self.dilation = 1
+        self.pad = pad
+        self.group = group
+        self.group_channels = channels // group
+        self.offset_scale = offset_scale
+
+        self.dw_conv = nn.Sequential(
+            nn.Conv2d(
+                channels,
+                channels,
+                kernel_size=kernel_size,
+                stride=1,
+                padding=(kernel_size-1)//2,
+                groups=channels),
+            build_norm_layer(
+                channels,
+                norm_layer,
+                'channels_first',
+                'channels_last'),
+            build_act_layer(act_layer))
+        self.offset = nn.Linear(
+            channels,
+            group * kernel_size * kernel_size * 2)
+        self.mask = nn.Linear(
+            channels,
+            group * kernel_size * kernel_size)
+        self.input_proj = nn.Linear(channels, channels)
+        self.output_proj = nn.Linear(channels, channels)
+        self._reset_parameters()
+
+    def _reset_parameters(self):
+        constant_(self.offset.weight.data, 0.)
+        constant_(self.offset.bias.data, 0.)
+        constant_(self.mask.weight.data, 0.)
+        constant_(self.mask.bias.data, 0.)
+        xavier_uniform_(self.input_proj.weight.data)
+        constant_(self.input_proj.bias.data, 0.)
+        xavier_uniform_(self.output_proj.weight.data)
+        constant_(self.output_proj.bias.data, 0.)
+
+    def forward(self, input):
+        """
+        :param query                       (N, H, W, C)
+        :return output                     (N, H, W, C)
+        """
+        N, H, W, _ = input.shape
+
+        x = self.input_proj(input)
+
+        x1 = input.permute(0, 3, 1, 2)
+        x1 = self.dw_conv(x1)
+        offset = self.offset(x1)
+        mask = self.mask(x1).reshape(N, H, W, self.group, -1)
+        mask = F.softmax(mask, -1).reshape(N, H, W, -1)
+
+        x = dcnv3_core_pytorch(
+            x, offset, mask,
+            self.kernel_size, self.kernel_size,
+            self.stride, self.stride,
+            self.pad, self.pad,
+            self.dilation, self.dilation,
+            self.group, self.group_channels,
+            self.offset_scale)
+        x = self.output_proj(x)
+
+        return x
+
+
+class DCNv3(nn.Module):
+    def __init__(
+            self, channels=64, kernel_size=3, stride=1,
+            pad=1, dilation=1, group=4, offset_scale=1.0,
+            act_layer='GELU', norm_layer='LN'):
+        """
+        DCNv3 Module
+        :param channels     
+        :param kernel_size  
+        :param stride      
+        :param pad     
+        :param dilation
+        :param group
+        :param offset_scale
+        :param act_layer
+        :param norm_layer
+        """
+        super().__init__()
+        if channels % group != 0:
+            raise ValueError(
+                f'channels must be divisible by group, but got {channels} and {group}')
+        _d_per_group = channels // group
+        # you'd better set _d_per_group to a power of 2 which is more efficient in our CUDA implementation
+        if not _is_power_of_2(_d_per_group):
+            warnings.warn(
+                "You'd better set channels in DCNv3 to make the dimension of each attention head a power of 2 "
+                "which is more efficient in our CUDA implementation.")
+
+        self.offset_scale = offset_scale
+        self.channels = channels
+        self.kernel_size = kernel_size
+        self.stride = stride
+        self.dilation = 1
+        self.pad = pad
+        self.group = group
+        self.group_channels = channels // group
+        self.offset_scale = offset_scale
+
+        self.dw_conv = nn.Sequential(
+            nn.Conv2d(
+                channels,
+                channels,
+                kernel_size=kernel_size,
+                stride=1,
+                padding=(kernel_size-1)//2,
+                groups=channels),
+            build_norm_layer(
+                channels,
+                norm_layer,
+                'channels_first',
+                'channels_last'),
+            build_act_layer(act_layer))
+        self.offset = nn.Linear(
+            channels,
+            group * kernel_size * kernel_size * 2)
+        self.mask = nn.Linear(
+            channels,
+            group * kernel_size * kernel_size)
+        self.input_proj = nn.Linear(channels, channels)
+        self.output_proj = nn.Linear(channels, channels)
+        self._reset_parameters()
+
+    def _reset_parameters(self):
+        constant_(self.offset.weight.data, 0.)
+        constant_(self.offset.bias.data, 0.)
+        constant_(self.mask.weight.data, 0.)
+        constant_(self.mask.bias.data, 0.)
+        xavier_uniform_(self.input_proj.weight.data)
+        constant_(self.input_proj.bias.data, 0.)
+        xavier_uniform_(self.output_proj.weight.data)
+        constant_(self.output_proj.bias.data, 0.)
+
+    def forward(self, input):
+        """
+        :param query                       (N, H, W, C)
+        :return output                     (N, H, W, C)
+        """
+        N, H, W, _ = input.shape
+
+        x = self.input_proj(input)
+        dtype = x.dtype
+
+        x1 = input.permute(0, 3, 1, 2)
+        x1 = self.dw_conv(x1)
+        offset = self.offset(x1)
+        mask = self.mask(x1).reshape(N, H, W, self.group, -1)
+        mask = F.softmax(mask, -1).reshape(N, H, W, -1).type(dtype)
+
+        x = DCNv3Function.apply(
+            x, offset, mask,
+            self.kernel_size, self.kernel_size,
+            self.stride, self.stride,
+            self.pad, self.pad,
+            self.dilation, self.dilation,
+            self.group, self.group_channels,
+            self.offset_scale,
+            256)
+        x = self.output_proj(x)
+
+        return x
--- a/segmentation/ops_dcnv3/setup.py
+++ b/segmentation/ops_dcnv3/setup.py
+# --------------------------------------------------------
+# InternImage
+# Copyright (c) 2022 OpenGVLab
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+
+import os
+import glob
+
+import torch
+
+from torch.utils.cpp_extension import CUDA_HOME
+from torch.utils.cpp_extension import CppExtension
+from torch.utils.cpp_extension import CUDAExtension
+
+from setuptools import find_packages
+from setuptools import setup
+
+requirements = ["torch", "torchvision"]
+
+
+def get_extensions():
+    this_dir = os.path.dirname(os.path.abspath(__file__))
+    extensions_dir = os.path.join(this_dir, "src")
+
+    main_file = glob.glob(os.path.join(extensions_dir, "*.cpp"))
+    source_cpu = glob.glob(os.path.join(extensions_dir, "cpu", "*.cpp"))
+    source_cuda = glob.glob(os.path.join(extensions_dir, "cuda", "*.cu"))
+
+    sources = main_file + source_cpu
+    extension = CppExtension
+    extra_compile_args = {"cxx": []}
+    define_macros = []
+
+    if torch.cuda.is_available() and CUDA_HOME is not None:
+        extension = CUDAExtension
+        sources += source_cuda
+        define_macros += [("WITH_CUDA", None)]
+        extra_compile_args["nvcc"] = [
+            # "-DCUDA_HAS_FP16=1",
+            # "-D__CUDA_NO_HALF_OPERATORS__",
+            # "-D__CUDA_NO_HALF_CONVERSIONS__",
+            # "-D__CUDA_NO_HALF2_OPERATORS__",
+        ]
+    else:
+        raise NotImplementedError('Cuda is not availabel')
+
+    sources = [os.path.join(extensions_dir, s) for s in sources]
+    include_dirs = [extensions_dir]
+    ext_modules = [
+        extension(
+            "DCNv3",
+            sources,
+            include_dirs=include_dirs,
+            define_macros=define_macros,
+            extra_compile_args=extra_compile_args,
+        )
+    ]
+    return ext_modules
+
+
+setup(
+    name="DCNv3",
+    version="1.0",
+    author="InternImage",
+    url="https://github.com/OpenGVLab/InternImage",
+    description=
+    "PyTorch Wrapper for CUDA Functions of DCNv3",
+    packages=find_packages(exclude=(
+        "configs",
+        "tests",
+    )),
+    ext_modules=get_extensions(),
+    cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension},
+)
--- a/segmentation/ops_dcnv3/src/cpu/dcnv3_cpu.cpp
+++ b/segmentation/ops_dcnv3/src/cpu/dcnv3_cpu.cpp
+/*!
+**************************************************************************************************
+* InternImage
+* Copyright (c) 2022 OpenGVLab
+* Licensed under The MIT License [see LICENSE for details]
+**************************************************************************************************
+* Modified from
+*https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+**************************************************************************************************
+*/
+
+#include <vector>
+
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+
+at::Tensor dcnv3_cpu_forward(const at::Tensor &input, const at::Tensor &offset,
+                             const at::Tensor &mask, const int kernel_h,
+                             const int kernel_w, const int stride_h,
+                             const int stride_w, const int pad_h,
+                             const int pad_w, const int dilation_h,
+                             const int dilation_w, const int group,
+                             const int group_channels, const float offset_scale,
+                             const int im2col_step) {
+    AT_ERROR("Not implement on cpu");
+}
+
+std::vector<at::Tensor>
+dcnv3_cpu_backward(const at::Tensor &input, const at::Tensor &offset,
+                   const at::Tensor &mask, const int kernel_h,
+                   const int kernel_w, const int stride_h, const int stride_w,
+                   const int pad_h, const int pad_w, const int dilation_h,
+                   const int dilation_w, const int group,
+                   const int group_channels, const float offset_scale,
+                   const at::Tensor &grad_output, const int im2col_step) {
+    AT_ERROR("Not implement on cpu");
+}
--- a/segmentation/ops_dcnv3/src/cpu/dcnv3_cpu.h
+++ b/segmentation/ops_dcnv3/src/cpu/dcnv3_cpu.h
+/*!
+**************************************************************************************************
+* InternImage
+* Copyright (c) 2022 OpenGVLab
+* Licensed under The MIT License [see LICENSE for details]
+**************************************************************************************************
+* Modified from
+*https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+**************************************************************************************************
+*/
+
+#pragma once
+#include <torch/extension.h>
+
+at::Tensor dcnv3_cpu_forward(const at::Tensor &input, const at::Tensor &offset,
+                             const at::Tensor &mask, const int kernel_h,
+                             const int kernel_w, const int stride_h,
+                             const int stride_w, const int pad_h,
+                             const int pad_w, const int dilation_h,
+                             const int dilation_w, const int group,
+                             const int group_channels, const float offset_scale,
+                             const int im2col_step);
+
+std::vector<at::Tensor>
+dcnv3_cpu_backward(const at::Tensor &input, const at::Tensor &offset,
+                   const at::Tensor &mask, const int kernel_h,
+                   const int kernel_w, const int stride_h, const int stride_w,
+                   const int pad_h, const int pad_w, const int dilation_h,
+                   const int dilation_w, const int group,
+                   const int group_channels, const float offset_scale,
+                   const at::Tensor &grad_output, const int im2col_step);
--- a/segmentation/ops_dcnv3/src/cuda/dcnv3_cuda.cu
+++ b/segmentation/ops_dcnv3/src/cuda/dcnv3_cuda.cu
+/*!
+**************************************************************************************************
+* InternImage
+* Copyright (c) 2022 OpenGVLab
+* Licensed under The MIT License [see LICENSE for details]
+**************************************************************************************************
+* Modified from
+*https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+**************************************************************************************************
+*/
+
+#include "cuda/dcnv3_im2col_cuda.cuh"
+#include <vector>
+
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <torch/torch.h>
+
+at::Tensor dcnv3_cuda_forward(const at::Tensor &input, const at::Tensor &offset,
+                              const at::Tensor &mask, const int kernel_h,
+                              const int kernel_w, const int stride_h,
+                              const int stride_w, const int pad_h,
+                              const int pad_w, const int dilation_h,
+                              const int dilation_w, const int group,
+                              const int group_channels,
+                              const float offset_scale, const int im2col_step) {
+    AT_ASSERTM(input.is_contiguous(), "input tensor has to be contiguous");
+    AT_ASSERTM(offset.is_contiguous(), "offset tensor has to be contiguous");
+    AT_ASSERTM(mask.is_contiguous(), "mask tensor has to be contiguous");
+    AT_ASSERTM(input.type().is_cuda(), "input must be a CUDA tensor");
+    AT_ASSERTM(offset.type().is_cuda(), "offset must be a CUDA tensor");
+    AT_ASSERTM(mask.type().is_cuda(), "mask must be a CUDA tensor");
+
+    const int batch = input.size(0);
+    const int height_in = input.size(1);
+    const int width_in = input.size(2);
+    const int channels = input.size(3);
+    const int height_out =
+        (height_in + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h +
+        1;
+    const int width_out =
+        (width_in + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w +
+        1;
+    const int im2col_step_ = std::min(batch, im2col_step);
+
+    AT_ASSERTM(batch % im2col_step_ == 0,
+               "batch(%d) must divide im2col_step(%d)", batch, im2col_step_);
+    AT_ASSERTM(
+        channels == (group * group_channels),
+        "Input channels and group times group channels wont match: (%d vs %d).",
+        channels, group * group_channels);
+
+    auto output =
+        at::zeros({batch, height_out, width_out, group * group_channels},
+                  input.options());
+
+    const int batch_n = im2col_step_;
+    auto output_n = output.view({batch / batch_n, batch_n, height_out,
+                                 width_out, group * group_channels});
+    auto per_input_size = height_in * width_in * group * group_channels;
+    auto per_offset_size =
+        height_out * width_out * group * kernel_h * kernel_w * 2;
+    auto per_mask_size = height_out * width_out * group * kernel_h * kernel_w;
+    for (int n = 0; n < batch / im2col_step_; ++n) {
+        auto columns = output_n.select(0, n);
+        // AT_DISPATCH_FLOATING_TYPES(
+        AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+            input.type(), "ms_deform_attn_forward_cuda", ([&] {
+                dcnv3_im2col_cuda(
+                    at::cuda::getCurrentCUDAStream(),
+                    input.data<scalar_t>() + n * im2col_step_ * per_input_size,
+                    offset.data<scalar_t>() +
+                        n * im2col_step_ * per_offset_size,
+                    mask.data<scalar_t>() + n * im2col_step_ * per_mask_size,
+                    columns.data<scalar_t>(), kernel_h, kernel_w, stride_h,
+                    stride_w, pad_h, pad_w, dilation_h, dilation_w, group,
+                    group_channels, batch_n, height_in, width_in, height_out,
+                    width_out, offset_scale);
+            }));
+    }
+
+    return output;
+}
+
+std::vector<at::Tensor>
+dcnv3_cuda_backward(const at::Tensor &input, const at::Tensor &offset,
+                    const at::Tensor &mask, const int kernel_h,
+                    const int kernel_w, const int stride_h, const int stride_w,
+                    const int pad_h, const int pad_w, const int dilation_h,
+                    const int dilation_w, const int group,
+                    const int group_channels, const float offset_scale,
+                    const at::Tensor &grad_output, const int im2col_step) {
+
+    AT_ASSERTM(input.is_contiguous(), "input tensor has to be contiguous");
+    AT_ASSERTM(offset.is_contiguous(), "offset tensor has to be contiguous");
+    AT_ASSERTM(mask.is_contiguous(), "mask tensor has to be contiguous");
+    AT_ASSERTM(grad_output.is_contiguous(),
+               "grad_output tensor has to be contiguous");
+    AT_ASSERTM(input.type().is_cuda(), "input must be a CUDA tensor");
+    AT_ASSERTM(offset.type().is_cuda(), "offset must be a CUDA tensor");
+    AT_ASSERTM(mask.type().is_cuda(), "mask must be a CUDA tensor");
+    AT_ASSERTM(grad_output.type().is_cuda(),
+               "grad_output must be a CUDA tensor");
+
+    const int batch = input.size(0);
+    const int height_in = input.size(1);
+    const int width_in = input.size(2);
+    const int channels = input.size(3);
+    const int height_out =
+        (height_in + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h +
+        1;
+    const int width_out =
+        (width_in + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w +
+        1;
+    const int im2col_step_ = std::min(batch, im2col_step);
+
+    AT_ASSERTM(batch % im2col_step_ == 0,
+               "batch(%d) must divide im2col_step(%d)", batch, im2col_step_);
+    AT_ASSERTM(
+        channels == (group * group_channels),
+        "Input channels and group times group channels wont match: (%d vs %d).",
+        channels, group * group_channels);
+
+    auto dtype = input.dtype();
+    if (dtype == at::kHalf) {
+        dtype = at::kFloat;
+    }
+
+    auto grad_input = at::zeros_like(input, dtype);
+    auto grad_offset = at::zeros_like(offset, dtype);
+    auto grad_mask = at::zeros_like(mask, dtype);
+
+    const int batch_n = im2col_step_;
+    auto per_input_size = height_in * width_in * group * group_channels;
+    auto per_offset_size =
+        height_out * width_out * group * kernel_h * kernel_w * 2;
+    auto per_mask_size = height_out * width_out * group * kernel_h * kernel_w;
+    auto grad_output_n =
+        grad_output.view({batch / im2col_step_, batch_n, height_out * width_out,
+                          group, group_channels});
+
+    for (int n = 0; n < batch / im2col_step_; ++n) {
+        auto grad_output_g = grad_output_n.select(0, n);
+        // AT_DISPATCH_FLOATING_TYPES(
+        AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+            input.type(), "ms_deform_attn_backward_cuda", ([&] {
+                dcnv3_col2im_cuda(
+                    at::cuda::getCurrentCUDAStream(),
+                    grad_output_g.data<scalar_t>(),
+                    input.data<scalar_t>() + n * im2col_step_ * per_input_size,
+                    offset.data<scalar_t>() +
+                        n * im2col_step_ * per_offset_size,
+                    mask.data<scalar_t>() + n * im2col_step_ * per_mask_size,
+                    kernel_h, kernel_w, stride_h, stride_w, pad_h, pad_w,
+                    dilation_h, dilation_w, group, group_channels, batch_n,
+                    height_in, width_in, height_out, width_out, offset_scale,
+                    grad_input.data<opmath_t>() +
+                        n * im2col_step_ * per_input_size,
+                    grad_offset.data<opmath_t>() +
+                        n * im2col_step_ * per_offset_size,
+                    grad_mask.data<opmath_t>() +
+                        n * im2col_step_ * per_mask_size);
+            }));
+    }
+
+    if (input.dtype() == torch::kHalf) {
+        return {grad_input.to(torch::kHalf), grad_offset.to(torch::kHalf),
+                grad_mask.to(torch::kHalf)};
+    } else {
+        return {grad_input, grad_offset, grad_mask};
+    }
+}
\ No newline at end of file
--- a/segmentation/ops_dcnv3/src/cuda/dcnv3_cuda.h
+++ b/segmentation/ops_dcnv3/src/cuda/dcnv3_cuda.h
+/*!
+**************************************************************************************************
+* InternImage
+* Copyright (c) 2022 OpenGVLab
+* Licensed under The MIT License [see LICENSE for details]
+**************************************************************************************************
+* Modified from
+*https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+**************************************************************************************************
+*/
+
+#pragma once
+#include <torch/extension.h>
+
+at::Tensor dcnv3_cuda_forward(const at::Tensor &input, const at::Tensor &offset,
+                              const at::Tensor &mask, const int kernel_h,
+                              const int kernel_w, const int stride_h,
+                              const int stride_w, const int pad_h,
+                              const int pad_w, const int dilation_h,
+                              const int dilation_w, const int group,
+                              const int group_channels,
+                              const float offset_scale, const int im2col_step);
+
+std::vector<at::Tensor>
+dcnv3_cuda_backward(const at::Tensor &input, const at::Tensor &offset,
+                    const at::Tensor &mask, const int kernel_h,
+                    const int kernel_w, const int stride_h, const int stride_w,
+                    const int pad_h, const int pad_w, const int dilation_h,
+                    const int dilation_w, const int group,
+                    const int group_channels, const float offset_scale,
+                    const at::Tensor &grad_output, const int im2col_step);
--- a/segmentation/ops_dcnv3/src/cuda/dcnv3_im2col_cuda.cuh
+++ b/segmentation/ops_dcnv3/src/cuda/dcnv3_im2col_cuda.cuh
+/*!
+**************************************************************************************************
+* InternImage
+* Copyright (c) 2022 OpenGVLab
+* Licensed under The MIT License [see LICENSE for details]
+**************************************************************************************************
+* Modified from
+*https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+**************************************************************************************************
+*/
+
+#include <algorithm>
+#include <cstdio>
+#include <cstring>
+
+#include <ATen/ATen.h>
+#include <ATen/OpMathType.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <THC/THCAtomics.cuh>
+
+#define CUDA_KERNEL_LOOP(i, n)                                                 \
+    for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n);               \
+         i += blockDim.x * gridDim.x)
+
+const int CUDA_NUM_THREADS = 256;
+inline int GET_BLOCKS(const int N, const int num_threads) {
+    return (N + num_threads - 1) / num_threads;
+}
+
+#define opmath_t at::opmath_type<scalar_t>
+
+template <typename scalar_t>
+__device__ opmath_t dcnv3_im2col_bilinear(const scalar_t *&bottom_data,
+                                          const int &height, const int &width,
+                                          const int &group,
+                                          const int &group_channels,
+                                          const opmath_t &h, const opmath_t &w,
+                                          const int &g, const int &c) {
+    const int h_low = floor(h);
+    const int w_low = floor(w);
+    const int h_high = h_low + 1;
+    const int w_high = w_low + 1;
+
+    const opmath_t lh = h - h_low;
+    const opmath_t lw = w - w_low;
+    const opmath_t hh = 1 - lh, hw = 1 - lw;
+
+    const int w_stride = group * group_channels;
+    const int h_stride = width * w_stride;
+    const int h_low_ptr_offset = h_low * h_stride;
+    const int h_high_ptr_offset = h_low_ptr_offset + h_stride;
+    const int w_low_ptr_offset = w_low * w_stride;
+    const int w_high_ptr_offset = w_low_ptr_offset + w_stride;
+    const int base_ptr = g * group_channels + c;
+
+    opmath_t v1 = 0;
+    if (h_low >= 0 && w_low >= 0) {
+        const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr;
+        v1 = bottom_data[ptr1];
+    }
+    opmath_t v2 = 0;
+    if (h_low >= 0 && w_high <= width - 1) {
+        const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr;
+        v2 = bottom_data[ptr2];
+    }
+    opmath_t v3 = 0;
+    if (h_high <= height - 1 && w_low >= 0) {
+        const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr;
+        v3 = bottom_data[ptr3];
+    }
+    opmath_t v4 = 0;
+    if (h_high <= height - 1 && w_high <= width - 1) {
+        const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr;
+        v4 = bottom_data[ptr4];
+    }
+    const opmath_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
+
+    const opmath_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+    return val;
+}
+
+template <typename scalar_t>
+__device__ void dcnv3_col2im_bilinear(
+    const scalar_t *&bottom_data, const int &height, const int &width,
+    const int &nheads, const int &group_channels, const opmath_t &h,
+    const opmath_t &w, const int &m, const int &c, const opmath_t offset_scale,
+    const opmath_t &top_grad, const opmath_t &mask, opmath_t *&grad_im,
+    opmath_t *grad_offset, opmath_t *grad_mask) {
+    const int h_low = floor(h);
+    const int w_low = floor(w);
+    const int h_high = h_low + 1;
+    const int w_high = w_low + 1;
+
+    const opmath_t lh = h - h_low;
+    const opmath_t lw = w - w_low;
+    const opmath_t hh = 1 - lh, hw = 1 - lw;
+
+    const int w_stride = nheads * group_channels;
+    const int h_stride = width * w_stride;
+    const int h_low_ptr_offset = h_low * h_stride;
+    const int h_high_ptr_offset = h_low_ptr_offset + h_stride;
+    const int w_low_ptr_offset = w_low * w_stride;
+    const int w_high_ptr_offset = w_low_ptr_offset + w_stride;
+    const int base_ptr = m * group_channels + c;
+
+    const opmath_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
+    const opmath_t top_grad_im = top_grad * mask;
+    opmath_t grad_h_weight = 0, grad_w_weight = 0;
+
+    opmath_t v1 = 0;
+    if (h_low >= 0 && w_low >= 0) {
+        const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr;
+        v1 = bottom_data[ptr1];
+        grad_h_weight -= hw * v1;
+        grad_w_weight -= hh * v1;
+        atomicAdd(grad_im + ptr1, w1 * top_grad_im);
+    }
+    opmath_t v2 = 0;
+    if (h_low >= 0 && w_high <= width - 1) {
+        const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr;
+        v2 = bottom_data[ptr2];
+        grad_h_weight -= lw * v2;
+        grad_w_weight += hh * v2;
+        atomicAdd(grad_im + ptr2, w2 * top_grad_im);
+    }
+    opmath_t v3 = 0;
+    if (h_high <= height - 1 && w_low >= 0) {
+        const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr;
+        v3 = bottom_data[ptr3];
+        grad_h_weight += hw * v3;
+        grad_w_weight -= lh * v3;
+        atomicAdd(grad_im + ptr3, w3 * top_grad_im);
+    }
+    opmath_t v4 = 0;
+    if (h_high <= height - 1 && w_high <= width - 1) {
+        const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr;
+        v4 = bottom_data[ptr4];
+        grad_h_weight += lw * v4;
+        grad_w_weight += lh * v4;
+        atomicAdd(grad_im + ptr4, w4 * top_grad_im);
+    }
+
+    const opmath_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+    *grad_mask = top_grad * val;
+    *grad_offset = offset_scale * grad_w_weight * top_grad_im;
+    *(grad_offset + 1) = offset_scale * grad_h_weight * top_grad_im;
+}
+
+template <typename scalar_t>
+__device__ void dcnv3_col2im_bilinear_gm(
+    const scalar_t *&bottom_data, const int &height, const int &width,
+    const int &nheads, const int &group_channels, const opmath_t &h,
+    const opmath_t &w, const int &m, const int &c, const opmath_t offset_scale,
+    const opmath_t &top_grad, const opmath_t &mask, opmath_t *&grad_im,
+    opmath_t *grad_offset, opmath_t *grad_mask) {
+    const int h_low = floor(h);
+    const int w_low = floor(w);
+    const int h_high = h_low + 1;
+    const int w_high = w_low + 1;
+
+    const opmath_t lh = h - h_low;
+    const opmath_t lw = w - w_low;
+    const opmath_t hh = 1 - lh, hw = 1 - lw;
+
+    const int w_stride = nheads * group_channels;
+    const int h_stride = width * w_stride;
+    const int h_low_ptr_offset = h_low * h_stride;
+    const int h_high_ptr_offset = h_low_ptr_offset + h_stride;
+    const int w_low_ptr_offset = w_low * w_stride;
+    const int w_high_ptr_offset = w_low_ptr_offset + w_stride;
+    const int base_ptr = m * group_channels + c;
+
+    const opmath_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
+    const opmath_t top_grad_im = top_grad * mask;
+    opmath_t grad_h_weight = 0, grad_w_weight = 0;
+
+    opmath_t v1 = 0;
+    if (h_low >= 0 && w_low >= 0) {
+        const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr;
+        v1 = bottom_data[ptr1];
+        grad_h_weight -= hw * v1;
+        grad_w_weight -= hh * v1;
+        atomicAdd(grad_im + ptr1, w1 * top_grad_im);
+    }
+    opmath_t v2 = 0;
+    if (h_low >= 0 && w_high <= width - 1) {
+        const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr;
+        v2 = bottom_data[ptr2];
+        grad_h_weight -= lw * v2;
+        grad_w_weight += hh * v2;
+        atomicAdd(grad_im + ptr2, w2 * top_grad_im);
+    }
+    opmath_t v3 = 0;
+    if (h_high <= height - 1 && w_low >= 0) {
+        const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr;
+        v3 = bottom_data[ptr3];
+        grad_h_weight += hw * v3;
+        grad_w_weight -= lh * v3;
+        atomicAdd(grad_im + ptr3, w3 * top_grad_im);
+    }
+    opmath_t v4 = 0;
+    if (h_high <= height - 1 && w_high <= width - 1) {
+        const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr;
+        v4 = bottom_data[ptr4];
+        grad_h_weight += lw * v4;
+        grad_w_weight += lh * v4;
+        atomicAdd(grad_im + ptr4, w4 * top_grad_im);
+    }
+
+    const opmath_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+    atomicAdd(grad_mask, top_grad * val);
+    atomicAdd(grad_offset, offset_scale * grad_w_weight * top_grad_im);
+    atomicAdd(grad_offset + 1, offset_scale * grad_h_weight * top_grad_im);
+}
+
+template <typename scalar_t>
+__global__ void dcnv3_im2col_gpu_kernel(
+    const int num_kernels, const scalar_t *data_im, const scalar_t *data_offset,
+    const scalar_t *data_mask, scalar_t *data_col, const int kernel_h,
+    const int kernel_w, const int stride_h, const int stride_w, const int pad_h,
+    const int pad_w, const int dilation_h, const int dilation_w,
+    const int group, const int group_channels, const int height_in,
+    const int width_in, const int height_out, const int width_out,
+    const opmath_t offset_scale) {
+    CUDA_KERNEL_LOOP(index, num_kernels) {
+        int _temp = index;
+        const int c_col = _temp % group_channels;
+        _temp /= group_channels;
+        const int sampling_index = _temp;
+        const int g_col = _temp % group;
+        _temp /= group;
+        const int p0_w = ((dilation_w * (kernel_w - 1)) >> 1) - pad_w +
+                         (_temp % width_out) * stride_w;
+        _temp /= width_out;
+        const int p0_h = ((dilation_h * (kernel_h - 1)) >> 1) - pad_h +
+                         (_temp % height_out) * stride_h;
+        _temp /= height_out;
+        const int b_col = _temp;
+
+        const int input_size = height_in * width_in;
+        scalar_t *data_col_ptr = data_col + index;
+        const int kernel_size = kernel_h * kernel_w;
+        int data_weight_ptr = sampling_index * kernel_size;
+        int data_loc_w_ptr = data_weight_ptr << 1;
+        const int qid_stride = group * group_channels;
+        opmath_t col = 0;
+        const scalar_t *data_im_ptr = data_im + b_col * input_size * qid_stride;
+        // top-left
+        const opmath_t p0_w_ =
+            p0_w - ((dilation_w * (kernel_w - 1)) >> 1) * offset_scale;
+        const opmath_t p0_h_ =
+            p0_h - ((dilation_h * (kernel_h - 1)) >> 1) * offset_scale;
+        for (int i = 0; i < kernel_w; ++i) {
+            for (int j = 0; j < kernel_h; ++j) {
+                const opmath_t offset_w = data_offset[data_loc_w_ptr];
+                const opmath_t offset_h = data_offset[data_loc_w_ptr + 1];
+                const opmath_t loc_w =
+                    p0_w_ + (i * dilation_w + offset_w) * offset_scale;
+                const opmath_t loc_h =
+                    p0_h_ + (j * dilation_h + offset_h) * offset_scale;
+                const opmath_t weight = data_mask[data_weight_ptr];
+                if (loc_h > -1 && loc_w > -1 && loc_h < height_in &&
+                    loc_w < width_in) {
+                    col += dcnv3_im2col_bilinear(
+                               data_im_ptr, height_in, width_in, group,
+                               group_channels, loc_h, loc_w, g_col, c_col) *
+                           weight;
+                }
+                data_weight_ptr += 1;
+                data_loc_w_ptr += 2;
+            }
+        }
+        *data_col_ptr = col;
+    }
+}
+
+// debug
+template <typename scalar_t, unsigned int blockSize>
+__global__ void dcnv3_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1(
+    const int num_kernels, const scalar_t *grad_col, const scalar_t *data_im,
+    const scalar_t *data_offset, const scalar_t *data_mask, const int kernel_h,
+    const int kernel_w, const int stride_h, const int stride_w, const int pad_h,
+    const int pad_w, const int dilation_h, const int dilation_w,
+    const int group, const int group_channels, const int height_in,
+    const int width_in, const int height_out, const int width_out,
+    const opmath_t offset_scale, opmath_t *grad_im, opmath_t *grad_offset,
+    opmath_t *grad_mask) {
+    CUDA_KERNEL_LOOP(index, num_kernels) {
+        __shared__ opmath_t cache_grad_offset[blockSize * 2];
+        __shared__ opmath_t cache_grad_mask[blockSize];
+        unsigned int tid = threadIdx.x;
+        int _temp = index;
+        const int c_col = _temp % group_channels;
+        _temp /= group_channels;
+        const int sampling_index = _temp;
+        const int g_col = _temp % group;
+        _temp /= group;
+        const int p0_w = ((dilation_w * (kernel_w - 1)) >> 1) - pad_w +
+                         (_temp % width_out) * stride_w;
+        _temp /= width_out;
+        const int p0_h = ((dilation_h * (kernel_h - 1)) >> 1) - pad_h +
+                         (_temp % height_out) * stride_h;
+        _temp /= height_out;
+        const int b_col = _temp;
+
+        const opmath_t top_grad = grad_col[index];
+        const int input_size = height_in * width_in;
+        const int kernel_size = kernel_h * kernel_w;
+        int data_weight_ptr = sampling_index * kernel_size;
+        int data_loc_w_ptr = data_weight_ptr << 1;
+        const int grad_sampling_ptr = data_weight_ptr;
+        grad_offset += grad_sampling_ptr << 1;
+        grad_mask += grad_sampling_ptr;
+        const int qid_stride = group * group_channels;
+        const int im_ptr_offset = b_col * input_size * qid_stride;
+        const scalar_t *data_im_ptr = data_im + im_ptr_offset;
+        opmath_t *grad_im_ptr = grad_im + im_ptr_offset;
+        const opmath_t p0_w_ =
+            p0_w - ((dilation_w * (kernel_w - 1)) >> 1) * offset_scale;
+        const opmath_t p0_h_ =
+            p0_h - ((dilation_h * (kernel_h - 1)) >> 1) * offset_scale;
+        for (int i = 0; i < kernel_w; ++i) {
+            for (int j = 0; j < kernel_h; ++j) {
+                const opmath_t offset_w = data_offset[data_loc_w_ptr];
+                const opmath_t offset_h = data_offset[data_loc_w_ptr + 1];
+                const opmath_t loc_w =
+                    p0_w_ + (i * dilation_w + offset_w) * offset_scale;
+                const opmath_t loc_h =
+                    p0_h_ + (j * dilation_h + offset_h) * offset_scale;
+                const opmath_t weight = data_mask[data_weight_ptr];
+                *(cache_grad_offset + (threadIdx.x << 1)) = 0;
+                *(cache_grad_offset + ((threadIdx.x << 1) + 1)) = 0;
+                *(cache_grad_mask + threadIdx.x) = 0;
+                if (loc_h > -1 && loc_w > -1 && loc_h < height_in &&
+                    loc_w < width_in) {
+                    dcnv3_col2im_bilinear(
+                        data_im_ptr, height_in, width_in, group, group_channels,
+                        loc_h, loc_w, g_col, c_col, offset_scale, top_grad,
+                        weight, grad_im_ptr,
+                        cache_grad_offset + (threadIdx.x << 1),
+                        cache_grad_mask + threadIdx.x);
+                }
+
+                __syncthreads();
+                if (tid == 0) {
+                    opmath_t _grad_w = cache_grad_offset[0],
+                             _grad_h = cache_grad_offset[1],
+                             _grad_a = cache_grad_mask[0];
+                    int sid = 2;
+                    for (unsigned int tid = 1; tid < blockSize; ++tid) {
+                        _grad_w += cache_grad_offset[sid];
+                        _grad_h += cache_grad_offset[sid + 1];
+                        _grad_a += cache_grad_mask[tid];
+                        sid += 2;
+                    }
+
+                    *grad_offset = _grad_w;
+                    *(grad_offset + 1) = _grad_h;
+                    *grad_mask = _grad_a;
+                }
+                __syncthreads();
+
+                data_weight_ptr += 1;
+                data_loc_w_ptr += 2;
+                grad_mask += 1;
+                grad_offset += 2;
+            }
+        }
+    }
+}
+
+template <typename scalar_t, unsigned int blockSize>
+__global__ void dcnv3_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2(
+    const int num_kernels, const scalar_t *grad_col, const scalar_t *data_im,
+    const scalar_t *data_offset, const scalar_t *data_mask, const int kernel_h,
+    const int kernel_w, const int stride_h, const int stride_w, const int pad_h,
+    const int pad_w, const int dilation_h, const int dilation_w,
+    const int group, const int group_channels, const int height_in,
+    const int width_in, const int height_out, const int width_out,
+    const opmath_t offset_scale, opmath_t *grad_im, opmath_t *grad_offset,
+    opmath_t *grad_mask) {
+    CUDA_KERNEL_LOOP(index, num_kernels) {
+        __shared__ opmath_t cache_grad_offset[blockSize * 2];
+        __shared__ opmath_t cache_grad_mask[blockSize];
+        unsigned int tid = threadIdx.x;
+        int _temp = index;
+        const int c_col = _temp % group_channels;
+        _temp /= group_channels;
+        const int sampling_index = _temp;
+        const int g_col = _temp % group;
+        _temp /= group;
+        const int p0_w = ((dilation_w * (kernel_w - 1)) >> 1) - pad_w +
+                         (_temp % width_out) * stride_w;
+        _temp /= width_out;
+        const int p0_h = ((dilation_h * (kernel_h - 1)) >> 1) - pad_h +
+                         (_temp % height_out) * stride_h;
+        _temp /= height_out;
+        const int b_col = _temp;
+
+        const opmath_t top_grad = grad_col[index];
+        const int input_size = height_in * width_in;
+        const int kernel_size = kernel_h * kernel_w;
+        int data_weight_ptr = sampling_index * kernel_size;
+        int data_loc_w_ptr = data_weight_ptr << 1;
+        const int grad_sampling_ptr = data_weight_ptr;
+        grad_offset += grad_sampling_ptr << 1;
+        grad_mask += grad_sampling_ptr;
+        const int qid_stride = group * group_channels;
+        const int im_ptr_offset = b_col * input_size * qid_stride;
+        const scalar_t *data_im_ptr = data_im + im_ptr_offset;
+        opmath_t *grad_im_ptr = grad_im + im_ptr_offset;
+        const opmath_t p0_w_ =
+            p0_w - ((dilation_w * (kernel_w - 1)) >> 1) * offset_scale;
+        const opmath_t p0_h_ =
+            p0_h - ((dilation_h * (kernel_h - 1)) >> 1) * offset_scale;
+        for (int i = 0; i < kernel_w; ++i) {
+            for (int j = 0; j < kernel_h; ++j) {
+                const opmath_t offset_w = data_offset[data_loc_w_ptr];
+                const opmath_t offset_h = data_offset[data_loc_w_ptr + 1];
+                const opmath_t loc_w =
+                    p0_w_ + (i * dilation_w + offset_w) * offset_scale;
+                const opmath_t loc_h =
+                    p0_h_ + (j * dilation_h + offset_h) * offset_scale;
+                const opmath_t weight = data_mask[data_weight_ptr];
+                *(cache_grad_offset + (threadIdx.x << 1)) = 0;
+                *(cache_grad_offset + ((threadIdx.x << 1) + 1)) = 0;
+                *(cache_grad_mask + threadIdx.x) = 0;
+                if (loc_h > -1 && loc_w > -1 && loc_h < height_in &&
+                    loc_w < width_in) {
+                    dcnv3_col2im_bilinear(
+                        data_im_ptr, height_in, width_in, group, group_channels,
+                        loc_h, loc_w, g_col, c_col, offset_scale, top_grad,
+                        weight, grad_im_ptr,
+                        cache_grad_offset + (threadIdx.x << 1),
+                        cache_grad_mask + threadIdx.x);
+                }
+
+                __syncthreads();
+
+                for (unsigned int s = blockSize / 2; s > 0; s >>= 1) {
+                    if (tid < s) {
+                        const unsigned int xid1 = tid << 1;
+                        const unsigned int xid2 = (tid + s) << 1;
+                        cache_grad_mask[tid] += cache_grad_mask[tid + s];
+                        cache_grad_offset[xid1] += cache_grad_offset[xid2];
+                        cache_grad_offset[xid1 + 1] +=
+                            cache_grad_offset[xid2 + 1];
+                    }
+                    __syncthreads();
+                }
+
+                if (tid == 0) {
+                    *grad_offset = cache_grad_offset[0];
+                    *(grad_offset + 1) = cache_grad_offset[1];
+                    *grad_mask = cache_grad_mask[0];
+                }
+                __syncthreads();
+
+                data_weight_ptr += 1;
+                data_loc_w_ptr += 2;
+                grad_mask += 1;
+                grad_offset += 2;
+            }
+        }
+    }
+}
+
+template <typename scalar_t>
+__global__ void dcnv3_col2im_gpu_kernel_shm_reduce_v1(
+    const int num_kernels, const scalar_t *grad_col, const scalar_t *data_im,
+    const scalar_t *data_offset, const scalar_t *data_mask, const int kernel_h,
+    const int kernel_w, const int stride_h, const int stride_w, const int pad_h,
+    const int pad_w, const int dilation_h, const int dilation_w,
+    const int group, const int group_channels, const int height_in,
+    const int width_in, const int height_out, const int width_out,
+    const opmath_t offset_scale, opmath_t *grad_im, opmath_t *grad_offset,
+    opmath_t *grad_mask) {
+    CUDA_KERNEL_LOOP(index, num_kernels) {
+        extern __shared__ int _s[];
+        opmath_t *cache_grad_offset = (opmath_t *)_s;
+        opmath_t *cache_grad_mask = cache_grad_offset + 2 * blockDim.x;
+        unsigned int tid = threadIdx.x;
+        int _temp = index;
+        const int c_col = _temp % group_channels;
+        _temp /= group_channels;
+        const int sampling_index = _temp;
+        const int g_col = _temp % group;
+        _temp /= group;
+        const int p0_w = ((dilation_w * (kernel_w - 1)) >> 1) - pad_w +
+                         (_temp % width_out) * stride_w;
+        _temp /= width_out;
+        const int p0_h = ((dilation_h * (kernel_h - 1)) >> 1) - pad_h +
+                         (_temp % height_out) * stride_h;
+        _temp /= height_out;
+        const int b_col = _temp;
+
+        const opmath_t top_grad = grad_col[index];
+        const int input_size = height_in * width_in;
+        const int kernel_size = kernel_h * kernel_w;
+        int data_weight_ptr = sampling_index * kernel_size;
+        int data_loc_w_ptr = data_weight_ptr << 1;
+        const int grad_sampling_ptr = data_weight_ptr;
+        grad_offset += grad_sampling_ptr << 1;
+        grad_mask += grad_sampling_ptr;
+        const int qid_stride = group * group_channels;
+        const int im_ptr_offset = b_col * input_size * qid_stride;
+        const scalar_t *data_im_ptr = data_im + im_ptr_offset;
+        opmath_t *grad_im_ptr = grad_im + im_ptr_offset;
+        const opmath_t p0_w_ =
+            p0_w - ((dilation_w * (kernel_w - 1)) >> 1) * offset_scale;
+        const opmath_t p0_h_ =
+            p0_h - ((dilation_h * (kernel_h - 1)) >> 1) * offset_scale;
+        for (int i = 0; i < kernel_w; ++i) {
+            for (int j = 0; j < kernel_h; ++j) {
+                const opmath_t offset_w = data_offset[data_loc_w_ptr];
+                const opmath_t offset_h = data_offset[data_loc_w_ptr + 1];
+                const opmath_t loc_w =
+                    p0_w_ + (i * dilation_w + offset_w) * offset_scale;
+                const opmath_t loc_h =
+                    p0_h_ + (j * dilation_h + offset_h) * offset_scale;
+                const opmath_t weight = data_mask[data_weight_ptr];
+                *(cache_grad_offset + (threadIdx.x << 1)) = 0;
+                *(cache_grad_offset + ((threadIdx.x << 1) + 1)) = 0;
+                *(cache_grad_mask + threadIdx.x) = 0;
+                if (loc_h > -1 && loc_w > -1 && loc_h < height_in &&
+                    loc_w < width_in) {
+                    dcnv3_col2im_bilinear(
+                        data_im_ptr, height_in, width_in, group, group_channels,
+                        loc_h, loc_w, g_col, c_col, offset_scale, top_grad,
+                        weight, grad_im_ptr,
+                        cache_grad_offset + (threadIdx.x << 1),
+                        cache_grad_mask + threadIdx.x);
+                }
+
+                __syncthreads();
+                if (tid == 0) {
+                    opmath_t _grad_w = cache_grad_offset[0],
+                             _grad_h = cache_grad_offset[1],
+                             _grad_a = cache_grad_mask[0];
+                    int sid = 2;
+                    for (unsigned int tid = 1; tid < blockDim.x; ++tid) {
+                        _grad_w += cache_grad_offset[sid];
+                        _grad_h += cache_grad_offset[sid + 1];
+                        _grad_a += cache_grad_mask[tid];
+                        sid += 2;
+                    }
+
+                    *grad_offset = _grad_w;
+                    *(grad_offset + 1) = _grad_h;
+                    *grad_mask = _grad_a;
+                }
+                __syncthreads();
+
+                data_weight_ptr += 1;
+                data_loc_w_ptr += 2;
+                grad_mask += 1;
+                grad_offset += 2;
+            }
+        }
+    }
+}
+
+template <typename scalar_t>
+__global__ void dcnv3_col2im_gpu_kernel_shm_reduce_v2(
+    const int num_kernels, const scalar_t *grad_col, const scalar_t *data_im,
+    const scalar_t *data_offset, const scalar_t *data_mask, const int kernel_h,
+    const int kernel_w, const int stride_h, const int stride_w, const int pad_h,
+    const int pad_w, const int dilation_h, const int dilation_w,
+    const int group, const int group_channels, const int height_in,
+    const int width_in, const int height_out, const int width_out,
+    const opmath_t offset_scale, opmath_t *grad_im, opmath_t *grad_offset,
+    opmath_t *grad_mask) {
+    CUDA_KERNEL_LOOP(index, num_kernels) {
+        extern __shared__ int _s[];
+        opmath_t *cache_grad_offset = (opmath_t *)_s;
+        opmath_t *cache_grad_mask = cache_grad_offset + 2 * blockDim.x;
+        unsigned int tid = threadIdx.x;
+        int _temp = index;
+        const int c_col = _temp % group_channels;
+        _temp /= group_channels;
+        const int sampling_index = _temp;
+        const int g_col = _temp % group;
+        _temp /= group;
+        const int p0_w = ((dilation_w * (kernel_w - 1)) >> 1) - pad_w +
+                         (_temp % width_out) * stride_w;
+        _temp /= width_out;
+        const int p0_h = ((dilation_h * (kernel_h - 1)) >> 1) - pad_h +
+                         (_temp % height_out) * stride_h;
+        _temp /= height_out;
+        const int b_col = _temp;
+
+        const opmath_t top_grad = grad_col[index];
+        const int input_size = height_in * width_in;
+        const int kernel_size = kernel_h * kernel_w;
+        int data_weight_ptr = sampling_index * kernel_size;
+        int data_loc_w_ptr = data_weight_ptr << 1;
+        const int grad_sampling_ptr = data_weight_ptr;
+        grad_offset += grad_sampling_ptr << 1;
+        grad_mask += grad_sampling_ptr;
+        const int qid_stride = group * group_channels;
+        const int im_ptr_offset = b_col * input_size * qid_stride;
+        const scalar_t *data_im_ptr = data_im + im_ptr_offset;
+        opmath_t *grad_im_ptr = grad_im + im_ptr_offset;
+        const opmath_t p0_w_ =
+            p0_w - ((dilation_w * (kernel_w - 1)) >> 1) * offset_scale;
+        const opmath_t p0_h_ =
+            p0_h - ((dilation_h * (kernel_h - 1)) >> 1) * offset_scale;
+        for (int i = 0; i < kernel_w; ++i) {
+            for (int j = 0; j < kernel_h; ++j) {
+                const opmath_t offset_w = data_offset[data_loc_w_ptr];
+                const opmath_t offset_h = data_offset[data_loc_w_ptr + 1];
+                const opmath_t loc_w =
+                    p0_w_ + (i * dilation_w + offset_w) * offset_scale;
+                const opmath_t loc_h =
+                    p0_h_ + (j * dilation_h + offset_h) * offset_scale;
+                const opmath_t weight = data_mask[data_weight_ptr];
+                *(cache_grad_offset + (threadIdx.x << 1)) = 0;
+                *(cache_grad_offset + ((threadIdx.x << 1) + 1)) = 0;
+                *(cache_grad_mask + threadIdx.x) = 0;
+                if (loc_h > -1 && loc_w > -1 && loc_h < height_in &&
+                    loc_w < width_in) {
+                    dcnv3_col2im_bilinear(
+                        data_im_ptr, height_in, width_in, group, group_channels,
+                        loc_h, loc_w, g_col, c_col, offset_scale, top_grad,
+                        weight, grad_im_ptr,
+                        cache_grad_offset + (threadIdx.x << 1),
+                        cache_grad_mask + threadIdx.x);
+                }
+
+                __syncthreads();
+
+                for (unsigned int s = blockDim.x / 2, spre = blockDim.x; s > 0;
+                     s >>= 1, spre >>= 1) {
+                    if (tid < s) {
+                        const unsigned int xid1 = tid << 1;
+                        const unsigned int xid2 = (tid + s) << 1;
+                        cache_grad_mask[tid] += cache_grad_mask[tid + s];
+                        cache_grad_offset[xid1] += cache_grad_offset[xid2];
+                        cache_grad_offset[xid1 + 1] +=
+                            cache_grad_offset[xid2 + 1];
+                        if (tid + (s << 1) < spre) {
+                            cache_grad_mask[tid] +=
+                                cache_grad_mask[tid + (s << 1)];
+                            cache_grad_offset[xid1] +=
+                                cache_grad_offset[xid2 + (s << 1)];
+                            cache_grad_offset[xid1 + 1] +=
+                                cache_grad_offset[xid2 + 1 + (s << 1)];
+                        }
+                    }
+                    __syncthreads();
+                }
+
+                if (tid == 0) {
+                    *grad_offset = cache_grad_offset[0];
+                    *(grad_offset + 1) = cache_grad_offset[1];
+                    *grad_mask = cache_grad_mask[0];
+                }
+                __syncthreads();
+
+                data_weight_ptr += 1;
+                data_loc_w_ptr += 2;
+                grad_mask += 1;
+                grad_offset += 2;
+            }
+        }
+    }
+}
+
+template <typename scalar_t>
+__global__ void dcnv3_col2im_gpu_kernel_shm_reduce_v2_multi_blocks(
+    const int num_kernels, const scalar_t *grad_col, const scalar_t *data_im,
+    const scalar_t *data_offset, const scalar_t *data_mask, const int kernel_h,
+    const int kernel_w, const int stride_h, const int stride_w, const int pad_h,
+    const int pad_w, const int dilation_h, const int dilation_w,
+    const int group, const int group_channels, const int height_in,
+    const int width_in, const int height_out, const int width_out,
+    const opmath_t offset_scale, opmath_t *grad_im, opmath_t *grad_offset,
+    opmath_t *grad_mask) {
+    CUDA_KERNEL_LOOP(index, num_kernels) {
+        extern __shared__ int _s[];
+        opmath_t *cache_grad_offset = (opmath_t *)_s;
+        opmath_t *cache_grad_mask = cache_grad_offset + 2 * blockDim.x;
+        unsigned int tid = threadIdx.x;
+        int _temp = index;
+        const int c_col = _temp % group_channels;
+        _temp /= group_channels;
+        const int sampling_index = _temp;
+        const int g_col = _temp % group;
+        _temp /= group;
+        const int p0_w = ((dilation_w * (kernel_w - 1)) >> 1) - pad_w +
+                         (_temp % width_out) * stride_w;
+        _temp /= width_out;
+        const int p0_h = ((dilation_h * (kernel_h - 1)) >> 1) - pad_h +
+                         (_temp % height_out) * stride_h;
+        _temp /= height_out;
+        const int b_col = _temp;
+
+        const opmath_t top_grad = grad_col[index];
+        const int input_size = height_in * width_in;
+        const int kernel_size = kernel_h * kernel_w;
+        int data_weight_ptr = sampling_index * kernel_size;
+        int data_loc_w_ptr = data_weight_ptr << 1;
+        const int grad_sampling_ptr = data_weight_ptr;
+        grad_offset += grad_sampling_ptr << 1;
+        grad_mask += grad_sampling_ptr;
+        const int qid_stride = group * group_channels;
+        const int im_ptr_offset = b_col * input_size * qid_stride;
+        const scalar_t *data_im_ptr = data_im + im_ptr_offset;
+        opmath_t *grad_im_ptr = grad_im + im_ptr_offset;
+        const opmath_t p0_w_ =
+            p0_w - ((dilation_w * (kernel_w - 1)) >> 1) * offset_scale;
+        const opmath_t p0_h_ =
+            p0_h - ((dilation_h * (kernel_h - 1)) >> 1) * offset_scale;
+        for (int i = 0; i < kernel_w; ++i) {
+            for (int j = 0; j < kernel_h; ++j) {
+                const opmath_t offset_w = data_offset[data_loc_w_ptr];
+                const opmath_t offset_h = data_offset[data_loc_w_ptr + 1];
+                const opmath_t loc_w =
+                    p0_w_ + (i * dilation_w + offset_w) * offset_scale;
+                const opmath_t loc_h =
+                    p0_h_ + (j * dilation_h + offset_h) * offset_scale;
+                const opmath_t weight = data_mask[data_weight_ptr];
+                *(cache_grad_offset + (threadIdx.x << 1)) = 0;
+                *(cache_grad_offset + ((threadIdx.x << 1) + 1)) = 0;
+                *(cache_grad_mask + threadIdx.x) = 0;
+                if (loc_h > -1 && loc_w > -1 && loc_h < height_in &&
+                    loc_w < width_in) {
+                    dcnv3_col2im_bilinear(
+                        data_im_ptr, height_in, width_in, group, group_channels,
+                        loc_h, loc_w, g_col, c_col, offset_scale, top_grad,
+                        weight, grad_im_ptr,
+                        cache_grad_offset + (threadIdx.x << 1),
+                        cache_grad_mask + threadIdx.x);
+                }
+
+                __syncthreads();
+
+                for (unsigned int s = blockDim.x / 2, spre = blockDim.x; s > 0;
+                     s >>= 1, spre >>= 1) {
+                    if (tid < s) {
+                        const unsigned int xid1 = tid << 1;
+                        const unsigned int xid2 = (tid + s) << 1;
+                        cache_grad_mask[tid] += cache_grad_mask[tid + s];
+                        cache_grad_offset[xid1] += cache_grad_offset[xid2];
+                        cache_grad_offset[xid1 + 1] +=
+                            cache_grad_offset[xid2 + 1];
+                        if (tid + (s << 1) < spre) {
+                            cache_grad_mask[tid] +=
+                                cache_grad_mask[tid + (s << 1)];
+                            cache_grad_offset[xid1] +=
+                                cache_grad_offset[xid2 + (s << 1)];
+                            cache_grad_offset[xid1 + 1] +=
+                                cache_grad_offset[xid2 + 1 + (s << 1)];
+                        }
+                    }
+                    __syncthreads();
+                }
+
+                if (tid == 0) {
+                    atomicAdd(grad_offset, cache_grad_offset[0]);
+                    atomicAdd(grad_offset + 1, cache_grad_offset[1]);
+                    atomicAdd(grad_mask, cache_grad_mask[0]);
+                }
+                __syncthreads();
+
+                data_weight_ptr += 1;
+                data_loc_w_ptr += 2;
+                grad_mask += 1;
+                grad_offset += 2;
+            }
+        }
+    }
+}
+
+template <typename scalar_t>
+__global__ void dcnv3_col2im_gpu_kernel_gm(
+    const int num_kernels, const scalar_t *grad_col, const scalar_t *data_im,
+    const scalar_t *data_offset, const scalar_t *data_mask, const int kernel_h,
+    const int kernel_w, const int stride_h, const int stride_w, const int pad_h,
+    const int pad_w, const int dilation_h, const int dilation_w,
+    const int group, const int group_channels, const int height_in,
+    const int width_in, const int height_out, const int width_out,
+    const opmath_t offset_scale, opmath_t *grad_im, opmath_t *grad_offset,
+    opmath_t *grad_mask) {
+    CUDA_KERNEL_LOOP(index, num_kernels) {
+        int _temp = index;
+        const int c_col = _temp % group_channels;
+        _temp /= group_channels;
+        const int sampling_index = _temp;
+        const int g_col = _temp % group;
+        _temp /= group;
+        const int p0_w = ((dilation_w * (kernel_w - 1)) >> 1) - pad_w +
+                         (_temp % width_out) * stride_w;
+        _temp /= width_out;
+        const int p0_h = ((dilation_h * (kernel_h - 1)) >> 1) - pad_h +
+                         (_temp % height_out) * stride_h;
+        _temp /= height_out;
+        const int b_col = _temp;
+
+        const opmath_t top_grad = grad_col[index];
+        const int input_size = height_in * width_in;
+        const int kernel_size = kernel_h * kernel_w;
+        int data_weight_ptr = sampling_index * kernel_size;
+        int data_loc_w_ptr = data_weight_ptr << 1;
+        const int grad_sampling_ptr = data_weight_ptr;
+        grad_offset += grad_sampling_ptr << 1;
+        grad_mask += grad_sampling_ptr;
+        const int qid_stride = group * group_channels;
+        const int im_ptr_offset = b_col * input_size * qid_stride;
+        const scalar_t *data_im_ptr = data_im + im_ptr_offset;
+        opmath_t *grad_im_ptr = grad_im + im_ptr_offset;
+        const opmath_t p0_w_ =
+            p0_w - ((dilation_w * (kernel_w - 1)) >> 1) * offset_scale;
+        const opmath_t p0_h_ =
+            p0_h - ((dilation_h * (kernel_h - 1)) >> 1) * offset_scale;
+        for (int i = 0; i < kernel_w; ++i) {
+            for (int j = 0; j < kernel_h; ++j) {
+                const opmath_t offset_w = data_offset[data_loc_w_ptr];
+                const opmath_t offset_h = data_offset[data_loc_w_ptr + 1];
+                const opmath_t loc_w =
+                    p0_w_ + (i * dilation_w + offset_w) * offset_scale;
+                const opmath_t loc_h =
+                    p0_h_ + (j * dilation_h + offset_h) * offset_scale;
+                const opmath_t weight = data_mask[data_weight_ptr];
+                if (loc_h > -1 && loc_w > -1 && loc_h < height_in &&
+                    loc_w < width_in) {
+                    dcnv3_col2im_bilinear_gm(
+                        data_im_ptr, height_in, width_in, group, group_channels,
+                        loc_h, loc_w, g_col, c_col, offset_scale, top_grad,
+                        weight, grad_im_ptr, grad_offset, grad_mask);
+                }
+                data_weight_ptr += 1;
+                data_loc_w_ptr += 2;
+                grad_mask += 1;
+                grad_offset += 2;
+            }
+        }
+    }
+}
+
+template <typename scalar_t>
+void dcnv3_im2col_cuda(cudaStream_t stream, const scalar_t *data_im,
+                       const scalar_t *data_offset, const scalar_t *data_mask,
+                       scalar_t *data_col, const int kernel_h,
+                       const int kernel_w, const int stride_h,
+                       const int stride_w, const int pad_h, const int pad_w,
+                       const int dilation_h, const int dilation_w,
+                       const int group, const int group_channels,
+                       const int batch_n, const int height_in,
+                       const int width_in, const int height_out,
+                       const int width_out, const opmath_t offset_scale) {
+    const int num_kernels =
+        batch_n * height_out * width_out * group * group_channels;
+    const int num_actual_kernels =
+        batch_n * height_out * width_out * group * group_channels;
+    const int num_threads = CUDA_NUM_THREADS;
+    dcnv3_im2col_gpu_kernel<scalar_t>
+        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
+           stream>>>(num_kernels, data_im, data_offset, data_mask, data_col,
+                     kernel_h, kernel_w, stride_h, stride_w, pad_h, pad_w,
+                     dilation_h, dilation_w, group, group_channels, height_in,
+                     width_in, height_out, width_out, offset_scale);
+
+    cudaError_t err = cudaGetLastError();
+    if (err != cudaSuccess) {
+        printf("error in dcnv3_im2col_cuda: %s\n", cudaGetErrorString(err));
+    }
+}
+
+template <typename scalar_t>
+void dcnv3_col2im_cuda(
+    cudaStream_t stream, const scalar_t *grad_col, const scalar_t *data_im,
+    const scalar_t *data_offset, const scalar_t *data_mask, const int kernel_h,
+    const int kernel_w, const int stride_h, const int stride_w, const int pad_h,
+    const int pad_w, const int dilation_h, const int dilation_w,
+    const int group, const int group_channels, const int batch_n,
+    const int height_in, const int width_in, const int height_out,
+    const int width_out, const opmath_t offset_scale, opmath_t *grad_im,
+    opmath_t *grad_offset, opmath_t *grad_mask) {
+    const int num_threads =
+        (group_channels > CUDA_NUM_THREADS) ? CUDA_NUM_THREADS : group_channels;
+    const int num_kernels =
+        batch_n * height_out * width_out * group * group_channels;
+    const int num_actual_kernels =
+        batch_n * height_out * width_out * group * group_channels;
+    if (group_channels > 1024) {
+        if ((group_channels & 1023) == 0) {
+            dcnv3_col2im_gpu_kernel_shm_reduce_v2_multi_blocks<scalar_t>
+                <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+                   num_threads * 3 * sizeof(opmath_t), stream>>>(
+                    num_kernels, grad_col, data_im, data_offset, data_mask,
+                    kernel_h, kernel_w, stride_h, stride_w, pad_h, pad_w,
+                    dilation_h, dilation_w, group, group_channels, height_in,
+                    width_in, height_out, width_out, offset_scale, grad_im,
+                    grad_offset, grad_mask);
+        } else {
+            dcnv3_col2im_gpu_kernel_gm<scalar_t>
+                <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
+                   stream>>>(num_kernels, grad_col, data_im, data_offset,
+                             data_mask, kernel_h, kernel_w, stride_h, stride_w,
+                             pad_h, pad_w, dilation_h, dilation_w, group,
+                             group_channels, height_in, width_in, height_out,
+                             width_out, offset_scale, grad_im, grad_offset,
+                             grad_mask);
+        }
+    } else {
+        switch (group_channels) {
+        case 1:
+            dcnv3_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t, 1>
+                <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
+                   stream>>>(num_kernels, grad_col, data_im, data_offset,
+                             data_mask, kernel_h, kernel_w, stride_h, stride_w,
+                             pad_h, pad_w, dilation_h, dilation_w, group,
+                             group_channels, height_in, width_in, height_out,
+                             width_out, offset_scale, grad_im, grad_offset,
+                             grad_mask);
+            break;
+        case 2:
+            dcnv3_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t, 2>
+                <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
+                   stream>>>(num_kernels, grad_col, data_im, data_offset,
+                             data_mask, kernel_h, kernel_w, stride_h, stride_w,
+                             pad_h, pad_w, dilation_h, dilation_w, group,
+                             group_channels, height_in, width_in, height_out,
+                             width_out, offset_scale, grad_im, grad_offset,
+                             grad_mask);
+            break;
+        case 4:
+            dcnv3_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t, 4>
+                <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
+                   stream>>>(num_kernels, grad_col, data_im, data_offset,
+                             data_mask, kernel_h, kernel_w, stride_h, stride_w,
+                             pad_h, pad_w, dilation_h, dilation_w, group,
+                             group_channels, height_in, width_in, height_out,
+                             width_out, offset_scale, grad_im, grad_offset,
+                             grad_mask);
+            break;
+        case 8:
+            dcnv3_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t, 8>
+                <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
+                   stream>>>(num_kernels, grad_col, data_im, data_offset,
+                             data_mask, kernel_h, kernel_w, stride_h, stride_w,
+                             pad_h, pad_w, dilation_h, dilation_w, group,
+                             group_channels, height_in, width_in, height_out,
+                             width_out, offset_scale, grad_im, grad_offset,
+                             grad_mask);
+            break;
+        case 16:
+            dcnv3_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t, 16>
+                <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
+                   stream>>>(num_kernels, grad_col, data_im, data_offset,
+                             data_mask, kernel_h, kernel_w, stride_h, stride_w,
+                             pad_h, pad_w, dilation_h, dilation_w, group,
+                             group_channels, height_in, width_in, height_out,
+                             width_out, offset_scale, grad_im, grad_offset,
+                             grad_mask);
+            break;
+        case 32:
+            dcnv3_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t, 32>
+                <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
+                   stream>>>(num_kernels, grad_col, data_im, data_offset,
+                             data_mask, kernel_h, kernel_w, stride_h, stride_w,
+                             pad_h, pad_w, dilation_h, dilation_w, group,
+                             group_channels, height_in, width_in, height_out,
+                             width_out, offset_scale, grad_im, grad_offset,
+                             grad_mask);
+            break;
+        case 64:
+            dcnv3_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t, 64>
+                <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
+                   stream>>>(num_kernels, grad_col, data_im, data_offset,
+                             data_mask, kernel_h, kernel_w, stride_h, stride_w,
+                             pad_h, pad_w, dilation_h, dilation_w, group,
+                             group_channels, height_in, width_in, height_out,
+                             width_out, offset_scale, grad_im, grad_offset,
+                             grad_mask);
+            break;
+        case 128:
+            dcnv3_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t, 128>
+                <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
+                   stream>>>(num_kernels, grad_col, data_im, data_offset,
+                             data_mask, kernel_h, kernel_w, stride_h, stride_w,
+                             pad_h, pad_w, dilation_h, dilation_w, group,
+                             group_channels, height_in, width_in, height_out,
+                             width_out, offset_scale, grad_im, grad_offset,
+                             grad_mask);
+            break;
+        case 256:
+            dcnv3_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t, 256>
+                <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
+                   stream>>>(num_kernels, grad_col, data_im, data_offset,
+                             data_mask, kernel_h, kernel_w, stride_h, stride_w,
+                             pad_h, pad_w, dilation_h, dilation_w, group,
+                             group_channels, height_in, width_in, height_out,
+                             width_out, offset_scale, grad_im, grad_offset,
+                             grad_mask);
+            break;
+        case 512:
+            dcnv3_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t, 512>
+                <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
+                   stream>>>(num_kernels, grad_col, data_im, data_offset,
+                             data_mask, kernel_h, kernel_w, stride_h, stride_w,
+                             pad_h, pad_w, dilation_h, dilation_w, group,
+                             group_channels, height_in, width_in, height_out,
+                             width_out, offset_scale, grad_im, grad_offset,
+                             grad_mask);
+            break;
+        case 1024:
+            dcnv3_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t,
+                                                                  1024>
+                <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
+                   stream>>>(num_kernels, grad_col, data_im, data_offset,
+                             data_mask, kernel_h, kernel_w, stride_h, stride_w,
+                             pad_h, pad_w, dilation_h, dilation_w, group,
+                             group_channels, height_in, width_in, height_out,
+                             width_out, offset_scale, grad_im, grad_offset,
+                             grad_mask);
+            break;
+        default:
+            if (group_channels < 64) {
+                dcnv3_col2im_gpu_kernel_shm_reduce_v1<scalar_t>
+                    <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+                       num_threads * 3 * sizeof(opmath_t), stream>>>(
+                        num_kernels, grad_col, data_im, data_offset, data_mask,
+                        kernel_h, kernel_w, stride_h, stride_w, pad_h, pad_w,
+                        dilation_h, dilation_w, group, group_channels,
+                        height_in, width_in, height_out, width_out,
+                        offset_scale, grad_im, grad_offset, grad_mask);
+            } else {
+                dcnv3_col2im_gpu_kernel_shm_reduce_v2<scalar_t>
+                    <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+                       num_threads * 3 * sizeof(opmath_t), stream>>>(
+                        num_kernels, grad_col, data_im, data_offset, data_mask,
+                        kernel_h, kernel_w, stride_h, stride_w, pad_h, pad_w,
+                        dilation_h, dilation_w, group, group_channels,
+                        height_in, width_in, height_out, width_out,
+                        offset_scale, grad_im, grad_offset, grad_mask);
+            }
+        }
+    }
+    cudaError_t err = cudaGetLastError();
+    if (err != cudaSuccess) {
+        printf("error in dcnv3_col2im_cuda: %s\n", cudaGetErrorString(err));
+    }
+}
\ No newline at end of file
--- a/segmentation/ops_dcnv3/src/dcnv3.h
+++ b/segmentation/ops_dcnv3/src/dcnv3.h
+/*!
+**************************************************************************************************
+* InternImage
+* Copyright (c) 2022 OpenGVLab
+* Licensed under The MIT License [see LICENSE for details]
+**************************************************************************************************
+* Modified from
+*https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+**************************************************************************************************
+*/
+
+#pragma once
+
+#include "cpu/dcnv3_cpu.h"
+
+#ifdef WITH_CUDA
+#include "cuda/dcnv3_cuda.h"
+#endif
+
+at::Tensor dcnv3_forward(const at::Tensor &input, const at::Tensor &offset,
+                         const at::Tensor &mask, const int kernel_h,
+                         const int kernel_w, const int stride_h,
+                         const int stride_w, const int pad_h, const int pad_w,
+                         const int dilation_h, const int dilation_w,
+                         const int group, const int group_channels,
+                         const float offset_scale, const int im2col_step) {
+    if (input.type().is_cuda()) {
+#ifdef WITH_CUDA
+        return dcnv3_cuda_forward(input, offset, mask, kernel_h, kernel_w,
+                                  stride_h, stride_w, pad_h, pad_w, dilation_h,
+                                  dilation_w, group, group_channels,
+                                  offset_scale, im2col_step);
+#else
+        AT_ERROR("Not compiled with GPU support");
+#endif
+    }
+    AT_ERROR("Not implemented on the CPU");
+}
+
+std::vector<at::Tensor>
+dcnv3_backward(const at::Tensor &input, const at::Tensor &offset,
+               const at::Tensor &mask, const int kernel_h, const int kernel_w,
+               const int stride_h, const int stride_w, const int pad_h,
+               const int pad_w, const int dilation_h, const int dilation_w,
+               const int group, const int group_channels,
+               const float offset_scale, const at::Tensor &grad_output,
+               const int im2col_step) {
+    if (input.type().is_cuda()) {
+#ifdef WITH_CUDA
+        return dcnv3_cuda_backward(input, offset, mask, kernel_h, kernel_w,
+                                   stride_h, stride_w, pad_h, pad_w, dilation_h,
+                                   dilation_w, group, group_channels,
+                                   offset_scale, grad_output, im2col_step);
+#else
+        AT_ERROR("Not compiled with GPU support");
+#endif
+    }
+    AT_ERROR("Not implemented on the CPU");
+}
--- a/segmentation/ops_dcnv3/src/vision.cpp
+++ b/segmentation/ops_dcnv3/src/vision.cpp
+/*!
+**************************************************************************************************
+* InternImage
+* Copyright (c) 2022 OpenGVLab
+* Licensed under The MIT License [see LICENSE for details]
+**************************************************************************************************
+* Modified from
+*https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+**************************************************************************************************
+*/
+
+#include "dcnv3.h"
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+    m.def("dcnv3_forward", &dcnv3_forward, "dcnv3_forward");
+    m.def("dcnv3_backward", &dcnv3_backward, "dcnv3_backward");
+}
--- a/segmentation/ops_dcnv3/test.py
+++ b/segmentation/ops_dcnv3/test.py
+# --------------------------------------------------------
+# InternImage
+# Copyright (c) 2022 OpenGVLab
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+
+from __future__ import absolute_import
+from __future__ import print_function
+from __future__ import division
+
+import time
+import torch
+import torch.nn as nn
+import math
+from torch.autograd import gradcheck
+
+from functions.dcnv3_func import DCNv3Function, dcnv3_core_pytorch
+
+H_in, W_in = 8, 8
+N, M, D = 2, 4, 16
+Kh, Kw = 3, 3
+P = Kh * Kw
+offset_scale = 2.0
+pad = 1
+dilation = 1
+stride = 1
+H_out = (H_in + 2 * pad - (dilation * (Kh - 1) + 1)) // stride + 1
+W_out = (W_in + 2 * pad - (dilation * (Kw - 1) + 1)) // stride + 1
+
+torch.manual_seed(3)
+
+
+@torch.no_grad()
+def check_forward_equal_with_pytorch_double():
+    input = torch.rand(N, H_in, W_in, M*D).cuda() * 0.01
+    offset = torch.rand(N, H_out, W_out, M*P*2).cuda() * 10
+    mask = torch.rand(N, H_out, W_out, M, P).cuda() + 1e-5
+    mask /= mask.sum(-1, keepdim=True)
+    mask = mask.reshape(N, H_out, W_out, M*P)
+
+    output_pytorch = dcnv3_core_pytorch(
+        input.double(),
+        offset.double(),
+        mask.double(),
+        Kh, Kw, stride, stride, Kh // 2, Kw // 2, dilation, dilation, M, D, offset_scale).detach().cpu()
+
+    im2col_step = 2
+    output_cuda = DCNv3Function.apply(
+        input.double(),
+        offset.double(),
+        mask.double(),
+        Kh, Kw, stride, stride, Kh // 2, Kw // 2, dilation, dilation, M, D, offset_scale,
+        im2col_step).detach().cpu()
+
+    fwdok = torch.allclose(output_cuda, output_pytorch)
+    max_abs_err = (output_cuda - output_pytorch).abs().max()
+    max_rel_err = ((output_cuda - output_pytorch).abs() /
+                   output_pytorch.abs()).max()
+    print('>>> forward double')
+    print(f'* {fwdok} check_forward_equal_with_pytorch_double: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}')
+
+
+@torch.no_grad()
+def check_forward_equal_with_pytorch_float():
+    input = torch.rand(N, H_in, W_in, M*D).cuda() * 0.01
+    offset = torch.rand(N, H_out, W_out, M*P*2).cuda() * 10
+    mask = torch.rand(N, H_out, W_out, M, P).cuda() + 1e-5
+    mask /= mask.sum(-1, keepdim=True)
+    mask = mask.reshape(N, H_out, W_out, M*P)
+
+    output_pytorch = dcnv3_core_pytorch(
+        input,
+        offset,
+        mask,
+        Kh, Kw, stride, stride, Kh // 2, Kw // 2, dilation, dilation, M, D, offset_scale).detach().cpu()
+
+    im2col_step = 2
+    output_cuda = DCNv3Function.apply(
+        input,
+        offset,
+        mask,
+        Kh, Kw, stride, stride, Kh // 2, Kw // 2, dilation, dilation, M, D, offset_scale,
+        im2col_step).detach().cpu()
+
+    fwdok = torch.allclose(output_cuda, output_pytorch, rtol=1e-2, atol=1e-3)
+    max_abs_err = (output_cuda - output_pytorch).abs().max()
+    max_rel_err = ((output_cuda - output_pytorch).abs() /
+                   output_pytorch.abs()).max()
+    print('>>> forward float')
+    print(f'* {fwdok} check_forward_equal_with_pytorch_float: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}')
+
+
+def check_backward_equal_with_pytorch_double(channels=4, grad_input=True, grad_offset=True, grad_mask=True):
+    # H_in, W_in = 4, 4
+    N = 2
+    M = 2
+    H_out = (H_in + 2 * pad - (dilation * (Kh - 1) + 1)) // stride + 1
+    W_out = (W_in + 2 * pad - (dilation * (Kw - 1) + 1)) // stride + 1
+
+    D = channels
+    input0 = torch.rand(N, H_in, W_in, M*D).cuda() * 0.01
+    offset0 = torch.rand(N, H_out, W_out, M*P*2).cuda() * 10
+    mask0 = torch.rand(N, H_out, W_out, M, P).cuda() + 1e-5
+    mask0 /= mask0.sum(-1, keepdim=True)
+    mask0 = mask0.reshape(N, H_out, W_out, M*P)
+    input0.requires_grad = grad_input
+    offset0.requires_grad = grad_offset
+    mask0.requires_grad = grad_mask
+
+    output_pytorch = dcnv3_core_pytorch(
+        input0.double(),
+        offset0.double(),
+        mask0.double(),
+        Kh, Kw, stride, stride, Kh // 2, Kw // 2, dilation, dilation, M, D, offset_scale)
+    output_pytorch.sum().backward()
+
+    input1 = input0.detach()
+    offset1 = offset0.detach()
+    mask1 = mask0.detach()
+    input1.requires_grad = grad_input
+    offset1.requires_grad = grad_offset
+    mask1.requires_grad = grad_mask
+
+    im2col_step = 2
+    output_cuda = DCNv3Function.apply(
+        input1.double(),
+        offset1.double(),
+        mask1.double(),
+        Kh, Kw, stride, stride, Kh // 2, Kw // 2, dilation, dilation, M, D, offset_scale,
+        im2col_step)
+    output_cuda.sum().backward()
+
+    print(f'>>> backward double: channels {D}')
+    bwdok = torch.allclose(input0.grad, input1.grad, rtol=1e-2, atol=1e-3)
+    max_abs_err = (input0.grad - input1.grad).abs().max()
+    max_rel_err = ((input0.grad - input1.grad).abs() /
+                   input0.grad.abs()).max()
+    print(
+        f'* {bwdok} input_grad check_backward_equal_with_pytorch_double: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}')
+
+    bwdok = torch.allclose(offset0.grad, offset1.grad, rtol=1e-2, atol=1e-3)
+    max_abs_err = (offset0.grad - offset1.grad).abs().max()
+    max_rel_err = ((offset0.grad - offset1.grad).abs() /
+                   offset0.grad.abs()).max()
+    print(
+        f'* {bwdok} offset_grad check_backward_equal_with_pytorch_double: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}')
+
+    bwdok = torch.allclose(mask0.grad, mask1.grad, rtol=1e-2, atol=1e-3)
+    max_abs_err = (mask0.grad - mask1.grad).abs().max()
+    max_rel_err = ((mask0.grad - mask1.grad).abs() /
+                   mask0.grad.abs()).max()
+    print(
+        f'* {bwdok} mask_grad check_backward_equal_with_pytorch_double: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}')
+
+
+def check_backward_equal_with_pytorch_float(channels=4, grad_input=True, grad_offset=True, grad_mask=True):
+    # H_in, W_in = 4, 4
+    N = 2
+    M = 2
+    H_out = (H_in + 2 * pad - (dilation * (Kh - 1) + 1)) // stride + 1
+    W_out = (W_in + 2 * pad - (dilation * (Kw - 1) + 1)) // stride + 1
+
+    D = channels
+    input0 = torch.rand(N, H_in, W_in, M*D).cuda() * 0.01
+    offset0 = torch.rand(N, H_out, W_out, M*P*2).cuda() * 10
+    mask0 = torch.rand(N, H_out, W_out, M, P).cuda() + 1e-5
+    mask0 /= mask0.sum(-1, keepdim=True)
+    mask0 = mask0.reshape(N, H_out, W_out, M*P)
+    input0.requires_grad = grad_input
+    offset0.requires_grad = grad_offset
+    mask0.requires_grad = grad_mask
+
+    output_pytorch = dcnv3_core_pytorch(
+        input0,
+        offset0,
+        mask0,
+        Kh, Kw, stride, stride, Kh // 2, Kw // 2, dilation, dilation, M, D, offset_scale)
+    output_pytorch.sum().backward()
+
+    input1 = input0.detach()
+    offset1 = offset0.detach()
+    mask1 = mask0.detach()
+    input1.requires_grad = grad_input
+    offset1.requires_grad = grad_offset
+    mask1.requires_grad = grad_mask
+
+    im2col_step = 2
+    output_cuda = DCNv3Function.apply(
+        input1,
+        offset1,
+        mask1,
+        Kh, Kw, stride, stride, Kh // 2, Kw // 2, dilation, dilation, M, D, offset_scale,
+        im2col_step)
+    output_cuda.sum().backward()
+
+    print(f'>>> backward float: channels {D}')
+    bwdok = torch.allclose(input0.grad, input1.grad, rtol=1e-2, atol=1e-3)
+    max_abs_err = (input0.grad - input1.grad).abs().max()
+    max_rel_err = ((input0.grad - input1.grad).abs() /
+                   input0.grad.abs()).max()
+    print(
+        f'* {bwdok} input_grad check_backward_equal_with_pytorch_float: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}')
+
+    bwdok = torch.allclose(offset0.grad, offset1.grad, rtol=1e-2, atol=1e-3)
+    max_abs_err = (offset0.grad - offset1.grad).abs().max()
+    max_rel_err = ((offset0.grad - offset1.grad).abs() /
+                   offset0.grad.abs()).max()
+    print(
+        f'* {bwdok} offset_grad check_backward_equal_with_pytorch_float: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}')
+
+    bwdok = torch.allclose(mask0.grad, mask1.grad, rtol=1e-2, atol=1e-3)
+    max_abs_err = (mask0.grad - mask1.grad).abs().max()
+    max_rel_err = ((mask0.grad - mask1.grad).abs() /
+                   mask0.grad.abs()).max()
+    print(
+        f'* {bwdok} mask_grad check_backward_equal_with_pytorch_float: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}')
+
+
+@torch.no_grad()
+def check_time_cost(im2col_step=128):
+    N = 512
+    H_in, W_in = 64, 64
+    H_out = (H_in + 2 * pad - (dilation * (Kh - 1) + 1)) // stride + 1
+    W_out = (W_in + 2 * pad - (dilation * (Kw - 1) + 1)) // stride + 1
+
+    input = torch.rand(N, H_in, W_in, M*D).cuda() * 0.01
+    offset = torch.rand(N, H_out, W_out, M*P*2).cuda() * 10
+    mask = torch.rand(N, H_out, W_out, M, P).cuda() + 1e-5
+    mask /= mask.sum(-1, keepdim=True)
+    mask = mask.reshape(N, H_out, W_out, M*P)
+    print(
+        f'>>> time cost: im2col_step {im2col_step}; input {input.shape}; points {P} ')
+    repeat = 100
+    for i in range(repeat):
+        output_cuda = DCNv3Function.apply(
+            input,
+            offset,
+            mask,
+            Kh, Kw, stride, stride, Kh // 2, Kw // 2, dilation, dilation, M, D, 1.0,
+            im2col_step)
+    torch.cuda.synchronize()
+    start = time.time()
+    for i in range(repeat):
+        output_cuda = DCNv3Function.apply(
+            input,
+            offset,
+            mask,
+            Kh, Kw, stride, stride, Kh // 2, Kw // 2, dilation, dilation, M, D, 1.0,
+            im2col_step)
+    torch.cuda.synchronize()
+    print(f'foward time cost: {(time.time() - start) / repeat}')
+
+
+if __name__ == '__main__':
+    check_forward_equal_with_pytorch_double()
+    check_forward_equal_with_pytorch_float()
+    for channels in [1, 16, 30, 32, 64, 71, 1025]:
+        check_backward_equal_with_pytorch_double(channels, True, True, True)
+    for channels in [1, 16, 30, 32, 64, 71, 1025]:
+        check_backward_equal_with_pytorch_float(channels, True, True, True)
+    for i in range(3):
+        im2col_step = 128 * (2 ** i)
+        check_time_cost(im2col_step)