add projects code

fb54db0f · limm · 1ac2e802 · fb54db0f · fb54db0f · fb54db0f
Commit fb54db0f authored Jun 24, 2025 by limm
20 changed files
--- a/projects/internimage_classification/ops_dcnv3/functions/__init__.py
+++ b/projects/internimage_classification/ops_dcnv3/functions/__init__.py
+# --------------------------------------------------------
+# InternImage
+# Copyright (c) 2022 OpenGVLab
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+# Copied from
+# https://github.com/OpenGVLab/InternImage/blob/master/classification/models/
+from .dcnv3_func import DCNv3Function, dcnv3_core_pytorch  # noqa
--- a/projects/internimage_classification/ops_dcnv3/functions/dcnv3_func.py
+++ b/projects/internimage_classification/ops_dcnv3/functions/dcnv3_func.py
+# --------------------------------------------------------
+# InternImage
+# Copyright (c) 2022 OpenGVLab
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+# Copied from
+# https://github.com/OpenGVLab/InternImage/blob/master/classification/models/
+from __future__ import absolute_import, division, print_function
+import pkg_resources
+import DCNv3
+import torch
+import torch.nn.functional as F
+from torch.autograd import Function
+from torch.autograd.function import once_differentiable
+from torch.cuda.amp import custom_bwd, custom_fwd
+dcn_version = float(pkg_resources.get_distribution('DCNv3').version)
+class DCNv3Function(Function):
+    @staticmethod
+    @custom_fwd
+    def forward(ctx, input, offset, mask, kernel_h, kernel_w, stride_h,
+                stride_w, pad_h, pad_w, dilation_h, dilation_w, group,
+                group_channels, offset_scale, im2col_step, remove_center):
+        ctx.kernel_h = kernel_h
+        ctx.kernel_w = kernel_w
+        ctx.stride_h = stride_h
+        ctx.stride_w = stride_w
+        ctx.pad_h = pad_h
+        ctx.pad_w = pad_w
+        ctx.dilation_h = dilation_h
+        ctx.dilation_w = dilation_w
+        ctx.group = group
+        ctx.group_channels = group_channels
+        ctx.offset_scale = offset_scale
+        ctx.im2col_step = im2col_step
+        ctx.remove_center = remove_center
+        args = [
+            input, offset, mask, kernel_h, kernel_w, stride_h, stride_w, pad_h,
+            pad_w, dilation_h, dilation_w, group, group_channels, offset_scale,
+            ctx.im2col_step
+        ]
+        if remove_center or dcn_version > 1.0:
+            args.append(remove_center)
+        output = DCNv3.dcnv3_forward(*args)
+        ctx.save_for_backward(input, offset, mask)
+        return output
+    @staticmethod
+    @once_differentiable
+    @custom_bwd
+    def backward(ctx, grad_output):
+        input, offset, mask = ctx.saved_tensors
+        args = [
+            input, offset, mask, ctx.kernel_h, ctx.kernel_w, ctx.stride_h,
+            ctx.stride_w, ctx.pad_h, ctx.pad_w, ctx.dilation_h, ctx.dilation_w,
+            ctx.group, ctx.group_channels, ctx.offset_scale,
+            grad_output.contiguous(), ctx.im2col_step
+        ]
+        if ctx.remove_center or dcn_version > 1.0:
+            args.append(ctx.remove_center)
+        grad_input, grad_offset, grad_mask = \
+            DCNv3.dcnv3_backward(*args)
+        return grad_input, grad_offset, grad_mask, \
+            None, None, None, None, None, None, None,\
+            None, None, None, None, None, None
+    @staticmethod
+    def symbolic(g, input, offset, mask, kernel_h, kernel_w, stride_h,
+                 stride_w, pad_h, pad_w, dilation_h, dilation_w, group,
+                 group_channels, offset_scale, im2col_step, remove_center):
+        """Symbolic function for mmdeploy::DCNv3.
+        Returns:
+            DCNv3 op for onnx.
+        """
+        return g.op(
+            'mmdeploy::TRTDCNv3',
+            input,
+            offset,
+            mask,
+            kernel_h_i=int(kernel_h),
+            kernel_w_i=int(kernel_w),
+            stride_h_i=int(stride_h),
+            stride_w_i=int(stride_w),
+            pad_h_i=int(pad_h),
+            pad_w_i=int(pad_w),
+            dilation_h_i=int(dilation_h),
+            dilation_w_i=int(dilation_w),
+            group_i=int(group),
+            group_channels_i=int(group_channels),
+            offset_scale_f=float(offset_scale),
+            im2col_step_i=int(im2col_step),
+            remove_center=int(remove_center),
+        )
+def _get_reference_points(spatial_shapes,
+                          device,
+                          kernel_h,
+                          kernel_w,
+                          dilation_h,
+                          dilation_w,
+                          pad_h=0,
+                          pad_w=0,
+                          stride_h=1,
+                          stride_w=1):
+    _, H_, W_, _ = spatial_shapes
+    H_out = (H_ - (dilation_h * (kernel_h - 1) + 1)) // stride_h + 1
+    W_out = (W_ - (dilation_w * (kernel_w - 1) + 1)) // stride_w + 1
+    ref_y, ref_x = torch.meshgrid(
+        torch.linspace(
+            # pad_h + 0.5,
+            # H_ - pad_h - 0.5,
+            (dilation_h * (kernel_h - 1)) // 2 + 0.5,
+            (dilation_h * (kernel_h - 1)) // 2 + 0.5 + (H_out - 1) * stride_h,
+            H_out,
+            dtype=torch.float32,
+            device=device),
+        torch.linspace(
+            # pad_w + 0.5,
+            # W_ - pad_w - 0.5,
+            (dilation_w * (kernel_w - 1)) // 2 + 0.5,
+            (dilation_w * (kernel_w - 1)) // 2 + 0.5 + (W_out - 1) * stride_w,
+            W_out,
+            dtype=torch.float32,
+            device=device))
+    ref_y = ref_y.reshape(-1)[None] / H_
+    ref_x = ref_x.reshape(-1)[None] / W_
+    ref = torch.stack((ref_x, ref_y), -1).reshape(1, H_out, W_out, 1, 2)
+    return ref
+def _generate_dilation_grids(spatial_shapes, kernel_h, kernel_w, dilation_h,
+                             dilation_w, group, device):
+    _, H_, W_, _ = spatial_shapes
+    points_list = []
+    x, y = torch.meshgrid(
+        torch.linspace(
+            -((dilation_w * (kernel_w - 1)) // 2),
+            -((dilation_w * (kernel_w - 1)) // 2) +
+            (kernel_w - 1) * dilation_w,
+            kernel_w,
+            dtype=torch.float32,
+            device=device),
+        torch.linspace(
+            -((dilation_h * (kernel_h - 1)) // 2),
+            -((dilation_h * (kernel_h - 1)) // 2) +
+            (kernel_h - 1) * dilation_h,
+            kernel_h,
+            dtype=torch.float32,
+            device=device))
+    points_list.extend([x / W_, y / H_])
+    grid = torch.stack(points_list, -1).reshape(-1, 1, 2).\
+        repeat(1, group, 1).permute(1, 0, 2)
+    grid = grid.reshape(1, 1, 1, group * kernel_h * kernel_w, 2)
+    return grid
+def remove_center_sampling_locations(sampling_locations, kernel_w, kernel_h):
+    idx = list(range(sampling_locations.shape[-2]))
+    C = (kernel_w * kernel_h - 1) // 2
+    idx = [i for i in idx if i != C and (i - C) % (C * 2 + 1) != 0]
+    sampling_locations = sampling_locations[:, :, :, idx, :]
+    return sampling_locations
+def dcnv3_core_pytorch(input, offset, mask, kernel_h, kernel_w, stride_h,
+                       stride_w, pad_h, pad_w, dilation_h, dilation_w, group,
+                       group_channels, offset_scale, remove_center):
+    # for debug and test only,
+    # need to use cuda version instead
+    if remove_center and (kernel_h % 2 == 0 or kernel_w % 2 == 0
+                          or kernel_w != kernel_h):
+        raise ValueError(
+            'remove_center is only compatible with square odd kernel size.')
+    input = F.pad(input, [0, 0, pad_h, pad_h, pad_w, pad_w])
+    N_, H_in, W_in, _ = input.shape
+    _, H_out, W_out, _ = offset.shape
+    ref = _get_reference_points(input.shape, input.device, kernel_h, kernel_w,
+                                dilation_h, dilation_w, pad_h, pad_w, stride_h,
+                                stride_w)
+    grid = _generate_dilation_grids(input.shape, kernel_h, kernel_w,
+                                    dilation_h, dilation_w, group,
+                                    input.device)
+    spatial_norm = torch.tensor([W_in, H_in]).reshape(1, 1, 1, 2).\
+        repeat(1, 1, 1, group*(kernel_h*kernel_w-remove_center)).\
+        to(input.device)
+    sampling_locations = (ref + grid * offset_scale).repeat(N_, 1, 1, 1, 1)
+    if remove_center:
+        sampling_locations = remove_center_sampling_locations(
+            sampling_locations, kernel_w=kernel_w, kernel_h=kernel_h)
+    sampling_locations = sampling_locations.flatten(3, 4)
+    sampling_locations = sampling_locations + \
+        offset * offset_scale / spatial_norm
+    P_ = kernel_h * kernel_w - remove_center
+    sampling_grids = 2 * sampling_locations - 1
+    # N_, H_in, W_in, group*group_channels ->
+    # N_, H_in*W_in, group*group_channels ->
+    # N_, group*group_channels, H_in*W_in ->
+    # N_*group, group_channels, H_in, W_in
+    input_ = input.view(N_, H_in*W_in, group*group_channels).transpose(1, 2).\
+        reshape(N_*group, group_channels, H_in, W_in)
+    # N_, H_out, W_out, group*P_*2 ->
+    # N_, H_out*W_out, group, P_, 2 ->
+    # N_, group, H_out*W_out, P_, 2 ->
+    # N_*group, H_out*W_out, P_, 2
+    sampling_grid_ = sampling_grids.view(N_, H_out*W_out, group, P_, 2).\
+        transpose(1, 2).flatten(0, 1)
+    # N_*group, group_channels, H_out*W_out, P_
+    sampling_input_ = F.grid_sample(
+        input_,
+        sampling_grid_,
+        mode='bilinear',
+        padding_mode='zeros',
+        align_corners=False)
+    # (N_, H_out, W_out, group*P_) ->
+    # N_, H_out*W_out, group, P_ ->
+    # (N_, group, H_out*W_out, P_) ->
+    # (N_*group, 1, H_out*W_out, P_)
+    mask = mask.view(N_, H_out*W_out, group, P_).transpose(1, 2).\
+        reshape(N_*group, 1, H_out*W_out, P_)
+    output = (sampling_input_ * mask).sum(-1).view(N_, group * group_channels,
+                                                   H_out * W_out)
+    return output.transpose(1, 2).reshape(N_, H_out, W_out, -1).contiguous()
--- a/projects/internimage_classification/ops_dcnv3/make.sh
+++ b/projects/internimage_classification/ops_dcnv3/make.sh
+#!/usr/bin/env bash
+# --------------------------------------------------------
+# InternImage
+# Copyright (c) 2022 OpenGVLab
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+# Copied from
+# https://github.com/OpenGVLab/InternImage/blob/master/classification/models/
+python setup.py build install
--- a/projects/internimage_classification/ops_dcnv3/modules/__init__.py
+++ b/projects/internimage_classification/ops_dcnv3/modules/__init__.py
+# --------------------------------------------------------
+# InternImage
+# Copyright (c) 2022 OpenGVLab
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+# Copied from
+# https://github.com/OpenGVLab/InternImage/blob/master/classification/models/
+from .dcnv3 import DCNv3, DCNv3_pytorch  # noqa
--- a/projects/internimage_classification/ops_dcnv3/modules/dcnv3.py
+++ b/projects/internimage_classification/ops_dcnv3/modules/dcnv3.py
+# --------------------------------------------------------
+# InternImage
+# Copyright (c) 2022 OpenGVLab
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+# Copied from
+# https://github.com/OpenGVLab/InternImage/blob/master/classification/models/
+from __future__ import absolute_import, division, print_function
+import warnings
+import torch
+import torch.nn.functional as F
+from torch import nn
+from torch.nn.init import constant_, xavier_uniform_
+from ..functions import DCNv3Function, dcnv3_core_pytorch
+class to_channels_first(nn.Module):
+    def __init__(self):
+        super().__init__()
+    def forward(self, x):
+        return x.permute(0, 3, 1, 2)
+class to_channels_last(nn.Module):
+    def __init__(self):
+        super().__init__()
+    def forward(self, x):
+        return x.permute(0, 2, 3, 1)
+def build_norm_layer(dim,
+                     norm_layer,
+                     in_format='channels_last',
+                     out_format='channels_last',
+                     eps=1e-6):
+    layers = []
+    if norm_layer == 'BN':
+        if in_format == 'channels_last':
+            layers.append(to_channels_first())
+        layers.append(nn.BatchNorm2d(dim))
+        if out_format == 'channels_last':
+            layers.append(to_channels_last())
+    elif norm_layer == 'LN':
+        if in_format == 'channels_first':
+            layers.append(to_channels_last())
+        layers.append(nn.LayerNorm(dim, eps=eps))
+        if out_format == 'channels_first':
+            layers.append(to_channels_first())
+    else:
+        raise NotImplementedError(
+            f'build_norm_layer does not support {norm_layer}')
+    return nn.Sequential(*layers)
+def build_act_layer(act_layer):
+    if act_layer == 'ReLU':
+        return nn.ReLU(inplace=True)
+    elif act_layer == 'SiLU':
+        return nn.SiLU(inplace=True)
+    elif act_layer == 'GELU':
+        return nn.GELU()
+    raise NotImplementedError(f'build_act_layer does not support {act_layer}')
+def _is_power_of_2(n):
+    if (not isinstance(n, int)) or (n < 0):
+        raise ValueError(
+            'invalid input for _is_power_of_2: {} (type: {})'.format(
+                n, type(n)))
+    return (n & (n - 1) == 0) and n != 0
+class CenterFeatureScaleModule(nn.Module):
+    def forward(self, query, center_feature_scale_proj_weight,
+                center_feature_scale_proj_bias):
+        center_feature_scale = F.linear(
+            query,
+            weight=center_feature_scale_proj_weight,
+            bias=center_feature_scale_proj_bias).sigmoid()
+        return center_feature_scale
+class DCNv3_pytorch(nn.Module):
+    def __init__(
+        self,
+        channels=64,
+        kernel_size=3,
+        dw_kernel_size=None,
+        stride=1,
+        pad=1,
+        dilation=1,
+        group=4,
+        offset_scale=1.0,
+        act_layer='GELU',
+        norm_layer='LN',
+        center_feature_scale=False,
+        remove_center=False,
+    ):
+        """DCNv3 Module.
+        :param channels
+        :param kernel_size
+        :param stride
+        :param pad
+        :param dilation
+        :param group
+        :param offset_scale
+        :param act_layer
+        :param norm_layer
+        """
+        super().__init__()
+        if channels % group != 0:
+            raise ValueError(f'channels must be divisible by group, '
+                             f'but got {channels} and {group}')
+        _d_per_group = channels // group
+        dw_kernel_size = dw_kernel_size if dw_kernel_size is not None\
+            else kernel_size
+        # you'd better set _d_per_group to a power of 2
+        # which is more efficient in our CUDA implementation
+        if not _is_power_of_2(_d_per_group):
+            warnings.warn(
+                "You'd better set channels in DCNv3 "
+                'to make the dimension of each attention head a power of 2 '
+                'which is more efficient in our CUDA implementation.')
+        self.offset_scale = offset_scale
+        self.channels = channels
+        self.kernel_size = kernel_size
+        self.dw_kernel_size = dw_kernel_size
+        self.stride = stride
+        self.dilation = dilation
+        self.pad = pad
+        self.group = group
+        self.group_channels = channels // group
+        self.offset_scale = offset_scale
+        self.center_feature_scale = center_feature_scale
+        self.remove_center = int(remove_center)
+        self.dw_conv = nn.Sequential(
+            nn.Conv2d(
+                channels,
+                channels,
+                kernel_size=dw_kernel_size,
+                stride=1,
+                padding=(dw_kernel_size - 1) // 2,
+                groups=channels),
+            build_norm_layer(channels, norm_layer, 'channels_first',
+                             'channels_last'), build_act_layer(act_layer))
+        self.offset = nn.Linear(
+            channels,
+            group * (kernel_size * kernel_size - remove_center) * 2)
+        self.mask = nn.Linear(
+            channels, group * (kernel_size * kernel_size - remove_center))
+        self.input_proj = nn.Linear(channels, channels)
+        self.output_proj = nn.Linear(channels, channels)
+        self._reset_parameters()
+        if center_feature_scale:
+            self.center_feature_scale_proj_weight = nn.Parameter(
+                torch.zeros((group, channels), dtype=torch.float))
+            self.center_feature_scale_proj_bias = nn.Parameter(
+                torch.tensor(0.0, dtype=torch.float).view(
+                    (1, )).repeat(group, ))
+            self.center_feature_scale_module = CenterFeatureScaleModule()
+    def _reset_parameters(self):
+        constant_(self.offset.weight.data, 0.)
+        constant_(self.offset.bias.data, 0.)
+        constant_(self.mask.weight.data, 0.)
+        constant_(self.mask.bias.data, 0.)
+        xavier_uniform_(self.input_proj.weight.data)
+        constant_(self.input_proj.bias.data, 0.)
+        xavier_uniform_(self.output_proj.weight.data)
+        constant_(self.output_proj.bias.data, 0.)
+    def forward(self, input):
+        """
+        :param query                       (N, H, W, C)
+        :return output                     (N, H, W, C)
+        """
+        N, H, W, _ = input.shape
+        x = self.input_proj(input)
+        x_proj = x
+        x1 = input.permute(0, 3, 1, 2)
+        x1 = self.dw_conv(x1)
+        offset = self.offset(x1)
+        mask = self.mask(x1).reshape(N, H, W, self.group, -1)
+        mask = F.softmax(mask, -1).reshape(N, H, W, -1)
+        x = dcnv3_core_pytorch(x, offset, mask, self.kernel_size,
+                               self.kernel_size, self.stride, self.stride,
+                               self.pad, self.pad, self.dilation,
+                               self.dilation, self.group, self.group_channels,
+                               self.offset_scale, self.remove_center)
+        if self.center_feature_scale:
+            center_feature_scale = self.center_feature_scale_module(
+                x1, self.center_feature_scale_proj_weight,
+                self.center_feature_scale_proj_bias)
+            # N, H, W, groups ->
+            # N, H, W, groups, 1 ->
+            # N, H, W, groups, _d_per_group ->
+            # N, H, W, channels
+            center_feature_scale = center_feature_scale[..., None].repeat(
+                1, 1, 1, 1, self.channels // self.group).flatten(-2)
+            x = x * (1 - center_feature_scale) + x_proj * center_feature_scale
+        x = self.output_proj(x)
+        return x
+class DCNv3(nn.Module):
+    def __init__(
+        self,
+        channels=64,
+        kernel_size=3,
+        dw_kernel_size=None,
+        stride=1,
+        pad=1,
+        dilation=1,
+        group=4,
+        offset_scale=1.0,
+        act_layer='GELU',
+        norm_layer='LN',
+        center_feature_scale=False,
+        remove_center=False,
+    ):
+        """DCNv3 Module.
+        :param channels
+        :param kernel_size
+        :param stride
+        :param pad
+        :param dilation
+        :param group
+        :param offset_scale
+        :param act_layer
+        :param norm_layer
+        """
+        super().__init__()
+        if channels % group != 0:
+            raise ValueError(f'channels must be divisible by group, '
+                             f'but got {channels} and {group}')
+        _d_per_group = channels // group
+        dw_kernel_size = dw_kernel_size if dw_kernel_size is not None\
+            else kernel_size
+        # you'd better set _d_per_group to a power of 2
+        # which is more efficient in our CUDA implementation
+        if not _is_power_of_2(_d_per_group):
+            warnings.warn(
+                "You'd better set channels in DCNv3 "
+                'to make the dimension of each attention head a power of 2 '
+                'which is more efficient in our CUDA implementation.')
+        self.offset_scale = offset_scale
+        self.channels = channels
+        self.kernel_size = kernel_size
+        self.dw_kernel_size = dw_kernel_size
+        self.stride = stride
+        self.dilation = dilation
+        self.pad = pad
+        self.group = group
+        self.group_channels = channels // group
+        self.offset_scale = offset_scale
+        self.center_feature_scale = center_feature_scale
+        self.remove_center = int(remove_center)
+        if self.remove_center and self.kernel_size % 2 == 0:
+            raise ValueError(
+                'remove_center is only compatible with odd kernel size.')
+        self.dw_conv = nn.Sequential(
+            nn.Conv2d(
+                channels,
+                channels,
+                kernel_size=dw_kernel_size,
+                stride=1,
+                padding=(dw_kernel_size - 1) // 2,
+                groups=channels),
+            build_norm_layer(channels, norm_layer, 'channels_first',
+                             'channels_last'), build_act_layer(act_layer))
+        self.offset = nn.Linear(
+            channels,
+            group * (kernel_size * kernel_size - remove_center) * 2)
+        self.mask = nn.Linear(
+            channels, group * (kernel_size * kernel_size - remove_center))
+        self.input_proj = nn.Linear(channels, channels)
+        self.output_proj = nn.Linear(channels, channels)
+        self._reset_parameters()
+        if center_feature_scale:
+            self.center_feature_scale_proj_weight = nn.Parameter(
+                torch.zeros((group, channels), dtype=torch.float))
+            self.center_feature_scale_proj_bias = nn.Parameter(
+                torch.tensor(0.0, dtype=torch.float).view(
+                    (1, )).repeat(group, ))
+            self.center_feature_scale_module = CenterFeatureScaleModule()
+    def _reset_parameters(self):
+        constant_(self.offset.weight.data, 0.)
+        constant_(self.offset.bias.data, 0.)
+        constant_(self.mask.weight.data, 0.)
+        constant_(self.mask.bias.data, 0.)
+        xavier_uniform_(self.input_proj.weight.data)
+        constant_(self.input_proj.bias.data, 0.)
+        xavier_uniform_(self.output_proj.weight.data)
+        constant_(self.output_proj.bias.data, 0.)
+    def forward(self, input):
+        """
+        :param query                       (N, H, W, C)
+        :return output                     (N, H, W, C)
+        """
+        N, H, W, _ = input.shape
+        x = self.input_proj(input)
+        x_proj = x
+        dtype = x.dtype
+        x1 = input.permute(0, 3, 1, 2)
+        x1 = self.dw_conv(x1)
+        offset = self.offset(x1)
+        mask = self.mask(x1).reshape(N, H, W, self.group, -1)
+        mask = F.softmax(mask, -1)
+        mask = mask.reshape(N, H, W, -1).type(dtype)
+        x = DCNv3Function.apply(x, offset, mask, self.kernel_size,
+                                self.kernel_size, self.stride, self.stride,
+                                self.pad, self.pad, self.dilation,
+                                self.dilation, self.group, self.group_channels,
+                                self.offset_scale, 256, self.remove_center)
+        if self.center_feature_scale:
+            center_feature_scale = self.center_feature_scale_module(
+                x1, self.center_feature_scale_proj_weight,
+                self.center_feature_scale_proj_bias)
+            # N, H, W, groups ->
+            # N, H, W, groups, 1 ->
+            # N, H, W, groups, _d_per_group ->
+            # N, H, W, channels
+            center_feature_scale = center_feature_scale[..., None].repeat(
+                1, 1, 1, 1, self.channels // self.group).flatten(-2)
+            x = x * (1 - center_feature_scale) + x_proj * center_feature_scale
+        x = self.output_proj(x)
+        return x
--- a/projects/internimage_classification/ops_dcnv3/setup.py
+++ b/projects/internimage_classification/ops_dcnv3/setup.py
+# --------------------------------------------------------
+# InternImage
+# Copyright (c) 2022 OpenGVLab
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+# Copied from
+# https://github.com/OpenGVLab/InternImage/blob/master/classification/models/
+import glob
+import os
+from setuptools import find_packages, setup
+import torch
+from torch.utils.cpp_extension import CUDA_HOME, CppExtension, CUDAExtension
+requirements = ['torch', 'torchvision']
+def get_extensions():
+    this_dir = os.path.dirname(os.path.abspath(__file__))
+    extensions_dir = os.path.join(this_dir, 'src')
+    main_file = glob.glob(os.path.join(extensions_dir, '*.cpp'))
+    source_cpu = glob.glob(os.path.join(extensions_dir, 'cpu', '*.cpp'))
+    source_cuda = glob.glob(os.path.join(extensions_dir, 'cuda', '*.cu'))
+    sources = main_file + source_cpu
+    extension = CppExtension
+    extra_compile_args = {'cxx': []}
+    define_macros = []
+    if torch.cuda.is_available() and CUDA_HOME is not None:
+        extension = CUDAExtension
+        sources += source_cuda
+        define_macros += [('WITH_CUDA', None)]
+        extra_compile_args['nvcc'] = [
+            # "-DCUDA_HAS_FP16=1",
+            # "-D__CUDA_NO_HALF_OPERATORS__",
+            # "-D__CUDA_NO_HALF_CONVERSIONS__",
+            # "-D__CUDA_NO_HALF2_OPERATORS__",
+        ]
+    else:
+        raise NotImplementedError('Cuda is not availabel')
+    sources = [os.path.join(extensions_dir, s) for s in sources]
+    include_dirs = [extensions_dir]
+    ext_modules = [
+        extension(
+            'DCNv3',
+            sources,
+            include_dirs=include_dirs,
+            define_macros=define_macros,
+            extra_compile_args=extra_compile_args,
+        )
+    ]
+    return ext_modules
+setup(
+    name='DCNv3',
+    version='1.1',
+    author='InternImage',
+    url='https://github.com/OpenGVLab/InternImage',
+    description='PyTorch Wrapper for CUDA Functions of DCNv3',
+    packages=find_packages(exclude=(
+        'configs',
+        'tests',
+    )),
+    ext_modules=get_extensions(),
+    cmdclass={'build_ext': torch.utils.cpp_extension.BuildExtension},
+)
--- a/projects/internimage_classification/ops_dcnv3/src/cpu/dcnv3_cpu.cpp
+++ b/projects/internimage_classification/ops_dcnv3/src/cpu/dcnv3_cpu.cpp
+/*!
+**************************************************************************************************
+* InternImage
+* Copyright (c) 2022 OpenGVLab
+* Licensed under The MIT License [see LICENSE for details]
+**************************************************************************************************
+* Modified from
+*https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+**************************************************************************************************
+*/
+#include <vector>
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+at::Tensor dcnv3_cpu_forward(const at::Tensor &input, const at::Tensor &offset,
+                             const at::Tensor &mask, const int kernel_h,
+                             const int kernel_w, const int stride_h,
+                             const int stride_w, const int pad_h,
+                             const int pad_w, const int dilation_h,
+                             const int dilation_w, const int group,
+                             const int group_channels, const float offset_scale,
+                             const int im2col_step) {
+    AT_ERROR("Not implement on cpu");
+}
+std::vector<at::Tensor>
+dcnv3_cpu_backward(const at::Tensor &input, const at::Tensor &offset,
+                   const at::Tensor &mask, const int kernel_h,
+                   const int kernel_w, const int stride_h, const int stride_w,
+                   const int pad_h, const int pad_w, const int dilation_h,
+                   const int dilation_w, const int group,
+                   const int group_channels, const float offset_scale,
+                   const at::Tensor &grad_output, const int im2col_step) {
+    AT_ERROR("Not implement on cpu");
+}
--- a/projects/internimage_classification/ops_dcnv3/src/cpu/dcnv3_cpu.h
+++ b/projects/internimage_classification/ops_dcnv3/src/cpu/dcnv3_cpu.h
+/*!
+**************************************************************************************************
+* InternImage
+* Copyright (c) 2022 OpenGVLab
+* Licensed under The MIT License [see LICENSE for details]
+**************************************************************************************************
+* Modified from
+*https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+**************************************************************************************************
+*/
+#pragma once
+#include <torch/extension.h>
+at::Tensor dcnv3_cpu_forward(const at::Tensor &input, const at::Tensor &offset,
+                             const at::Tensor &mask, const int kernel_h,
+                             const int kernel_w, const int stride_h,
+                             const int stride_w, const int pad_h,
+                             const int pad_w, const int dilation_h,
+                             const int dilation_w, const int group,
+                             const int group_channels, const float offset_scale,
+                             const int im2col_step);
+std::vector<at::Tensor>
+dcnv3_cpu_backward(const at::Tensor &input, const at::Tensor &offset,
+                   const at::Tensor &mask, const int kernel_h,
+                   const int kernel_w, const int stride_h, const int stride_w,
+                   const int pad_h, const int pad_w, const int dilation_h,
+                   const int dilation_w, const int group,
+                   const int group_channels, const float offset_scale,
+                   const at::Tensor &grad_output, const int im2col_step);
--- a/projects/internimage_classification/ops_dcnv3/src/cuda/dcnv3_cuda.cu
+++ b/projects/internimage_classification/ops_dcnv3/src/cuda/dcnv3_cuda.cu
+/*!
+**************************************************************************************************
+* InternImage
+* Copyright (c) 2022 OpenGVLab
+* Licensed under The MIT License [see LICENSE for details]
+**************************************************************************************************
+* Modified from
+*https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+**************************************************************************************************
+*/
+#include "cuda/dcnv3_im2col_cuda.cuh"
+#include <vector>
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <torch/torch.h>
+at::Tensor dcnv3_cuda_forward(const at::Tensor &input, const at::Tensor &offset,
+                              const at::Tensor &mask, const int kernel_h,
+                              const int kernel_w, const int stride_h,
+                              const int stride_w, const int pad_h,
+                              const int pad_w, const int dilation_h,
+                              const int dilation_w, const int group,
+                              const int group_channels,
+                              const float offset_scale, const int im2col_step, const int remove_center) {
+    AT_ASSERTM(input.is_contiguous(), "input tensor has to be contiguous");
+    AT_ASSERTM(offset.is_contiguous(), "offset tensor has to be contiguous");
+    AT_ASSERTM(mask.is_contiguous(), "mask tensor has to be contiguous");
+    AT_ASSERTM(input.type().is_cuda(), "input must be a CUDA tensor");
+    AT_ASSERTM(offset.type().is_cuda(), "offset must be a CUDA tensor");
+    AT_ASSERTM(mask.type().is_cuda(), "mask must be a CUDA tensor");
+    const int batch = input.size(0);
+    const int height_in = input.size(1);
+    const int width_in = input.size(2);
+    const int channels = input.size(3);
+    const int height_out =
+        (height_in + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h +
+        1;
+    const int width_out =
+        (width_in + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w +
+        1;
+    const int im2col_step_ = std::min(batch, im2col_step);
+    AT_ASSERTM(batch % im2col_step_ == 0,
+               "batch(%d) must divide im2col_step(%d)", batch, im2col_step_);
+    AT_ASSERTM(
+        channels == (group * group_channels),
+        "Input channels and group times group channels won't match: (%d vs %d).",
+        channels, group * group_channels);
+    auto output =
+        at::zeros({batch, height_out, width_out, group * group_channels},
+                  input.options());
+    const int batch_n = im2col_step_;
+    auto output_n = output.view({batch / batch_n, batch_n, height_out,
+                                 width_out, group * group_channels});
+    auto per_input_size = height_in * width_in * group * group_channels;
+    auto per_offset_size =
+        height_out * width_out * group * (kernel_h * kernel_w - remove_center) * 2;
+    auto per_mask_size = height_out * width_out * group * (kernel_h * kernel_w - remove_center);
+    for (int n = 0; n < batch / im2col_step_; ++n) {
+        auto columns = output_n.select(0, n);
+        // AT_DISPATCH_FLOATING_TYPES(
+        AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+            input.type(), "ms_deform_attn_forward_cuda", ([&] {
+                dcnv3_im2col_cuda(
+                    at::cuda::getCurrentCUDAStream(),
+                    input.data<scalar_t>() + n * im2col_step_ * per_input_size,
+                    offset.data<scalar_t>() +
+                        n * im2col_step_ * per_offset_size,
+                    mask.data<scalar_t>() + n * im2col_step_ * per_mask_size,
+                    columns.data<scalar_t>(), kernel_h, kernel_w, stride_h,
+                    stride_w, pad_h, pad_w, dilation_h, dilation_w, group,
+                    group_channels, batch_n, height_in, width_in, height_out,
+                    width_out, offset_scale, remove_center);
+            }));
+    }
+    return output;
+}
+std::vector<at::Tensor>
+dcnv3_cuda_backward(const at::Tensor &input, const at::Tensor &offset,
+                    const at::Tensor &mask, const int kernel_h,
+                    const int kernel_w, const int stride_h, const int stride_w,
+                    const int pad_h, const int pad_w, const int dilation_h,
+                    const int dilation_w, const int group,
+                    const int group_channels, const float offset_scale,
+                    const at::Tensor &grad_output, const int im2col_step, const int remove_center) {
+    AT_ASSERTM(input.is_contiguous(), "input tensor has to be contiguous");
+    AT_ASSERTM(offset.is_contiguous(), "offset tensor has to be contiguous");
+    AT_ASSERTM(mask.is_contiguous(), "mask tensor has to be contiguous");
+    AT_ASSERTM(grad_output.is_contiguous(),
+               "grad_output tensor has to be contiguous");
+    AT_ASSERTM(input.type().is_cuda(), "input must be a CUDA tensor");
+    AT_ASSERTM(offset.type().is_cuda(), "offset must be a CUDA tensor");
+    AT_ASSERTM(mask.type().is_cuda(), "mask must be a CUDA tensor");
+    AT_ASSERTM(grad_output.type().is_cuda(),
+               "grad_output must be a CUDA tensor");
+    const int batch = input.size(0);
+    const int height_in = input.size(1);
+    const int width_in = input.size(2);
+    const int channels = input.size(3);
+    const int height_out =
+        (height_in + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h +
+        1;
+    const int width_out =
+        (width_in + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w +
+        1;
+    const int im2col_step_ = std::min(batch, im2col_step);
+    AT_ASSERTM(batch % im2col_step_ == 0,
+               "batch(%d) must divide im2col_step(%d)", batch, im2col_step_);
+    AT_ASSERTM(
+        channels == (group * group_channels),
+        "Input channels and group times group channels won't match: (%d vs %d).",
+        channels, group * group_channels);
+    auto dtype = input.dtype();
+    if (dtype == at::kHalf) {
+        dtype = at::kFloat;
+    }
+    auto grad_input = at::zeros_like(input, dtype);
+    auto grad_offset = at::zeros_like(offset, dtype);
+    auto grad_mask = at::zeros_like(mask, dtype);
+    const int batch_n = im2col_step_;
+    auto per_input_size = height_in * width_in * group * group_channels;
+    auto per_offset_size =
+        height_out * width_out * group * (kernel_h * kernel_w - remove_center) * 2;
+    auto per_mask_size = height_out * width_out * group * (kernel_h * kernel_w - remove_center);
+    auto grad_output_n =
+        grad_output.view({batch / im2col_step_, batch_n, height_out * width_out,
+                          group, group_channels});
+    for (int n = 0; n < batch / im2col_step_; ++n) {
+        auto grad_output_g = grad_output_n.select(0, n);
+        // AT_DISPATCH_FLOATING_TYPES(
+        AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+            input.type(), "ms_deform_attn_backward_cuda", ([&] {
+                dcnv3_col2im_cuda(
+                    at::cuda::getCurrentCUDAStream(),
+                    grad_output_g.data<scalar_t>(),
+                    input.data<scalar_t>() + n * im2col_step_ * per_input_size,
+                    offset.data<scalar_t>() +
+                        n * im2col_step_ * per_offset_size,
+                    mask.data<scalar_t>() + n * im2col_step_ * per_mask_size,
+                    kernel_h, kernel_w, stride_h, stride_w, pad_h, pad_w,
+                    dilation_h, dilation_w, group, group_channels, batch_n,
+                    height_in, width_in, height_out, width_out, offset_scale, remove_center,
+                    grad_input.data<opmath_t>() +
+                        n * im2col_step_ * per_input_size,
+                    grad_offset.data<opmath_t>() +
+                        n * im2col_step_ * per_offset_size,
+                    grad_mask.data<opmath_t>() +
+                        n * im2col_step_ * per_mask_size);
+            }));
+    }
+    if (input.dtype() == torch::kHalf) {
+        return {grad_input.to(torch::kHalf), grad_offset.to(torch::kHalf),
+                grad_mask.to(torch::kHalf)};
+    } else {
+        return {grad_input, grad_offset, grad_mask};
+    }
+}
--- a/projects/internimage_classification/ops_dcnv3/src/cuda/dcnv3_cuda.h
+++ b/projects/internimage_classification/ops_dcnv3/src/cuda/dcnv3_cuda.h
+/*!
+**************************************************************************************************
+* InternImage
+* Copyright (c) 2022 OpenGVLab
+* Licensed under The MIT License [see LICENSE for details]
+**************************************************************************************************
+* Modified from
+*https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+**************************************************************************************************
+*/
+#pragma once
+#include <torch/extension.h>
+at::Tensor dcnv3_cuda_forward(const at::Tensor &input, const at::Tensor &offset,
+                              const at::Tensor &mask, const int kernel_h,
+                              const int kernel_w, const int stride_h,
+                              const int stride_w, const int pad_h,
+                              const int pad_w, const int dilation_h,
+                              const int dilation_w, const int group,
+                              const int group_channels,
+                              const float offset_scale, const int im2col_step, const int remove_center);
+std::vector<at::Tensor>
+dcnv3_cuda_backward(const at::Tensor &input, const at::Tensor &offset,
+                    const at::Tensor &mask, const int kernel_h,
+                    const int kernel_w, const int stride_h, const int stride_w,
+                    const int pad_h, const int pad_w, const int dilation_h,
+                    const int dilation_w, const int group,
+                    const int group_channels, const float offset_scale,
+                    const at::Tensor &grad_output, const int im2col_step, const int remove_center);
--- a/projects/internimage_classification/ops_dcnv3/src/cuda/dcnv3_im2col_cuda.cuh
+++ b/projects/internimage_classification/ops_dcnv3/src/cuda/dcnv3_im2col_cuda.cuh
--- a/projects/internimage_classification/ops_dcnv3/src/dcnv3.h
+++ b/projects/internimage_classification/ops_dcnv3/src/dcnv3.h
+/*!
+**************************************************************************************************
+* InternImage
+* Copyright (c) 2022 OpenGVLab
+* Licensed under The MIT License [see LICENSE for details]
+**************************************************************************************************
+* Modified from
+*https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+**************************************************************************************************
+*/
+#pragma once
+#include "cpu/dcnv3_cpu.h"
+#ifdef WITH_CUDA
+#include "cuda/dcnv3_cuda.h"
+#endif
+at::Tensor dcnv3_forward(const at::Tensor &input, const at::Tensor &offset,
+                         const at::Tensor &mask, const int kernel_h,
+                         const int kernel_w, const int stride_h,
+                         const int stride_w, const int pad_h, const int pad_w,
+                         const int dilation_h, const int dilation_w,
+                         const int group, const int group_channels,
+                         const float offset_scale, const int im2col_step, const int remove_center) {
+    if (input.type().is_cuda()) {
+#ifdef WITH_CUDA
+        return dcnv3_cuda_forward(input, offset, mask, kernel_h, kernel_w,
+                                  stride_h, stride_w, pad_h, pad_w, dilation_h,
+                                  dilation_w, group, group_channels,
+                                  offset_scale, im2col_step, remove_center);
+#else
+        AT_ERROR("Not compiled with GPU support");
+#endif
+    }
+    AT_ERROR("Not implemented on the CPU");
+}
+std::vector<at::Tensor>
+dcnv3_backward(const at::Tensor &input, const at::Tensor &offset,
+               const at::Tensor &mask, const int kernel_h, const int kernel_w,
+               const int stride_h, const int stride_w, const int pad_h,
+               const int pad_w, const int dilation_h, const int dilation_w,
+               const int group, const int group_channels,
+               const float offset_scale, const at::Tensor &grad_output,
+               const int im2col_step, const int remove_center) {
+    if (input.type().is_cuda()) {
+#ifdef WITH_CUDA
+        return dcnv3_cuda_backward(input, offset, mask, kernel_h, kernel_w,
+                                   stride_h, stride_w, pad_h, pad_w, dilation_h,
+                                   dilation_w, group, group_channels,
+                                   offset_scale, grad_output, im2col_step, remove_center);
+#else
+        AT_ERROR("Not compiled with GPU support");
+#endif
+    }
+    AT_ERROR("Not implemented on the CPU");
+}
--- a/projects/internimage_classification/ops_dcnv3/src/vision.cpp
+++ b/projects/internimage_classification/ops_dcnv3/src/vision.cpp
+/*!
+**************************************************************************************************
+* InternImage
+* Copyright (c) 2022 OpenGVLab
+* Licensed under The MIT License [see LICENSE for details]
+**************************************************************************************************
+* Modified from
+*https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+**************************************************************************************************
+*/
+#include "dcnv3.h"
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+    m.def("dcnv3_forward", &dcnv3_forward, "dcnv3_forward");
+    m.def("dcnv3_backward", &dcnv3_backward, "dcnv3_backward");
+}
--- a/projects/internimage_classification/ops_dcnv3/test.py
+++ b/projects/internimage_classification/ops_dcnv3/test.py
+# --------------------------------------------------------
+# InternImage
+# Copyright (c) 2022 OpenGVLab
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+# Copied from
+# https://github.com/OpenGVLab/InternImage/blob/master/classification/models/
+from __future__ import absolute_import, division, print_function
+import math  # noqa
+import time
+import torch
+import torch.nn as nn  # noqa
+from functions.dcnv3_func import DCNv3Function, dcnv3_core_pytorch
+from torch.autograd import gradcheck  # noqa
+H_in, W_in = 8, 8
+N, M, D = 2, 4, 16
+Kh, Kw = 3, 3
+remove_center = False
+P = Kh * Kw - remove_center
+offset_scale = 2.0
+pad = 1
+dilation = 1
+stride = 1
+H_out = (H_in + 2 * pad - (dilation * (Kh - 1) + 1)) // stride + 1
+W_out = (W_in + 2 * pad - (dilation * (Kw - 1) + 1)) // stride + 1
+torch.manual_seed(3)
+@torch.no_grad()
+def check_forward_equal_with_pytorch_double():
+    input = torch.rand(N, H_in, W_in, M * D).cuda() * 0.01
+    offset = torch.rand(N, H_out, W_out, M * P * 2).cuda() * 10
+    mask = torch.rand(N, H_out, W_out, M, P).cuda() + 1e-5
+    mask /= mask.sum(-1, keepdim=True)
+    mask = mask.reshape(N, H_out, W_out, M * P)
+    output_pytorch = dcnv3_core_pytorch(input.double(), offset.double(),
+                                        mask.double(), Kh, Kw, stride, stride,
+                                        Kh // 2, Kw // 2, dilation, dilation,
+                                        M, D, offset_scale,
+                                        remove_center).detach().cpu()
+    im2col_step = 2
+    output_cuda = DCNv3Function.apply(input.double(), offset.double(),
+                                      mask.double(), Kh, Kw, stride, stride,
+                                      Kh // 2, Kw // 2, dilation, dilation, M,
+                                      D, offset_scale, im2col_step,
+                                      remove_center).detach().cpu()
+    fwdok = torch.allclose(output_cuda, output_pytorch)
+    max_abs_err = (output_cuda - output_pytorch).abs().max()
+    max_rel_err = ((output_cuda - output_pytorch).abs() /
+                   output_pytorch.abs()).max()
+    print('>>> forward double')
+    print(f'* {fwdok} check_forward_equal_with_pytorch_double:'
+          f' max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}')
+@torch.no_grad()
+def check_forward_equal_with_pytorch_float():
+    input = torch.rand(N, H_in, W_in, M * D).cuda() * 0.01
+    offset = torch.rand(N, H_out, W_out, M * P * 2).cuda() * 10
+    mask = torch.rand(N, H_out, W_out, M, P).cuda() + 1e-5
+    mask /= mask.sum(-1, keepdim=True)
+    mask = mask.reshape(N, H_out, W_out, M * P)
+    output_pytorch = dcnv3_core_pytorch(input, offset, mask, Kh, Kw, stride,
+                                        stride, Kh // 2, Kw // 2, dilation,
+                                        dilation, M, D, offset_scale,
+                                        remove_center).detach().cpu()
+    im2col_step = 2
+    output_cuda = DCNv3Function.apply(input, offset, mask, Kh, Kw, stride,
+                                      stride, Kh // 2, Kw // 2, dilation,
+                                      dilation, M, D, offset_scale,
+                                      im2col_step,
+                                      remove_center).detach().cpu()
+    fwdok = torch.allclose(output_cuda, output_pytorch, rtol=1e-2, atol=1e-3)
+    max_abs_err = (output_cuda - output_pytorch).abs().max()
+    max_rel_err = ((output_cuda - output_pytorch).abs() /
+                   output_pytorch.abs()).max()
+    print('>>> forward float')
+    print(f'* {fwdok} check_forward_equal_with_pytorch_float:'
+          f'max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}')
+def check_backward_equal_with_pytorch_double(channels=4,
+                                             grad_input=True,
+                                             grad_offset=True,
+                                             grad_mask=True):
+    # H_in, W_in = 4, 4
+    N = 2
+    M = 2
+    H_out = (H_in + 2 * pad - (dilation * (Kh - 1) + 1)) // stride + 1
+    W_out = (W_in + 2 * pad - (dilation * (Kw - 1) + 1)) // stride + 1
+    D = channels
+    input0 = torch.rand(N, H_in, W_in, M * D).cuda() * 0.01
+    offset0 = torch.rand(N, H_out, W_out, M * P * 2).cuda() * 10
+    mask0 = torch.rand(N, H_out, W_out, M, P).cuda() + 1e-5
+    mask0 /= mask0.sum(-1, keepdim=True)
+    mask0 = mask0.reshape(N, H_out, W_out, M * P)
+    input0.requires_grad = grad_input
+    offset0.requires_grad = grad_offset
+    mask0.requires_grad = grad_mask
+    output_pytorch = dcnv3_core_pytorch(input0.double(), offset0.double(),
+                                        mask0.double(), Kh, Kw, stride, stride,
+                                        Kh // 2, Kw // 2, dilation, dilation,
+                                        M, D, offset_scale, remove_center)
+    output_pytorch.sum().backward()
+    input1 = input0.detach()
+    offset1 = offset0.detach()
+    mask1 = mask0.detach()
+    input1.requires_grad = grad_input
+    offset1.requires_grad = grad_offset
+    mask1.requires_grad = grad_mask
+    im2col_step = 2
+    output_cuda = DCNv3Function.apply(input1.double(), offset1.double(),
+                                      mask1.double(), Kh, Kw, stride, stride,
+                                      Kh // 2, Kw // 2, dilation, dilation, M,
+                                      D, offset_scale, im2col_step,
+                                      remove_center)
+    output_cuda.sum().backward()
+    print(f'>>> backward double: channels {D}')
+    bwdok = torch.allclose(input0.grad, input1.grad, rtol=1e-2, atol=1e-3)
+    max_abs_err = (input0.grad - input1.grad).abs().max()
+    max_rel_err = ((input0.grad - input1.grad).abs() / input0.grad.abs()).max()
+    print(f'* {bwdok} input_grad check_backward_equal_with_pytorch_double:'
+          f'max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}')
+    bwdok = torch.allclose(offset0.grad, offset1.grad, rtol=1e-2, atol=1e-3)
+    max_abs_err = (offset0.grad - offset1.grad).abs().max()
+    max_rel_err = ((offset0.grad - offset1.grad).abs() /
+                   offset0.grad.abs()).max()
+    print(f'* {bwdok} offset_grad check_backward_equal_with_pytorch_double:'
+          f'max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}')
+    bwdok = torch.allclose(mask0.grad, mask1.grad, rtol=1e-2, atol=1e-3)
+    max_abs_err = (mask0.grad - mask1.grad).abs().max()
+    max_rel_err = ((mask0.grad - mask1.grad).abs() / mask0.grad.abs()).max()
+    print(f'* {bwdok} mask_grad check_backward_equal_with_pytorch_double:'
+          f'max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}')
+def check_backward_equal_with_pytorch_float(channels=4,
+                                            grad_input=True,
+                                            grad_offset=True,
+                                            grad_mask=True):
+    # H_in, W_in = 4, 4
+    N = 2
+    M = 2
+    H_out = (H_in + 2 * pad - (dilation * (Kh - 1) + 1)) // stride + 1
+    W_out = (W_in + 2 * pad - (dilation * (Kw - 1) + 1)) // stride + 1
+    D = channels
+    input0 = torch.rand(N, H_in, W_in, M * D).cuda() * 0.01
+    offset0 = torch.rand(N, H_out, W_out, M * P * 2).cuda() * 10
+    mask0 = torch.rand(N, H_out, W_out, M, P).cuda() + 1e-5
+    mask0 /= mask0.sum(-1, keepdim=True)
+    mask0 = mask0.reshape(N, H_out, W_out, M * P)
+    input0.requires_grad = grad_input
+    offset0.requires_grad = grad_offset
+    mask0.requires_grad = grad_mask
+    output_pytorch = dcnv3_core_pytorch(input0, offset0, mask0, Kh, Kw, stride,
+                                        stride, Kh // 2, Kw // 2, dilation,
+                                        dilation, M, D, offset_scale,
+                                        remove_center)
+    output_pytorch.sum().backward()
+    input1 = input0.detach()
+    offset1 = offset0.detach()
+    mask1 = mask0.detach()
+    input1.requires_grad = grad_input
+    offset1.requires_grad = grad_offset
+    mask1.requires_grad = grad_mask
+    im2col_step = 2
+    output_cuda = DCNv3Function.apply(input1, offset1, mask1, Kh, Kw, stride,
+                                      stride, Kh // 2, Kw // 2, dilation,
+                                      dilation, M, D, offset_scale,
+                                      im2col_step, remove_center)
+    output_cuda.sum().backward()
+    print(f'>>> backward float: channels {D}')
+    bwdok = torch.allclose(input0.grad, input1.grad, rtol=1e-2, atol=1e-3)
+    max_abs_err = (input0.grad - input1.grad).abs().max()
+    max_rel_err = ((input0.grad - input1.grad).abs() / input0.grad.abs()).max()
+    print(f'* {bwdok} input_grad check_backward_equal_with_pytorch_float:'
+          f'max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}')
+    bwdok = torch.allclose(offset0.grad, offset1.grad, rtol=1e-2, atol=1e-3)
+    max_abs_err = (offset0.grad - offset1.grad).abs().max()
+    max_rel_err = ((offset0.grad - offset1.grad).abs() /
+                   offset0.grad.abs()).max()
+    print(f'* {bwdok} offset_grad check_backward_equal_with_pytorch_float:'
+          f'max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}')
+    bwdok = torch.allclose(mask0.grad, mask1.grad, rtol=1e-2, atol=1e-3)
+    max_abs_err = (mask0.grad - mask1.grad).abs().max()
+    max_rel_err = ((mask0.grad - mask1.grad).abs() / mask0.grad.abs()).max()
+    print(f'* {bwdok} mask_grad check_backward_equal_with_pytorch_float:'
+          f'max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}')
+@torch.no_grad()
+def check_time_cost(im2col_step=128):
+    N = 512
+    H_in, W_in = 64, 64
+    H_out = (H_in + 2 * pad - (dilation * (Kh - 1) + 1)) // stride + 1
+    W_out = (W_in + 2 * pad - (dilation * (Kw - 1) + 1)) // stride + 1
+    input = torch.rand(N, H_in, W_in, M * D).cuda() * 0.01
+    offset = torch.rand(N, H_out, W_out, M * P * 2).cuda() * 10
+    mask = torch.rand(N, H_out, W_out, M, P).cuda() + 1e-5
+    mask /= mask.sum(-1, keepdim=True)
+    mask = mask.reshape(N, H_out, W_out, M * P)
+    print(f'>>> time cost: im2col_step {im2col_step};'
+          f'input {input.shape}; points {P} ')
+    repeat = 100
+    for i in range(repeat):
+        output_cuda = DCNv3Function.apply(input, offset, mask, Kh, Kw, stride,
+                                          stride, Kh // 2, Kw // 2, dilation,
+                                          dilation, M, D, 1.0, im2col_step,
+                                          remove_center)
+    torch.cuda.synchronize()
+    start = time.time()
+    for i in range(repeat):
+        output_cuda = DCNv3Function.apply(  # noqa
+            input, offset, mask, Kh, Kw, stride, stride, Kh // 2, Kw // 2,
+            dilation, dilation, M, D, 1.0, im2col_step, remove_center)
+    torch.cuda.synchronize()
+    print(f'foward time cost: {(time.time() - start) / repeat}')
+if __name__ == '__main__':
+    check_forward_equal_with_pytorch_double()
+    check_forward_equal_with_pytorch_float()
+    for channels in [1, 16, 30, 32, 64, 71, 1025]:
+        check_backward_equal_with_pytorch_double(channels, True, True, True)
+    for channels in [1, 16, 30, 32, 64, 71, 1025]:
+        check_backward_equal_with_pytorch_float(channels, True, True, True)
+    for i in range(3):
+        im2col_step = 128 * (2**i)
+        check_time_cost(im2col_step)
--- a/projects/maskfeat_video/README.md
+++ b/projects/maskfeat_video/README.md
+# MaskFeat Pre-training with Video
+- [MaskFeat Pre-training with Video](#maskfeat-pre-training-with-video)
+  - [Description](#description)
+  - [Usage](#usage)
+    - [Setup Environment](#setup-environment)
+    - [Data Preparation](#data-preparation)
+    - [Pre-training Commands](#pre-training-commands)
+      - [On Local Single GPU](#on-local-single-gpu)
+      - [On Multiple GPUs](#on-multiple-gpus)
+      - [On Multiple GPUs with Slurm](#on-multiple-gpus-with-slurm)
+    - [Downstream Tasks Commands](#downstream-tasks-commands)
+      - [On Multiple GPUs](#on-multiple-gpus-1)
+      - [On Multiple GPUs with Slurm](#on-multiple-gpus-with-slurm-1)
+  - [Results](#results)
+  - [Citation](#citation)
+  - [Checklist](#checklist)
+## Description
+<!-- Share any information you would like others to know. For example:
+Author: @xxx.
+This is an implementation of \[XXX\]. -->
+Author: @fangyixiao18
+This is the implementation of **MaskFeat** with video dataset, like Kinetics400.
+## Usage
+<!-- For a typical model, this section should contain the commands for dataset prepareation, pre-training, downstream tasks. You are also suggested to dump your environment specification to env.yml by `conda env export > env.yml`. -->
+### Setup Environment
+Requirements:
+- MMPretrain >= 1.0.0rc0
+- MMAction2 >= 1.0.0rc3
+Please refer to [Get Started](https://mmpretrain.readthedocs.io/en/latest/get_started.html) documentation of MMPretrain to finish installation.
+Besides, to process the video data, we apply transforms in MMAction2. The instruction to install MMAction2 can be found in [Get Started documentation](https://mmaction2.readthedocs.io/en/1.x/get_started.html).
+### Data Preparation
+You can refer to the [documentation](https://mmaction2.readthedocs.io/en/1.x/user_guides/2_data_prepare.html) in MMAction2.
+### Pre-training Commands
+At first, you need to add the current folder to `PYTHONPATH`, so that Python can find your model files. In `projects/maskfeat_video/` root directory, please run command below to add it.
+```shell
+export PYTHONPATH=`pwd`:$PYTHONPATH
+```
+Then run the following commands to train the model:
+#### On Local Single GPU
+```bash
+# train with mim
+mim train mmpretrain ${CONFIG} --work-dir ${WORK_DIR}
+# a specific command example
+mim train mmpretrain configs/maskfeat_mvit-small_8xb32-amp-coslr-300e_k400.py \
+    --work-dir work_dirs/selfsup/maskfeat_mvit-small_8xb32-amp-coslr-300e_k400/
+# train with scripts
+python tools/train.py configs/maskfeat_mvit-small_8xb32-amp-coslr-300e_k400.py \
+    --work-dir work_dirs/selfsup/maskfeat_mvit-small_8xb32-amp-coslr-300e_k400/
+```
+#### On Multiple GPUs
+```bash
+# train with mim
+# a specific command examples, 8 GPUs here
+mim train mmpretrain configs/maskfeat_mvit-small_8xb32-amp-coslr-300e_k400.py \
+    --work-dir work_dirs/selfsup/maskfeat_mvit-small_8xb32-amp-coslr-300e_k400/ \
+    --launcher pytorch --gpus 8
+# train with scripts
+bash tools/dist_train.sh configs/maskfeat_mvit-small_8xb32-amp-coslr-300e_k400.py 8
+```
+Note:
+- CONFIG: the config files under the directory `configs/`
+- WORK_DIR: the working directory to save configs, logs, and checkpoints
+#### On Multiple GPUs with Slurm
+```bash
+# train with mim
+mim train mmpretrain configs/maskfeat_mvit-small_16xb32-amp-coslr-300e_k400.py \
+    --work-dir work_dirs/selfsup/maskfeat_mvit-small_16xb32-amp-coslr-300e_k400/ \
+    --launcher slurm --gpus 16 --gpus-per-node 8 \
+    --partition ${PARTITION}
+# train with scripts
+GPUS_PER_NODE=8 GPUS=16 bash tools/slurm_train.sh ${PARTITION} maskfeat-video \
+    configs/maskfeat_mvit-small_16xb32-amp-coslr-300e_k400.py \
+    --work-dir work_dirs/selfsup/maskfeat_mvit-small_16xb32-amp-coslr-300e_k400/
+```
+Note:
+- CONFIG: the config files under the directory `configs/`
+- WORK_DIR: the working directory to save configs, logs, and checkpoints
+- PARTITION: the slurm partition you are using
+### Downstream Tasks Commands
+To evaluate the **MaskFeat MViT** pretrained with MMPretrain, we recommend to run MMAction2:
+#### On Multiple GPUs
+```bash
+# command example for train
+mim train mmaction2 ${CONFIG} \
+    --work-dir ${WORK_DIR} \
+    --launcher pytorch -gpus 8 \
+    --cfg-options model.backbone.init_cfg.type=Pretrained \
+    model.backbone.init_cfg.checkpoint=${CHECKPOINT} \
+    model.backbone.init_cfg.prefix="backbone." \
+    ${PY_ARGS}
+    [optional args]
+mim train mmaction2 configs/mvit-small_ft-8xb8-coslr-100e_k400.py \
+    --work-dir work_dirs/benchmarks/maskfeat/training_maskfeat-mvit-k400/ \
+    --launcher pytorch -gpus 8 \
+    --cfg-options model.backbone.init_cfg.type=Pretrained \
+    model.backbone.init_cfg.checkpoint=https://download.openmmlab.com/mmselfsup/1.x/maskfeat/maskfeat_mvit-small_16xb32-amp-coslr-300e_k400/maskfeat_mvit-small_16xb32-amp-coslr-300e_k400_20230131-87d60b6f.pth \
+    model.backbone.init_cfg.prefix="backbone." \
+    $PY_ARGS
+# command example for test
+mim test mmaction2 configs/mvit-small_ft-8xb16-coslr-100e_k400.py \
+  --checkpoint https://download.openmmlab.com/mmselfsup/1.x/maskfeat/maskfeat_mvit-small_16xb32-amp-coslr-300e_k400/mvit-small_ft-8xb16-coslr-100e_k400/mvit-small_ft-8xb16-coslr-100e_k400_20230131-5e8303f5.pth \
+  --work-dir work_dirs/benchmarks/maskfeat/maskfeat-mvit-k400/test/ \
+  --launcher pytorch --gpus 8
+```
+#### On Multiple GPUs with Slurm
+```bash
+mim train mmaction2 ${CONFIG} \
+    --work-dir ${WORK_DIR} \
+    --launcher slurm --gpus 8 --gpus-per-node 8 \
+    --partition ${PARTITION} \
+    --cfg-options model.backbone.init_cfg.type=Pretrained \
+    model.backbone.init_cfg.checkpoint=$CHECKPOINT \
+    model.backbone.init_cfg.prefix="backbone." \
+    $PY_ARGS
+mim test mmaction2 ${CONFIG} \
+    --checkpoint https://download.openmmlab.com/mmselfsup/1.x/maskfeat/maskfeat_mvit-small_16xb32-amp-coslr-300e_k400/mvit-small_ft-8xb16-coslr-100e_k400/mvit-small_ft-8xb16-coslr-100e_k400_20230131-5e8303f5.pth
+    --work-dir ${WORK_DIR} \
+    --launcher slurm --gpus 8 --gpus-per-node 8 \
+    --partition ${PARTITION} \
+    $PY_ARGS
+```
+Note:
+- CONFIG: the config files under the directory `configs/`
+- WORK_DIR: the working directory to save configs, logs, and checkpoints
+- PARTITION: the slurm partition you are using
+- CHECKPOINT: the pretrained checkpoint of MMPretrain saved in working directory, like `$WORK_DIR/epoch_300.pth`
+- PY_ARGS: other optional args
+## Results
+<!-- You should claim whether this is based on the pre-trained weights, which are converted from the official release; or it's a reproduced result obtained from retraining the model in this project. -->
+The Fine-tuning results are based on Kinetics400(K400) dataset.
+Due to the version of K400 dataset, our pretraining, fine-tuning and the final test results are based on MMAction2 version, which is a little different from PySlowFast version.
+<table class="docutils">
+<thead>
+  <tr>
+	    <th>Algorithm</th>
+	    <th>Backbone</th>
+	    <th>Epoch</th>
+      <th>Batch Size</th>
+      <th>Fine-tuning</th>
+      <th>Pretrain Links</th>
+      <th>Fine-tuning Links</th>
+	</tr>
+  </thead>
+  <tbody>
+  <tr>
+      <td>MaskFeat</td>
+	    <td>MViT-small</td>
+	    <td>300</td>
+      <td>512</td>
+      <td>81.8</td>
+      <td><a href='https://github.com/open-mmlab/mmselfsup/blob/dev-1.x/projects/maskfeat_video/configs/maskfeat_mvit-small_16xb32-amp-coslr-300e_k400.py'>config</a> | <a href='https://download.openmmlab.com/mmselfsup/1.x/maskfeat/maskfeat_mvit-small_16xb32-amp-coslr-300e_k400/maskfeat_mvit-small_16xb32-amp-coslr-300e_k400_20230131-87d60b6f.pth'>model</a> | <a href='https://download.openmmlab.com/mmselfsup/1.x/maskfeat/maskfeat_mvit-small_16xb32-amp-coslr-300e_k400/maskfeat_mvit-small_16xb32-amp-coslr-300e_k400_20230118_114151.json'>log</a></td>
+      <td><a href='https://github.com/open-mmlab/mmselfsup/blob/dev-1.x/projects/maskfeat_video/configs/mvit-small_ft-8xb16-coslr-100e_k400.py'>config</a> | <a href='https://download.openmmlab.com/mmselfsup/1.x/maskfeat/maskfeat_mvit-small_16xb32-amp-coslr-300e_k400/mvit-small_ft-8xb16-coslr-100e_k400/mvit-small_ft-8xb16-coslr-100e_k400_20230131-5e8303f5.pth'>model</a> | <a href='https://download.openmmlab.com/mmselfsup/1.x/maskfeat/maskfeat_mvit-small_16xb32-amp-coslr-300e_k400/mvit-small_ft-8xb16-coslr-100e_k400/mvit-small_ft-8xb16-coslr-100e_k400_20230121_142927.json'>log</a></td>
+	</tr>
+</tbody>
+</table>
+Remarks:
+- We converted the pretrained model from PySlowFast and run fine-tuning with MMAction2, based on MMAction2 version of K400, we got `81.5` test accuracy. The pretrained model from MMPretrain got `81.8`, as provided above.
+- We also tested our model on [other version](https://github.com/facebookresearch/video-nonlocal-net/blob/main/DATASET.md) of K400, we got `82.1` test accuracy.
+- Some other details can be found in [MMAction2 MViT page](https://github.com/open-mmlab/mmaction2/tree/dev-1.x/configs/recognition/mvit).
+## Citation
+```bibtex
+@InProceedings{wei2022masked,
+    author    = {Wei, Chen and Fan, Haoqi and Xie, Saining and Wu, Chao-Yuan and Yuille, Alan and Feichtenhofer, Christoph},
+    title     = {Masked Feature Prediction for Self-Supervised Visual Pre-Training},
+    booktitle = {CVPR},
+    year      = {2022},
+}
+```
+## Checklist
+Here is a checklist illustrating a usual development workflow of a successful project, and also serves as an overview of this project's progress.
+<!--The PIC (person in charge) or contributors of this project should check all the items that they believe have been finished, which will further be verified by codebase maintainers via a PR.
+OpenMMLab's maintainer will review the code to ensure the project's quality. Reaching the first milestone means that this project suffices the minimum requirement of being merged into 'projects/'. But this project is only eligible to become a part of the core package upon attaining the last milestone.
+Note that keeping this section up-to-date is crucial not only for this project's developers but the entire community, since there might be some other contributors joining this project and deciding their starting point from this list. It also helps maintainers accurately estimate time and effort on further code polishing, if needed.
+A project does not necessarily have to be finished in a single PR, but it's essential for the project to at least reach the first milestone in its very first PR. -->
+- [x] Milestone 1: PR-ready, and acceptable to be one of the `projects/`.
+  - [x] Finish the code
+    <!-- The code's design shall follow existing interfaces and convention. For example, each model component should be registered into `MMPretrain.registry.MODELS` and configurable via a config file. -->
+  - [x] Basic docstrings & proper citation
+    <!-- Each major object should contain a docstring, describing its functionality and arguments. If you have adapted the code from other open-source projects, don't forget to cite the source project in docstring and make sure your behavior is not against its license. Typically, we do not accept any code snippet under GPL license. [A Short Guide to Open Source Licenses](https://medium.com/nationwide-technology/a-short-guide-to-open-source-licenses-cf5b1c329edd) -->
+  - [x] Inference correctness
+    <!-- If you are reproducing the result from a paper, make sure your model's inference-time feature vectors or losses matches that from the original codes. The weights usually could be obtained by simply renaming the keys in the official pre-trained weights. This test could be skipped though, if you are able to prove the training-time correctness and check the second milestone. -->
+  - [x] A full README
+    <!-- As this template does. -->
+- [x] Milestone 2: Indicates a successful model implementation.
+  - [x] Training-time correctness
+    <!-- If you are reproducing the result from a paper, checking this item means that you should have trained your model from scratch based on the original paper's specification and verified that the final result. Due to the pretrain-downstream pipeline of self-supervised learning, this item requires at least one downstream result matches the report within a minor error range. -->
+- [ ] Milestone 3: Good to be a part of our core package!
+  - [ ] Type hints and docstrings
+    <!-- Ideally *all* the methods should have [type hints](https://www.pythontutorial.net/python-basics/python-type-hints/) and [docstrings](https://google.github.io/styleguide/pyguide.html#381-docstrings). [Example](https://github.com/open-mmlab/mmpretrain/blob/main/mmpretrain/models/selfsup/mae.py) -->
+  - [ ] Unit tests
+    <!-- Unit tests for each module are required. [Example](https://github.com/open-mmlab/mmpretrain/blob/main/tests/test_models/test_selfsup/test_mae.py) -->
+  - [ ] Code polishing
+    <!-- Refactor your code according to reviewer's comment. -->
+  - [ ] `metafile.yml` and `README.md`
+    <!-- It will be parsed by MIM and Inferencer. [Example](https://github.com/open-mmlab/mmpretrain/blob/main/configs/mae/metafile.yml). In particular, you may have to refactor this README into a standard one. [Example](https://github.com/open-mmlab/mmpretrain/blob/main/configs/mae/README.md) -->
+- [ ] Refactor and Move your modules into the core package following the codebase's file hierarchy structure.
--- a/projects/maskfeat_video/configs/maskfeat_mvit-small_16xb32-amp-coslr-300e_k400.py
+++ b/projects/maskfeat_video/configs/maskfeat_mvit-small_16xb32-amp-coslr-300e_k400.py
+_base_ = 'mmpretrain::_base_/default_runtime.py'
+custom_imports = dict(imports=['models'], allow_failed_imports=False)
+model = dict(
+    type='VideoMaskFeat',
+    backbone=dict(
+        type='MaskFeatMViT',
+        arch='maskfeat-small',
+        drop_path_rate=0.0,
+        dim_mul_in_attention=False),
+    neck=dict(
+        type='LinearNeck',
+        in_channels=768,
+        out_channels=108,
+        with_avg_pool=False,
+        init_cfg=dict(type='TruncNormal', layer='Linear', std=0.02, bias=0)),
+    head=dict(
+        type='MaskFeatPretrainHead',
+        loss=dict(type='PixelReconstructionLoss', criterion='L2')),
+    target_generator=dict(
+        type='HOGGenerator3d', nbins=9, pool=8, gaussian_window=16))
+# dataset settings
+dataset_type = 'mmaction.VideoDataset'
+data_root = 'data/kinetics400/videos_train'
+ann_file_train = 'data/Kinetics400/kinetics400_train_list_videos.txt'
+data_preprocessor = dict(
+    type='VideoDataPreprocessor',
+    mean=[114.75, 114.75, 114.75],
+    std=[57.375, 57.375, 57.375],
+    format_shape='NCTHW')
+train_pipeline = [
+    dict(type='mmaction.DecordInit'),
+    dict(
+        type='mmaction.SampleFrames',
+        clip_len=16,
+        frame_interval=4,
+        num_clips=1),
+    dict(type='mmaction.DecordDecode'),
+    dict(type='mmaction.Resize', scale=(-1, 256)),
+    dict(type='mmaction.RandomResizedCrop', area_range=(0.5, 1.0)),
+    dict(type='mmaction.Resize', scale=(224, 224), keep_ratio=False),
+    dict(type='mmaction.Flip', flip_ratio=0.5),
+    dict(type='mmaction.FormatShape', input_format='NCTHW'),
+    dict(
+        type='MaskFeatMaskGenerator3D',
+        input_size=(8, 7, 7),
+        num_masking_patches=157,
+        min_num_patches=9,
+        max_num_patches=49),
+    dict(type='PackInputs', input_key='imgs')
+]
+train_dataloader = dict(
+    batch_size=32,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    collate_fn=dict(type='default_collate'),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file_train,
+        data_prefix=dict(video=data_root),
+        pipeline=train_pipeline))
+optim_wrapper = dict(
+    type='AmpOptimWrapper',
+    loss_scale='dynamic',
+    optimizer=dict(
+        type='AdamW', lr=8e-4 * 2, betas=(0.9, 0.999), weight_decay=0.05),
+    clip_grad=dict(max_norm=0.02),
+    paramwise_cfg=dict(
+        bias_decay_mult=0.,
+        norm_decay_mult=0.,
+        custom_keys={
+            'pos_embed': dict(decay_mult=0.),
+            'cls_token': dict(decay_mult=0.)
+        }))
+param_scheduler = [
+    dict(
+        type='LinearLR',
+        start_factor=1e-4,
+        by_epoch=True,
+        begin=0,
+        end=10,
+        convert_to_iter_based=True),
+    dict(
+        type='CosineAnnealingLR',
+        T_max=290,
+        eta_min=1e-6,
+        by_epoch=True,
+        begin=10,
+        end=300,
+        convert_to_iter_based=True)
+]
+train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=300)
+default_hooks = dict(
+    checkpoint=dict(interval=1, max_keep_ckpts=2), logger=dict(interval=100))
--- a/projects/maskfeat_video/configs/maskfeat_mvit-small_8xb32-amp-coslr-300e_k400.py
+++ b/projects/maskfeat_video/configs/maskfeat_mvit-small_8xb32-amp-coslr-300e_k400.py
+_base_ = './maskfeat_mvit-small_16xb32-amp-coslr-300e_k400.py'
+optim_wrapper = dict(
+    optimizer=dict(
+        type='AdamW', lr=8e-4, betas=(0.9, 0.999), weight_decay=0.05))
--- a/projects/maskfeat_video/configs/mvit-small_ft-8xb16-coslr-100e_k400.py
+++ b/projects/maskfeat_video/configs/mvit-small_ft-8xb16-coslr-100e_k400.py
+_base_ = [
+    'mmaction::_base_/models/mvit_small.py',
+    'mmaction::_base_/default_runtime.py'
+]
+model = dict(
+    backbone=dict(
+        drop_path_rate=0.1,
+        dim_mul_in_attention=False,
+        pretrained=None,
+        pretrained_type='maskfeat',
+    ),
+    data_preprocessor=dict(
+        type='ActionDataPreprocessor',
+        mean=[114.75, 114.75, 114.75],
+        std=[57.375, 57.375, 57.375],
+        blending=dict(
+            type='RandomBatchAugment',
+            augments=[
+                dict(type='MixupBlending', alpha=0.8, num_classes=400),
+                dict(type='CutmixBlending', alpha=1, num_classes=400)
+            ]),
+        format_shape='NCTHW'),
+    cls_head=dict(dropout_ratio=0., init_scale=0.001))
+# dataset settings
+dataset_type = 'VideoDataset'
+data_root = 'data/kinetics400/videos_train'
+data_root_val = 'data/kinetics400/videos_val'
+ann_file_train = 'data/kinetics400/kinetics400_train_list_videos.txt'
+ann_file_val = 'data/kinetics400/kinetics400_val_list_videos.txt'
+ann_file_test = 'data/kinetics400/kinetics400_val_list_videos.txt'
+train_pipeline = [
+    dict(type='DecordInit'),
+    dict(type='SampleFrames', clip_len=16, frame_interval=4, num_clips=1),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='PytorchVideoWrapper', op='RandAugment', magnitude=7),
+    dict(type='RandomResizedCrop'),
+    dict(type='Resize', scale=(224, 224), keep_ratio=False),
+    dict(type='Flip', flip_ratio=0.5),
+    dict(type='RandomErasing', erase_prob=0.25, mode='rand'),
+    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='PackActionInputs')
+]
+val_pipeline = [
+    dict(type='DecordInit'),
+    dict(
+        type='SampleFrames',
+        clip_len=16,
+        frame_interval=4,
+        num_clips=1,
+        test_mode=True),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='CenterCrop', crop_size=224),
+    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='PackActionInputs')
+]
+test_pipeline = [
+    dict(type='DecordInit'),
+    dict(
+        type='SampleFrames',
+        clip_len=16,
+        frame_interval=4,
+        num_clips=10,
+        test_mode=True),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 224)),
+    dict(type='CenterCrop', crop_size=224),
+    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='PackActionInputs')
+]
+repeat_sample = 2
+train_dataloader = dict(
+    batch_size=16,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    collate_fn=dict(type='repeat_pseudo_collate'),
+    dataset=dict(
+        type='RepeatAugDataset',
+        num_repeats=repeat_sample,
+        ann_file=ann_file_train,
+        data_prefix=dict(video=data_root),
+        pipeline=train_pipeline))
+val_dataloader = dict(
+    batch_size=16,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file_val,
+        data_prefix=dict(video=data_root_val),
+        pipeline=val_pipeline,
+        test_mode=True))
+test_dataloader = dict(
+    batch_size=1,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file_test,
+        data_prefix=dict(video=data_root_val),
+        pipeline=test_pipeline,
+        test_mode=True))
+val_evaluator = dict(type='AccMetric')
+test_evaluator = val_evaluator
+train_cfg = dict(
+    type='EpochBasedTrainLoop', max_epochs=100, val_begin=1, val_interval=1)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+base_lr = 9.6e-3
+optim_wrapper = dict(
+    optimizer=dict(
+        type='AdamW', lr=base_lr, betas=(0.9, 0.999), weight_decay=0.05),
+    constructor='LearningRateDecayOptimizerConstructor',
+    paramwise_cfg={
+        'decay_rate': 0.75,
+        'decay_type': 'layer_wise',
+        'num_layers': 16
+    },
+    clip_grad=dict(max_norm=5, norm_type=2))
+param_scheduler = [
+    dict(
+        type='LinearLR',
+        start_factor=1 / 600,
+        by_epoch=True,
+        begin=0,
+        end=20,
+        convert_to_iter_based=True),
+    dict(
+        type='CosineAnnealingLR',
+        T_max=80,
+        eta_min_ratio=1 / 600,
+        by_epoch=True,
+        begin=20,
+        end=100,
+        convert_to_iter_based=True)
+]
+default_hooks = dict(
+    checkpoint=dict(interval=3, max_keep_ckpts=20), logger=dict(interval=100))
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (64 samples per GPU) / repeat_sample.
+auto_scale_lr = dict(enable=True, base_batch_size=512 // repeat_sample)
--- a/projects/maskfeat_video/models/__init__.py
+++ b/projects/maskfeat_video/models/__init__.py
+from .hog_generator_3d import HOGGenerator3d
+from .maskfeat import VideoMaskFeat
+from .maskfeat_mvit import MaskFeatMViT
+from .transforms import MaskFeatMaskGenerator3D
+__all__ = [
+    'HOGGenerator3d', 'VideoMaskFeat', 'MaskFeatMViT',
+    'MaskFeatMaskGenerator3D'
+]
--- a/projects/maskfeat_video/models/hog_generator_3d.py
+++ b/projects/maskfeat_video/models/hog_generator_3d.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+from mmpretrain.models import HOGGenerator
+from mmpretrain.registry import MODELS
+@MODELS.register_module()
+class HOGGenerator3d(HOGGenerator):
+    """Generate HOG feature for videos.
+    This module is used in MaskFeat to generate HOG feature.
+    Here is the link of `HOG wikipedia
+    <https://en.wikipedia.org/wiki/Histogram_of_oriented_gradients>`_.
+    Args:
+        nbins (int): Number of bin. Defaults to 9.
+        pool (float): Number of cell. Defaults to 8.
+        gaussian_window (int): Size of gaussian kernel. Defaults to 16.
+    """
+    def __init__(self,
+                 nbins: int = 9,
+                 pool: int = 8,
+                 gaussian_window: int = 16) -> None:
+        super().__init__(
+            nbins=nbins, pool=pool, gaussian_window=gaussian_window)
+    def _reshape(self, hog_feat: torch.Tensor) -> torch.Tensor:
+        """Reshape HOG Features for output."""
+        hog_feat = hog_feat.flatten(1, 2)
+        self.unfold_size = hog_feat.shape[-1] // 14
+        hog_feat = hog_feat.permute(0, 2, 3, 1)
+        hog_feat = hog_feat.unfold(1, self.unfold_size,
+                                   self.unfold_size).unfold(
+                                       2, self.unfold_size, self.unfold_size)
+        hog_feat = hog_feat.flatten(3).view(self.B, self.T, 14, 14, -1)
+        hog_feat = hog_feat.flatten(1, 3)  # B N C
+        return hog_feat