Commit fb54db0f authored by limm's avatar limm
Browse files

add projects code

parent 1ac2e802
Pipeline #2804 canceled with stages
# --------------------------------------------------------
# InternImage
# Copyright (c) 2022 OpenGVLab
# Licensed under The MIT License [see LICENSE for details]
# --------------------------------------------------------
# Copied from
# https://github.com/OpenGVLab/InternImage/blob/master/classification/models/
from .dcnv3_func import DCNv3Function, dcnv3_core_pytorch # noqa
# --------------------------------------------------------
# InternImage
# Copyright (c) 2022 OpenGVLab
# Licensed under The MIT License [see LICENSE for details]
# --------------------------------------------------------
# Copied from
# https://github.com/OpenGVLab/InternImage/blob/master/classification/models/
from __future__ import absolute_import, division, print_function
import pkg_resources
import DCNv3
import torch
import torch.nn.functional as F
from torch.autograd import Function
from torch.autograd.function import once_differentiable
from torch.cuda.amp import custom_bwd, custom_fwd
dcn_version = float(pkg_resources.get_distribution('DCNv3').version)
class DCNv3Function(Function):
@staticmethod
@custom_fwd
def forward(ctx, input, offset, mask, kernel_h, kernel_w, stride_h,
stride_w, pad_h, pad_w, dilation_h, dilation_w, group,
group_channels, offset_scale, im2col_step, remove_center):
ctx.kernel_h = kernel_h
ctx.kernel_w = kernel_w
ctx.stride_h = stride_h
ctx.stride_w = stride_w
ctx.pad_h = pad_h
ctx.pad_w = pad_w
ctx.dilation_h = dilation_h
ctx.dilation_w = dilation_w
ctx.group = group
ctx.group_channels = group_channels
ctx.offset_scale = offset_scale
ctx.im2col_step = im2col_step
ctx.remove_center = remove_center
args = [
input, offset, mask, kernel_h, kernel_w, stride_h, stride_w, pad_h,
pad_w, dilation_h, dilation_w, group, group_channels, offset_scale,
ctx.im2col_step
]
if remove_center or dcn_version > 1.0:
args.append(remove_center)
output = DCNv3.dcnv3_forward(*args)
ctx.save_for_backward(input, offset, mask)
return output
@staticmethod
@once_differentiable
@custom_bwd
def backward(ctx, grad_output):
input, offset, mask = ctx.saved_tensors
args = [
input, offset, mask, ctx.kernel_h, ctx.kernel_w, ctx.stride_h,
ctx.stride_w, ctx.pad_h, ctx.pad_w, ctx.dilation_h, ctx.dilation_w,
ctx.group, ctx.group_channels, ctx.offset_scale,
grad_output.contiguous(), ctx.im2col_step
]
if ctx.remove_center or dcn_version > 1.0:
args.append(ctx.remove_center)
grad_input, grad_offset, grad_mask = \
DCNv3.dcnv3_backward(*args)
return grad_input, grad_offset, grad_mask, \
None, None, None, None, None, None, None,\
None, None, None, None, None, None
@staticmethod
def symbolic(g, input, offset, mask, kernel_h, kernel_w, stride_h,
stride_w, pad_h, pad_w, dilation_h, dilation_w, group,
group_channels, offset_scale, im2col_step, remove_center):
"""Symbolic function for mmdeploy::DCNv3.
Returns:
DCNv3 op for onnx.
"""
return g.op(
'mmdeploy::TRTDCNv3',
input,
offset,
mask,
kernel_h_i=int(kernel_h),
kernel_w_i=int(kernel_w),
stride_h_i=int(stride_h),
stride_w_i=int(stride_w),
pad_h_i=int(pad_h),
pad_w_i=int(pad_w),
dilation_h_i=int(dilation_h),
dilation_w_i=int(dilation_w),
group_i=int(group),
group_channels_i=int(group_channels),
offset_scale_f=float(offset_scale),
im2col_step_i=int(im2col_step),
remove_center=int(remove_center),
)
def _get_reference_points(spatial_shapes,
device,
kernel_h,
kernel_w,
dilation_h,
dilation_w,
pad_h=0,
pad_w=0,
stride_h=1,
stride_w=1):
_, H_, W_, _ = spatial_shapes
H_out = (H_ - (dilation_h * (kernel_h - 1) + 1)) // stride_h + 1
W_out = (W_ - (dilation_w * (kernel_w - 1) + 1)) // stride_w + 1
ref_y, ref_x = torch.meshgrid(
torch.linspace(
# pad_h + 0.5,
# H_ - pad_h - 0.5,
(dilation_h * (kernel_h - 1)) // 2 + 0.5,
(dilation_h * (kernel_h - 1)) // 2 + 0.5 + (H_out - 1) * stride_h,
H_out,
dtype=torch.float32,
device=device),
torch.linspace(
# pad_w + 0.5,
# W_ - pad_w - 0.5,
(dilation_w * (kernel_w - 1)) // 2 + 0.5,
(dilation_w * (kernel_w - 1)) // 2 + 0.5 + (W_out - 1) * stride_w,
W_out,
dtype=torch.float32,
device=device))
ref_y = ref_y.reshape(-1)[None] / H_
ref_x = ref_x.reshape(-1)[None] / W_
ref = torch.stack((ref_x, ref_y), -1).reshape(1, H_out, W_out, 1, 2)
return ref
def _generate_dilation_grids(spatial_shapes, kernel_h, kernel_w, dilation_h,
dilation_w, group, device):
_, H_, W_, _ = spatial_shapes
points_list = []
x, y = torch.meshgrid(
torch.linspace(
-((dilation_w * (kernel_w - 1)) // 2),
-((dilation_w * (kernel_w - 1)) // 2) +
(kernel_w - 1) * dilation_w,
kernel_w,
dtype=torch.float32,
device=device),
torch.linspace(
-((dilation_h * (kernel_h - 1)) // 2),
-((dilation_h * (kernel_h - 1)) // 2) +
(kernel_h - 1) * dilation_h,
kernel_h,
dtype=torch.float32,
device=device))
points_list.extend([x / W_, y / H_])
grid = torch.stack(points_list, -1).reshape(-1, 1, 2).\
repeat(1, group, 1).permute(1, 0, 2)
grid = grid.reshape(1, 1, 1, group * kernel_h * kernel_w, 2)
return grid
def remove_center_sampling_locations(sampling_locations, kernel_w, kernel_h):
idx = list(range(sampling_locations.shape[-2]))
C = (kernel_w * kernel_h - 1) // 2
idx = [i for i in idx if i != C and (i - C) % (C * 2 + 1) != 0]
sampling_locations = sampling_locations[:, :, :, idx, :]
return sampling_locations
def dcnv3_core_pytorch(input, offset, mask, kernel_h, kernel_w, stride_h,
stride_w, pad_h, pad_w, dilation_h, dilation_w, group,
group_channels, offset_scale, remove_center):
# for debug and test only,
# need to use cuda version instead
if remove_center and (kernel_h % 2 == 0 or kernel_w % 2 == 0
or kernel_w != kernel_h):
raise ValueError(
'remove_center is only compatible with square odd kernel size.')
input = F.pad(input, [0, 0, pad_h, pad_h, pad_w, pad_w])
N_, H_in, W_in, _ = input.shape
_, H_out, W_out, _ = offset.shape
ref = _get_reference_points(input.shape, input.device, kernel_h, kernel_w,
dilation_h, dilation_w, pad_h, pad_w, stride_h,
stride_w)
grid = _generate_dilation_grids(input.shape, kernel_h, kernel_w,
dilation_h, dilation_w, group,
input.device)
spatial_norm = torch.tensor([W_in, H_in]).reshape(1, 1, 1, 2).\
repeat(1, 1, 1, group*(kernel_h*kernel_w-remove_center)).\
to(input.device)
sampling_locations = (ref + grid * offset_scale).repeat(N_, 1, 1, 1, 1)
if remove_center:
sampling_locations = remove_center_sampling_locations(
sampling_locations, kernel_w=kernel_w, kernel_h=kernel_h)
sampling_locations = sampling_locations.flatten(3, 4)
sampling_locations = sampling_locations + \
offset * offset_scale / spatial_norm
P_ = kernel_h * kernel_w - remove_center
sampling_grids = 2 * sampling_locations - 1
# N_, H_in, W_in, group*group_channels ->
# N_, H_in*W_in, group*group_channels ->
# N_, group*group_channels, H_in*W_in ->
# N_*group, group_channels, H_in, W_in
input_ = input.view(N_, H_in*W_in, group*group_channels).transpose(1, 2).\
reshape(N_*group, group_channels, H_in, W_in)
# N_, H_out, W_out, group*P_*2 ->
# N_, H_out*W_out, group, P_, 2 ->
# N_, group, H_out*W_out, P_, 2 ->
# N_*group, H_out*W_out, P_, 2
sampling_grid_ = sampling_grids.view(N_, H_out*W_out, group, P_, 2).\
transpose(1, 2).flatten(0, 1)
# N_*group, group_channels, H_out*W_out, P_
sampling_input_ = F.grid_sample(
input_,
sampling_grid_,
mode='bilinear',
padding_mode='zeros',
align_corners=False)
# (N_, H_out, W_out, group*P_) ->
# N_, H_out*W_out, group, P_ ->
# (N_, group, H_out*W_out, P_) ->
# (N_*group, 1, H_out*W_out, P_)
mask = mask.view(N_, H_out*W_out, group, P_).transpose(1, 2).\
reshape(N_*group, 1, H_out*W_out, P_)
output = (sampling_input_ * mask).sum(-1).view(N_, group * group_channels,
H_out * W_out)
return output.transpose(1, 2).reshape(N_, H_out, W_out, -1).contiguous()
#!/usr/bin/env bash
# --------------------------------------------------------
# InternImage
# Copyright (c) 2022 OpenGVLab
# Licensed under The MIT License [see LICENSE for details]
# --------------------------------------------------------
# Copied from
# https://github.com/OpenGVLab/InternImage/blob/master/classification/models/
python setup.py build install
# --------------------------------------------------------
# InternImage
# Copyright (c) 2022 OpenGVLab
# Licensed under The MIT License [see LICENSE for details]
# --------------------------------------------------------
# Copied from
# https://github.com/OpenGVLab/InternImage/blob/master/classification/models/
from .dcnv3 import DCNv3, DCNv3_pytorch # noqa
# --------------------------------------------------------
# InternImage
# Copyright (c) 2022 OpenGVLab
# Licensed under The MIT License [see LICENSE for details]
# --------------------------------------------------------
# Copied from
# https://github.com/OpenGVLab/InternImage/blob/master/classification/models/
from __future__ import absolute_import, division, print_function
import warnings
import torch
import torch.nn.functional as F
from torch import nn
from torch.nn.init import constant_, xavier_uniform_
from ..functions import DCNv3Function, dcnv3_core_pytorch
class to_channels_first(nn.Module):
def __init__(self):
super().__init__()
def forward(self, x):
return x.permute(0, 3, 1, 2)
class to_channels_last(nn.Module):
def __init__(self):
super().__init__()
def forward(self, x):
return x.permute(0, 2, 3, 1)
def build_norm_layer(dim,
norm_layer,
in_format='channels_last',
out_format='channels_last',
eps=1e-6):
layers = []
if norm_layer == 'BN':
if in_format == 'channels_last':
layers.append(to_channels_first())
layers.append(nn.BatchNorm2d(dim))
if out_format == 'channels_last':
layers.append(to_channels_last())
elif norm_layer == 'LN':
if in_format == 'channels_first':
layers.append(to_channels_last())
layers.append(nn.LayerNorm(dim, eps=eps))
if out_format == 'channels_first':
layers.append(to_channels_first())
else:
raise NotImplementedError(
f'build_norm_layer does not support {norm_layer}')
return nn.Sequential(*layers)
def build_act_layer(act_layer):
if act_layer == 'ReLU':
return nn.ReLU(inplace=True)
elif act_layer == 'SiLU':
return nn.SiLU(inplace=True)
elif act_layer == 'GELU':
return nn.GELU()
raise NotImplementedError(f'build_act_layer does not support {act_layer}')
def _is_power_of_2(n):
if (not isinstance(n, int)) or (n < 0):
raise ValueError(
'invalid input for _is_power_of_2: {} (type: {})'.format(
n, type(n)))
return (n & (n - 1) == 0) and n != 0
class CenterFeatureScaleModule(nn.Module):
def forward(self, query, center_feature_scale_proj_weight,
center_feature_scale_proj_bias):
center_feature_scale = F.linear(
query,
weight=center_feature_scale_proj_weight,
bias=center_feature_scale_proj_bias).sigmoid()
return center_feature_scale
class DCNv3_pytorch(nn.Module):
def __init__(
self,
channels=64,
kernel_size=3,
dw_kernel_size=None,
stride=1,
pad=1,
dilation=1,
group=4,
offset_scale=1.0,
act_layer='GELU',
norm_layer='LN',
center_feature_scale=False,
remove_center=False,
):
"""DCNv3 Module.
:param channels
:param kernel_size
:param stride
:param pad
:param dilation
:param group
:param offset_scale
:param act_layer
:param norm_layer
"""
super().__init__()
if channels % group != 0:
raise ValueError(f'channels must be divisible by group, '
f'but got {channels} and {group}')
_d_per_group = channels // group
dw_kernel_size = dw_kernel_size if dw_kernel_size is not None\
else kernel_size
# you'd better set _d_per_group to a power of 2
# which is more efficient in our CUDA implementation
if not _is_power_of_2(_d_per_group):
warnings.warn(
"You'd better set channels in DCNv3 "
'to make the dimension of each attention head a power of 2 '
'which is more efficient in our CUDA implementation.')
self.offset_scale = offset_scale
self.channels = channels
self.kernel_size = kernel_size
self.dw_kernel_size = dw_kernel_size
self.stride = stride
self.dilation = dilation
self.pad = pad
self.group = group
self.group_channels = channels // group
self.offset_scale = offset_scale
self.center_feature_scale = center_feature_scale
self.remove_center = int(remove_center)
self.dw_conv = nn.Sequential(
nn.Conv2d(
channels,
channels,
kernel_size=dw_kernel_size,
stride=1,
padding=(dw_kernel_size - 1) // 2,
groups=channels),
build_norm_layer(channels, norm_layer, 'channels_first',
'channels_last'), build_act_layer(act_layer))
self.offset = nn.Linear(
channels,
group * (kernel_size * kernel_size - remove_center) * 2)
self.mask = nn.Linear(
channels, group * (kernel_size * kernel_size - remove_center))
self.input_proj = nn.Linear(channels, channels)
self.output_proj = nn.Linear(channels, channels)
self._reset_parameters()
if center_feature_scale:
self.center_feature_scale_proj_weight = nn.Parameter(
torch.zeros((group, channels), dtype=torch.float))
self.center_feature_scale_proj_bias = nn.Parameter(
torch.tensor(0.0, dtype=torch.float).view(
(1, )).repeat(group, ))
self.center_feature_scale_module = CenterFeatureScaleModule()
def _reset_parameters(self):
constant_(self.offset.weight.data, 0.)
constant_(self.offset.bias.data, 0.)
constant_(self.mask.weight.data, 0.)
constant_(self.mask.bias.data, 0.)
xavier_uniform_(self.input_proj.weight.data)
constant_(self.input_proj.bias.data, 0.)
xavier_uniform_(self.output_proj.weight.data)
constant_(self.output_proj.bias.data, 0.)
def forward(self, input):
"""
:param query (N, H, W, C)
:return output (N, H, W, C)
"""
N, H, W, _ = input.shape
x = self.input_proj(input)
x_proj = x
x1 = input.permute(0, 3, 1, 2)
x1 = self.dw_conv(x1)
offset = self.offset(x1)
mask = self.mask(x1).reshape(N, H, W, self.group, -1)
mask = F.softmax(mask, -1).reshape(N, H, W, -1)
x = dcnv3_core_pytorch(x, offset, mask, self.kernel_size,
self.kernel_size, self.stride, self.stride,
self.pad, self.pad, self.dilation,
self.dilation, self.group, self.group_channels,
self.offset_scale, self.remove_center)
if self.center_feature_scale:
center_feature_scale = self.center_feature_scale_module(
x1, self.center_feature_scale_proj_weight,
self.center_feature_scale_proj_bias)
# N, H, W, groups ->
# N, H, W, groups, 1 ->
# N, H, W, groups, _d_per_group ->
# N, H, W, channels
center_feature_scale = center_feature_scale[..., None].repeat(
1, 1, 1, 1, self.channels // self.group).flatten(-2)
x = x * (1 - center_feature_scale) + x_proj * center_feature_scale
x = self.output_proj(x)
return x
class DCNv3(nn.Module):
def __init__(
self,
channels=64,
kernel_size=3,
dw_kernel_size=None,
stride=1,
pad=1,
dilation=1,
group=4,
offset_scale=1.0,
act_layer='GELU',
norm_layer='LN',
center_feature_scale=False,
remove_center=False,
):
"""DCNv3 Module.
:param channels
:param kernel_size
:param stride
:param pad
:param dilation
:param group
:param offset_scale
:param act_layer
:param norm_layer
"""
super().__init__()
if channels % group != 0:
raise ValueError(f'channels must be divisible by group, '
f'but got {channels} and {group}')
_d_per_group = channels // group
dw_kernel_size = dw_kernel_size if dw_kernel_size is not None\
else kernel_size
# you'd better set _d_per_group to a power of 2
# which is more efficient in our CUDA implementation
if not _is_power_of_2(_d_per_group):
warnings.warn(
"You'd better set channels in DCNv3 "
'to make the dimension of each attention head a power of 2 '
'which is more efficient in our CUDA implementation.')
self.offset_scale = offset_scale
self.channels = channels
self.kernel_size = kernel_size
self.dw_kernel_size = dw_kernel_size
self.stride = stride
self.dilation = dilation
self.pad = pad
self.group = group
self.group_channels = channels // group
self.offset_scale = offset_scale
self.center_feature_scale = center_feature_scale
self.remove_center = int(remove_center)
if self.remove_center and self.kernel_size % 2 == 0:
raise ValueError(
'remove_center is only compatible with odd kernel size.')
self.dw_conv = nn.Sequential(
nn.Conv2d(
channels,
channels,
kernel_size=dw_kernel_size,
stride=1,
padding=(dw_kernel_size - 1) // 2,
groups=channels),
build_norm_layer(channels, norm_layer, 'channels_first',
'channels_last'), build_act_layer(act_layer))
self.offset = nn.Linear(
channels,
group * (kernel_size * kernel_size - remove_center) * 2)
self.mask = nn.Linear(
channels, group * (kernel_size * kernel_size - remove_center))
self.input_proj = nn.Linear(channels, channels)
self.output_proj = nn.Linear(channels, channels)
self._reset_parameters()
if center_feature_scale:
self.center_feature_scale_proj_weight = nn.Parameter(
torch.zeros((group, channels), dtype=torch.float))
self.center_feature_scale_proj_bias = nn.Parameter(
torch.tensor(0.0, dtype=torch.float).view(
(1, )).repeat(group, ))
self.center_feature_scale_module = CenterFeatureScaleModule()
def _reset_parameters(self):
constant_(self.offset.weight.data, 0.)
constant_(self.offset.bias.data, 0.)
constant_(self.mask.weight.data, 0.)
constant_(self.mask.bias.data, 0.)
xavier_uniform_(self.input_proj.weight.data)
constant_(self.input_proj.bias.data, 0.)
xavier_uniform_(self.output_proj.weight.data)
constant_(self.output_proj.bias.data, 0.)
def forward(self, input):
"""
:param query (N, H, W, C)
:return output (N, H, W, C)
"""
N, H, W, _ = input.shape
x = self.input_proj(input)
x_proj = x
dtype = x.dtype
x1 = input.permute(0, 3, 1, 2)
x1 = self.dw_conv(x1)
offset = self.offset(x1)
mask = self.mask(x1).reshape(N, H, W, self.group, -1)
mask = F.softmax(mask, -1)
mask = mask.reshape(N, H, W, -1).type(dtype)
x = DCNv3Function.apply(x, offset, mask, self.kernel_size,
self.kernel_size, self.stride, self.stride,
self.pad, self.pad, self.dilation,
self.dilation, self.group, self.group_channels,
self.offset_scale, 256, self.remove_center)
if self.center_feature_scale:
center_feature_scale = self.center_feature_scale_module(
x1, self.center_feature_scale_proj_weight,
self.center_feature_scale_proj_bias)
# N, H, W, groups ->
# N, H, W, groups, 1 ->
# N, H, W, groups, _d_per_group ->
# N, H, W, channels
center_feature_scale = center_feature_scale[..., None].repeat(
1, 1, 1, 1, self.channels // self.group).flatten(-2)
x = x * (1 - center_feature_scale) + x_proj * center_feature_scale
x = self.output_proj(x)
return x
# --------------------------------------------------------
# InternImage
# Copyright (c) 2022 OpenGVLab
# Licensed under The MIT License [see LICENSE for details]
# --------------------------------------------------------
# Copied from
# https://github.com/OpenGVLab/InternImage/blob/master/classification/models/
import glob
import os
from setuptools import find_packages, setup
import torch
from torch.utils.cpp_extension import CUDA_HOME, CppExtension, CUDAExtension
requirements = ['torch', 'torchvision']
def get_extensions():
this_dir = os.path.dirname(os.path.abspath(__file__))
extensions_dir = os.path.join(this_dir, 'src')
main_file = glob.glob(os.path.join(extensions_dir, '*.cpp'))
source_cpu = glob.glob(os.path.join(extensions_dir, 'cpu', '*.cpp'))
source_cuda = glob.glob(os.path.join(extensions_dir, 'cuda', '*.cu'))
sources = main_file + source_cpu
extension = CppExtension
extra_compile_args = {'cxx': []}
define_macros = []
if torch.cuda.is_available() and CUDA_HOME is not None:
extension = CUDAExtension
sources += source_cuda
define_macros += [('WITH_CUDA', None)]
extra_compile_args['nvcc'] = [
# "-DCUDA_HAS_FP16=1",
# "-D__CUDA_NO_HALF_OPERATORS__",
# "-D__CUDA_NO_HALF_CONVERSIONS__",
# "-D__CUDA_NO_HALF2_OPERATORS__",
]
else:
raise NotImplementedError('Cuda is not availabel')
sources = [os.path.join(extensions_dir, s) for s in sources]
include_dirs = [extensions_dir]
ext_modules = [
extension(
'DCNv3',
sources,
include_dirs=include_dirs,
define_macros=define_macros,
extra_compile_args=extra_compile_args,
)
]
return ext_modules
setup(
name='DCNv3',
version='1.1',
author='InternImage',
url='https://github.com/OpenGVLab/InternImage',
description='PyTorch Wrapper for CUDA Functions of DCNv3',
packages=find_packages(exclude=(
'configs',
'tests',
)),
ext_modules=get_extensions(),
cmdclass={'build_ext': torch.utils.cpp_extension.BuildExtension},
)
/*!
**************************************************************************************************
* InternImage
* Copyright (c) 2022 OpenGVLab
* Licensed under The MIT License [see LICENSE for details]
**************************************************************************************************
* Modified from
*https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
**************************************************************************************************
*/
#include <vector>
#include <ATen/ATen.h>
#include <ATen/cuda/CUDAContext.h>
at::Tensor dcnv3_cpu_forward(const at::Tensor &input, const at::Tensor &offset,
const at::Tensor &mask, const int kernel_h,
const int kernel_w, const int stride_h,
const int stride_w, const int pad_h,
const int pad_w, const int dilation_h,
const int dilation_w, const int group,
const int group_channels, const float offset_scale,
const int im2col_step) {
AT_ERROR("Not implement on cpu");
}
std::vector<at::Tensor>
dcnv3_cpu_backward(const at::Tensor &input, const at::Tensor &offset,
const at::Tensor &mask, const int kernel_h,
const int kernel_w, const int stride_h, const int stride_w,
const int pad_h, const int pad_w, const int dilation_h,
const int dilation_w, const int group,
const int group_channels, const float offset_scale,
const at::Tensor &grad_output, const int im2col_step) {
AT_ERROR("Not implement on cpu");
}
/*!
**************************************************************************************************
* InternImage
* Copyright (c) 2022 OpenGVLab
* Licensed under The MIT License [see LICENSE for details]
**************************************************************************************************
* Modified from
*https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
**************************************************************************************************
*/
#pragma once
#include <torch/extension.h>
at::Tensor dcnv3_cpu_forward(const at::Tensor &input, const at::Tensor &offset,
const at::Tensor &mask, const int kernel_h,
const int kernel_w, const int stride_h,
const int stride_w, const int pad_h,
const int pad_w, const int dilation_h,
const int dilation_w, const int group,
const int group_channels, const float offset_scale,
const int im2col_step);
std::vector<at::Tensor>
dcnv3_cpu_backward(const at::Tensor &input, const at::Tensor &offset,
const at::Tensor &mask, const int kernel_h,
const int kernel_w, const int stride_h, const int stride_w,
const int pad_h, const int pad_w, const int dilation_h,
const int dilation_w, const int group,
const int group_channels, const float offset_scale,
const at::Tensor &grad_output, const int im2col_step);
/*!
**************************************************************************************************
* InternImage
* Copyright (c) 2022 OpenGVLab
* Licensed under The MIT License [see LICENSE for details]
**************************************************************************************************
* Modified from
*https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
**************************************************************************************************
*/
#include "cuda/dcnv3_im2col_cuda.cuh"
#include <vector>
#include <ATen/ATen.h>
#include <ATen/cuda/CUDAContext.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <torch/torch.h>
at::Tensor dcnv3_cuda_forward(const at::Tensor &input, const at::Tensor &offset,
const at::Tensor &mask, const int kernel_h,
const int kernel_w, const int stride_h,
const int stride_w, const int pad_h,
const int pad_w, const int dilation_h,
const int dilation_w, const int group,
const int group_channels,
const float offset_scale, const int im2col_step, const int remove_center) {
AT_ASSERTM(input.is_contiguous(), "input tensor has to be contiguous");
AT_ASSERTM(offset.is_contiguous(), "offset tensor has to be contiguous");
AT_ASSERTM(mask.is_contiguous(), "mask tensor has to be contiguous");
AT_ASSERTM(input.type().is_cuda(), "input must be a CUDA tensor");
AT_ASSERTM(offset.type().is_cuda(), "offset must be a CUDA tensor");
AT_ASSERTM(mask.type().is_cuda(), "mask must be a CUDA tensor");
const int batch = input.size(0);
const int height_in = input.size(1);
const int width_in = input.size(2);
const int channels = input.size(3);
const int height_out =
(height_in + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h +
1;
const int width_out =
(width_in + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w +
1;
const int im2col_step_ = std::min(batch, im2col_step);
AT_ASSERTM(batch % im2col_step_ == 0,
"batch(%d) must divide im2col_step(%d)", batch, im2col_step_);
AT_ASSERTM(
channels == (group * group_channels),
"Input channels and group times group channels won't match: (%d vs %d).",
channels, group * group_channels);
auto output =
at::zeros({batch, height_out, width_out, group * group_channels},
input.options());
const int batch_n = im2col_step_;
auto output_n = output.view({batch / batch_n, batch_n, height_out,
width_out, group * group_channels});
auto per_input_size = height_in * width_in * group * group_channels;
auto per_offset_size =
height_out * width_out * group * (kernel_h * kernel_w - remove_center) * 2;
auto per_mask_size = height_out * width_out * group * (kernel_h * kernel_w - remove_center);
for (int n = 0; n < batch / im2col_step_; ++n) {
auto columns = output_n.select(0, n);
// AT_DISPATCH_FLOATING_TYPES(
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
input.type(), "ms_deform_attn_forward_cuda", ([&] {
dcnv3_im2col_cuda(
at::cuda::getCurrentCUDAStream(),
input.data<scalar_t>() + n * im2col_step_ * per_input_size,
offset.data<scalar_t>() +
n * im2col_step_ * per_offset_size,
mask.data<scalar_t>() + n * im2col_step_ * per_mask_size,
columns.data<scalar_t>(), kernel_h, kernel_w, stride_h,
stride_w, pad_h, pad_w, dilation_h, dilation_w, group,
group_channels, batch_n, height_in, width_in, height_out,
width_out, offset_scale, remove_center);
}));
}
return output;
}
std::vector<at::Tensor>
dcnv3_cuda_backward(const at::Tensor &input, const at::Tensor &offset,
const at::Tensor &mask, const int kernel_h,
const int kernel_w, const int stride_h, const int stride_w,
const int pad_h, const int pad_w, const int dilation_h,
const int dilation_w, const int group,
const int group_channels, const float offset_scale,
const at::Tensor &grad_output, const int im2col_step, const int remove_center) {
AT_ASSERTM(input.is_contiguous(), "input tensor has to be contiguous");
AT_ASSERTM(offset.is_contiguous(), "offset tensor has to be contiguous");
AT_ASSERTM(mask.is_contiguous(), "mask tensor has to be contiguous");
AT_ASSERTM(grad_output.is_contiguous(),
"grad_output tensor has to be contiguous");
AT_ASSERTM(input.type().is_cuda(), "input must be a CUDA tensor");
AT_ASSERTM(offset.type().is_cuda(), "offset must be a CUDA tensor");
AT_ASSERTM(mask.type().is_cuda(), "mask must be a CUDA tensor");
AT_ASSERTM(grad_output.type().is_cuda(),
"grad_output must be a CUDA tensor");
const int batch = input.size(0);
const int height_in = input.size(1);
const int width_in = input.size(2);
const int channels = input.size(3);
const int height_out =
(height_in + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h +
1;
const int width_out =
(width_in + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w +
1;
const int im2col_step_ = std::min(batch, im2col_step);
AT_ASSERTM(batch % im2col_step_ == 0,
"batch(%d) must divide im2col_step(%d)", batch, im2col_step_);
AT_ASSERTM(
channels == (group * group_channels),
"Input channels and group times group channels won't match: (%d vs %d).",
channels, group * group_channels);
auto dtype = input.dtype();
if (dtype == at::kHalf) {
dtype = at::kFloat;
}
auto grad_input = at::zeros_like(input, dtype);
auto grad_offset = at::zeros_like(offset, dtype);
auto grad_mask = at::zeros_like(mask, dtype);
const int batch_n = im2col_step_;
auto per_input_size = height_in * width_in * group * group_channels;
auto per_offset_size =
height_out * width_out * group * (kernel_h * kernel_w - remove_center) * 2;
auto per_mask_size = height_out * width_out * group * (kernel_h * kernel_w - remove_center);
auto grad_output_n =
grad_output.view({batch / im2col_step_, batch_n, height_out * width_out,
group, group_channels});
for (int n = 0; n < batch / im2col_step_; ++n) {
auto grad_output_g = grad_output_n.select(0, n);
// AT_DISPATCH_FLOATING_TYPES(
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
input.type(), "ms_deform_attn_backward_cuda", ([&] {
dcnv3_col2im_cuda(
at::cuda::getCurrentCUDAStream(),
grad_output_g.data<scalar_t>(),
input.data<scalar_t>() + n * im2col_step_ * per_input_size,
offset.data<scalar_t>() +
n * im2col_step_ * per_offset_size,
mask.data<scalar_t>() + n * im2col_step_ * per_mask_size,
kernel_h, kernel_w, stride_h, stride_w, pad_h, pad_w,
dilation_h, dilation_w, group, group_channels, batch_n,
height_in, width_in, height_out, width_out, offset_scale, remove_center,
grad_input.data<opmath_t>() +
n * im2col_step_ * per_input_size,
grad_offset.data<opmath_t>() +
n * im2col_step_ * per_offset_size,
grad_mask.data<opmath_t>() +
n * im2col_step_ * per_mask_size);
}));
}
if (input.dtype() == torch::kHalf) {
return {grad_input.to(torch::kHalf), grad_offset.to(torch::kHalf),
grad_mask.to(torch::kHalf)};
} else {
return {grad_input, grad_offset, grad_mask};
}
}
/*!
**************************************************************************************************
* InternImage
* Copyright (c) 2022 OpenGVLab
* Licensed under The MIT License [see LICENSE for details]
**************************************************************************************************
* Modified from
*https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
**************************************************************************************************
*/
#pragma once
#include <torch/extension.h>
at::Tensor dcnv3_cuda_forward(const at::Tensor &input, const at::Tensor &offset,
const at::Tensor &mask, const int kernel_h,
const int kernel_w, const int stride_h,
const int stride_w, const int pad_h,
const int pad_w, const int dilation_h,
const int dilation_w, const int group,
const int group_channels,
const float offset_scale, const int im2col_step, const int remove_center);
std::vector<at::Tensor>
dcnv3_cuda_backward(const at::Tensor &input, const at::Tensor &offset,
const at::Tensor &mask, const int kernel_h,
const int kernel_w, const int stride_h, const int stride_w,
const int pad_h, const int pad_w, const int dilation_h,
const int dilation_w, const int group,
const int group_channels, const float offset_scale,
const at::Tensor &grad_output, const int im2col_step, const int remove_center);
/*!
**************************************************************************************************
* InternImage
* Copyright (c) 2022 OpenGVLab
* Licensed under The MIT License [see LICENSE for details]
**************************************************************************************************
* Modified from
*https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
**************************************************************************************************
*/
#pragma once
#include "cpu/dcnv3_cpu.h"
#ifdef WITH_CUDA
#include "cuda/dcnv3_cuda.h"
#endif
at::Tensor dcnv3_forward(const at::Tensor &input, const at::Tensor &offset,
const at::Tensor &mask, const int kernel_h,
const int kernel_w, const int stride_h,
const int stride_w, const int pad_h, const int pad_w,
const int dilation_h, const int dilation_w,
const int group, const int group_channels,
const float offset_scale, const int im2col_step, const int remove_center) {
if (input.type().is_cuda()) {
#ifdef WITH_CUDA
return dcnv3_cuda_forward(input, offset, mask, kernel_h, kernel_w,
stride_h, stride_w, pad_h, pad_w, dilation_h,
dilation_w, group, group_channels,
offset_scale, im2col_step, remove_center);
#else
AT_ERROR("Not compiled with GPU support");
#endif
}
AT_ERROR("Not implemented on the CPU");
}
std::vector<at::Tensor>
dcnv3_backward(const at::Tensor &input, const at::Tensor &offset,
const at::Tensor &mask, const int kernel_h, const int kernel_w,
const int stride_h, const int stride_w, const int pad_h,
const int pad_w, const int dilation_h, const int dilation_w,
const int group, const int group_channels,
const float offset_scale, const at::Tensor &grad_output,
const int im2col_step, const int remove_center) {
if (input.type().is_cuda()) {
#ifdef WITH_CUDA
return dcnv3_cuda_backward(input, offset, mask, kernel_h, kernel_w,
stride_h, stride_w, pad_h, pad_w, dilation_h,
dilation_w, group, group_channels,
offset_scale, grad_output, im2col_step, remove_center);
#else
AT_ERROR("Not compiled with GPU support");
#endif
}
AT_ERROR("Not implemented on the CPU");
}
/*!
**************************************************************************************************
* InternImage
* Copyright (c) 2022 OpenGVLab
* Licensed under The MIT License [see LICENSE for details]
**************************************************************************************************
* Modified from
*https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
**************************************************************************************************
*/
#include "dcnv3.h"
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("dcnv3_forward", &dcnv3_forward, "dcnv3_forward");
m.def("dcnv3_backward", &dcnv3_backward, "dcnv3_backward");
}
# --------------------------------------------------------
# InternImage
# Copyright (c) 2022 OpenGVLab
# Licensed under The MIT License [see LICENSE for details]
# --------------------------------------------------------
# Copied from
# https://github.com/OpenGVLab/InternImage/blob/master/classification/models/
from __future__ import absolute_import, division, print_function
import math # noqa
import time
import torch
import torch.nn as nn # noqa
from functions.dcnv3_func import DCNv3Function, dcnv3_core_pytorch
from torch.autograd import gradcheck # noqa
H_in, W_in = 8, 8
N, M, D = 2, 4, 16
Kh, Kw = 3, 3
remove_center = False
P = Kh * Kw - remove_center
offset_scale = 2.0
pad = 1
dilation = 1
stride = 1
H_out = (H_in + 2 * pad - (dilation * (Kh - 1) + 1)) // stride + 1
W_out = (W_in + 2 * pad - (dilation * (Kw - 1) + 1)) // stride + 1
torch.manual_seed(3)
@torch.no_grad()
def check_forward_equal_with_pytorch_double():
input = torch.rand(N, H_in, W_in, M * D).cuda() * 0.01
offset = torch.rand(N, H_out, W_out, M * P * 2).cuda() * 10
mask = torch.rand(N, H_out, W_out, M, P).cuda() + 1e-5
mask /= mask.sum(-1, keepdim=True)
mask = mask.reshape(N, H_out, W_out, M * P)
output_pytorch = dcnv3_core_pytorch(input.double(), offset.double(),
mask.double(), Kh, Kw, stride, stride,
Kh // 2, Kw // 2, dilation, dilation,
M, D, offset_scale,
remove_center).detach().cpu()
im2col_step = 2
output_cuda = DCNv3Function.apply(input.double(), offset.double(),
mask.double(), Kh, Kw, stride, stride,
Kh // 2, Kw // 2, dilation, dilation, M,
D, offset_scale, im2col_step,
remove_center).detach().cpu()
fwdok = torch.allclose(output_cuda, output_pytorch)
max_abs_err = (output_cuda - output_pytorch).abs().max()
max_rel_err = ((output_cuda - output_pytorch).abs() /
output_pytorch.abs()).max()
print('>>> forward double')
print(f'* {fwdok} check_forward_equal_with_pytorch_double:'
f' max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}')
@torch.no_grad()
def check_forward_equal_with_pytorch_float():
input = torch.rand(N, H_in, W_in, M * D).cuda() * 0.01
offset = torch.rand(N, H_out, W_out, M * P * 2).cuda() * 10
mask = torch.rand(N, H_out, W_out, M, P).cuda() + 1e-5
mask /= mask.sum(-1, keepdim=True)
mask = mask.reshape(N, H_out, W_out, M * P)
output_pytorch = dcnv3_core_pytorch(input, offset, mask, Kh, Kw, stride,
stride, Kh // 2, Kw // 2, dilation,
dilation, M, D, offset_scale,
remove_center).detach().cpu()
im2col_step = 2
output_cuda = DCNv3Function.apply(input, offset, mask, Kh, Kw, stride,
stride, Kh // 2, Kw // 2, dilation,
dilation, M, D, offset_scale,
im2col_step,
remove_center).detach().cpu()
fwdok = torch.allclose(output_cuda, output_pytorch, rtol=1e-2, atol=1e-3)
max_abs_err = (output_cuda - output_pytorch).abs().max()
max_rel_err = ((output_cuda - output_pytorch).abs() /
output_pytorch.abs()).max()
print('>>> forward float')
print(f'* {fwdok} check_forward_equal_with_pytorch_float:'
f'max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}')
def check_backward_equal_with_pytorch_double(channels=4,
grad_input=True,
grad_offset=True,
grad_mask=True):
# H_in, W_in = 4, 4
N = 2
M = 2
H_out = (H_in + 2 * pad - (dilation * (Kh - 1) + 1)) // stride + 1
W_out = (W_in + 2 * pad - (dilation * (Kw - 1) + 1)) // stride + 1
D = channels
input0 = torch.rand(N, H_in, W_in, M * D).cuda() * 0.01
offset0 = torch.rand(N, H_out, W_out, M * P * 2).cuda() * 10
mask0 = torch.rand(N, H_out, W_out, M, P).cuda() + 1e-5
mask0 /= mask0.sum(-1, keepdim=True)
mask0 = mask0.reshape(N, H_out, W_out, M * P)
input0.requires_grad = grad_input
offset0.requires_grad = grad_offset
mask0.requires_grad = grad_mask
output_pytorch = dcnv3_core_pytorch(input0.double(), offset0.double(),
mask0.double(), Kh, Kw, stride, stride,
Kh // 2, Kw // 2, dilation, dilation,
M, D, offset_scale, remove_center)
output_pytorch.sum().backward()
input1 = input0.detach()
offset1 = offset0.detach()
mask1 = mask0.detach()
input1.requires_grad = grad_input
offset1.requires_grad = grad_offset
mask1.requires_grad = grad_mask
im2col_step = 2
output_cuda = DCNv3Function.apply(input1.double(), offset1.double(),
mask1.double(), Kh, Kw, stride, stride,
Kh // 2, Kw // 2, dilation, dilation, M,
D, offset_scale, im2col_step,
remove_center)
output_cuda.sum().backward()
print(f'>>> backward double: channels {D}')
bwdok = torch.allclose(input0.grad, input1.grad, rtol=1e-2, atol=1e-3)
max_abs_err = (input0.grad - input1.grad).abs().max()
max_rel_err = ((input0.grad - input1.grad).abs() / input0.grad.abs()).max()
print(f'* {bwdok} input_grad check_backward_equal_with_pytorch_double:'
f'max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}')
bwdok = torch.allclose(offset0.grad, offset1.grad, rtol=1e-2, atol=1e-3)
max_abs_err = (offset0.grad - offset1.grad).abs().max()
max_rel_err = ((offset0.grad - offset1.grad).abs() /
offset0.grad.abs()).max()
print(f'* {bwdok} offset_grad check_backward_equal_with_pytorch_double:'
f'max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}')
bwdok = torch.allclose(mask0.grad, mask1.grad, rtol=1e-2, atol=1e-3)
max_abs_err = (mask0.grad - mask1.grad).abs().max()
max_rel_err = ((mask0.grad - mask1.grad).abs() / mask0.grad.abs()).max()
print(f'* {bwdok} mask_grad check_backward_equal_with_pytorch_double:'
f'max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}')
def check_backward_equal_with_pytorch_float(channels=4,
grad_input=True,
grad_offset=True,
grad_mask=True):
# H_in, W_in = 4, 4
N = 2
M = 2
H_out = (H_in + 2 * pad - (dilation * (Kh - 1) + 1)) // stride + 1
W_out = (W_in + 2 * pad - (dilation * (Kw - 1) + 1)) // stride + 1
D = channels
input0 = torch.rand(N, H_in, W_in, M * D).cuda() * 0.01
offset0 = torch.rand(N, H_out, W_out, M * P * 2).cuda() * 10
mask0 = torch.rand(N, H_out, W_out, M, P).cuda() + 1e-5
mask0 /= mask0.sum(-1, keepdim=True)
mask0 = mask0.reshape(N, H_out, W_out, M * P)
input0.requires_grad = grad_input
offset0.requires_grad = grad_offset
mask0.requires_grad = grad_mask
output_pytorch = dcnv3_core_pytorch(input0, offset0, mask0, Kh, Kw, stride,
stride, Kh // 2, Kw // 2, dilation,
dilation, M, D, offset_scale,
remove_center)
output_pytorch.sum().backward()
input1 = input0.detach()
offset1 = offset0.detach()
mask1 = mask0.detach()
input1.requires_grad = grad_input
offset1.requires_grad = grad_offset
mask1.requires_grad = grad_mask
im2col_step = 2
output_cuda = DCNv3Function.apply(input1, offset1, mask1, Kh, Kw, stride,
stride, Kh // 2, Kw // 2, dilation,
dilation, M, D, offset_scale,
im2col_step, remove_center)
output_cuda.sum().backward()
print(f'>>> backward float: channels {D}')
bwdok = torch.allclose(input0.grad, input1.grad, rtol=1e-2, atol=1e-3)
max_abs_err = (input0.grad - input1.grad).abs().max()
max_rel_err = ((input0.grad - input1.grad).abs() / input0.grad.abs()).max()
print(f'* {bwdok} input_grad check_backward_equal_with_pytorch_float:'
f'max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}')
bwdok = torch.allclose(offset0.grad, offset1.grad, rtol=1e-2, atol=1e-3)
max_abs_err = (offset0.grad - offset1.grad).abs().max()
max_rel_err = ((offset0.grad - offset1.grad).abs() /
offset0.grad.abs()).max()
print(f'* {bwdok} offset_grad check_backward_equal_with_pytorch_float:'
f'max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}')
bwdok = torch.allclose(mask0.grad, mask1.grad, rtol=1e-2, atol=1e-3)
max_abs_err = (mask0.grad - mask1.grad).abs().max()
max_rel_err = ((mask0.grad - mask1.grad).abs() / mask0.grad.abs()).max()
print(f'* {bwdok} mask_grad check_backward_equal_with_pytorch_float:'
f'max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}')
@torch.no_grad()
def check_time_cost(im2col_step=128):
N = 512
H_in, W_in = 64, 64
H_out = (H_in + 2 * pad - (dilation * (Kh - 1) + 1)) // stride + 1
W_out = (W_in + 2 * pad - (dilation * (Kw - 1) + 1)) // stride + 1
input = torch.rand(N, H_in, W_in, M * D).cuda() * 0.01
offset = torch.rand(N, H_out, W_out, M * P * 2).cuda() * 10
mask = torch.rand(N, H_out, W_out, M, P).cuda() + 1e-5
mask /= mask.sum(-1, keepdim=True)
mask = mask.reshape(N, H_out, W_out, M * P)
print(f'>>> time cost: im2col_step {im2col_step};'
f'input {input.shape}; points {P} ')
repeat = 100
for i in range(repeat):
output_cuda = DCNv3Function.apply(input, offset, mask, Kh, Kw, stride,
stride, Kh // 2, Kw // 2, dilation,
dilation, M, D, 1.0, im2col_step,
remove_center)
torch.cuda.synchronize()
start = time.time()
for i in range(repeat):
output_cuda = DCNv3Function.apply( # noqa
input, offset, mask, Kh, Kw, stride, stride, Kh // 2, Kw // 2,
dilation, dilation, M, D, 1.0, im2col_step, remove_center)
torch.cuda.synchronize()
print(f'foward time cost: {(time.time() - start) / repeat}')
if __name__ == '__main__':
check_forward_equal_with_pytorch_double()
check_forward_equal_with_pytorch_float()
for channels in [1, 16, 30, 32, 64, 71, 1025]:
check_backward_equal_with_pytorch_double(channels, True, True, True)
for channels in [1, 16, 30, 32, 64, 71, 1025]:
check_backward_equal_with_pytorch_float(channels, True, True, True)
for i in range(3):
im2col_step = 128 * (2**i)
check_time_cost(im2col_step)
# MaskFeat Pre-training with Video
- [MaskFeat Pre-training with Video](#maskfeat-pre-training-with-video)
- [Description](#description)
- [Usage](#usage)
- [Setup Environment](#setup-environment)
- [Data Preparation](#data-preparation)
- [Pre-training Commands](#pre-training-commands)
- [On Local Single GPU](#on-local-single-gpu)
- [On Multiple GPUs](#on-multiple-gpus)
- [On Multiple GPUs with Slurm](#on-multiple-gpus-with-slurm)
- [Downstream Tasks Commands](#downstream-tasks-commands)
- [On Multiple GPUs](#on-multiple-gpus-1)
- [On Multiple GPUs with Slurm](#on-multiple-gpus-with-slurm-1)
- [Results](#results)
- [Citation](#citation)
- [Checklist](#checklist)
## Description
<!-- Share any information you would like others to know. For example:
Author: @xxx.
This is an implementation of \[XXX\]. -->
Author: @fangyixiao18
This is the implementation of **MaskFeat** with video dataset, like Kinetics400.
## Usage
<!-- For a typical model, this section should contain the commands for dataset prepareation, pre-training, downstream tasks. You are also suggested to dump your environment specification to env.yml by `conda env export > env.yml`. -->
### Setup Environment
Requirements:
- MMPretrain >= 1.0.0rc0
- MMAction2 >= 1.0.0rc3
Please refer to [Get Started](https://mmpretrain.readthedocs.io/en/latest/get_started.html) documentation of MMPretrain to finish installation.
Besides, to process the video data, we apply transforms in MMAction2. The instruction to install MMAction2 can be found in [Get Started documentation](https://mmaction2.readthedocs.io/en/1.x/get_started.html).
### Data Preparation
You can refer to the [documentation](https://mmaction2.readthedocs.io/en/1.x/user_guides/2_data_prepare.html) in MMAction2.
### Pre-training Commands
At first, you need to add the current folder to `PYTHONPATH`, so that Python can find your model files. In `projects/maskfeat_video/` root directory, please run command below to add it.
```shell
export PYTHONPATH=`pwd`:$PYTHONPATH
```
Then run the following commands to train the model:
#### On Local Single GPU
```bash
# train with mim
mim train mmpretrain ${CONFIG} --work-dir ${WORK_DIR}
# a specific command example
mim train mmpretrain configs/maskfeat_mvit-small_8xb32-amp-coslr-300e_k400.py \
--work-dir work_dirs/selfsup/maskfeat_mvit-small_8xb32-amp-coslr-300e_k400/
# train with scripts
python tools/train.py configs/maskfeat_mvit-small_8xb32-amp-coslr-300e_k400.py \
--work-dir work_dirs/selfsup/maskfeat_mvit-small_8xb32-amp-coslr-300e_k400/
```
#### On Multiple GPUs
```bash
# train with mim
# a specific command examples, 8 GPUs here
mim train mmpretrain configs/maskfeat_mvit-small_8xb32-amp-coslr-300e_k400.py \
--work-dir work_dirs/selfsup/maskfeat_mvit-small_8xb32-amp-coslr-300e_k400/ \
--launcher pytorch --gpus 8
# train with scripts
bash tools/dist_train.sh configs/maskfeat_mvit-small_8xb32-amp-coslr-300e_k400.py 8
```
Note:
- CONFIG: the config files under the directory `configs/`
- WORK_DIR: the working directory to save configs, logs, and checkpoints
#### On Multiple GPUs with Slurm
```bash
# train with mim
mim train mmpretrain configs/maskfeat_mvit-small_16xb32-amp-coslr-300e_k400.py \
--work-dir work_dirs/selfsup/maskfeat_mvit-small_16xb32-amp-coslr-300e_k400/ \
--launcher slurm --gpus 16 --gpus-per-node 8 \
--partition ${PARTITION}
# train with scripts
GPUS_PER_NODE=8 GPUS=16 bash tools/slurm_train.sh ${PARTITION} maskfeat-video \
configs/maskfeat_mvit-small_16xb32-amp-coslr-300e_k400.py \
--work-dir work_dirs/selfsup/maskfeat_mvit-small_16xb32-amp-coslr-300e_k400/
```
Note:
- CONFIG: the config files under the directory `configs/`
- WORK_DIR: the working directory to save configs, logs, and checkpoints
- PARTITION: the slurm partition you are using
### Downstream Tasks Commands
To evaluate the **MaskFeat MViT** pretrained with MMPretrain, we recommend to run MMAction2:
#### On Multiple GPUs
```bash
# command example for train
mim train mmaction2 ${CONFIG} \
--work-dir ${WORK_DIR} \
--launcher pytorch -gpus 8 \
--cfg-options model.backbone.init_cfg.type=Pretrained \
model.backbone.init_cfg.checkpoint=${CHECKPOINT} \
model.backbone.init_cfg.prefix="backbone." \
${PY_ARGS}
[optional args]
mim train mmaction2 configs/mvit-small_ft-8xb8-coslr-100e_k400.py \
--work-dir work_dirs/benchmarks/maskfeat/training_maskfeat-mvit-k400/ \
--launcher pytorch -gpus 8 \
--cfg-options model.backbone.init_cfg.type=Pretrained \
model.backbone.init_cfg.checkpoint=https://download.openmmlab.com/mmselfsup/1.x/maskfeat/maskfeat_mvit-small_16xb32-amp-coslr-300e_k400/maskfeat_mvit-small_16xb32-amp-coslr-300e_k400_20230131-87d60b6f.pth \
model.backbone.init_cfg.prefix="backbone." \
$PY_ARGS
# command example for test
mim test mmaction2 configs/mvit-small_ft-8xb16-coslr-100e_k400.py \
--checkpoint https://download.openmmlab.com/mmselfsup/1.x/maskfeat/maskfeat_mvit-small_16xb32-amp-coslr-300e_k400/mvit-small_ft-8xb16-coslr-100e_k400/mvit-small_ft-8xb16-coslr-100e_k400_20230131-5e8303f5.pth \
--work-dir work_dirs/benchmarks/maskfeat/maskfeat-mvit-k400/test/ \
--launcher pytorch --gpus 8
```
#### On Multiple GPUs with Slurm
```bash
mim train mmaction2 ${CONFIG} \
--work-dir ${WORK_DIR} \
--launcher slurm --gpus 8 --gpus-per-node 8 \
--partition ${PARTITION} \
--cfg-options model.backbone.init_cfg.type=Pretrained \
model.backbone.init_cfg.checkpoint=$CHECKPOINT \
model.backbone.init_cfg.prefix="backbone." \
$PY_ARGS
mim test mmaction2 ${CONFIG} \
--checkpoint https://download.openmmlab.com/mmselfsup/1.x/maskfeat/maskfeat_mvit-small_16xb32-amp-coslr-300e_k400/mvit-small_ft-8xb16-coslr-100e_k400/mvit-small_ft-8xb16-coslr-100e_k400_20230131-5e8303f5.pth
--work-dir ${WORK_DIR} \
--launcher slurm --gpus 8 --gpus-per-node 8 \
--partition ${PARTITION} \
$PY_ARGS
```
Note:
- CONFIG: the config files under the directory `configs/`
- WORK_DIR: the working directory to save configs, logs, and checkpoints
- PARTITION: the slurm partition you are using
- CHECKPOINT: the pretrained checkpoint of MMPretrain saved in working directory, like `$WORK_DIR/epoch_300.pth`
- PY_ARGS: other optional args
## Results
<!-- You should claim whether this is based on the pre-trained weights, which are converted from the official release; or it's a reproduced result obtained from retraining the model in this project. -->
The Fine-tuning results are based on Kinetics400(K400) dataset.
Due to the version of K400 dataset, our pretraining, fine-tuning and the final test results are based on MMAction2 version, which is a little different from PySlowFast version.
<table class="docutils">
<thead>
<tr>
<th>Algorithm</th>
<th>Backbone</th>
<th>Epoch</th>
<th>Batch Size</th>
<th>Fine-tuning</th>
<th>Pretrain Links</th>
<th>Fine-tuning Links</th>
</tr>
</thead>
<tbody>
<tr>
<td>MaskFeat</td>
<td>MViT-small</td>
<td>300</td>
<td>512</td>
<td>81.8</td>
<td><a href='https://github.com/open-mmlab/mmselfsup/blob/dev-1.x/projects/maskfeat_video/configs/maskfeat_mvit-small_16xb32-amp-coslr-300e_k400.py'>config</a> | <a href='https://download.openmmlab.com/mmselfsup/1.x/maskfeat/maskfeat_mvit-small_16xb32-amp-coslr-300e_k400/maskfeat_mvit-small_16xb32-amp-coslr-300e_k400_20230131-87d60b6f.pth'>model</a> | <a href='https://download.openmmlab.com/mmselfsup/1.x/maskfeat/maskfeat_mvit-small_16xb32-amp-coslr-300e_k400/maskfeat_mvit-small_16xb32-amp-coslr-300e_k400_20230118_114151.json'>log</a></td>
<td><a href='https://github.com/open-mmlab/mmselfsup/blob/dev-1.x/projects/maskfeat_video/configs/mvit-small_ft-8xb16-coslr-100e_k400.py'>config</a> | <a href='https://download.openmmlab.com/mmselfsup/1.x/maskfeat/maskfeat_mvit-small_16xb32-amp-coslr-300e_k400/mvit-small_ft-8xb16-coslr-100e_k400/mvit-small_ft-8xb16-coslr-100e_k400_20230131-5e8303f5.pth'>model</a> | <a href='https://download.openmmlab.com/mmselfsup/1.x/maskfeat/maskfeat_mvit-small_16xb32-amp-coslr-300e_k400/mvit-small_ft-8xb16-coslr-100e_k400/mvit-small_ft-8xb16-coslr-100e_k400_20230121_142927.json'>log</a></td>
</tr>
</tbody>
</table>
Remarks:
- We converted the pretrained model from PySlowFast and run fine-tuning with MMAction2, based on MMAction2 version of K400, we got `81.5` test accuracy. The pretrained model from MMPretrain got `81.8`, as provided above.
- We also tested our model on [other version](https://github.com/facebookresearch/video-nonlocal-net/blob/main/DATASET.md) of K400, we got `82.1` test accuracy.
- Some other details can be found in [MMAction2 MViT page](https://github.com/open-mmlab/mmaction2/tree/dev-1.x/configs/recognition/mvit).
## Citation
```bibtex
@InProceedings{wei2022masked,
author = {Wei, Chen and Fan, Haoqi and Xie, Saining and Wu, Chao-Yuan and Yuille, Alan and Feichtenhofer, Christoph},
title = {Masked Feature Prediction for Self-Supervised Visual Pre-Training},
booktitle = {CVPR},
year = {2022},
}
```
## Checklist
Here is a checklist illustrating a usual development workflow of a successful project, and also serves as an overview of this project's progress.
<!--The PIC (person in charge) or contributors of this project should check all the items that they believe have been finished, which will further be verified by codebase maintainers via a PR.
OpenMMLab's maintainer will review the code to ensure the project's quality. Reaching the first milestone means that this project suffices the minimum requirement of being merged into 'projects/'. But this project is only eligible to become a part of the core package upon attaining the last milestone.
Note that keeping this section up-to-date is crucial not only for this project's developers but the entire community, since there might be some other contributors joining this project and deciding their starting point from this list. It also helps maintainers accurately estimate time and effort on further code polishing, if needed.
A project does not necessarily have to be finished in a single PR, but it's essential for the project to at least reach the first milestone in its very first PR. -->
- [x] Milestone 1: PR-ready, and acceptable to be one of the `projects/`.
- [x] Finish the code
<!-- The code's design shall follow existing interfaces and convention. For example, each model component should be registered into `MMPretrain.registry.MODELS` and configurable via a config file. -->
- [x] Basic docstrings & proper citation
<!-- Each major object should contain a docstring, describing its functionality and arguments. If you have adapted the code from other open-source projects, don't forget to cite the source project in docstring and make sure your behavior is not against its license. Typically, we do not accept any code snippet under GPL license. [A Short Guide to Open Source Licenses](https://medium.com/nationwide-technology/a-short-guide-to-open-source-licenses-cf5b1c329edd) -->
- [x] Inference correctness
<!-- If you are reproducing the result from a paper, make sure your model's inference-time feature vectors or losses matches that from the original codes. The weights usually could be obtained by simply renaming the keys in the official pre-trained weights. This test could be skipped though, if you are able to prove the training-time correctness and check the second milestone. -->
- [x] A full README
<!-- As this template does. -->
- [x] Milestone 2: Indicates a successful model implementation.
- [x] Training-time correctness
<!-- If you are reproducing the result from a paper, checking this item means that you should have trained your model from scratch based on the original paper's specification and verified that the final result. Due to the pretrain-downstream pipeline of self-supervised learning, this item requires at least one downstream result matches the report within a minor error range. -->
- [ ] Milestone 3: Good to be a part of our core package!
- [ ] Type hints and docstrings
<!-- Ideally *all* the methods should have [type hints](https://www.pythontutorial.net/python-basics/python-type-hints/) and [docstrings](https://google.github.io/styleguide/pyguide.html#381-docstrings). [Example](https://github.com/open-mmlab/mmpretrain/blob/main/mmpretrain/models/selfsup/mae.py) -->
- [ ] Unit tests
<!-- Unit tests for each module are required. [Example](https://github.com/open-mmlab/mmpretrain/blob/main/tests/test_models/test_selfsup/test_mae.py) -->
- [ ] Code polishing
<!-- Refactor your code according to reviewer's comment. -->
- [ ] `metafile.yml` and `README.md`
<!-- It will be parsed by MIM and Inferencer. [Example](https://github.com/open-mmlab/mmpretrain/blob/main/configs/mae/metafile.yml). In particular, you may have to refactor this README into a standard one. [Example](https://github.com/open-mmlab/mmpretrain/blob/main/configs/mae/README.md) -->
- [ ] Refactor and Move your modules into the core package following the codebase's file hierarchy structure.
_base_ = 'mmpretrain::_base_/default_runtime.py'
custom_imports = dict(imports=['models'], allow_failed_imports=False)
model = dict(
type='VideoMaskFeat',
backbone=dict(
type='MaskFeatMViT',
arch='maskfeat-small',
drop_path_rate=0.0,
dim_mul_in_attention=False),
neck=dict(
type='LinearNeck',
in_channels=768,
out_channels=108,
with_avg_pool=False,
init_cfg=dict(type='TruncNormal', layer='Linear', std=0.02, bias=0)),
head=dict(
type='MaskFeatPretrainHead',
loss=dict(type='PixelReconstructionLoss', criterion='L2')),
target_generator=dict(
type='HOGGenerator3d', nbins=9, pool=8, gaussian_window=16))
# dataset settings
dataset_type = 'mmaction.VideoDataset'
data_root = 'data/kinetics400/videos_train'
ann_file_train = 'data/Kinetics400/kinetics400_train_list_videos.txt'
data_preprocessor = dict(
type='VideoDataPreprocessor',
mean=[114.75, 114.75, 114.75],
std=[57.375, 57.375, 57.375],
format_shape='NCTHW')
train_pipeline = [
dict(type='mmaction.DecordInit'),
dict(
type='mmaction.SampleFrames',
clip_len=16,
frame_interval=4,
num_clips=1),
dict(type='mmaction.DecordDecode'),
dict(type='mmaction.Resize', scale=(-1, 256)),
dict(type='mmaction.RandomResizedCrop', area_range=(0.5, 1.0)),
dict(type='mmaction.Resize', scale=(224, 224), keep_ratio=False),
dict(type='mmaction.Flip', flip_ratio=0.5),
dict(type='mmaction.FormatShape', input_format='NCTHW'),
dict(
type='MaskFeatMaskGenerator3D',
input_size=(8, 7, 7),
num_masking_patches=157,
min_num_patches=9,
max_num_patches=49),
dict(type='PackInputs', input_key='imgs')
]
train_dataloader = dict(
batch_size=32,
num_workers=8,
persistent_workers=True,
sampler=dict(type='DefaultSampler', shuffle=True),
collate_fn=dict(type='default_collate'),
dataset=dict(
type=dataset_type,
ann_file=ann_file_train,
data_prefix=dict(video=data_root),
pipeline=train_pipeline))
optim_wrapper = dict(
type='AmpOptimWrapper',
loss_scale='dynamic',
optimizer=dict(
type='AdamW', lr=8e-4 * 2, betas=(0.9, 0.999), weight_decay=0.05),
clip_grad=dict(max_norm=0.02),
paramwise_cfg=dict(
bias_decay_mult=0.,
norm_decay_mult=0.,
custom_keys={
'pos_embed': dict(decay_mult=0.),
'cls_token': dict(decay_mult=0.)
}))
param_scheduler = [
dict(
type='LinearLR',
start_factor=1e-4,
by_epoch=True,
begin=0,
end=10,
convert_to_iter_based=True),
dict(
type='CosineAnnealingLR',
T_max=290,
eta_min=1e-6,
by_epoch=True,
begin=10,
end=300,
convert_to_iter_based=True)
]
train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=300)
default_hooks = dict(
checkpoint=dict(interval=1, max_keep_ckpts=2), logger=dict(interval=100))
_base_ = './maskfeat_mvit-small_16xb32-amp-coslr-300e_k400.py'
optim_wrapper = dict(
optimizer=dict(
type='AdamW', lr=8e-4, betas=(0.9, 0.999), weight_decay=0.05))
_base_ = [
'mmaction::_base_/models/mvit_small.py',
'mmaction::_base_/default_runtime.py'
]
model = dict(
backbone=dict(
drop_path_rate=0.1,
dim_mul_in_attention=False,
pretrained=None,
pretrained_type='maskfeat',
),
data_preprocessor=dict(
type='ActionDataPreprocessor',
mean=[114.75, 114.75, 114.75],
std=[57.375, 57.375, 57.375],
blending=dict(
type='RandomBatchAugment',
augments=[
dict(type='MixupBlending', alpha=0.8, num_classes=400),
dict(type='CutmixBlending', alpha=1, num_classes=400)
]),
format_shape='NCTHW'),
cls_head=dict(dropout_ratio=0., init_scale=0.001))
# dataset settings
dataset_type = 'VideoDataset'
data_root = 'data/kinetics400/videos_train'
data_root_val = 'data/kinetics400/videos_val'
ann_file_train = 'data/kinetics400/kinetics400_train_list_videos.txt'
ann_file_val = 'data/kinetics400/kinetics400_val_list_videos.txt'
ann_file_test = 'data/kinetics400/kinetics400_val_list_videos.txt'
train_pipeline = [
dict(type='DecordInit'),
dict(type='SampleFrames', clip_len=16, frame_interval=4, num_clips=1),
dict(type='DecordDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(type='PytorchVideoWrapper', op='RandAugment', magnitude=7),
dict(type='RandomResizedCrop'),
dict(type='Resize', scale=(224, 224), keep_ratio=False),
dict(type='Flip', flip_ratio=0.5),
dict(type='RandomErasing', erase_prob=0.25, mode='rand'),
dict(type='FormatShape', input_format='NCTHW'),
dict(type='PackActionInputs')
]
val_pipeline = [
dict(type='DecordInit'),
dict(
type='SampleFrames',
clip_len=16,
frame_interval=4,
num_clips=1,
test_mode=True),
dict(type='DecordDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(type='CenterCrop', crop_size=224),
dict(type='FormatShape', input_format='NCTHW'),
dict(type='PackActionInputs')
]
test_pipeline = [
dict(type='DecordInit'),
dict(
type='SampleFrames',
clip_len=16,
frame_interval=4,
num_clips=10,
test_mode=True),
dict(type='DecordDecode'),
dict(type='Resize', scale=(-1, 224)),
dict(type='CenterCrop', crop_size=224),
dict(type='FormatShape', input_format='NCTHW'),
dict(type='PackActionInputs')
]
repeat_sample = 2
train_dataloader = dict(
batch_size=16,
num_workers=8,
persistent_workers=True,
sampler=dict(type='DefaultSampler', shuffle=True),
collate_fn=dict(type='repeat_pseudo_collate'),
dataset=dict(
type='RepeatAugDataset',
num_repeats=repeat_sample,
ann_file=ann_file_train,
data_prefix=dict(video=data_root),
pipeline=train_pipeline))
val_dataloader = dict(
batch_size=16,
num_workers=8,
persistent_workers=True,
sampler=dict(type='DefaultSampler', shuffle=False),
dataset=dict(
type=dataset_type,
ann_file=ann_file_val,
data_prefix=dict(video=data_root_val),
pipeline=val_pipeline,
test_mode=True))
test_dataloader = dict(
batch_size=1,
num_workers=8,
persistent_workers=True,
sampler=dict(type='DefaultSampler', shuffle=False),
dataset=dict(
type=dataset_type,
ann_file=ann_file_test,
data_prefix=dict(video=data_root_val),
pipeline=test_pipeline,
test_mode=True))
val_evaluator = dict(type='AccMetric')
test_evaluator = val_evaluator
train_cfg = dict(
type='EpochBasedTrainLoop', max_epochs=100, val_begin=1, val_interval=1)
val_cfg = dict(type='ValLoop')
test_cfg = dict(type='TestLoop')
base_lr = 9.6e-3
optim_wrapper = dict(
optimizer=dict(
type='AdamW', lr=base_lr, betas=(0.9, 0.999), weight_decay=0.05),
constructor='LearningRateDecayOptimizerConstructor',
paramwise_cfg={
'decay_rate': 0.75,
'decay_type': 'layer_wise',
'num_layers': 16
},
clip_grad=dict(max_norm=5, norm_type=2))
param_scheduler = [
dict(
type='LinearLR',
start_factor=1 / 600,
by_epoch=True,
begin=0,
end=20,
convert_to_iter_based=True),
dict(
type='CosineAnnealingLR',
T_max=80,
eta_min_ratio=1 / 600,
by_epoch=True,
begin=20,
end=100,
convert_to_iter_based=True)
]
default_hooks = dict(
checkpoint=dict(interval=3, max_keep_ckpts=20), logger=dict(interval=100))
# Default setting for scaling LR automatically
# - `enable` means enable scaling LR automatically
# or not by default.
# - `base_batch_size` = (8 GPUs) x (64 samples per GPU) / repeat_sample.
auto_scale_lr = dict(enable=True, base_batch_size=512 // repeat_sample)
from .hog_generator_3d import HOGGenerator3d
from .maskfeat import VideoMaskFeat
from .maskfeat_mvit import MaskFeatMViT
from .transforms import MaskFeatMaskGenerator3D
__all__ = [
'HOGGenerator3d', 'VideoMaskFeat', 'MaskFeatMViT',
'MaskFeatMaskGenerator3D'
]
# Copyright (c) OpenMMLab. All rights reserved.
import torch
from mmpretrain.models import HOGGenerator
from mmpretrain.registry import MODELS
@MODELS.register_module()
class HOGGenerator3d(HOGGenerator):
"""Generate HOG feature for videos.
This module is used in MaskFeat to generate HOG feature.
Here is the link of `HOG wikipedia
<https://en.wikipedia.org/wiki/Histogram_of_oriented_gradients>`_.
Args:
nbins (int): Number of bin. Defaults to 9.
pool (float): Number of cell. Defaults to 8.
gaussian_window (int): Size of gaussian kernel. Defaults to 16.
"""
def __init__(self,
nbins: int = 9,
pool: int = 8,
gaussian_window: int = 16) -> None:
super().__init__(
nbins=nbins, pool=pool, gaussian_window=gaussian_window)
def _reshape(self, hog_feat: torch.Tensor) -> torch.Tensor:
"""Reshape HOG Features for output."""
hog_feat = hog_feat.flatten(1, 2)
self.unfold_size = hog_feat.shape[-1] // 14
hog_feat = hog_feat.permute(0, 2, 3, 1)
hog_feat = hog_feat.unfold(1, self.unfold_size,
self.unfold_size).unfold(
2, self.unfold_size, self.unfold_size)
hog_feat = hog_feat.flatten(3).view(self.B, self.T, 14, 14, -1)
hog_feat = hog_feat.flatten(1, 3) # B N C
return hog_feat
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment