Commit df3c64a9 authored by zhiqi-li's avatar zhiqi-li
Browse files

support occupancy prediction

parent bdd98bcb
from .dense_heads import *
from .detectors import *
from .modules import *
from .runner import *
from .hooks import *
from .backbones import *
\ No newline at end of file
from .train import custom_train_model
from .mmdet_train import custom_train_detector
# from .test import custom_multi_gpu_test
\ No newline at end of file
# ---------------------------------------------
# Copyright (c) OpenMMLab. All rights reserved.
# ---------------------------------------------
# Modified by Zhiqi Li
# ---------------------------------------------
import random
import warnings
import numpy as np
import torch
import torch.distributed as dist
from mmcv.parallel import MMDataParallel, MMDistributedDataParallel
from mmcv.runner import (HOOKS, DistSamplerSeedHook, EpochBasedRunner,
Fp16OptimizerHook, OptimizerHook, build_optimizer,
build_runner, get_dist_info)
from mmcv.utils import build_from_cfg
from mmdet.core import EvalHook
from mmdet.datasets import (build_dataset,
replace_ImageToTensor)
from mmdet.utils import get_root_logger
import time
import os.path as osp
from projects.mmdet3d_plugin.datasets.builder import build_dataloader
from projects.mmdet3d_plugin.core.evaluation.eval_hooks import CustomDistEvalHook
from projects.mmdet3d_plugin.datasets import custom_build_dataset
def custom_train_detector(model,
dataset,
cfg,
distributed=False,
validate=False,
timestamp=None,
eval_model=None,
meta=None):
logger = get_root_logger(cfg.log_level)
# prepare data loaders
dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset]
#assert len(dataset)==1s
if 'imgs_per_gpu' in cfg.data:
logger.warning('"imgs_per_gpu" is deprecated in MMDet V2.0. '
'Please use "samples_per_gpu" instead')
if 'samples_per_gpu' in cfg.data:
logger.warning(
f'Got "imgs_per_gpu"={cfg.data.imgs_per_gpu} and '
f'"samples_per_gpu"={cfg.data.samples_per_gpu}, "imgs_per_gpu"'
f'={cfg.data.imgs_per_gpu} is used in this experiments')
else:
logger.warning(
'Automatically set "samples_per_gpu"="imgs_per_gpu"='
f'{cfg.data.imgs_per_gpu} in this experiments')
cfg.data.samples_per_gpu = cfg.data.imgs_per_gpu
data_loaders = [
build_dataloader(
ds,
cfg.data.samples_per_gpu,
cfg.data.workers_per_gpu,
# cfg.gpus will be ignored if distributed
len(cfg.gpu_ids),
dist=distributed,
seed=cfg.seed,
shuffler_sampler=cfg.data.shuffler_sampler, # dict(type='DistributedGroupSampler'),
nonshuffler_sampler=cfg.data.nonshuffler_sampler, # dict(type='DistributedSampler'),
) for ds in dataset
]
# put model on gpus
if distributed:
find_unused_parameters = cfg.get('find_unused_parameters', False)
# Sets the `find_unused_parameters` parameter in
# torch.nn.parallel.DistributedDataParallel
model = MMDistributedDataParallel(
model.cuda(),
device_ids=[torch.cuda.current_device()],
broadcast_buffers=False,
find_unused_parameters=find_unused_parameters)
if eval_model is not None:
eval_model = MMDistributedDataParallel(
eval_model.cuda(),
device_ids=[torch.cuda.current_device()],
broadcast_buffers=False,
find_unused_parameters=find_unused_parameters)
else:
model = MMDataParallel(
model.cuda(cfg.gpu_ids[0]), device_ids=cfg.gpu_ids)
if eval_model is not None:
eval_model = MMDataParallel(
eval_model.cuda(cfg.gpu_ids[0]), device_ids=cfg.gpu_ids)
# build runner
optimizer = build_optimizer(model, cfg.optimizer)
if 'runner' not in cfg:
cfg.runner = {
'type': 'EpochBasedRunner',
'max_epochs': cfg.total_epochs
}
warnings.warn(
'config is now expected to have a `runner` section, '
'please set `runner` in your config.', UserWarning)
else:
if 'total_epochs' in cfg:
assert cfg.total_epochs == cfg.runner.max_epochs
if eval_model is not None:
runner = build_runner(
cfg.runner,
default_args=dict(
model=model,
eval_model=eval_model,
optimizer=optimizer,
work_dir=cfg.work_dir,
logger=logger,
meta=meta))
else:
runner = build_runner(
cfg.runner,
default_args=dict(
model=model,
optimizer=optimizer,
work_dir=cfg.work_dir,
logger=logger,
meta=meta))
# an ugly workaround to make .log and .log.json filenames the same
runner.timestamp = timestamp
# fp16 setting
fp16_cfg = cfg.get('fp16', None)
if fp16_cfg is not None:
optimizer_config = Fp16OptimizerHook(
**cfg.optimizer_config, **fp16_cfg, distributed=distributed)
elif distributed and 'type' not in cfg.optimizer_config:
optimizer_config = OptimizerHook(**cfg.optimizer_config)
else:
optimizer_config = cfg.optimizer_config
# register hooks
runner.register_training_hooks(cfg.lr_config, optimizer_config,
cfg.checkpoint_config, cfg.log_config,
cfg.get('momentum_config', None))
# register profiler hook
#trace_config = dict(type='tb_trace', dir_name='work_dir')
#profiler_config = dict(on_trace_ready=trace_config)
#runner.register_profiler_hook(profiler_config)
if distributed:
if isinstance(runner, EpochBasedRunner):
runner.register_hook(DistSamplerSeedHook())
# register eval hooks
if validate:
# Support batch_size > 1 in validation
val_samples_per_gpu = cfg.data.val.pop('samples_per_gpu', 1)
if val_samples_per_gpu > 1:
assert False
# Replace 'ImageToTensor' to 'DefaultFormatBundle'
cfg.data.val.pipeline = replace_ImageToTensor(
cfg.data.val.pipeline)
val_dataset = custom_build_dataset(cfg.data.val, dict(test_mode=True))
val_dataloader = build_dataloader(
val_dataset,
samples_per_gpu=val_samples_per_gpu,
workers_per_gpu=cfg.data.workers_per_gpu,
dist=distributed,
shuffle=False,
shuffler_sampler=cfg.data.shuffler_sampler, # dict(type='DistributedGroupSampler'),
nonshuffler_sampler=cfg.data.nonshuffler_sampler, # dict(type='DistributedSampler'),
)
eval_cfg = cfg.get('evaluation', {})
eval_cfg['by_epoch'] = cfg.runner['type'] != 'IterBasedRunner'
eval_cfg['jsonfile_prefix'] = osp.join('val', cfg.work_dir, time.ctime().replace(' ','_').replace(':','_'))
eval_hook = CustomDistEvalHook if distributed else EvalHook
runner.register_hook(eval_hook(val_dataloader, **eval_cfg))
# user-defined hooks
if cfg.get('custom_hooks', None):
custom_hooks = cfg.custom_hooks
assert isinstance(custom_hooks, list), \
f'custom_hooks expect list type, but got {type(custom_hooks)}'
for hook_cfg in cfg.custom_hooks:
assert isinstance(hook_cfg, dict), \
'Each item in custom_hooks expects dict type, but got ' \
f'{type(hook_cfg)}'
hook_cfg = hook_cfg.copy()
priority = hook_cfg.pop('priority', 'NORMAL')
hook = build_from_cfg(hook_cfg, HOOKS)
runner.register_hook(hook, priority=priority)
if cfg.resume_from:
runner.resume(cfg.resume_from)
elif cfg.load_from:
runner.load_checkpoint(cfg.load_from)
runner.run(data_loaders, cfg.workflow)
# ---------------------------------------------
# Copyright (c) OpenMMLab. All rights reserved.
# ---------------------------------------------
# Modified by Xiaoyu Tian
# ---------------------------------------------
import os.path as osp
import pickle
import shutil
import tempfile
import time
import mmcv
import torch
import torch.distributed as dist
from mmcv.image import tensor2imgs
from mmcv.runner import get_dist_info
from mmdet.core import encode_mask_results
import mmcv
import numpy as np
import pycocotools.mask as mask_util
def custom_encode_mask_results(mask_results):
"""Encode bitmap mask to RLE code. Semantic Masks only
Args:
mask_results (list | tuple[list]): bitmap mask results.
In mask scoring rcnn, mask_results is a tuple of (segm_results,
segm_cls_score).
Returns:
list | tuple: RLE encoded mask.
"""
cls_segms = mask_results
num_classes = len(cls_segms)
encoded_mask_results = []
for i in range(len(cls_segms)):
encoded_mask_results.append(
mask_util.encode(
np.array(
cls_segms[i][:, :, np.newaxis], order='F',
dtype='uint8'))[0]) # encoded with RLE
return [encoded_mask_results]
def custom_multi_gpu_test(model, data_loader, tmpdir=None, gpu_collect=False):
"""Test model with multiple gpus.
This method tests model with multiple gpus and collects the results
under two different modes: gpu and cpu modes. By setting 'gpu_collect=True'
it encodes results to gpu tensors and use gpu communication for results
collection. On cpu mode it saves the results on different gpus to 'tmpdir'
and collects them by the rank 0 worker.
Args:
model (nn.Module): Model to be tested.
data_loader (nn.Dataloader): Pytorch data loader.
tmpdir (str): Path of directory to save the temporary results from
different gpus under cpu mode.
gpu_collect (bool): Option to use either gpu or cpu to collect results.
Returns:
list: The prediction results.
"""
model.eval()
bbox_results = []
mask_results = []
occ_results = []
dataset = data_loader.dataset
rank, world_size = get_dist_info()
if rank == 0:
prog_bar = mmcv.ProgressBar(len(dataset))
time.sleep(2) # This line can prevent deadlock problem in some cases.
have_mask = False
for i, data in enumerate(data_loader):
with torch.no_grad():
result = model(return_loss=False, rescale=True, **data)
bs=result.shape[0]
assert bs==1, \
'Evaluation only supports batch_size=1 in this version'
# encode mask results
if isinstance(result, dict):
if 'bbox_results' in result.keys():
bbox_result = result['bbox_results']
batch_size = len(result['bbox_results'])
bbox_results.extend(bbox_result)
if 'mask_results' in result.keys() and result['mask_results'] is not None:
mask_result = custom_encode_mask_results(result['mask_results'])
mask_results.extend(mask_result)
have_mask = True
else:
batch_size = 1
occ_results.extend([result.squeeze(dim=0).cpu().numpy().astype(np.uint8)])
# batch_size = len(result)
# bbox_results.extend(result)
#if isinstance(result[0], tuple):
# assert False, 'this code is for instance segmentation, which our code will not utilize.'
# result = [(bbox_results, encode_mask_results(mask_results))
# for bbox_results, mask_results in result]
if rank == 0:
for _ in range(batch_size * world_size):
prog_bar.update()
# collect results from all ranks
if gpu_collect:
bbox_results = collect_results_gpu(bbox_results, len(dataset))
if have_mask:
mask_results = collect_results_gpu(mask_results, len(dataset))
else:
mask_results = None
else:
# bbox_results = collect_results_cpu(bbox_results, len(dataset), tmpdir)
# tmpdir = tmpdir+'_mask' if tmpdir is not None else None
# if have_mask:
# mask_results = collect_results_cpu(mask_results, len(dataset), tmpdir)
# else:
# mask_results = None
tmpdir = tmpdir + '_occ' if tmpdir is not None else None
occ_results = collect_results_cpu(occ_results, len(dataset), tmpdir)
return occ_results
def collect_results_cpu(result_part, size, tmpdir=None):
rank, world_size = get_dist_info()
# create a tmp dir if it is not specified
if tmpdir is None:
MAX_LEN = 512
# 32 is whitespace
dir_tensor = torch.full((MAX_LEN, ),
32,
dtype=torch.uint8,
device='cuda')
if rank == 0:
mmcv.mkdir_or_exist('.dist_test')
tmpdir = tempfile.mkdtemp(dir='.dist_test')
tmpdir = torch.tensor(
bytearray(tmpdir.encode()), dtype=torch.uint8, device='cuda')
dir_tensor[:len(tmpdir)] = tmpdir
dist.broadcast(dir_tensor, 0)
tmpdir = dir_tensor.cpu().numpy().tobytes().decode().rstrip()
else:
mmcv.mkdir_or_exist(tmpdir)
# dump the part result to the dir
mmcv.dump(result_part, osp.join(tmpdir, f'part_{rank}.pkl'))
dist.barrier()
# collect all parts
if rank != 0:
return None
else:
# load results of all parts from tmp dir
part_list = []
for i in range(world_size):
part_file = osp.join(tmpdir, f'part_{i}.pkl')
part_list.append(mmcv.load(part_file))
# sort the results
ordered_results = []
'''
bacause we change the sample of the evaluation stage to make sure that each gpu will handle continuous sample,
'''
#for res in zip(*part_list):
for res in part_list:
ordered_results.extend(list(res))
# the dataloader may pad some samples
ordered_results = ordered_results[:size]
# remove tmp dir
shutil.rmtree(tmpdir)
return ordered_results
def single_gpu_test(model,
data_loader,
show=False,
out_dir=None,
show_score_thr=0.3):
"""Test model with single gpu.
This method tests model with single gpu and gives the 'show' option.
By setting ``show=True``, it saves the visualization results under
``out_dir``.
Args:
model (nn.Module): Model to be tested.
data_loader (nn.Dataloader): Pytorch data loader.
show (bool): Whether to save viualization results.
Default: True.
out_dir (str): The path to save visualization results.
Default: None.
Returns:
list[dict]: The prediction results.
"""
model.eval()
results = []
dataset = data_loader.dataset
prog_bar = mmcv.ProgressBar(len(dataset))
for i, data in enumerate(data_loader):
with torch.no_grad():
result = model(return_loss=False, rescale=True, **data)
results.extend([result.squeeze(dim=0).cpu().numpy().astype(np.uint8)])
batch_size = len(result)
for _ in range(batch_size):
prog_bar.update()
return results
def collect_results_gpu(result_part, size):
collect_results_cpu(result_part, size)
\ No newline at end of file
# ---------------------------------------------
# Copyright (c) OpenMMLab. All rights reserved.
# ---------------------------------------------
# Modified by Zhiqi Li
# ---------------------------------------------
from .mmdet_train import custom_train_detector
from mmseg.apis import train_segmentor
from mmdet.apis import train_detector
def custom_train_model(model,
dataset,
cfg,
distributed=False,
validate=False,
timestamp=None,
eval_model=None,
meta=None):
"""A function wrapper for launching model training according to cfg.
Because we need different eval_hook in runner. Should be deprecated in the
future.
"""
if cfg.model.type in ['EncoderDecoder3D']:
assert False
else:
custom_train_detector(
model,
dataset,
cfg,
distributed=distributed,
validate=validate,
timestamp=timestamp,
eval_model=eval_model,
meta=meta)
def train_model(model,
dataset,
cfg,
distributed=False,
validate=False,
timestamp=None,
meta=None):
"""A function wrapper for launching model training according to cfg.
Because we need different eval_hook in runner. Should be deprecated in the
future.
"""
if cfg.model.type in ['EncoderDecoder3D']:
train_segmentor(
model,
dataset,
cfg,
distributed=distributed,
validate=validate,
timestamp=timestamp,
meta=meta)
else:
train_detector(
model,
dataset,
cfg,
distributed=distributed,
validate=validate,
timestamp=timestamp,
meta=meta)
from .internimage import InternImage
from .custom_layer_decay_optimizer_constructor import CustomLayerDecayOptimizerConstructor
\ No newline at end of file
# --------------------------------------------------------
# InternImage
# Copyright (c) 2022 OpenGVLab
# Licensed under The MIT License [see LICENSE for details]
# --------------------------------------------------------
"""
Mostly copy-paste from BEiT library:
https://github.com/microsoft/unilm/blob/master/beit/semantic_segmentation/mmcv_custom/layer_decay_optimizer_constructor.py
"""
import json
from mmcv.runner import OPTIMIZER_BUILDERS, DefaultOptimizerConstructor
from mmcv.runner import get_dist_info
from mmdet.utils import get_root_logger
def get_num_layer_for_swin(var_name, num_max_layer, depths):
if var_name.startswith("backbone.patch_embed"):
return 0
elif "level_embeds" in var_name:
return 0
elif var_name.startswith("backbone.layers") or var_name.startswith(
"backbone.levels"):
if var_name.split('.')[3] not in ['downsample', 'norm']:
stage_id = int(var_name.split('.')[2])
layer_id = int(var_name.split('.')[4])
# layers for Swin-Large: [2, 2, 18, 2]
if stage_id == 0:
return layer_id + 1
elif stage_id == 1:
return layer_id + 1 + depths[0]
elif stage_id == 2:
return layer_id + 1 + depths[0] + depths[1]
else:
return layer_id + 1 + depths[0] + depths[1] + depths[2]
else:
stage_id = int(var_name.split('.')[2])
if stage_id == 0:
return 1 + depths[0]
elif stage_id == 1:
return 1 + depths[0] + depths[1]
elif stage_id == 2:
return 1 + depths[0] + depths[1] + depths[2]
else:
return 1 + depths[0] + depths[1] + depths[2]
else:
return num_max_layer - 1
@OPTIMIZER_BUILDERS.register_module()
class CustomLayerDecayOptimizerConstructor(DefaultOptimizerConstructor):
def add_params(self, params, module, prefix='', is_dcn_module=None):
"""Add all parameters of module to the params list.
The parameters of the given module will be added to the list of param
groups, with specific rules defined by paramwise_cfg.
Args:
params (list[dict]): A list of param groups, it will be modified
in place.
module (nn.Module): The module to be added.
prefix (str): The prefix of the module
is_dcn_module (int|float|None): If the current module is a
submodule of DCN, `is_dcn_module` will be passed to
control conv_offset layer's learning rate. Defaults to None.
"""
parameter_groups = {}
logger = get_root_logger()
logger.info(self.paramwise_cfg)
backbone_small_lr = self.paramwise_cfg.get('backbone_small_lr', False)
dino_head = self.paramwise_cfg.get('dino_head', False)
num_layers = self.paramwise_cfg.get('num_layers') + 2
layer_decay_rate = self.paramwise_cfg.get('layer_decay_rate')
depths = self.paramwise_cfg.get('depths')
offset_lr_scale = self.paramwise_cfg.get('offset_lr_scale', 1.0)
logger.info("Build CustomLayerDecayOptimizerConstructor %f - %d" %
(layer_decay_rate, num_layers))
weight_decay = self.base_wd
for name, param in module.named_parameters():
if not param.requires_grad:
continue # frozen weights
if len(param.shape) == 1 or name.endswith(".bias") or \
"relative_position" in name or \
"norm" in name or\
"sampling_offsets" in name:
group_name = "no_decay"
this_weight_decay = 0.
else:
group_name = "decay"
this_weight_decay = weight_decay
layer_id = get_num_layer_for_swin(name, num_layers, depths)
if layer_id == num_layers - 1 and dino_head and \
("sampling_offsets" in name or "reference_points" in name):
group_name = "layer_%d_%s_0.1x" % (layer_id, group_name)
elif "sampling_offsets" in name or "reference_points" in name:
group_name = "layer_%d_%s_offset_lr_scale" % (layer_id,
group_name)
else:
group_name = "layer_%d_%s" % (layer_id, group_name)
if group_name not in parameter_groups:
scale = layer_decay_rate ** (num_layers - layer_id - 1)
if scale < 1 and backbone_small_lr == True:
scale = scale * 0.1
if "0.1x" in group_name:
scale = scale * 0.1
if "offset_lr_scale" in group_name:
scale = scale * offset_lr_scale
parameter_groups[group_name] = {
"weight_decay": this_weight_decay,
"params": [],
"param_names": [],
"lr_scale": scale,
"group_name": group_name,
"lr": scale * self.base_lr,
}
parameter_groups[group_name]["params"].append(param)
parameter_groups[group_name]["param_names"].append(name)
rank, _ = get_dist_info()
if rank == 0:
to_display = {}
for key in parameter_groups:
to_display[key] = {
"param_names": parameter_groups[key]["param_names"],
"lr_scale": parameter_groups[key]["lr_scale"],
"lr": parameter_groups[key]["lr"],
"weight_decay": parameter_groups[key]["weight_decay"],
}
logger.info("Param groups = %s" % json.dumps(to_display, indent=2))
# state_dict = module.state_dict()
# for group_name in parameter_groups:
# group = parameter_groups[group_name]
# for name in group["param_names"]:
# group["params"].append(state_dict[name])
params.extend(parameter_groups.values())
\ No newline at end of file
# --------------------------------------------------------
# InternImage
# Copyright (c) 2022 OpenGVLab
# Licensed under The MIT License [see LICENSE for details]
# --------------------------------------------------------
import torch
import torch.nn as nn
from collections import OrderedDict
import torch.utils.checkpoint as checkpoint
from timm.models.layers import trunc_normal_, DropPath
from mmcv.runner import _load_checkpoint
from mmcv.cnn import constant_init, trunc_normal_init
from mmdet.utils import get_root_logger
from mmdet.models.builder import BACKBONES
import torch.nn.functional as F
from .ops_dcnv3 import modules as opsm
class to_channels_first(nn.Module):
def __init__(self):
super().__init__()
def forward(self, x):
return x.permute(0, 3, 1, 2)
class to_channels_last(nn.Module):
def __init__(self):
super().__init__()
def forward(self, x):
return x.permute(0, 2, 3, 1)
def build_norm_layer(dim,
norm_layer,
in_format='channels_last',
out_format='channels_last',
eps=1e-6):
layers = []
if norm_layer == 'BN':
if in_format == 'channels_last':
layers.append(to_channels_first())
layers.append(nn.BatchNorm2d(dim))
if out_format == 'channels_last':
layers.append(to_channels_last())
elif norm_layer == 'LN':
if in_format == 'channels_first':
layers.append(to_channels_last())
layers.append(nn.LayerNorm(dim, eps=eps))
if out_format == 'channels_first':
layers.append(to_channels_first())
else:
raise NotImplementedError(
f'build_norm_layer does not support {norm_layer}')
return nn.Sequential(*layers)
def build_act_layer(act_layer):
if act_layer == 'ReLU':
return nn.ReLU(inplace=True)
elif act_layer == 'SiLU':
return nn.SiLU(inplace=True)
elif act_layer == 'GELU':
return nn.GELU()
raise NotImplementedError(f'build_act_layer does not support {act_layer}')
class CrossAttention(nn.Module):
r""" Cross Attention Module
Args:
dim (int): Number of input channels.
num_heads (int): Number of attention heads. Default: 8
qkv_bias (bool, optional): If True, add a learnable bias to q, k, v.
Default: False.
qk_scale (float | None, optional): Override default qk scale of
head_dim ** -0.5 if set. Default: None.
attn_drop (float, optional): Dropout ratio of attention weight.
Default: 0.0
proj_drop (float, optional): Dropout ratio of output. Default: 0.0
attn_head_dim (int, optional): Dimension of attention head.
out_dim (int, optional): Dimension of output.
"""
def __init__(self,
dim,
num_heads=8,
qkv_bias=False,
qk_scale=None,
attn_drop=0.,
proj_drop=0.,
attn_head_dim=None,
out_dim=None):
super().__init__()
if out_dim is None:
out_dim = dim
self.num_heads = num_heads
head_dim = dim // num_heads
if attn_head_dim is not None:
head_dim = attn_head_dim
all_head_dim = head_dim * self.num_heads
self.scale = qk_scale or head_dim ** -0.5
assert all_head_dim == dim
self.q = nn.Linear(dim, all_head_dim, bias=False)
self.k = nn.Linear(dim, all_head_dim, bias=False)
self.v = nn.Linear(dim, all_head_dim, bias=False)
if qkv_bias:
self.q_bias = nn.Parameter(torch.zeros(all_head_dim))
self.k_bias = nn.Parameter(torch.zeros(all_head_dim))
self.v_bias = nn.Parameter(torch.zeros(all_head_dim))
else:
self.q_bias = None
self.k_bias = None
self.v_bias = None
self.attn_drop = nn.Dropout(attn_drop)
self.proj = nn.Linear(all_head_dim, out_dim)
self.proj_drop = nn.Dropout(proj_drop)
def forward(self, x, k=None, v=None):
B, N, C = x.shape
N_k = k.shape[1]
N_v = v.shape[1]
q_bias, k_bias, v_bias = None, None, None
if self.q_bias is not None:
q_bias = self.q_bias
k_bias = self.k_bias
v_bias = self.v_bias
q = F.linear(input=x, weight=self.q.weight, bias=q_bias)
q = q.reshape(B, N, 1, self.num_heads,
-1).permute(2, 0, 3, 1,
4).squeeze(0) # (B, N_head, N_q, dim)
k = F.linear(input=k, weight=self.k.weight, bias=k_bias)
k = k.reshape(B, N_k, 1, self.num_heads, -1).permute(2, 0, 3, 1,
4).squeeze(0)
v = F.linear(input=v, weight=self.v.weight, bias=v_bias)
v = v.reshape(B, N_v, 1, self.num_heads, -1).permute(2, 0, 3, 1,
4).squeeze(0)
q = q * self.scale
attn = (q @ k.transpose(-2, -1)) # (B, N_head, N_q, N_k)
attn = attn.softmax(dim=-1)
attn = self.attn_drop(attn)
x = (attn @ v).transpose(1, 2).reshape(B, N, -1)
x = self.proj(x)
x = self.proj_drop(x)
return x
class AttentiveBlock(nn.Module):
r"""Attentive Block
Args:
dim (int): Number of input channels.
num_heads (int): Number of attention heads. Default: 8
qkv_bias (bool, optional): If True, add a learnable bias to q, k, v.
Default: False.
qk_scale (float | None, optional): Override default qk scale of
head_dim ** -0.5 if set. Default: None.
drop (float, optional): Dropout rate. Default: 0.0.
attn_drop (float, optional): Attention dropout rate. Default: 0.0.
drop_path (float | tuple[float], optional): Stochastic depth rate.
Default: 0.0.
norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm.
attn_head_dim (int, optional): Dimension of attention head. Default: None.
out_dim (int, optional): Dimension of output. Default: None.
"""
def __init__(self,
dim,
num_heads,
qkv_bias=False,
qk_scale=None,
drop=0.,
attn_drop=0.,
drop_path=0.,
norm_layer="LN",
attn_head_dim=None,
out_dim=None):
super().__init__()
self.norm1_q = build_norm_layer(dim, norm_layer, eps=1e-6)
self.norm1_k = build_norm_layer(dim, norm_layer, eps=1e-6)
self.norm1_v = build_norm_layer(dim, norm_layer, eps=1e-6)
self.cross_dcn = CrossAttention(dim,
num_heads=num_heads,
qkv_bias=qkv_bias,
qk_scale=qk_scale,
attn_drop=attn_drop,
proj_drop=drop,
attn_head_dim=attn_head_dim,
out_dim=out_dim)
self.drop_path = DropPath(
drop_path) if drop_path > 0. else nn.Identity()
def forward(self,
x_q,
x_kv,
pos_q,
pos_k,
bool_masked_pos,
rel_pos_bias=None):
x_q = self.norm1_q(x_q + pos_q)
x_k = self.norm1_k(x_kv + pos_k)
x_v = self.norm1_v(x_kv)
x = self.cross_dcn(x_q, k=x_k, v=x_v)
return x
class AttentionPoolingBlock(AttentiveBlock):
def forward(self, x):
x_q = x.mean(1, keepdim=True)
x_kv = x
pos_q, pos_k = 0, 0
x = super().forward(x_q, x_kv, pos_q, pos_k,
bool_masked_pos=None,
rel_pos_bias=None)
x = x.squeeze(1)
return x
class StemLayer(nn.Module):
r""" Stem layer of InternImage
Args:
in_chans (int): number of input channels
out_chans (int): number of output channels
act_layer (str): activation layer
norm_layer (str): normalization layer
"""
def __init__(self,
in_chans=3,
out_chans=96,
act_layer='GELU',
norm_layer='BN'):
super().__init__()
self.conv1 = nn.Conv2d(in_chans,
out_chans // 2,
kernel_size=3,
stride=2,
padding=1)
self.norm1 = build_norm_layer(out_chans // 2, norm_layer,
'channels_first', 'channels_first')
self.act = build_act_layer(act_layer)
self.conv2 = nn.Conv2d(out_chans // 2,
out_chans,
kernel_size=3,
stride=2,
padding=1)
self.norm2 = build_norm_layer(out_chans, norm_layer, 'channels_first',
'channels_last')
def forward(self, x):
x = self.conv1(x)
x = self.norm1(x)
x = self.act(x)
x = self.conv2(x)
x = self.norm2(x)
return x
class DownsampleLayer(nn.Module):
r""" Downsample layer of InternImage
Args:
channels (int): number of input channels
norm_layer (str): normalization layer
"""
def __init__(self, channels, norm_layer='LN'):
super().__init__()
self.conv = nn.Conv2d(channels,
2 * channels,
kernel_size=3,
stride=2,
padding=1,
bias=False)
self.norm = build_norm_layer(2 * channels, norm_layer,
'channels_first', 'channels_last')
def forward(self, x):
x = self.conv(x.permute(0, 3, 1, 2))
x = self.norm(x)
return x
class MLPLayer(nn.Module):
r""" MLP layer of InternImage
Args:
in_features (int): number of input features
hidden_features (int): number of hidden features
out_features (int): number of output features
act_layer (str): activation layer
drop (float): dropout rate
"""
def __init__(self,
in_features,
hidden_features=None,
out_features=None,
act_layer='GELU',
drop=0.):
super().__init__()
out_features = out_features or in_features
hidden_features = hidden_features or in_features
self.fc1 = nn.Linear(in_features, hidden_features)
self.act = build_act_layer(act_layer)
self.fc2 = nn.Linear(hidden_features, out_features)
self.drop = nn.Dropout(drop)
def forward(self, x):
x = self.fc1(x)
x = self.act(x)
x = self.drop(x)
x = self.fc2(x)
x = self.drop(x)
return x
class InternImageLayer(nn.Module):
r""" Basic layer of InternImage
Args:
core_op (nn.Module): core operation of InternImage
channels (int): number of input channels
groups (list): Groups of each block.
mlp_ratio (float): ratio of mlp hidden features to input channels
drop (float): dropout rate
drop_path (float): drop path rate
act_layer (str): activation layer
norm_layer (str): normalization layer
post_norm (bool): whether to use post normalization
layer_scale (float): layer scale
offset_scale (float): offset scale
with_cp (bool): whether to use checkpoint
"""
def __init__(self,
core_op,
channels,
groups,
mlp_ratio=4.,
drop=0.,
drop_path=0.,
act_layer='GELU',
norm_layer='LN',
post_norm=False,
layer_scale=None,
offset_scale=1.0,
with_cp=False,
dw_kernel_size=None, # for InternImage-H/G
res_post_norm=False, # for InternImage-H/G
center_feature_scale=False): # for InternImage-H/G
super().__init__()
self.channels = channels
self.groups = groups
self.mlp_ratio = mlp_ratio
self.with_cp = with_cp
self.norm1 = build_norm_layer(channels, 'LN')
self.post_norm = post_norm
self.dcn = core_op(
channels=channels,
kernel_size=3,
stride=1,
pad=1,
dilation=1,
group=groups,
offset_scale=offset_scale,
act_layer=act_layer,
norm_layer=norm_layer,
dw_kernel_size=dw_kernel_size, # for InternImage-H/G
center_feature_scale=center_feature_scale) # for InternImage-H/G
self.drop_path = DropPath(drop_path) if drop_path > 0. \
else nn.Identity()
self.norm2 = build_norm_layer(channels, 'LN')
self.mlp = MLPLayer(in_features=channels,
hidden_features=int(channels * mlp_ratio),
act_layer=act_layer,
drop=drop)
self.layer_scale = layer_scale is not None
if self.layer_scale:
self.gamma1 = nn.Parameter(layer_scale * torch.ones(channels),
requires_grad=True)
self.gamma2 = nn.Parameter(layer_scale * torch.ones(channels),
requires_grad=True)
self.res_post_norm = res_post_norm
if res_post_norm:
self.res_post_norm1 = build_norm_layer(channels, 'LN')
self.res_post_norm2 = build_norm_layer(channels, 'LN')
def forward(self, x):
def _inner_forward(x):
if not self.layer_scale:
if self.post_norm:
x = x + self.drop_path(self.norm1(self.dcn(x)))
x = x + self.drop_path(self.norm2(self.mlp(x)))
elif self.res_post_norm: # for InternImage-H/G
x = x + self.drop_path(self.res_post_norm1(self.dcn(self.norm1(x))))
x = x + self.drop_path(self.res_post_norm2(self.mlp(self.norm2(x))))
else:
x = x + self.drop_path(self.dcn(self.norm1(x)))
x = x + self.drop_path(self.mlp(self.norm2(x)))
return x
if self.post_norm:
x = x + self.drop_path(self.gamma1 * self.norm1(self.dcn(x)))
x = x + self.drop_path(self.gamma2 * self.norm2(self.mlp(x)))
else:
x = x + self.drop_path(self.gamma1 * self.dcn(self.norm1(x)))
x = x + self.drop_path(self.gamma2 * self.mlp(self.norm2(x)))
return x
if self.with_cp and x.requires_grad:
x = checkpoint.checkpoint(_inner_forward, x)
else:
x = _inner_forward(x)
return x
class InternImageBlock(nn.Module):
r""" Block of InternImage
Args:
core_op (nn.Module): core operation of InternImage
channels (int): number of input channels
depths (list): Depth of each block.
groups (list): Groups of each block.
mlp_ratio (float): ratio of mlp hidden features to input channels
drop (float): dropout rate
drop_path (float): drop path rate
act_layer (str): activation layer
norm_layer (str): normalization layer
post_norm (bool): whether to use post normalization
layer_scale (float): layer scale
offset_scale (float): offset scale
with_cp (bool): whether to use checkpoint
"""
def __init__(self,
core_op,
channels,
depth,
groups,
downsample=True,
mlp_ratio=4.,
drop=0.,
drop_path=0.,
act_layer='GELU',
norm_layer='LN',
post_norm=False,
offset_scale=1.0,
layer_scale=None,
with_cp=False,
dw_kernel_size=None, # for InternImage-H/G
post_norm_block_ids=None, # for InternImage-H/G
res_post_norm=False, # for InternImage-H/G
center_feature_scale=False): # for InternImage-H/G
super().__init__()
self.channels = channels
self.depth = depth
self.post_norm = post_norm
self.center_feature_scale = center_feature_scale
self.blocks = nn.ModuleList([
InternImageLayer(
core_op=core_op,
channels=channels,
groups=groups,
mlp_ratio=mlp_ratio,
drop=drop,
drop_path=drop_path[i] if isinstance(
drop_path, list) else drop_path,
act_layer=act_layer,
norm_layer=norm_layer,
post_norm=post_norm,
layer_scale=layer_scale,
offset_scale=offset_scale,
with_cp=with_cp,
dw_kernel_size=dw_kernel_size, # for InternImage-H/G
res_post_norm=res_post_norm, # for InternImage-H/G
center_feature_scale=center_feature_scale # for InternImage-H/G
) for i in range(depth)
])
if not self.post_norm or center_feature_scale:
self.norm = build_norm_layer(channels, 'LN')
self.post_norm_block_ids = post_norm_block_ids
if post_norm_block_ids is not None: # for InternImage-H/G
self.post_norms = nn.ModuleList(
[build_norm_layer(channels, 'LN', eps=1e-6) for _ in post_norm_block_ids]
)
self.downsample = DownsampleLayer(
channels=channels, norm_layer=norm_layer) if downsample else None
def forward(self, x, return_wo_downsample=False):
for i, blk in enumerate(self.blocks):
x = blk(x)
if (self.post_norm_block_ids is not None) and (i in self.post_norm_block_ids):
index = self.post_norm_block_ids.index(i)
x = self.post_norms[index](x) # for InternImage-H/G
if not self.post_norm or self.center_feature_scale:
x = self.norm(x)
if return_wo_downsample:
x_ = x
if self.downsample is not None:
x = self.downsample(x)
if return_wo_downsample:
return x, x_
return x
@BACKBONES.register_module()
class InternImage(nn.Module):
r""" InternImage
A PyTorch impl of : `InternImage: Exploring Large-Scale Vision Foundation Models with Deformable Convolutions` -
https://arxiv.org/pdf/2103.14030
Args:
core_op (str): Core operator. Default: 'DCNv3'
channels (int): Number of the first stage. Default: 64
depths (list): Depth of each block. Default: [3, 4, 18, 5]
groups (list): Groups of each block. Default: [3, 6, 12, 24]
mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.
drop_rate (float): Probability of an element to be zeroed. Default: 0.
drop_path_rate (float): Stochastic depth rate. Default: 0.
act_layer (str): Activation layer. Default: 'GELU'
norm_layer (str): Normalization layer. Default: 'LN'
layer_scale (bool): Whether to use layer scale. Default: False
cls_scale (bool): Whether to use class scale. Default: False
with_cp (bool): Use checkpoint or not. Using checkpoint will save some
dw_kernel_size (int): Size of the dwconv. Default: None
level2_post_norm (bool): Whether to use level2 post norm. Default: False
level2_post_norm_block_ids (list): Indexes of post norm blocks. Default: None
res_post_norm (bool): Whether to use res post norm. Default: False
center_feature_scale (bool): Whether to use center feature scale. Default: False
"""
def __init__(self,
core_op='DCNv3',
channels=64,
depths=[3, 4, 18, 5],
groups=[3, 6, 12, 24],
mlp_ratio=4.,
drop_rate=0.,
drop_path_rate=0.2,
drop_path_type='linear',
act_layer='GELU',
norm_layer='LN',
layer_scale=None,
offset_scale=1.0,
post_norm=False,
with_cp=False,
dw_kernel_size=None, # for InternImage-H/G
level2_post_norm=False, # for InternImage-H/G
level2_post_norm_block_ids=None, # for InternImage-H/G
res_post_norm=False, # for InternImage-H/G
center_feature_scale=False, # for InternImage-H/G
out_indices=(0, 1, 2, 3),
init_cfg=None,
**kwargs):
super().__init__()
self.core_op = core_op
self.num_levels = len(depths)
self.depths = depths
self.channels = channels
self.num_features = int(channels * 2**(self.num_levels - 1))
self.post_norm = post_norm
self.mlp_ratio = mlp_ratio
self.init_cfg = init_cfg
self.out_indices = out_indices
self.level2_post_norm_block_ids = level2_post_norm_block_ids
logger = get_root_logger()
logger.info(f'using core type: {core_op}')
logger.info(f'using activation layer: {act_layer}')
logger.info(f'using main norm layer: {norm_layer}')
logger.info(f'using dpr: {drop_path_type}, {drop_path_rate}')
logger.info(f"level2_post_norm: {level2_post_norm}")
logger.info(f"level2_post_norm_block_ids: {level2_post_norm_block_ids}")
logger.info(f"res_post_norm: {res_post_norm}")
in_chans = 3
self.patch_embed = StemLayer(in_chans=in_chans,
out_chans=channels,
act_layer=act_layer,
norm_layer=norm_layer)
self.pos_drop = nn.Dropout(p=drop_rate)
dpr = [
x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))
]
if drop_path_type == 'uniform':
for i in range(len(dpr)):
dpr[i] = drop_path_rate
self.levels = nn.ModuleList()
for i in range(self.num_levels):
post_norm_block_ids = level2_post_norm_block_ids if level2_post_norm and (
i == 2) else None # for InternImage-H/G
level = InternImageBlock(
core_op=getattr(opsm, core_op),
channels=int(channels * 2**i),
depth=depths[i],
groups=groups[i],
mlp_ratio=self.mlp_ratio,
drop=drop_rate,
drop_path=dpr[sum(depths[:i]):sum(depths[:i + 1])],
act_layer=act_layer,
norm_layer=norm_layer,
post_norm=post_norm,
downsample=(i < self.num_levels - 1),
layer_scale=layer_scale,
offset_scale=offset_scale,
with_cp=with_cp,
dw_kernel_size=dw_kernel_size, # for InternImage-H/G
post_norm_block_ids=post_norm_block_ids, # for InternImage-H/G
res_post_norm=res_post_norm, # for InternImage-H/G
center_feature_scale=center_feature_scale # for InternImage-H/G
)
self.levels.append(level)
self.num_layers = len(depths)
self.apply(self._init_weights)
self.apply(self._init_deform_weights)
def init_weights(self):
logger = get_root_logger()
if self.init_cfg is None:
logger.warn(f'No pre-trained weights for '
f'{self.__class__.__name__}, '
f'training start from scratch')
for m in self.modules():
if isinstance(m, nn.Linear):
trunc_normal_init(m, std=.02, bias=0.)
elif isinstance(m, nn.LayerNorm):
constant_init(m, 1.0)
else:
assert 'checkpoint' in self.init_cfg, f'Only support ' \
f'specify `Pretrained` in ' \
f'`init_cfg` in ' \
f'{self.__class__.__name__} '
ckpt = _load_checkpoint(self.init_cfg.checkpoint,
logger=logger,
map_location='cpu')
if 'state_dict' in ckpt:
_state_dict = ckpt['state_dict']
elif 'model' in ckpt:
_state_dict = ckpt['model']
else:
_state_dict = ckpt
state_dict = OrderedDict()
for k, v in _state_dict.items():
if k.startswith('backbone.'):
state_dict[k[9:]] = v
else:
state_dict[k] = v
# strip prefix of state_dict
if list(state_dict.keys())[0].startswith('module.'):
state_dict = {k[7:]: v for k, v in state_dict.items()}
# load state_dict
meg = self.load_state_dict(state_dict, False)
logger.info(meg)
def _init_weights(self, m):
if isinstance(m, nn.Linear):
trunc_normal_(m.weight, std=.02)
if isinstance(m, nn.Linear) and m.bias is not None:
nn.init.constant_(m.bias, 0)
elif isinstance(m, nn.LayerNorm):
nn.init.constant_(m.bias, 0)
nn.init.constant_(m.weight, 1.0)
def _init_deform_weights(self, m):
if isinstance(m, getattr(opsm, self.core_op)):
m._reset_parameters()
def forward(self, x):
x = self.patch_embed(x)
x = self.pos_drop(x)
seq_out = []
for level_idx, level in enumerate(self.levels):
x, x_ = level(x, return_wo_downsample=True)
if level_idx in self.out_indices:
seq_out.append(x_.permute(0, 3, 1, 2).contiguous())
return seq_out
\ No newline at end of file
# --------------------------------------------------------
# InternImage
# Copyright (c) 2022 OpenGVLab
# Licensed under The MIT License [see LICENSE for details]
# --------------------------------------------------------
from .dcnv3_func import DCNv3Function, dcnv3_core_pytorch
# --------------------------------------------------------
# InternImage
# Copyright (c) 2022 OpenGVLab
# Licensed under The MIT License [see LICENSE for details]
# --------------------------------------------------------
from __future__ import absolute_import
from __future__ import print_function
from __future__ import division
import torch
import torch.nn.functional as F
from torch.autograd import Function
from torch.autograd.function import once_differentiable
from torch.cuda.amp import custom_bwd, custom_fwd
import DCNv3
class DCNv3Function(Function):
@staticmethod
@custom_fwd
def forward(
ctx, input, offset, mask,
kernel_h, kernel_w, stride_h, stride_w,
pad_h, pad_w, dilation_h, dilation_w,
group, group_channels, offset_scale, im2col_step):
ctx.kernel_h = kernel_h
ctx.kernel_w = kernel_w
ctx.stride_h = stride_h
ctx.stride_w = stride_w
ctx.pad_h = pad_h
ctx.pad_w = pad_w
ctx.dilation_h = dilation_h
ctx.dilation_w = dilation_w
ctx.group = group
ctx.group_channels = group_channels
ctx.offset_scale = offset_scale
ctx.im2col_step = im2col_step
output = DCNv3.dcnv3_forward(
input, offset, mask, kernel_h,
kernel_w, stride_h, stride_w, pad_h,
pad_w, dilation_h, dilation_w, group,
group_channels, offset_scale, ctx.im2col_step)
ctx.save_for_backward(input, offset, mask)
return output
@staticmethod
@once_differentiable
@custom_bwd
def backward(ctx, grad_output):
input, offset, mask = ctx.saved_tensors
grad_input, grad_offset, grad_mask = \
DCNv3.dcnv3_backward(
input, offset, mask, ctx.kernel_h,
ctx.kernel_w, ctx.stride_h, ctx.stride_w, ctx.pad_h,
ctx.pad_w, ctx.dilation_h, ctx.dilation_w, ctx.group,
ctx.group_channels, ctx.offset_scale, grad_output.contiguous(), ctx.im2col_step)
return grad_input, grad_offset, grad_mask, \
None, None, None, None, None, None, None, None, None, None, None, None
@staticmethod
def symbolic(g, input, offset, mask, kernel_h, kernel_w, stride_h,
stride_w, pad_h, pad_w, dilation_h, dilation_w, group,
group_channels, offset_scale, im2col_step):
"""Symbolic function for mmdeploy::DCNv3.
Returns:
DCNv3 op for onnx.
"""
return g.op(
'mmdeploy::TRTDCNv3',
input,
offset,
mask,
kernel_h_i=int(kernel_h),
kernel_w_i=int(kernel_w),
stride_h_i=int(stride_h),
stride_w_i=int(stride_w),
pad_h_i=int(pad_h),
pad_w_i=int(pad_w),
dilation_h_i=int(dilation_h),
dilation_w_i=int(dilation_w),
group_i=int(group),
group_channels_i=int(group_channels),
offset_scale_f=float(offset_scale),
im2col_step_i=int(im2col_step),
)
def _get_reference_points(spatial_shapes, device, kernel_h, kernel_w, dilation_h, dilation_w, pad_h=0, pad_w=0, stride_h=1, stride_w=1):
_, H_, W_, _ = spatial_shapes
H_out = (H_ - (dilation_h * (kernel_h - 1) + 1)) // stride_h + 1
W_out = (W_ - (dilation_w * (kernel_w - 1) + 1)) // stride_w + 1
ref_y, ref_x = torch.meshgrid(
torch.linspace(
# pad_h + 0.5,
# H_ - pad_h - 0.5,
(dilation_h * (kernel_h - 1)) // 2 + 0.5,
(dilation_h * (kernel_h - 1)) // 2 + 0.5 + (H_out - 1) * stride_h,
H_out,
dtype=torch.float32,
device=device),
torch.linspace(
# pad_w + 0.5,
# W_ - pad_w - 0.5,
(dilation_w * (kernel_w - 1)) // 2 + 0.5,
(dilation_w * (kernel_w - 1)) // 2 + 0.5 + (W_out - 1) * stride_w,
W_out,
dtype=torch.float32,
device=device))
ref_y = ref_y.reshape(-1)[None] / H_
ref_x = ref_x.reshape(-1)[None] / W_
ref = torch.stack((ref_x, ref_y), -1).reshape(
1, H_out, W_out, 1, 2)
return ref
def _generate_dilation_grids(spatial_shapes, kernel_h, kernel_w, dilation_h, dilation_w, group, device):
_, H_, W_, _ = spatial_shapes
points_list = []
x, y = torch.meshgrid(
torch.linspace(
-((dilation_w * (kernel_w - 1)) // 2),
-((dilation_w * (kernel_w - 1)) // 2) +
(kernel_w - 1) * dilation_w, kernel_w,
dtype=torch.float32,
device=device),
torch.linspace(
-((dilation_h * (kernel_h - 1)) // 2),
-((dilation_h * (kernel_h - 1)) // 2) +
(kernel_h - 1) * dilation_h, kernel_h,
dtype=torch.float32,
device=device))
points_list.extend([x / W_, y / H_])
grid = torch.stack(points_list, -1).reshape(-1, 1, 2).\
repeat(1, group, 1).permute(1, 0, 2)
grid = grid.reshape(1, 1, 1, group * kernel_h * kernel_w, 2)
return grid
def dcnv3_core_pytorch(
input, offset, mask, kernel_h,
kernel_w, stride_h, stride_w, pad_h,
pad_w, dilation_h, dilation_w, group,
group_channels, offset_scale):
# for debug and test only,
# need to use cuda version instead
input = F.pad(
input,
[0, 0, pad_h, pad_h, pad_w, pad_w])
N_, H_in, W_in, _ = input.shape
_, H_out, W_out, _ = offset.shape
ref = _get_reference_points(
input.shape, input.device, kernel_h, kernel_w, dilation_h, dilation_w, pad_h, pad_w, stride_h, stride_w)
grid = _generate_dilation_grids(
input.shape, kernel_h, kernel_w, dilation_h, dilation_w, group, input.device)
spatial_norm = torch.tensor([W_in, H_in]).reshape(1, 1, 1, 2).\
repeat(1, 1, 1, group*kernel_h*kernel_w).to(input.device)
sampling_locations = (ref + grid * offset_scale).repeat(N_, 1, 1, 1, 1).flatten(3, 4) + \
offset * offset_scale / spatial_norm
P_ = kernel_h * kernel_w
sampling_grids = 2 * sampling_locations - 1
# N_, H_in, W_in, group*group_channels -> N_, H_in*W_in, group*group_channels -> N_, group*group_channels, H_in*W_in -> N_*group, group_channels, H_in, W_in
input_ = input.view(N_, H_in*W_in, group*group_channels).transpose(1, 2).\
reshape(N_*group, group_channels, H_in, W_in)
# N_, H_out, W_out, group*P_*2 -> N_, H_out*W_out, group, P_, 2 -> N_, group, H_out*W_out, P_, 2 -> N_*group, H_out*W_out, P_, 2
sampling_grid_ = sampling_grids.view(N_, H_out*W_out, group, P_, 2).transpose(1, 2).\
flatten(0, 1)
# N_*group, group_channels, H_out*W_out, P_
sampling_input_ = F.grid_sample(
input_, sampling_grid_, mode='bilinear', padding_mode='zeros', align_corners=False)
# (N_, H_out, W_out, group*P_) -> N_, H_out*W_out, group, P_ -> (N_, group, H_out*W_out, P_) -> (N_*group, 1, H_out*W_out, P_)
mask = mask.view(N_, H_out*W_out, group, P_).transpose(1, 2).\
reshape(N_*group, 1, H_out*W_out, P_)
output = (sampling_input_ * mask).sum(-1).view(N_,
group*group_channels, H_out*W_out)
return output.transpose(1, 2).reshape(N_, H_out, W_out, -1).contiguous()
#!/usr/bin/env bash
# --------------------------------------------------------
# InternImage
# Copyright (c) 2022 OpenGVLab
# Licensed under The MIT License [see LICENSE for details]
# --------------------------------------------------------
python setup.py build install
# --------------------------------------------------------
# InternImage
# Copyright (c) 2022 OpenGVLab
# Licensed under The MIT License [see LICENSE for details]
# --------------------------------------------------------
from .dcnv3 import DCNv3, DCNv3_pytorch
\ No newline at end of file
# --------------------------------------------------------
# InternImage
# Copyright (c) 2022 OpenGVLab
# Licensed under The MIT License [see LICENSE for details]
# --------------------------------------------------------
from __future__ import absolute_import
from __future__ import print_function
from __future__ import division
import warnings
import torch
from torch import nn
import torch.nn.functional as F
from torch.nn.init import xavier_uniform_, constant_
from ..functions import DCNv3Function, dcnv3_core_pytorch
class to_channels_first(nn.Module):
def __init__(self):
super().__init__()
def forward(self, x):
return x.permute(0, 3, 1, 2)
class to_channels_last(nn.Module):
def __init__(self):
super().__init__()
def forward(self, x):
return x.permute(0, 2, 3, 1)
def build_norm_layer(dim,
norm_layer,
in_format='channels_last',
out_format='channels_last',
eps=1e-6):
layers = []
if norm_layer == 'BN':
if in_format == 'channels_last':
layers.append(to_channels_first())
layers.append(nn.BatchNorm2d(dim))
if out_format == 'channels_last':
layers.append(to_channels_last())
elif norm_layer == 'LN':
if in_format == 'channels_first':
layers.append(to_channels_last())
layers.append(nn.LayerNorm(dim, eps=eps))
if out_format == 'channels_first':
layers.append(to_channels_first())
else:
raise NotImplementedError(
f'build_norm_layer does not support {norm_layer}')
return nn.Sequential(*layers)
def build_act_layer(act_layer):
if act_layer == 'ReLU':
return nn.ReLU(inplace=True)
elif act_layer == 'SiLU':
return nn.SiLU(inplace=True)
elif act_layer == 'GELU':
return nn.GELU()
raise NotImplementedError(f'build_act_layer does not support {act_layer}')
def _is_power_of_2(n):
if (not isinstance(n, int)) or (n < 0):
raise ValueError(
"invalid input for _is_power_of_2: {} (type: {})".format(n, type(n)))
return (n & (n - 1) == 0) and n != 0
class CenterFeatureScaleModule(nn.Module):
def forward(self,
query,
center_feature_scale_proj_weight,
center_feature_scale_proj_bias):
center_feature_scale = F.linear(query,
weight=center_feature_scale_proj_weight,
bias=center_feature_scale_proj_bias).sigmoid()
return center_feature_scale
class DCNv3_pytorch(nn.Module):
def __init__(
self,
channels=64,
kernel_size=3,
dw_kernel_size=None,
stride=1,
pad=1,
dilation=1,
group=4,
offset_scale=1.0,
act_layer='GELU',
norm_layer='LN',
center_feature_scale=False):
"""
DCNv3 Module
:param channels
:param kernel_size
:param stride
:param pad
:param dilation
:param group
:param offset_scale
:param act_layer
:param norm_layer
"""
super().__init__()
if channels % group != 0:
raise ValueError(
f'channels must be divisible by group, but got {channels} and {group}')
_d_per_group = channels // group
dw_kernel_size = dw_kernel_size if dw_kernel_size is not None else kernel_size
# you'd better set _d_per_group to a power of 2 which is more efficient in our CUDA implementation
if not _is_power_of_2(_d_per_group):
warnings.warn(
"You'd better set channels in DCNv3 to make the dimension of each attention head a power of 2 "
"which is more efficient in our CUDA implementation.")
self.offset_scale = offset_scale
self.channels = channels
self.kernel_size = kernel_size
self.dw_kernel_size = dw_kernel_size
self.stride = stride
self.dilation = dilation
self.pad = pad
self.group = group
self.group_channels = channels // group
self.offset_scale = offset_scale
self.center_feature_scale = center_feature_scale
self.dw_conv = nn.Sequential(
nn.Conv2d(
channels,
channels,
kernel_size=dw_kernel_size,
stride=1,
padding=(dw_kernel_size - 1) // 2,
groups=channels),
build_norm_layer(
channels,
norm_layer,
'channels_first',
'channels_last'),
build_act_layer(act_layer))
self.offset = nn.Linear(
channels,
group * kernel_size * kernel_size * 2)
self.mask = nn.Linear(
channels,
group * kernel_size * kernel_size)
self.input_proj = nn.Linear(channels, channels)
self.output_proj = nn.Linear(channels, channels)
self._reset_parameters()
if center_feature_scale:
self.center_feature_scale_proj_weight = nn.Parameter(
torch.zeros((group, channels), dtype=torch.float))
self.center_feature_scale_proj_bias = nn.Parameter(
torch.tensor(0.0, dtype=torch.float).view((1,)).repeat(group, ))
self.center_feature_scale_module = CenterFeatureScaleModule()
def _reset_parameters(self):
constant_(self.offset.weight.data, 0.)
constant_(self.offset.bias.data, 0.)
constant_(self.mask.weight.data, 0.)
constant_(self.mask.bias.data, 0.)
xavier_uniform_(self.input_proj.weight.data)
constant_(self.input_proj.bias.data, 0.)
xavier_uniform_(self.output_proj.weight.data)
constant_(self.output_proj.bias.data, 0.)
def forward(self, input):
"""
:param query (N, H, W, C)
:return output (N, H, W, C)
"""
N, H, W, _ = input.shape
x = self.input_proj(input)
x_proj = x
x1 = input.permute(0, 3, 1, 2)
x1 = self.dw_conv(x1)
offset = self.offset(x1)
mask = self.mask(x1).reshape(N, H, W, self.group, -1)
mask = F.softmax(mask, -1).reshape(N, H, W, -1)
x = dcnv3_core_pytorch(
x, offset, mask,
self.kernel_size, self.kernel_size,
self.stride, self.stride,
self.pad, self.pad,
self.dilation, self.dilation,
self.group, self.group_channels,
self.offset_scale)
if self.center_feature_scale:
center_feature_scale = self.center_feature_scale_module(
x1, self.center_feature_scale_proj_weight, self.center_feature_scale_proj_bias)
# N, H, W, groups -> N, H, W, groups, 1 -> N, H, W, groups, _d_per_group -> N, H, W, channels
center_feature_scale = center_feature_scale[..., None].repeat(
1, 1, 1, 1, self.channels // self.group).flatten(-2)
x = x * (1 - center_feature_scale) + x_proj * center_feature_scale
x = self.output_proj(x)
return x
class DCNv3(nn.Module):
def __init__(
self,
channels=64,
kernel_size=3,
dw_kernel_size=None,
stride=1,
pad=1,
dilation=1,
group=4,
offset_scale=1.0,
act_layer='GELU',
norm_layer='LN',
center_feature_scale=False):
"""
DCNv3 Module
:param channels
:param kernel_size
:param stride
:param pad
:param dilation
:param group
:param offset_scale
:param act_layer
:param norm_layer
"""
super().__init__()
if channels % group != 0:
raise ValueError(
f'channels must be divisible by group, but got {channels} and {group}')
_d_per_group = channels // group
dw_kernel_size = dw_kernel_size if dw_kernel_size is not None else kernel_size
# you'd better set _d_per_group to a power of 2 which is more efficient in our CUDA implementation
if not _is_power_of_2(_d_per_group):
warnings.warn(
"You'd better set channels in DCNv3 to make the dimension of each attention head a power of 2 "
"which is more efficient in our CUDA implementation.")
self.offset_scale = offset_scale
self.channels = channels
self.kernel_size = kernel_size
self.dw_kernel_size = dw_kernel_size
self.stride = stride
self.dilation = dilation
self.pad = pad
self.group = group
self.group_channels = channels // group
self.offset_scale = offset_scale
self.center_feature_scale = center_feature_scale
self.dw_conv = nn.Sequential(
nn.Conv2d(
channels,
channels,
kernel_size=dw_kernel_size,
stride=1,
padding=(dw_kernel_size - 1) // 2,
groups=channels),
build_norm_layer(
channels,
norm_layer,
'channels_first',
'channels_last'),
build_act_layer(act_layer))
self.offset = nn.Linear(
channels,
group * kernel_size * kernel_size * 2)
self.mask = nn.Linear(
channels,
group * kernel_size * kernel_size)
self.input_proj = nn.Linear(channels, channels)
self.output_proj = nn.Linear(channels, channels)
self._reset_parameters()
if center_feature_scale:
self.center_feature_scale_proj_weight = nn.Parameter(
torch.zeros((group, channels), dtype=torch.float))
self.center_feature_scale_proj_bias = nn.Parameter(
torch.tensor(0.0, dtype=torch.float).view((1,)).repeat(group, ))
self.center_feature_scale_module = CenterFeatureScaleModule()
def _reset_parameters(self):
constant_(self.offset.weight.data, 0.)
constant_(self.offset.bias.data, 0.)
constant_(self.mask.weight.data, 0.)
constant_(self.mask.bias.data, 0.)
xavier_uniform_(self.input_proj.weight.data)
constant_(self.input_proj.bias.data, 0.)
xavier_uniform_(self.output_proj.weight.data)
constant_(self.output_proj.bias.data, 0.)
def forward(self, input):
"""
:param query (N, H, W, C)
:return output (N, H, W, C)
"""
N, H, W, _ = input.shape
x = self.input_proj(input)
x_proj = x
dtype = x.dtype
x1 = input.permute(0, 3, 1, 2)
x1 = self.dw_conv(x1)
offset = self.offset(x1)
mask = self.mask(x1).reshape(N, H, W, self.group, -1)
mask = F.softmax(mask, -1).reshape(N, H, W, -1).type(dtype)
x = DCNv3Function.apply(
x, offset, mask,
self.kernel_size, self.kernel_size,
self.stride, self.stride,
self.pad, self.pad,
self.dilation, self.dilation,
self.group, self.group_channels,
self.offset_scale,
256)
if self.center_feature_scale:
center_feature_scale = self.center_feature_scale_module(
x1, self.center_feature_scale_proj_weight, self.center_feature_scale_proj_bias)
# N, H, W, groups -> N, H, W, groups, 1 -> N, H, W, groups, _d_per_group -> N, H, W, channels
center_feature_scale = center_feature_scale[..., None].repeat(
1, 1, 1, 1, self.channels // self.group).flatten(-2)
x = x * (1 - center_feature_scale) + x_proj * center_feature_scale
x = self.output_proj(x)
return x
# --------------------------------------------------------
# InternImage
# Copyright (c) 2022 OpenGVLab
# Licensed under The MIT License [see LICENSE for details]
# --------------------------------------------------------
import os
import glob
import torch
from torch.utils.cpp_extension import CUDA_HOME
from torch.utils.cpp_extension import CppExtension
from torch.utils.cpp_extension import CUDAExtension
from setuptools import find_packages
from setuptools import setup
requirements = ["torch", "torchvision"]
def get_extensions():
this_dir = os.path.dirname(os.path.abspath(__file__))
extensions_dir = os.path.join(this_dir, "src")
main_file = glob.glob(os.path.join(extensions_dir, "*.cpp"))
source_cpu = glob.glob(os.path.join(extensions_dir, "cpu", "*.cpp"))
source_cuda = glob.glob(os.path.join(extensions_dir, "cuda", "*.cu"))
sources = main_file + source_cpu
extension = CppExtension
extra_compile_args = {"cxx": []}
define_macros = []
if torch.cuda.is_available() and CUDA_HOME is not None:
extension = CUDAExtension
sources += source_cuda
define_macros += [("WITH_CUDA", None)]
extra_compile_args["nvcc"] = [
# "-DCUDA_HAS_FP16=1",
# "-D__CUDA_NO_HALF_OPERATORS__",
# "-D__CUDA_NO_HALF_CONVERSIONS__",
# "-D__CUDA_NO_HALF2_OPERATORS__",
]
else:
raise NotImplementedError('Cuda is not availabel')
sources = [os.path.join(extensions_dir, s) for s in sources]
include_dirs = [extensions_dir]
ext_modules = [
extension(
"DCNv3",
sources,
include_dirs=include_dirs,
define_macros=define_macros,
extra_compile_args=extra_compile_args,
)
]
return ext_modules
setup(
name="DCNv3",
version="1.0",
author="InternImage",
url="https://github.com/OpenGVLab/InternImage",
description=
"PyTorch Wrapper for CUDA Functions of DCNv3",
packages=find_packages(exclude=(
"configs",
"tests",
)),
ext_modules=get_extensions(),
cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension},
)
/*!
**************************************************************************************************
* InternImage
* Copyright (c) 2022 OpenGVLab
* Licensed under The MIT License [see LICENSE for details]
**************************************************************************************************
* Modified from
*https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
**************************************************************************************************
*/
#include <vector>
#include <ATen/ATen.h>
#include <ATen/cuda/CUDAContext.h>
at::Tensor dcnv3_cpu_forward(const at::Tensor &input, const at::Tensor &offset,
const at::Tensor &mask, const int kernel_h,
const int kernel_w, const int stride_h,
const int stride_w, const int pad_h,
const int pad_w, const int dilation_h,
const int dilation_w, const int group,
const int group_channels, const float offset_scale,
const int im2col_step) {
AT_ERROR("Not implement on cpu");
}
std::vector<at::Tensor>
dcnv3_cpu_backward(const at::Tensor &input, const at::Tensor &offset,
const at::Tensor &mask, const int kernel_h,
const int kernel_w, const int stride_h, const int stride_w,
const int pad_h, const int pad_w, const int dilation_h,
const int dilation_w, const int group,
const int group_channels, const float offset_scale,
const at::Tensor &grad_output, const int im2col_step) {
AT_ERROR("Not implement on cpu");
}
/*!
**************************************************************************************************
* InternImage
* Copyright (c) 2022 OpenGVLab
* Licensed under The MIT License [see LICENSE for details]
**************************************************************************************************
* Modified from
*https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
**************************************************************************************************
*/
#pragma once
#include <torch/extension.h>
at::Tensor dcnv3_cpu_forward(const at::Tensor &input, const at::Tensor &offset,
const at::Tensor &mask, const int kernel_h,
const int kernel_w, const int stride_h,
const int stride_w, const int pad_h,
const int pad_w, const int dilation_h,
const int dilation_w, const int group,
const int group_channels, const float offset_scale,
const int im2col_step);
std::vector<at::Tensor>
dcnv3_cpu_backward(const at::Tensor &input, const at::Tensor &offset,
const at::Tensor &mask, const int kernel_h,
const int kernel_w, const int stride_h, const int stride_w,
const int pad_h, const int pad_w, const int dilation_h,
const int dilation_w, const int group,
const int group_channels, const float offset_scale,
const at::Tensor &grad_output, const int im2col_step);
/*!
**************************************************************************************************
* InternImage
* Copyright (c) 2022 OpenGVLab
* Licensed under The MIT License [see LICENSE for details]
**************************************************************************************************
* Modified from
*https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
**************************************************************************************************
*/
#include "cuda/dcnv3_im2col_cuda.cuh"
#include <vector>
#include <ATen/ATen.h>
#include <ATen/cuda/CUDAContext.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <torch/torch.h>
at::Tensor dcnv3_cuda_forward(const at::Tensor &input, const at::Tensor &offset,
const at::Tensor &mask, const int kernel_h,
const int kernel_w, const int stride_h,
const int stride_w, const int pad_h,
const int pad_w, const int dilation_h,
const int dilation_w, const int group,
const int group_channels,
const float offset_scale, const int im2col_step) {
AT_ASSERTM(input.is_contiguous(), "input tensor has to be contiguous");
AT_ASSERTM(offset.is_contiguous(), "offset tensor has to be contiguous");
AT_ASSERTM(mask.is_contiguous(), "mask tensor has to be contiguous");
AT_ASSERTM(input.type().is_cuda(), "input must be a CUDA tensor");
AT_ASSERTM(offset.type().is_cuda(), "offset must be a CUDA tensor");
AT_ASSERTM(mask.type().is_cuda(), "mask must be a CUDA tensor");
const int batch = input.size(0);
const int height_in = input.size(1);
const int width_in = input.size(2);
const int channels = input.size(3);
const int height_out =
(height_in + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h +
1;
const int width_out =
(width_in + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w +
1;
const int im2col_step_ = std::min(batch, im2col_step);
AT_ASSERTM(batch % im2col_step_ == 0,
"batch(%d) must divide im2col_step(%d)", batch, im2col_step_);
AT_ASSERTM(
channels == (group * group_channels),
"Input channels and group times group channels wont match: (%d vs %d).",
channels, group * group_channels);
auto output =
at::zeros({batch, height_out, width_out, group * group_channels},
input.options());
const int batch_n = im2col_step_;
auto output_n = output.view({batch / batch_n, batch_n, height_out,
width_out, group * group_channels});
auto per_input_size = height_in * width_in * group * group_channels;
auto per_offset_size =
height_out * width_out * group * kernel_h * kernel_w * 2;
auto per_mask_size = height_out * width_out * group * kernel_h * kernel_w;
for (int n = 0; n < batch / im2col_step_; ++n) {
auto columns = output_n.select(0, n);
// AT_DISPATCH_FLOATING_TYPES(
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
input.type(), "ms_deform_attn_forward_cuda", ([&] {
dcnv3_im2col_cuda(
at::cuda::getCurrentCUDAStream(),
input.data<scalar_t>() + n * im2col_step_ * per_input_size,
offset.data<scalar_t>() +
n * im2col_step_ * per_offset_size,
mask.data<scalar_t>() + n * im2col_step_ * per_mask_size,
columns.data<scalar_t>(), kernel_h, kernel_w, stride_h,
stride_w, pad_h, pad_w, dilation_h, dilation_w, group,
group_channels, batch_n, height_in, width_in, height_out,
width_out, offset_scale);
}));
}
return output;
}
std::vector<at::Tensor>
dcnv3_cuda_backward(const at::Tensor &input, const at::Tensor &offset,
const at::Tensor &mask, const int kernel_h,
const int kernel_w, const int stride_h, const int stride_w,
const int pad_h, const int pad_w, const int dilation_h,
const int dilation_w, const int group,
const int group_channels, const float offset_scale,
const at::Tensor &grad_output, const int im2col_step) {
AT_ASSERTM(input.is_contiguous(), "input tensor has to be contiguous");
AT_ASSERTM(offset.is_contiguous(), "offset tensor has to be contiguous");
AT_ASSERTM(mask.is_contiguous(), "mask tensor has to be contiguous");
AT_ASSERTM(grad_output.is_contiguous(),
"grad_output tensor has to be contiguous");
AT_ASSERTM(input.type().is_cuda(), "input must be a CUDA tensor");
AT_ASSERTM(offset.type().is_cuda(), "offset must be a CUDA tensor");
AT_ASSERTM(mask.type().is_cuda(), "mask must be a CUDA tensor");
AT_ASSERTM(grad_output.type().is_cuda(),
"grad_output must be a CUDA tensor");
const int batch = input.size(0);
const int height_in = input.size(1);
const int width_in = input.size(2);
const int channels = input.size(3);
const int height_out =
(height_in + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h +
1;
const int width_out =
(width_in + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w +
1;
const int im2col_step_ = std::min(batch, im2col_step);
AT_ASSERTM(batch % im2col_step_ == 0,
"batch(%d) must divide im2col_step(%d)", batch, im2col_step_);
AT_ASSERTM(
channels == (group * group_channels),
"Input channels and group times group channels wont match: (%d vs %d).",
channels, group * group_channels);
auto dtype = input.dtype();
if (dtype == at::kHalf) {
dtype = at::kFloat;
}
auto grad_input = at::zeros_like(input, dtype);
auto grad_offset = at::zeros_like(offset, dtype);
auto grad_mask = at::zeros_like(mask, dtype);
const int batch_n = im2col_step_;
auto per_input_size = height_in * width_in * group * group_channels;
auto per_offset_size =
height_out * width_out * group * kernel_h * kernel_w * 2;
auto per_mask_size = height_out * width_out * group * kernel_h * kernel_w;
auto grad_output_n =
grad_output.view({batch / im2col_step_, batch_n, height_out * width_out,
group, group_channels});
for (int n = 0; n < batch / im2col_step_; ++n) {
auto grad_output_g = grad_output_n.select(0, n);
// AT_DISPATCH_FLOATING_TYPES(
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
input.type(), "ms_deform_attn_backward_cuda", ([&] {
dcnv3_col2im_cuda(
at::cuda::getCurrentCUDAStream(),
grad_output_g.data<scalar_t>(),
input.data<scalar_t>() + n * im2col_step_ * per_input_size,
offset.data<scalar_t>() +
n * im2col_step_ * per_offset_size,
mask.data<scalar_t>() + n * im2col_step_ * per_mask_size,
kernel_h, kernel_w, stride_h, stride_w, pad_h, pad_w,
dilation_h, dilation_w, group, group_channels, batch_n,
height_in, width_in, height_out, width_out, offset_scale,
grad_input.data<opmath_t>() +
n * im2col_step_ * per_input_size,
grad_offset.data<opmath_t>() +
n * im2col_step_ * per_offset_size,
grad_mask.data<opmath_t>() +
n * im2col_step_ * per_mask_size);
}));
}
if (input.dtype() == torch::kHalf) {
return {grad_input.to(torch::kHalf), grad_offset.to(torch::kHalf),
grad_mask.to(torch::kHalf)};
} else {
return {grad_input, grad_offset, grad_mask};
}
}
\ No newline at end of file
/*!
**************************************************************************************************
* InternImage
* Copyright (c) 2022 OpenGVLab
* Licensed under The MIT License [see LICENSE for details]
**************************************************************************************************
* Modified from
*https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
**************************************************************************************************
*/
#pragma once
#include <torch/extension.h>
at::Tensor dcnv3_cuda_forward(const at::Tensor &input, const at::Tensor &offset,
const at::Tensor &mask, const int kernel_h,
const int kernel_w, const int stride_h,
const int stride_w, const int pad_h,
const int pad_w, const int dilation_h,
const int dilation_w, const int group,
const int group_channels,
const float offset_scale, const int im2col_step);
std::vector<at::Tensor>
dcnv3_cuda_backward(const at::Tensor &input, const at::Tensor &offset,
const at::Tensor &mask, const int kernel_h,
const int kernel_w, const int stride_h, const int stride_w,
const int pad_h, const int pad_w, const int dilation_h,
const int dilation_w, const int group,
const int group_channels, const float offset_scale,
const at::Tensor &grad_output, const int im2col_step);
/*!
**************************************************************************************************
* InternImage
* Copyright (c) 2022 OpenGVLab
* Licensed under The MIT License [see LICENSE for details]
**************************************************************************************************
* Modified from
*https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
**************************************************************************************************
*/
#include <algorithm>
#include <cstdio>
#include <cstring>
#include <ATen/ATen.h>
#include <ATen/OpMathType.h>
#include <ATen/cuda/CUDAContext.h>
#include <THC/THCAtomics.cuh>
#define CUDA_KERNEL_LOOP(i, n) \
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
i += blockDim.x * gridDim.x)
const int CUDA_NUM_THREADS = 256;
inline int GET_BLOCKS(const int N, const int num_threads) {
return (N + num_threads - 1) / num_threads;
}
#define opmath_t at::opmath_type<scalar_t>
template <typename scalar_t>
__device__ opmath_t dcnv3_im2col_bilinear(const scalar_t *&bottom_data,
const int &height, const int &width,
const int &group,
const int &group_channels,
const opmath_t &h, const opmath_t &w,
const int &g, const int &c) {
const int h_low = floor(h);
const int w_low = floor(w);
const int h_high = h_low + 1;
const int w_high = w_low + 1;
const opmath_t lh = h - h_low;
const opmath_t lw = w - w_low;
const opmath_t hh = 1 - lh, hw = 1 - lw;
const int w_stride = group * group_channels;
const int h_stride = width * w_stride;
const int h_low_ptr_offset = h_low * h_stride;
const int h_high_ptr_offset = h_low_ptr_offset + h_stride;
const int w_low_ptr_offset = w_low * w_stride;
const int w_high_ptr_offset = w_low_ptr_offset + w_stride;
const int base_ptr = g * group_channels + c;
opmath_t v1 = 0;
if (h_low >= 0 && w_low >= 0) {
const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr;
v1 = bottom_data[ptr1];
}
opmath_t v2 = 0;
if (h_low >= 0 && w_high <= width - 1) {
const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr;
v2 = bottom_data[ptr2];
}
opmath_t v3 = 0;
if (h_high <= height - 1 && w_low >= 0) {
const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr;
v3 = bottom_data[ptr3];
}
opmath_t v4 = 0;
if (h_high <= height - 1 && w_high <= width - 1) {
const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr;
v4 = bottom_data[ptr4];
}
const opmath_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
const opmath_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
return val;
}
template <typename scalar_t>
__device__ void dcnv3_col2im_bilinear(
const scalar_t *&bottom_data, const int &height, const int &width,
const int &nheads, const int &group_channels, const opmath_t &h,
const opmath_t &w, const int &m, const int &c, const opmath_t offset_scale,
const opmath_t &top_grad, const opmath_t &mask, opmath_t *&grad_im,
opmath_t *grad_offset, opmath_t *grad_mask) {
const int h_low = floor(h);
const int w_low = floor(w);
const int h_high = h_low + 1;
const int w_high = w_low + 1;
const opmath_t lh = h - h_low;
const opmath_t lw = w - w_low;
const opmath_t hh = 1 - lh, hw = 1 - lw;
const int w_stride = nheads * group_channels;
const int h_stride = width * w_stride;
const int h_low_ptr_offset = h_low * h_stride;
const int h_high_ptr_offset = h_low_ptr_offset + h_stride;
const int w_low_ptr_offset = w_low * w_stride;
const int w_high_ptr_offset = w_low_ptr_offset + w_stride;
const int base_ptr = m * group_channels + c;
const opmath_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
const opmath_t top_grad_im = top_grad * mask;
opmath_t grad_h_weight = 0, grad_w_weight = 0;
opmath_t v1 = 0;
if (h_low >= 0 && w_low >= 0) {
const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr;
v1 = bottom_data[ptr1];
grad_h_weight -= hw * v1;
grad_w_weight -= hh * v1;
atomicAdd(grad_im + ptr1, w1 * top_grad_im);
}
opmath_t v2 = 0;
if (h_low >= 0 && w_high <= width - 1) {
const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr;
v2 = bottom_data[ptr2];
grad_h_weight -= lw * v2;
grad_w_weight += hh * v2;
atomicAdd(grad_im + ptr2, w2 * top_grad_im);
}
opmath_t v3 = 0;
if (h_high <= height - 1 && w_low >= 0) {
const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr;
v3 = bottom_data[ptr3];
grad_h_weight += hw * v3;
grad_w_weight -= lh * v3;
atomicAdd(grad_im + ptr3, w3 * top_grad_im);
}
opmath_t v4 = 0;
if (h_high <= height - 1 && w_high <= width - 1) {
const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr;
v4 = bottom_data[ptr4];
grad_h_weight += lw * v4;
grad_w_weight += lh * v4;
atomicAdd(grad_im + ptr4, w4 * top_grad_im);
}
const opmath_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
*grad_mask = top_grad * val;
*grad_offset = offset_scale * grad_w_weight * top_grad_im;
*(grad_offset + 1) = offset_scale * grad_h_weight * top_grad_im;
}
template <typename scalar_t>
__device__ void dcnv3_col2im_bilinear_gm(
const scalar_t *&bottom_data, const int &height, const int &width,
const int &nheads, const int &group_channels, const opmath_t &h,
const opmath_t &w, const int &m, const int &c, const opmath_t offset_scale,
const opmath_t &top_grad, const opmath_t &mask, opmath_t *&grad_im,
opmath_t *grad_offset, opmath_t *grad_mask) {
const int h_low = floor(h);
const int w_low = floor(w);
const int h_high = h_low + 1;
const int w_high = w_low + 1;
const opmath_t lh = h - h_low;
const opmath_t lw = w - w_low;
const opmath_t hh = 1 - lh, hw = 1 - lw;
const int w_stride = nheads * group_channels;
const int h_stride = width * w_stride;
const int h_low_ptr_offset = h_low * h_stride;
const int h_high_ptr_offset = h_low_ptr_offset + h_stride;
const int w_low_ptr_offset = w_low * w_stride;
const int w_high_ptr_offset = w_low_ptr_offset + w_stride;
const int base_ptr = m * group_channels + c;
const opmath_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
const opmath_t top_grad_im = top_grad * mask;
opmath_t grad_h_weight = 0, grad_w_weight = 0;
opmath_t v1 = 0;
if (h_low >= 0 && w_low >= 0) {
const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr;
v1 = bottom_data[ptr1];
grad_h_weight -= hw * v1;
grad_w_weight -= hh * v1;
atomicAdd(grad_im + ptr1, w1 * top_grad_im);
}
opmath_t v2 = 0;
if (h_low >= 0 && w_high <= width - 1) {
const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr;
v2 = bottom_data[ptr2];
grad_h_weight -= lw * v2;
grad_w_weight += hh * v2;
atomicAdd(grad_im + ptr2, w2 * top_grad_im);
}
opmath_t v3 = 0;
if (h_high <= height - 1 && w_low >= 0) {
const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr;
v3 = bottom_data[ptr3];
grad_h_weight += hw * v3;
grad_w_weight -= lh * v3;
atomicAdd(grad_im + ptr3, w3 * top_grad_im);
}
opmath_t v4 = 0;
if (h_high <= height - 1 && w_high <= width - 1) {
const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr;
v4 = bottom_data[ptr4];
grad_h_weight += lw * v4;
grad_w_weight += lh * v4;
atomicAdd(grad_im + ptr4, w4 * top_grad_im);
}
const opmath_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
atomicAdd(grad_mask, top_grad * val);
atomicAdd(grad_offset, offset_scale * grad_w_weight * top_grad_im);
atomicAdd(grad_offset + 1, offset_scale * grad_h_weight * top_grad_im);
}
template <typename scalar_t>
__global__ void dcnv3_im2col_gpu_kernel(
const int num_kernels, const scalar_t *data_im, const scalar_t *data_offset,
const scalar_t *data_mask, scalar_t *data_col, const int kernel_h,
const int kernel_w, const int stride_h, const int stride_w, const int pad_h,
const int pad_w, const int dilation_h, const int dilation_w,
const int group, const int group_channels, const int height_in,
const int width_in, const int height_out, const int width_out,
const opmath_t offset_scale) {
CUDA_KERNEL_LOOP(index, num_kernels) {
int _temp = index;
const int c_col = _temp % group_channels;
_temp /= group_channels;
const int sampling_index = _temp;
const int g_col = _temp % group;
_temp /= group;
const int p0_w = ((dilation_w * (kernel_w - 1)) >> 1) - pad_w +
(_temp % width_out) * stride_w;
_temp /= width_out;
const int p0_h = ((dilation_h * (kernel_h - 1)) >> 1) - pad_h +
(_temp % height_out) * stride_h;
_temp /= height_out;
const int b_col = _temp;
const int input_size = height_in * width_in;
scalar_t *data_col_ptr = data_col + index;
const int kernel_size = kernel_h * kernel_w;
int data_weight_ptr = sampling_index * kernel_size;
int data_loc_w_ptr = data_weight_ptr << 1;
const int qid_stride = group * group_channels;
opmath_t col = 0;
const scalar_t *data_im_ptr = data_im + b_col * input_size * qid_stride;
// top-left
const opmath_t p0_w_ =
p0_w - ((dilation_w * (kernel_w - 1)) >> 1) * offset_scale;
const opmath_t p0_h_ =
p0_h - ((dilation_h * (kernel_h - 1)) >> 1) * offset_scale;
for (int i = 0; i < kernel_w; ++i) {
for (int j = 0; j < kernel_h; ++j) {
const opmath_t offset_w = data_offset[data_loc_w_ptr];
const opmath_t offset_h = data_offset[data_loc_w_ptr + 1];
const opmath_t loc_w =
p0_w_ + (i * dilation_w + offset_w) * offset_scale;
const opmath_t loc_h =
p0_h_ + (j * dilation_h + offset_h) * offset_scale;
const opmath_t weight = data_mask[data_weight_ptr];
if (loc_h > -1 && loc_w > -1 && loc_h < height_in &&
loc_w < width_in) {
col += dcnv3_im2col_bilinear(
data_im_ptr, height_in, width_in, group,
group_channels, loc_h, loc_w, g_col, c_col) *
weight;
}
data_weight_ptr += 1;
data_loc_w_ptr += 2;
}
}
*data_col_ptr = col;
}
}
// debug
template <typename scalar_t, unsigned int blockSize>
__global__ void dcnv3_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1(
const int num_kernels, const scalar_t *grad_col, const scalar_t *data_im,
const scalar_t *data_offset, const scalar_t *data_mask, const int kernel_h,
const int kernel_w, const int stride_h, const int stride_w, const int pad_h,
const int pad_w, const int dilation_h, const int dilation_w,
const int group, const int group_channels, const int height_in,
const int width_in, const int height_out, const int width_out,
const opmath_t offset_scale, opmath_t *grad_im, opmath_t *grad_offset,
opmath_t *grad_mask) {
CUDA_KERNEL_LOOP(index, num_kernels) {
__shared__ opmath_t cache_grad_offset[blockSize * 2];
__shared__ opmath_t cache_grad_mask[blockSize];
unsigned int tid = threadIdx.x;
int _temp = index;
const int c_col = _temp % group_channels;
_temp /= group_channels;
const int sampling_index = _temp;
const int g_col = _temp % group;
_temp /= group;
const int p0_w = ((dilation_w * (kernel_w - 1)) >> 1) - pad_w +
(_temp % width_out) * stride_w;
_temp /= width_out;
const int p0_h = ((dilation_h * (kernel_h - 1)) >> 1) - pad_h +
(_temp % height_out) * stride_h;
_temp /= height_out;
const int b_col = _temp;
const opmath_t top_grad = grad_col[index];
const int input_size = height_in * width_in;
const int kernel_size = kernel_h * kernel_w;
int data_weight_ptr = sampling_index * kernel_size;
int data_loc_w_ptr = data_weight_ptr << 1;
const int grad_sampling_ptr = data_weight_ptr;
grad_offset += grad_sampling_ptr << 1;
grad_mask += grad_sampling_ptr;
const int qid_stride = group * group_channels;
const int im_ptr_offset = b_col * input_size * qid_stride;
const scalar_t *data_im_ptr = data_im + im_ptr_offset;
opmath_t *grad_im_ptr = grad_im + im_ptr_offset;
const opmath_t p0_w_ =
p0_w - ((dilation_w * (kernel_w - 1)) >> 1) * offset_scale;
const opmath_t p0_h_ =
p0_h - ((dilation_h * (kernel_h - 1)) >> 1) * offset_scale;
for (int i = 0; i < kernel_w; ++i) {
for (int j = 0; j < kernel_h; ++j) {
const opmath_t offset_w = data_offset[data_loc_w_ptr];
const opmath_t offset_h = data_offset[data_loc_w_ptr + 1];
const opmath_t loc_w =
p0_w_ + (i * dilation_w + offset_w) * offset_scale;
const opmath_t loc_h =
p0_h_ + (j * dilation_h + offset_h) * offset_scale;
const opmath_t weight = data_mask[data_weight_ptr];
*(cache_grad_offset + (threadIdx.x << 1)) = 0;
*(cache_grad_offset + ((threadIdx.x << 1) + 1)) = 0;
*(cache_grad_mask + threadIdx.x) = 0;
if (loc_h > -1 && loc_w > -1 && loc_h < height_in &&
loc_w < width_in) {
dcnv3_col2im_bilinear(
data_im_ptr, height_in, width_in, group, group_channels,
loc_h, loc_w, g_col, c_col, offset_scale, top_grad,
weight, grad_im_ptr,
cache_grad_offset + (threadIdx.x << 1),
cache_grad_mask + threadIdx.x);
}
__syncthreads();
if (tid == 0) {
opmath_t _grad_w = cache_grad_offset[0],
_grad_h = cache_grad_offset[1],
_grad_a = cache_grad_mask[0];
int sid = 2;
for (unsigned int tid = 1; tid < blockSize; ++tid) {
_grad_w += cache_grad_offset[sid];
_grad_h += cache_grad_offset[sid + 1];
_grad_a += cache_grad_mask[tid];
sid += 2;
}
*grad_offset = _grad_w;
*(grad_offset + 1) = _grad_h;
*grad_mask = _grad_a;
}
__syncthreads();
data_weight_ptr += 1;
data_loc_w_ptr += 2;
grad_mask += 1;
grad_offset += 2;
}
}
}
}
template <typename scalar_t, unsigned int blockSize>
__global__ void dcnv3_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2(
const int num_kernels, const scalar_t *grad_col, const scalar_t *data_im,
const scalar_t *data_offset, const scalar_t *data_mask, const int kernel_h,
const int kernel_w, const int stride_h, const int stride_w, const int pad_h,
const int pad_w, const int dilation_h, const int dilation_w,
const int group, const int group_channels, const int height_in,
const int width_in, const int height_out, const int width_out,
const opmath_t offset_scale, opmath_t *grad_im, opmath_t *grad_offset,
opmath_t *grad_mask) {
CUDA_KERNEL_LOOP(index, num_kernels) {
__shared__ opmath_t cache_grad_offset[blockSize * 2];
__shared__ opmath_t cache_grad_mask[blockSize];
unsigned int tid = threadIdx.x;
int _temp = index;
const int c_col = _temp % group_channels;
_temp /= group_channels;
const int sampling_index = _temp;
const int g_col = _temp % group;
_temp /= group;
const int p0_w = ((dilation_w * (kernel_w - 1)) >> 1) - pad_w +
(_temp % width_out) * stride_w;
_temp /= width_out;
const int p0_h = ((dilation_h * (kernel_h - 1)) >> 1) - pad_h +
(_temp % height_out) * stride_h;
_temp /= height_out;
const int b_col = _temp;
const opmath_t top_grad = grad_col[index];
const int input_size = height_in * width_in;
const int kernel_size = kernel_h * kernel_w;
int data_weight_ptr = sampling_index * kernel_size;
int data_loc_w_ptr = data_weight_ptr << 1;
const int grad_sampling_ptr = data_weight_ptr;
grad_offset += grad_sampling_ptr << 1;
grad_mask += grad_sampling_ptr;
const int qid_stride = group * group_channels;
const int im_ptr_offset = b_col * input_size * qid_stride;
const scalar_t *data_im_ptr = data_im + im_ptr_offset;
opmath_t *grad_im_ptr = grad_im + im_ptr_offset;
const opmath_t p0_w_ =
p0_w - ((dilation_w * (kernel_w - 1)) >> 1) * offset_scale;
const opmath_t p0_h_ =
p0_h - ((dilation_h * (kernel_h - 1)) >> 1) * offset_scale;
for (int i = 0; i < kernel_w; ++i) {
for (int j = 0; j < kernel_h; ++j) {
const opmath_t offset_w = data_offset[data_loc_w_ptr];
const opmath_t offset_h = data_offset[data_loc_w_ptr + 1];
const opmath_t loc_w =
p0_w_ + (i * dilation_w + offset_w) * offset_scale;
const opmath_t loc_h =
p0_h_ + (j * dilation_h + offset_h) * offset_scale;
const opmath_t weight = data_mask[data_weight_ptr];
*(cache_grad_offset + (threadIdx.x << 1)) = 0;
*(cache_grad_offset + ((threadIdx.x << 1) + 1)) = 0;
*(cache_grad_mask + threadIdx.x) = 0;
if (loc_h > -1 && loc_w > -1 && loc_h < height_in &&
loc_w < width_in) {
dcnv3_col2im_bilinear(
data_im_ptr, height_in, width_in, group, group_channels,
loc_h, loc_w, g_col, c_col, offset_scale, top_grad,
weight, grad_im_ptr,
cache_grad_offset + (threadIdx.x << 1),
cache_grad_mask + threadIdx.x);
}
__syncthreads();
for (unsigned int s = blockSize / 2; s > 0; s >>= 1) {
if (tid < s) {
const unsigned int xid1 = tid << 1;
const unsigned int xid2 = (tid + s) << 1;
cache_grad_mask[tid] += cache_grad_mask[tid + s];
cache_grad_offset[xid1] += cache_grad_offset[xid2];
cache_grad_offset[xid1 + 1] +=
cache_grad_offset[xid2 + 1];
}
__syncthreads();
}
if (tid == 0) {
*grad_offset = cache_grad_offset[0];
*(grad_offset + 1) = cache_grad_offset[1];
*grad_mask = cache_grad_mask[0];
}
__syncthreads();
data_weight_ptr += 1;
data_loc_w_ptr += 2;
grad_mask += 1;
grad_offset += 2;
}
}
}
}
template <typename scalar_t>
__global__ void dcnv3_col2im_gpu_kernel_shm_reduce_v1(
const int num_kernels, const scalar_t *grad_col, const scalar_t *data_im,
const scalar_t *data_offset, const scalar_t *data_mask, const int kernel_h,
const int kernel_w, const int stride_h, const int stride_w, const int pad_h,
const int pad_w, const int dilation_h, const int dilation_w,
const int group, const int group_channels, const int height_in,
const int width_in, const int height_out, const int width_out,
const opmath_t offset_scale, opmath_t *grad_im, opmath_t *grad_offset,
opmath_t *grad_mask) {
CUDA_KERNEL_LOOP(index, num_kernels) {
extern __shared__ int _s[];
opmath_t *cache_grad_offset = (opmath_t *)_s;
opmath_t *cache_grad_mask = cache_grad_offset + 2 * blockDim.x;
unsigned int tid = threadIdx.x;
int _temp = index;
const int c_col = _temp % group_channels;
_temp /= group_channels;
const int sampling_index = _temp;
const int g_col = _temp % group;
_temp /= group;
const int p0_w = ((dilation_w * (kernel_w - 1)) >> 1) - pad_w +
(_temp % width_out) * stride_w;
_temp /= width_out;
const int p0_h = ((dilation_h * (kernel_h - 1)) >> 1) - pad_h +
(_temp % height_out) * stride_h;
_temp /= height_out;
const int b_col = _temp;
const opmath_t top_grad = grad_col[index];
const int input_size = height_in * width_in;
const int kernel_size = kernel_h * kernel_w;
int data_weight_ptr = sampling_index * kernel_size;
int data_loc_w_ptr = data_weight_ptr << 1;
const int grad_sampling_ptr = data_weight_ptr;
grad_offset += grad_sampling_ptr << 1;
grad_mask += grad_sampling_ptr;
const int qid_stride = group * group_channels;
const int im_ptr_offset = b_col * input_size * qid_stride;
const scalar_t *data_im_ptr = data_im + im_ptr_offset;
opmath_t *grad_im_ptr = grad_im + im_ptr_offset;
const opmath_t p0_w_ =
p0_w - ((dilation_w * (kernel_w - 1)) >> 1) * offset_scale;
const opmath_t p0_h_ =
p0_h - ((dilation_h * (kernel_h - 1)) >> 1) * offset_scale;
for (int i = 0; i < kernel_w; ++i) {
for (int j = 0; j < kernel_h; ++j) {
const opmath_t offset_w = data_offset[data_loc_w_ptr];
const opmath_t offset_h = data_offset[data_loc_w_ptr + 1];
const opmath_t loc_w =
p0_w_ + (i * dilation_w + offset_w) * offset_scale;
const opmath_t loc_h =
p0_h_ + (j * dilation_h + offset_h) * offset_scale;
const opmath_t weight = data_mask[data_weight_ptr];
*(cache_grad_offset + (threadIdx.x << 1)) = 0;
*(cache_grad_offset + ((threadIdx.x << 1) + 1)) = 0;
*(cache_grad_mask + threadIdx.x) = 0;
if (loc_h > -1 && loc_w > -1 && loc_h < height_in &&
loc_w < width_in) {
dcnv3_col2im_bilinear(
data_im_ptr, height_in, width_in, group, group_channels,
loc_h, loc_w, g_col, c_col, offset_scale, top_grad,
weight, grad_im_ptr,
cache_grad_offset + (threadIdx.x << 1),
cache_grad_mask + threadIdx.x);
}
__syncthreads();
if (tid == 0) {
opmath_t _grad_w = cache_grad_offset[0],
_grad_h = cache_grad_offset[1],
_grad_a = cache_grad_mask[0];
int sid = 2;
for (unsigned int tid = 1; tid < blockDim.x; ++tid) {
_grad_w += cache_grad_offset[sid];
_grad_h += cache_grad_offset[sid + 1];
_grad_a += cache_grad_mask[tid];
sid += 2;
}
*grad_offset = _grad_w;
*(grad_offset + 1) = _grad_h;
*grad_mask = _grad_a;
}
__syncthreads();
data_weight_ptr += 1;
data_loc_w_ptr += 2;
grad_mask += 1;
grad_offset += 2;
}
}
}
}
template <typename scalar_t>
__global__ void dcnv3_col2im_gpu_kernel_shm_reduce_v2(
const int num_kernels, const scalar_t *grad_col, const scalar_t *data_im,
const scalar_t *data_offset, const scalar_t *data_mask, const int kernel_h,
const int kernel_w, const int stride_h, const int stride_w, const int pad_h,
const int pad_w, const int dilation_h, const int dilation_w,
const int group, const int group_channels, const int height_in,
const int width_in, const int height_out, const int width_out,
const opmath_t offset_scale, opmath_t *grad_im, opmath_t *grad_offset,
opmath_t *grad_mask) {
CUDA_KERNEL_LOOP(index, num_kernels) {
extern __shared__ int _s[];
opmath_t *cache_grad_offset = (opmath_t *)_s;
opmath_t *cache_grad_mask = cache_grad_offset + 2 * blockDim.x;
unsigned int tid = threadIdx.x;
int _temp = index;
const int c_col = _temp % group_channels;
_temp /= group_channels;
const int sampling_index = _temp;
const int g_col = _temp % group;
_temp /= group;
const int p0_w = ((dilation_w * (kernel_w - 1)) >> 1) - pad_w +
(_temp % width_out) * stride_w;
_temp /= width_out;
const int p0_h = ((dilation_h * (kernel_h - 1)) >> 1) - pad_h +
(_temp % height_out) * stride_h;
_temp /= height_out;
const int b_col = _temp;
const opmath_t top_grad = grad_col[index];
const int input_size = height_in * width_in;
const int kernel_size = kernel_h * kernel_w;
int data_weight_ptr = sampling_index * kernel_size;
int data_loc_w_ptr = data_weight_ptr << 1;
const int grad_sampling_ptr = data_weight_ptr;
grad_offset += grad_sampling_ptr << 1;
grad_mask += grad_sampling_ptr;
const int qid_stride = group * group_channels;
const int im_ptr_offset = b_col * input_size * qid_stride;
const scalar_t *data_im_ptr = data_im + im_ptr_offset;
opmath_t *grad_im_ptr = grad_im + im_ptr_offset;
const opmath_t p0_w_ =
p0_w - ((dilation_w * (kernel_w - 1)) >> 1) * offset_scale;
const opmath_t p0_h_ =
p0_h - ((dilation_h * (kernel_h - 1)) >> 1) * offset_scale;
for (int i = 0; i < kernel_w; ++i) {
for (int j = 0; j < kernel_h; ++j) {
const opmath_t offset_w = data_offset[data_loc_w_ptr];
const opmath_t offset_h = data_offset[data_loc_w_ptr + 1];
const opmath_t loc_w =
p0_w_ + (i * dilation_w + offset_w) * offset_scale;
const opmath_t loc_h =
p0_h_ + (j * dilation_h + offset_h) * offset_scale;
const opmath_t weight = data_mask[data_weight_ptr];
*(cache_grad_offset + (threadIdx.x << 1)) = 0;
*(cache_grad_offset + ((threadIdx.x << 1) + 1)) = 0;
*(cache_grad_mask + threadIdx.x) = 0;
if (loc_h > -1 && loc_w > -1 && loc_h < height_in &&
loc_w < width_in) {
dcnv3_col2im_bilinear(
data_im_ptr, height_in, width_in, group, group_channels,
loc_h, loc_w, g_col, c_col, offset_scale, top_grad,
weight, grad_im_ptr,
cache_grad_offset + (threadIdx.x << 1),
cache_grad_mask + threadIdx.x);
}
__syncthreads();
for (unsigned int s = blockDim.x / 2, spre = blockDim.x; s > 0;
s >>= 1, spre >>= 1) {
if (tid < s) {
const unsigned int xid1 = tid << 1;
const unsigned int xid2 = (tid + s) << 1;
cache_grad_mask[tid] += cache_grad_mask[tid + s];
cache_grad_offset[xid1] += cache_grad_offset[xid2];
cache_grad_offset[xid1 + 1] +=
cache_grad_offset[xid2 + 1];
if (tid + (s << 1) < spre) {
cache_grad_mask[tid] +=
cache_grad_mask[tid + (s << 1)];
cache_grad_offset[xid1] +=
cache_grad_offset[xid2 + (s << 1)];
cache_grad_offset[xid1 + 1] +=
cache_grad_offset[xid2 + 1 + (s << 1)];
}
}
__syncthreads();
}
if (tid == 0) {
*grad_offset = cache_grad_offset[0];
*(grad_offset + 1) = cache_grad_offset[1];
*grad_mask = cache_grad_mask[0];
}
__syncthreads();
data_weight_ptr += 1;
data_loc_w_ptr += 2;
grad_mask += 1;
grad_offset += 2;
}
}
}
}
template <typename scalar_t>
__global__ void dcnv3_col2im_gpu_kernel_shm_reduce_v2_multi_blocks(
const int num_kernels, const scalar_t *grad_col, const scalar_t *data_im,
const scalar_t *data_offset, const scalar_t *data_mask, const int kernel_h,
const int kernel_w, const int stride_h, const int stride_w, const int pad_h,
const int pad_w, const int dilation_h, const int dilation_w,
const int group, const int group_channels, const int height_in,
const int width_in, const int height_out, const int width_out,
const opmath_t offset_scale, opmath_t *grad_im, opmath_t *grad_offset,
opmath_t *grad_mask) {
CUDA_KERNEL_LOOP(index, num_kernels) {
extern __shared__ int _s[];
opmath_t *cache_grad_offset = (opmath_t *)_s;
opmath_t *cache_grad_mask = cache_grad_offset + 2 * blockDim.x;
unsigned int tid = threadIdx.x;
int _temp = index;
const int c_col = _temp % group_channels;
_temp /= group_channels;
const int sampling_index = _temp;
const int g_col = _temp % group;
_temp /= group;
const int p0_w = ((dilation_w * (kernel_w - 1)) >> 1) - pad_w +
(_temp % width_out) * stride_w;
_temp /= width_out;
const int p0_h = ((dilation_h * (kernel_h - 1)) >> 1) - pad_h +
(_temp % height_out) * stride_h;
_temp /= height_out;
const int b_col = _temp;
const opmath_t top_grad = grad_col[index];
const int input_size = height_in * width_in;
const int kernel_size = kernel_h * kernel_w;
int data_weight_ptr = sampling_index * kernel_size;
int data_loc_w_ptr = data_weight_ptr << 1;
const int grad_sampling_ptr = data_weight_ptr;
grad_offset += grad_sampling_ptr << 1;
grad_mask += grad_sampling_ptr;
const int qid_stride = group * group_channels;
const int im_ptr_offset = b_col * input_size * qid_stride;
const scalar_t *data_im_ptr = data_im + im_ptr_offset;
opmath_t *grad_im_ptr = grad_im + im_ptr_offset;
const opmath_t p0_w_ =
p0_w - ((dilation_w * (kernel_w - 1)) >> 1) * offset_scale;
const opmath_t p0_h_ =
p0_h - ((dilation_h * (kernel_h - 1)) >> 1) * offset_scale;
for (int i = 0; i < kernel_w; ++i) {
for (int j = 0; j < kernel_h; ++j) {
const opmath_t offset_w = data_offset[data_loc_w_ptr];
const opmath_t offset_h = data_offset[data_loc_w_ptr + 1];
const opmath_t loc_w =
p0_w_ + (i * dilation_w + offset_w) * offset_scale;
const opmath_t loc_h =
p0_h_ + (j * dilation_h + offset_h) * offset_scale;
const opmath_t weight = data_mask[data_weight_ptr];
*(cache_grad_offset + (threadIdx.x << 1)) = 0;
*(cache_grad_offset + ((threadIdx.x << 1) + 1)) = 0;
*(cache_grad_mask + threadIdx.x) = 0;
if (loc_h > -1 && loc_w > -1 && loc_h < height_in &&
loc_w < width_in) {
dcnv3_col2im_bilinear(
data_im_ptr, height_in, width_in, group, group_channels,
loc_h, loc_w, g_col, c_col, offset_scale, top_grad,
weight, grad_im_ptr,
cache_grad_offset + (threadIdx.x << 1),
cache_grad_mask + threadIdx.x);
}
__syncthreads();
for (unsigned int s = blockDim.x / 2, spre = blockDim.x; s > 0;
s >>= 1, spre >>= 1) {
if (tid < s) {
const unsigned int xid1 = tid << 1;
const unsigned int xid2 = (tid + s) << 1;
cache_grad_mask[tid] += cache_grad_mask[tid + s];
cache_grad_offset[xid1] += cache_grad_offset[xid2];
cache_grad_offset[xid1 + 1] +=
cache_grad_offset[xid2 + 1];
if (tid + (s << 1) < spre) {
cache_grad_mask[tid] +=
cache_grad_mask[tid + (s << 1)];
cache_grad_offset[xid1] +=
cache_grad_offset[xid2 + (s << 1)];
cache_grad_offset[xid1 + 1] +=
cache_grad_offset[xid2 + 1 + (s << 1)];
}
}
__syncthreads();
}
if (tid == 0) {
atomicAdd(grad_offset, cache_grad_offset[0]);
atomicAdd(grad_offset + 1, cache_grad_offset[1]);
atomicAdd(grad_mask, cache_grad_mask[0]);
}
__syncthreads();
data_weight_ptr += 1;
data_loc_w_ptr += 2;
grad_mask += 1;
grad_offset += 2;
}
}
}
}
template <typename scalar_t>
__global__ void dcnv3_col2im_gpu_kernel_gm(
const int num_kernels, const scalar_t *grad_col, const scalar_t *data_im,
const scalar_t *data_offset, const scalar_t *data_mask, const int kernel_h,
const int kernel_w, const int stride_h, const int stride_w, const int pad_h,
const int pad_w, const int dilation_h, const int dilation_w,
const int group, const int group_channels, const int height_in,
const int width_in, const int height_out, const int width_out,
const opmath_t offset_scale, opmath_t *grad_im, opmath_t *grad_offset,
opmath_t *grad_mask) {
CUDA_KERNEL_LOOP(index, num_kernels) {
int _temp = index;
const int c_col = _temp % group_channels;
_temp /= group_channels;
const int sampling_index = _temp;
const int g_col = _temp % group;
_temp /= group;
const int p0_w = ((dilation_w * (kernel_w - 1)) >> 1) - pad_w +
(_temp % width_out) * stride_w;
_temp /= width_out;
const int p0_h = ((dilation_h * (kernel_h - 1)) >> 1) - pad_h +
(_temp % height_out) * stride_h;
_temp /= height_out;
const int b_col = _temp;
const opmath_t top_grad = grad_col[index];
const int input_size = height_in * width_in;
const int kernel_size = kernel_h * kernel_w;
int data_weight_ptr = sampling_index * kernel_size;
int data_loc_w_ptr = data_weight_ptr << 1;
const int grad_sampling_ptr = data_weight_ptr;
grad_offset += grad_sampling_ptr << 1;
grad_mask += grad_sampling_ptr;
const int qid_stride = group * group_channels;
const int im_ptr_offset = b_col * input_size * qid_stride;
const scalar_t *data_im_ptr = data_im + im_ptr_offset;
opmath_t *grad_im_ptr = grad_im + im_ptr_offset;
const opmath_t p0_w_ =
p0_w - ((dilation_w * (kernel_w - 1)) >> 1) * offset_scale;
const opmath_t p0_h_ =
p0_h - ((dilation_h * (kernel_h - 1)) >> 1) * offset_scale;
for (int i = 0; i < kernel_w; ++i) {
for (int j = 0; j < kernel_h; ++j) {
const opmath_t offset_w = data_offset[data_loc_w_ptr];
const opmath_t offset_h = data_offset[data_loc_w_ptr + 1];
const opmath_t loc_w =
p0_w_ + (i * dilation_w + offset_w) * offset_scale;
const opmath_t loc_h =
p0_h_ + (j * dilation_h + offset_h) * offset_scale;
const opmath_t weight = data_mask[data_weight_ptr];
if (loc_h > -1 && loc_w > -1 && loc_h < height_in &&
loc_w < width_in) {
dcnv3_col2im_bilinear_gm(
data_im_ptr, height_in, width_in, group, group_channels,
loc_h, loc_w, g_col, c_col, offset_scale, top_grad,
weight, grad_im_ptr, grad_offset, grad_mask);
}
data_weight_ptr += 1;
data_loc_w_ptr += 2;
grad_mask += 1;
grad_offset += 2;
}
}
}
}
template <typename scalar_t>
void dcnv3_im2col_cuda(cudaStream_t stream, const scalar_t *data_im,
const scalar_t *data_offset, const scalar_t *data_mask,
scalar_t *data_col, const int kernel_h,
const int kernel_w, const int stride_h,
const int stride_w, const int pad_h, const int pad_w,
const int dilation_h, const int dilation_w,
const int group, const int group_channels,
const int batch_n, const int height_in,
const int width_in, const int height_out,
const int width_out, const opmath_t offset_scale) {
const int num_kernels =
batch_n * height_out * width_out * group * group_channels;
const int num_actual_kernels =
batch_n * height_out * width_out * group * group_channels;
const int num_threads = CUDA_NUM_THREADS;
dcnv3_im2col_gpu_kernel<scalar_t>
<<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
stream>>>(num_kernels, data_im, data_offset, data_mask, data_col,
kernel_h, kernel_w, stride_h, stride_w, pad_h, pad_w,
dilation_h, dilation_w, group, group_channels, height_in,
width_in, height_out, width_out, offset_scale);
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {
printf("error in dcnv3_im2col_cuda: %s\n", cudaGetErrorString(err));
}
}
template <typename scalar_t>
void dcnv3_col2im_cuda(
cudaStream_t stream, const scalar_t *grad_col, const scalar_t *data_im,
const scalar_t *data_offset, const scalar_t *data_mask, const int kernel_h,
const int kernel_w, const int stride_h, const int stride_w, const int pad_h,
const int pad_w, const int dilation_h, const int dilation_w,
const int group, const int group_channels, const int batch_n,
const int height_in, const int width_in, const int height_out,
const int width_out, const opmath_t offset_scale, opmath_t *grad_im,
opmath_t *grad_offset, opmath_t *grad_mask) {
const int num_threads =
(group_channels > CUDA_NUM_THREADS) ? CUDA_NUM_THREADS : group_channels;
const int num_kernels =
batch_n * height_out * width_out * group * group_channels;
const int num_actual_kernels =
batch_n * height_out * width_out * group * group_channels;
if (group_channels > 1024) {
if ((group_channels & 1023) == 0) {
dcnv3_col2im_gpu_kernel_shm_reduce_v2_multi_blocks<scalar_t>
<<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
num_threads * 3 * sizeof(opmath_t), stream>>>(
num_kernels, grad_col, data_im, data_offset, data_mask,
kernel_h, kernel_w, stride_h, stride_w, pad_h, pad_w,
dilation_h, dilation_w, group, group_channels, height_in,
width_in, height_out, width_out, offset_scale, grad_im,
grad_offset, grad_mask);
} else {
dcnv3_col2im_gpu_kernel_gm<scalar_t>
<<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
stream>>>(num_kernels, grad_col, data_im, data_offset,
data_mask, kernel_h, kernel_w, stride_h, stride_w,
pad_h, pad_w, dilation_h, dilation_w, group,
group_channels, height_in, width_in, height_out,
width_out, offset_scale, grad_im, grad_offset,
grad_mask);
}
} else {
switch (group_channels) {
case 1:
dcnv3_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t, 1>
<<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
stream>>>(num_kernels, grad_col, data_im, data_offset,
data_mask, kernel_h, kernel_w, stride_h, stride_w,
pad_h, pad_w, dilation_h, dilation_w, group,
group_channels, height_in, width_in, height_out,
width_out, offset_scale, grad_im, grad_offset,
grad_mask);
break;
case 2:
dcnv3_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t, 2>
<<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
stream>>>(num_kernels, grad_col, data_im, data_offset,
data_mask, kernel_h, kernel_w, stride_h, stride_w,
pad_h, pad_w, dilation_h, dilation_w, group,
group_channels, height_in, width_in, height_out,
width_out, offset_scale, grad_im, grad_offset,
grad_mask);
break;
case 4:
dcnv3_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t, 4>
<<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
stream>>>(num_kernels, grad_col, data_im, data_offset,
data_mask, kernel_h, kernel_w, stride_h, stride_w,
pad_h, pad_w, dilation_h, dilation_w, group,
group_channels, height_in, width_in, height_out,
width_out, offset_scale, grad_im, grad_offset,
grad_mask);
break;
case 8:
dcnv3_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t, 8>
<<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
stream>>>(num_kernels, grad_col, data_im, data_offset,
data_mask, kernel_h, kernel_w, stride_h, stride_w,
pad_h, pad_w, dilation_h, dilation_w, group,
group_channels, height_in, width_in, height_out,
width_out, offset_scale, grad_im, grad_offset,
grad_mask);
break;
case 16:
dcnv3_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t, 16>
<<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
stream>>>(num_kernels, grad_col, data_im, data_offset,
data_mask, kernel_h, kernel_w, stride_h, stride_w,
pad_h, pad_w, dilation_h, dilation_w, group,
group_channels, height_in, width_in, height_out,
width_out, offset_scale, grad_im, grad_offset,
grad_mask);
break;
case 32:
dcnv3_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t, 32>
<<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
stream>>>(num_kernels, grad_col, data_im, data_offset,
data_mask, kernel_h, kernel_w, stride_h, stride_w,
pad_h, pad_w, dilation_h, dilation_w, group,
group_channels, height_in, width_in, height_out,
width_out, offset_scale, grad_im, grad_offset,
grad_mask);
break;
case 64:
dcnv3_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t, 64>
<<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
stream>>>(num_kernels, grad_col, data_im, data_offset,
data_mask, kernel_h, kernel_w, stride_h, stride_w,
pad_h, pad_w, dilation_h, dilation_w, group,
group_channels, height_in, width_in, height_out,
width_out, offset_scale, grad_im, grad_offset,
grad_mask);
break;
case 128:
dcnv3_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t, 128>
<<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
stream>>>(num_kernels, grad_col, data_im, data_offset,
data_mask, kernel_h, kernel_w, stride_h, stride_w,
pad_h, pad_w, dilation_h, dilation_w, group,
group_channels, height_in, width_in, height_out,
width_out, offset_scale, grad_im, grad_offset,
grad_mask);
break;
case 256:
dcnv3_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t, 256>
<<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
stream>>>(num_kernels, grad_col, data_im, data_offset,
data_mask, kernel_h, kernel_w, stride_h, stride_w,
pad_h, pad_w, dilation_h, dilation_w, group,
group_channels, height_in, width_in, height_out,
width_out, offset_scale, grad_im, grad_offset,
grad_mask);
break;
case 512:
dcnv3_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t, 512>
<<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
stream>>>(num_kernels, grad_col, data_im, data_offset,
data_mask, kernel_h, kernel_w, stride_h, stride_w,
pad_h, pad_w, dilation_h, dilation_w, group,
group_channels, height_in, width_in, height_out,
width_out, offset_scale, grad_im, grad_offset,
grad_mask);
break;
case 1024:
dcnv3_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t,
1024>
<<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
stream>>>(num_kernels, grad_col, data_im, data_offset,
data_mask, kernel_h, kernel_w, stride_h, stride_w,
pad_h, pad_w, dilation_h, dilation_w, group,
group_channels, height_in, width_in, height_out,
width_out, offset_scale, grad_im, grad_offset,
grad_mask);
break;
default:
if (group_channels < 64) {
dcnv3_col2im_gpu_kernel_shm_reduce_v1<scalar_t>
<<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
num_threads * 3 * sizeof(opmath_t), stream>>>(
num_kernels, grad_col, data_im, data_offset, data_mask,
kernel_h, kernel_w, stride_h, stride_w, pad_h, pad_w,
dilation_h, dilation_w, group, group_channels,
height_in, width_in, height_out, width_out,
offset_scale, grad_im, grad_offset, grad_mask);
} else {
dcnv3_col2im_gpu_kernel_shm_reduce_v2<scalar_t>
<<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
num_threads * 3 * sizeof(opmath_t), stream>>>(
num_kernels, grad_col, data_im, data_offset, data_mask,
kernel_h, kernel_w, stride_h, stride_w, pad_h, pad_w,
dilation_h, dilation_w, group, group_channels,
height_in, width_in, height_out, width_out,
offset_scale, grad_im, grad_offset, grad_mask);
}
}
}
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {
printf("error in dcnv3_col2im_cuda: %s\n", cudaGetErrorString(err));
}
}
\ No newline at end of file
/*!
**************************************************************************************************
* InternImage
* Copyright (c) 2022 OpenGVLab
* Licensed under The MIT License [see LICENSE for details]
**************************************************************************************************
* Modified from
*https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
**************************************************************************************************
*/
#pragma once
#include "cpu/dcnv3_cpu.h"
#ifdef WITH_CUDA
#include "cuda/dcnv3_cuda.h"
#endif
at::Tensor dcnv3_forward(const at::Tensor &input, const at::Tensor &offset,
const at::Tensor &mask, const int kernel_h,
const int kernel_w, const int stride_h,
const int stride_w, const int pad_h, const int pad_w,
const int dilation_h, const int dilation_w,
const int group, const int group_channels,
const float offset_scale, const int im2col_step) {
if (input.type().is_cuda()) {
#ifdef WITH_CUDA
return dcnv3_cuda_forward(input, offset, mask, kernel_h, kernel_w,
stride_h, stride_w, pad_h, pad_w, dilation_h,
dilation_w, group, group_channels,
offset_scale, im2col_step);
#else
AT_ERROR("Not compiled with GPU support");
#endif
}
AT_ERROR("Not implemented on the CPU");
}
std::vector<at::Tensor>
dcnv3_backward(const at::Tensor &input, const at::Tensor &offset,
const at::Tensor &mask, const int kernel_h, const int kernel_w,
const int stride_h, const int stride_w, const int pad_h,
const int pad_w, const int dilation_h, const int dilation_w,
const int group, const int group_channels,
const float offset_scale, const at::Tensor &grad_output,
const int im2col_step) {
if (input.type().is_cuda()) {
#ifdef WITH_CUDA
return dcnv3_cuda_backward(input, offset, mask, kernel_h, kernel_w,
stride_h, stride_w, pad_h, pad_w, dilation_h,
dilation_w, group, group_channels,
offset_scale, grad_output, im2col_step);
#else
AT_ERROR("Not compiled with GPU support");
#endif
}
AT_ERROR("Not implemented on the CPU");
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment