Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
dcnv3
Commits
c4552f79
Commit
c4552f79
authored
Mar 04, 2023
by
zhe chen
Browse files
Release detection and segmentation
parent
5ba0b547
Changes
84
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
2862 additions
and
0 deletions
+2862
-0
segmentation/mmcv_custom/__init__.py
segmentation/mmcv_custom/__init__.py
+10
-0
segmentation/mmcv_custom/custom_layer_decay_optimizer_constructor.py
...n/mmcv_custom/custom_layer_decay_optimizer_constructor.py
+154
-0
segmentation/mmseg_custom/__init__.py
segmentation/mmseg_custom/__init__.py
+7
-0
segmentation/mmseg_custom/models/__init__.py
segmentation/mmseg_custom/models/__init__.py
+7
-0
segmentation/mmseg_custom/models/backbones/__init__.py
segmentation/mmseg_custom/models/backbones/__init__.py
+9
-0
segmentation/mmseg_custom/models/backbones/intern_image.py
segmentation/mmseg_custom/models/backbones/intern_image.py
+482
-0
segmentation/ops_dcnv3/functions/__init__.py
segmentation/ops_dcnv3/functions/__init__.py
+7
-0
segmentation/ops_dcnv3/functions/dcnv3_func.py
segmentation/ops_dcnv3/functions/dcnv3_func.py
+161
-0
segmentation/ops_dcnv3/make.sh
segmentation/ops_dcnv3/make.sh
+8
-0
segmentation/ops_dcnv3/modules/__init__.py
segmentation/ops_dcnv3/modules/__init__.py
+7
-0
segmentation/ops_dcnv3/modules/dcnv3.py
segmentation/ops_dcnv3/modules/dcnv3.py
+278
-0
segmentation/ops_dcnv3/setup.py
segmentation/ops_dcnv3/setup.py
+75
-0
segmentation/ops_dcnv3/src/cpu/dcnv3_cpu.cpp
segmentation/ops_dcnv3/src/cpu/dcnv3_cpu.cpp
+37
-0
segmentation/ops_dcnv3/src/cpu/dcnv3_cpu.h
segmentation/ops_dcnv3/src/cpu/dcnv3_cpu.h
+31
-0
segmentation/ops_dcnv3/src/cuda/dcnv3_cuda.cu
segmentation/ops_dcnv3/src/cuda/dcnv3_cuda.cu
+174
-0
segmentation/ops_dcnv3/src/cuda/dcnv3_cuda.h
segmentation/ops_dcnv3/src/cuda/dcnv3_cuda.h
+31
-0
segmentation/ops_dcnv3/src/cuda/dcnv3_im2col_cuda.cuh
segmentation/ops_dcnv3/src/cuda/dcnv3_im2col_cuda.cuh
+1045
-0
segmentation/ops_dcnv3/src/dcnv3.h
segmentation/ops_dcnv3/src/dcnv3.h
+59
-0
segmentation/ops_dcnv3/src/vision.cpp
segmentation/ops_dcnv3/src/vision.cpp
+17
-0
segmentation/ops_dcnv3/test.py
segmentation/ops_dcnv3/test.py
+263
-0
No files found.
segmentation/mmcv_custom/__init__.py
0 → 100644
View file @
c4552f79
# --------------------------------------------------------
# InternImage
# Copyright (c) 2022 OpenGVLab
# Licensed under The MIT License [see LICENSE for details]
# --------------------------------------------------------
# -*- coding: utf-8 -*-
from
.custom_layer_decay_optimizer_constructor
import
CustomLayerDecayOptimizerConstructor
__all__
=
[
'CustomLayerDecayOptimizerConstructor'
,]
segmentation/mmcv_custom/custom_layer_decay_optimizer_constructor.py
0 → 100644
View file @
c4552f79
# --------------------------------------------------------
# InternImage
# Copyright (c) 2022 OpenGVLab
# Licensed under The MIT License [see LICENSE for details]
# --------------------------------------------------------
"""
Mostly copy-paste from BEiT library:
https://github.com/microsoft/unilm/blob/master/beit/semantic_segmentation/mmcv_custom/layer_decay_optimizer_constructor.py
"""
import
json
from
mmcv.runner
import
OPTIMIZER_BUILDERS
,
DefaultOptimizerConstructor
from
mmcv.runner
import
get_dist_info
from
mmseg.utils
import
get_root_logger
def
get_num_layer_for_swin
(
var_name
,
num_max_layer
,
depths
):
if
var_name
.
startswith
(
"backbone.patch_embed"
):
return
0
elif
var_name
.
startswith
(
'decode_head.mask_embed'
):
return
0
elif
var_name
.
startswith
(
'decode_head.cls_embed'
):
return
0
elif
var_name
.
startswith
(
'decode_head.level_embed'
):
return
0
elif
var_name
.
startswith
(
'decode_head.query_embed'
):
return
0
elif
var_name
.
startswith
(
'decode_head.query_feat'
):
return
0
if
var_name
.
startswith
(
"backbone.cb_modules.0.patch_embed"
):
return
0
elif
"level_embeds"
in
var_name
:
return
0
elif
var_name
.
startswith
(
"backbone.layers"
)
or
var_name
.
startswith
(
"backbone.levels"
):
if
var_name
.
split
(
'.'
)[
3
]
not
in
[
'downsample'
,
'norm'
]:
stage_id
=
int
(
var_name
.
split
(
'.'
)[
2
])
layer_id
=
int
(
var_name
.
split
(
'.'
)[
4
])
# layers for Swin-Large: [2, 2, 18, 2]
if
stage_id
==
0
:
return
layer_id
+
1
elif
stage_id
==
1
:
return
layer_id
+
1
+
depths
[
0
]
elif
stage_id
==
2
:
return
layer_id
+
1
+
depths
[
0
]
+
depths
[
1
]
else
:
return
layer_id
+
1
+
depths
[
0
]
+
depths
[
1
]
+
depths
[
2
]
else
:
stage_id
=
int
(
var_name
.
split
(
'.'
)[
2
])
if
stage_id
==
0
:
return
1
+
depths
[
0
]
elif
stage_id
==
1
:
return
1
+
depths
[
0
]
+
depths
[
1
]
elif
stage_id
==
2
:
return
1
+
depths
[
0
]
+
depths
[
1
]
+
depths
[
2
]
else
:
return
1
+
depths
[
0
]
+
depths
[
1
]
+
depths
[
2
]
else
:
return
num_max_layer
-
1
@
OPTIMIZER_BUILDERS
.
register_module
()
class
CustomLayerDecayOptimizerConstructor
(
DefaultOptimizerConstructor
):
def
add_params
(
self
,
params
,
module
,
prefix
=
''
,
is_dcn_module
=
None
):
"""Add all parameters of module to the params list.
The parameters of the given module will be added to the list of param
groups, with specific rules defined by paramwise_cfg.
Args:
params (list[dict]): A list of param groups, it will be modified
in place.
module (nn.Module): The module to be added.
prefix (str): The prefix of the module
is_dcn_module (int|float|None): If the current module is a
submodule of DCN, `is_dcn_module` will be passed to
control conv_offset layer's learning rate. Defaults to None.
"""
parameter_groups
=
{}
logger
=
get_root_logger
()
logger
.
info
(
self
.
paramwise_cfg
)
backbone_small_lr
=
self
.
paramwise_cfg
.
get
(
'backbone_small_lr'
,
False
)
dino_head
=
self
.
paramwise_cfg
.
get
(
'dino_head'
,
False
)
num_layers
=
self
.
paramwise_cfg
.
get
(
'num_layers'
)
+
2
layer_decay_rate
=
self
.
paramwise_cfg
.
get
(
'layer_decay_rate'
)
depths
=
self
.
paramwise_cfg
.
get
(
'depths'
)
offset_lr_scale
=
self
.
paramwise_cfg
.
get
(
'offset_lr_scale'
,
1.0
)
logger
.
info
(
"Build CustomLayerDecayOptimizerConstructor %f - %d"
%
(
layer_decay_rate
,
num_layers
))
weight_decay
=
self
.
base_wd
for
name
,
param
in
module
.
named_parameters
():
if
not
param
.
requires_grad
:
continue
# frozen weights
if
len
(
param
.
shape
)
==
1
or
name
.
endswith
(
".bias"
)
or
\
"relative_position"
in
name
or
\
"norm"
in
name
or
\
"sampling_offsets"
in
name
:
group_name
=
"no_decay"
this_weight_decay
=
0.
else
:
group_name
=
"decay"
this_weight_decay
=
weight_decay
layer_id
=
get_num_layer_for_swin
(
name
,
num_layers
,
depths
)
if
layer_id
==
num_layers
-
1
and
dino_head
and
\
(
"sampling_offsets"
in
name
or
"reference_points"
in
name
):
group_name
=
"layer_%d_%s_0.1x"
%
(
layer_id
,
group_name
)
elif
(
"sampling_offsets"
in
name
or
"reference_points"
in
name
)
and
"backbone"
in
name
:
group_name
=
"layer_%d_%s_offset_lr_scale"
%
(
layer_id
,
group_name
)
else
:
group_name
=
"layer_%d_%s"
%
(
layer_id
,
group_name
)
if
group_name
not
in
parameter_groups
:
scale
=
layer_decay_rate
**
(
num_layers
-
layer_id
-
1
)
if
scale
<
1
and
backbone_small_lr
==
True
:
scale
=
scale
*
0.1
if
"0.1x"
in
group_name
:
scale
=
scale
*
0.1
if
"offset_lr_scale"
in
group_name
:
scale
=
scale
*
offset_lr_scale
parameter_groups
[
group_name
]
=
{
"weight_decay"
:
this_weight_decay
,
"params"
:
[],
"param_names"
:
[],
"lr_scale"
:
scale
,
"group_name"
:
group_name
,
"lr"
:
scale
*
self
.
base_lr
,
}
parameter_groups
[
group_name
][
"params"
].
append
(
param
)
parameter_groups
[
group_name
][
"param_names"
].
append
(
name
)
rank
,
_
=
get_dist_info
()
if
rank
==
0
:
to_display
=
{}
for
key
in
parameter_groups
:
to_display
[
key
]
=
{
"param_names"
:
parameter_groups
[
key
][
"param_names"
],
"lr_scale"
:
parameter_groups
[
key
][
"lr_scale"
],
"lr"
:
parameter_groups
[
key
][
"lr"
],
"weight_decay"
:
parameter_groups
[
key
][
"weight_decay"
],
}
logger
.
info
(
"Param groups = %s"
%
json
.
dumps
(
to_display
,
indent
=
2
))
# state_dict = module.state_dict()
# for group_name in parameter_groups:
# group = parameter_groups[group_name]
# for name in group["param_names"]:
# group["params"].append(state_dict[name])
params
.
extend
(
parameter_groups
.
values
())
\ No newline at end of file
segmentation/mmseg_custom/__init__.py
0 → 100644
View file @
c4552f79
# --------------------------------------------------------
# InternImage
# Copyright (c) 2022 OpenGVLab
# Licensed under The MIT License [see LICENSE for details]
# --------------------------------------------------------
from
.models
import
*
# noqa: F401,F403
segmentation/mmseg_custom/models/__init__.py
0 → 100644
View file @
c4552f79
# --------------------------------------------------------
# InternImage
# Copyright (c) 2022 OpenGVLab
# Licensed under The MIT License [see LICENSE for details]
# --------------------------------------------------------
from
.backbones
import
*
# noqa: F401,F403
\ No newline at end of file
segmentation/mmseg_custom/models/backbones/__init__.py
0 → 100644
View file @
c4552f79
# --------------------------------------------------------
# InternImage
# Copyright (c) 2022 OpenGVLab
# Licensed under The MIT License [see LICENSE for details]
# --------------------------------------------------------
from
.intern_image
import
InternImage
__all__
=
[
'InternImage'
]
segmentation/mmseg_custom/models/backbones/intern_image.py
0 → 100644
View file @
c4552f79
# --------------------------------------------------------
# InternImage
# Copyright (c) 2022 OpenGVLab
# Licensed under The MIT License [see LICENSE for details]
# --------------------------------------------------------
import
torch
import
torch.nn
as
nn
from
collections
import
OrderedDict
import
torch.utils.checkpoint
as
checkpoint
from
timm.models.layers
import
trunc_normal_
,
DropPath
from
mmcv.runner
import
_load_checkpoint
from
mmcv.cnn
import
constant_init
,
trunc_normal_init
from
mmseg.utils
import
get_root_logger
from
mmseg.models.builder
import
BACKBONES
from
ops_dcnv3
import
modules
as
opsm
class
to_channels_first
(
nn
.
Module
):
def
__init__
(
self
):
super
().
__init__
()
def
forward
(
self
,
x
):
return
x
.
permute
(
0
,
3
,
1
,
2
)
class
to_channels_last
(
nn
.
Module
):
def
__init__
(
self
):
super
().
__init__
()
def
forward
(
self
,
x
):
return
x
.
permute
(
0
,
2
,
3
,
1
)
def
build_norm_layer
(
dim
,
norm_layer
,
in_format
=
'channels_last'
,
out_format
=
'channels_last'
,
eps
=
1e-6
):
layers
=
[]
if
norm_layer
==
'BN'
:
if
in_format
==
'channels_last'
:
layers
.
append
(
to_channels_first
())
layers
.
append
(
nn
.
BatchNorm2d
(
dim
))
if
out_format
==
'channels_last'
:
layers
.
append
(
to_channels_last
())
elif
norm_layer
==
'LN'
:
if
in_format
==
'channels_first'
:
layers
.
append
(
to_channels_last
())
layers
.
append
(
nn
.
LayerNorm
(
dim
,
eps
=
eps
))
if
out_format
==
'channels_first'
:
layers
.
append
(
to_channels_first
())
else
:
raise
NotImplementedError
(
f
'build_norm_layer does not support
{
norm_layer
}
'
)
return
nn
.
Sequential
(
*
layers
)
def
build_act_layer
(
act_layer
):
if
act_layer
==
'ReLU'
:
return
nn
.
ReLU
(
inplace
=
True
)
elif
act_layer
==
'SiLU'
:
return
nn
.
SiLU
(
inplace
=
True
)
elif
act_layer
==
'GELU'
:
return
nn
.
GELU
()
raise
NotImplementedError
(
f
'build_act_layer does not support
{
act_layer
}
'
)
class
StemLayer
(
nn
.
Module
):
r
""" Stem layer of InternImage
Args:
in_chans (int): number of input channels
out_chans (int): number of output channels
act_layer (str): activation layer
norm_layer (str): normalization layer
"""
def
__init__
(
self
,
in_chans
=
3
,
out_chans
=
96
,
act_layer
=
'GELU'
,
norm_layer
=
'BN'
):
super
().
__init__
()
self
.
conv1
=
nn
.
Conv2d
(
in_chans
,
out_chans
//
2
,
kernel_size
=
3
,
stride
=
2
,
padding
=
1
)
self
.
norm1
=
build_norm_layer
(
out_chans
//
2
,
norm_layer
,
'channels_first'
,
'channels_first'
)
self
.
act
=
build_act_layer
(
act_layer
)
self
.
conv2
=
nn
.
Conv2d
(
out_chans
//
2
,
out_chans
,
kernel_size
=
3
,
stride
=
2
,
padding
=
1
)
self
.
norm2
=
build_norm_layer
(
out_chans
,
norm_layer
,
'channels_first'
,
'channels_last'
)
def
forward
(
self
,
x
):
x
=
self
.
conv1
(
x
)
x
=
self
.
norm1
(
x
)
x
=
self
.
act
(
x
)
x
=
self
.
conv2
(
x
)
x
=
self
.
norm2
(
x
)
return
x
class
DownsampleLayer
(
nn
.
Module
):
r
""" Downsample layer of InternImage
Args:
channels (int): number of input channels
norm_layer (str): normalization layer
"""
def
__init__
(
self
,
channels
,
norm_layer
=
'LN'
):
super
().
__init__
()
self
.
conv
=
nn
.
Conv2d
(
channels
,
2
*
channels
,
kernel_size
=
3
,
stride
=
2
,
padding
=
1
,
bias
=
False
)
self
.
norm
=
build_norm_layer
(
2
*
channels
,
norm_layer
,
'channels_first'
,
'channels_last'
)
def
forward
(
self
,
x
):
x
=
self
.
conv
(
x
.
permute
(
0
,
3
,
1
,
2
))
x
=
self
.
norm
(
x
)
return
x
class
MLPLayer
(
nn
.
Module
):
r
""" MLP layer of InternImage
Args:
in_features (int): number of input features
hidden_features (int): number of hidden features
out_features (int): number of output features
act_layer (str): activation layer
drop (float): dropout rate
"""
def
__init__
(
self
,
in_features
,
hidden_features
=
None
,
out_features
=
None
,
act_layer
=
'GELU'
,
drop
=
0.
):
super
().
__init__
()
out_features
=
out_features
or
in_features
hidden_features
=
hidden_features
or
in_features
self
.
fc1
=
nn
.
Linear
(
in_features
,
hidden_features
)
self
.
act
=
build_act_layer
(
act_layer
)
self
.
fc2
=
nn
.
Linear
(
hidden_features
,
out_features
)
self
.
drop
=
nn
.
Dropout
(
drop
)
def
forward
(
self
,
x
):
x
=
self
.
fc1
(
x
)
x
=
self
.
act
(
x
)
x
=
self
.
drop
(
x
)
x
=
self
.
fc2
(
x
)
x
=
self
.
drop
(
x
)
return
x
class
InternImageLayer
(
nn
.
Module
):
r
""" Basic layer of InternImage
Args:
core_op (nn.Module): core operation of InternImage
channels (int): number of input channels
groups (list): Groups of each block.
mlp_ratio (float): ratio of mlp hidden features to input channels
drop (float): dropout rate
drop_path (float): drop path rate
act_layer (str): activation layer
norm_layer (str): normalization layer
post_norm (bool): whether to use post normalization
layer_scale (float): layer scale
offset_scale (float): offset scale
with_cp (bool): whether to use checkpoint
"""
def
__init__
(
self
,
core_op
,
channels
,
groups
,
mlp_ratio
=
4.
,
drop
=
0.
,
drop_path
=
0.
,
act_layer
=
'GELU'
,
norm_layer
=
'LN'
,
post_norm
=
False
,
layer_scale
=
None
,
offset_scale
=
1.0
,
with_cp
=
False
):
super
().
__init__
()
self
.
channels
=
channels
self
.
groups
=
groups
self
.
mlp_ratio
=
mlp_ratio
self
.
with_cp
=
with_cp
self
.
norm1
=
build_norm_layer
(
channels
,
'LN'
)
self
.
post_norm
=
post_norm
self
.
dcn
=
core_op
(
channels
=
channels
,
kernel_size
=
3
,
stride
=
1
,
pad
=
1
,
dilation
=
1
,
group
=
groups
,
offset_scale
=
offset_scale
,
act_layer
=
act_layer
,
norm_layer
=
norm_layer
)
self
.
drop_path
=
DropPath
(
drop_path
)
if
drop_path
>
0.
\
else
nn
.
Identity
()
self
.
norm2
=
build_norm_layer
(
channels
,
'LN'
)
self
.
mlp
=
MLPLayer
(
in_features
=
channels
,
hidden_features
=
int
(
channels
*
mlp_ratio
),
act_layer
=
act_layer
,
drop
=
drop
)
self
.
layer_scale
=
layer_scale
is
not
None
if
self
.
layer_scale
:
self
.
gamma1
=
nn
.
Parameter
(
layer_scale
*
torch
.
ones
(
channels
),
requires_grad
=
True
)
self
.
gamma2
=
nn
.
Parameter
(
layer_scale
*
torch
.
ones
(
channels
),
requires_grad
=
True
)
def
forward
(
self
,
x
):
def
_inner_forward
(
x
):
if
not
self
.
layer_scale
:
if
self
.
post_norm
:
x
=
x
+
self
.
drop_path
(
self
.
norm1
(
self
.
dcn
(
x
)))
x
=
x
+
self
.
drop_path
(
self
.
norm2
(
self
.
mlp
(
x
)))
else
:
x
=
x
+
self
.
drop_path
(
self
.
dcn
(
self
.
norm1
(
x
)))
x
=
x
+
self
.
drop_path
(
self
.
mlp
(
self
.
norm2
(
x
)))
return
x
if
self
.
post_norm
:
x
=
x
+
self
.
drop_path
(
self
.
gamma1
*
self
.
norm1
(
self
.
dcn
(
x
)))
x
=
x
+
self
.
drop_path
(
self
.
gamma2
*
self
.
norm2
(
self
.
mlp
(
x
)))
else
:
x
=
x
+
self
.
drop_path
(
self
.
gamma1
*
self
.
dcn
(
self
.
norm1
(
x
)))
x
=
x
+
self
.
drop_path
(
self
.
gamma2
*
self
.
mlp
(
self
.
norm2
(
x
)))
return
x
if
self
.
with_cp
and
x
.
requires_grad
:
x
=
checkpoint
.
checkpoint
(
_inner_forward
,
x
)
else
:
x
=
_inner_forward
(
x
)
return
x
class
InternImageBlock
(
nn
.
Module
):
r
""" Block of InternImage
Args:
core_op (nn.Module): core operation of InternImage
channels (int): number of input channels
depths (list): Depth of each block.
groups (list): Groups of each block.
mlp_ratio (float): ratio of mlp hidden features to input channels
drop (float): dropout rate
drop_path (float): drop path rate
act_layer (str): activation layer
norm_layer (str): normalization layer
post_norm (bool): whether to use post normalization
layer_scale (float): layer scale
offset_scale (float): offset scale
with_cp (bool): whether to use checkpoint
"""
def
__init__
(
self
,
core_op
,
channels
,
depth
,
groups
,
downsample
=
True
,
mlp_ratio
=
4.
,
drop
=
0.
,
drop_path
=
0.
,
act_layer
=
'GELU'
,
norm_layer
=
'LN'
,
post_norm
=
False
,
offset_scale
=
1.0
,
layer_scale
=
None
,
with_cp
=
False
):
super
().
__init__
()
self
.
channels
=
channels
self
.
depth
=
depth
self
.
post_norm
=
post_norm
self
.
blocks
=
nn
.
ModuleList
([
InternImageLayer
(
core_op
=
core_op
,
channels
=
channels
,
groups
=
groups
,
mlp_ratio
=
mlp_ratio
,
drop
=
drop
,
drop_path
=
drop_path
[
i
]
if
isinstance
(
drop_path
,
list
)
else
drop_path
,
act_layer
=
act_layer
,
norm_layer
=
norm_layer
,
post_norm
=
post_norm
,
layer_scale
=
layer_scale
,
offset_scale
=
offset_scale
,
with_cp
=
with_cp
)
for
i
in
range
(
depth
)
])
if
not
self
.
post_norm
:
self
.
norm
=
build_norm_layer
(
channels
,
'LN'
)
self
.
downsample
=
DownsampleLayer
(
channels
=
channels
,
norm_layer
=
norm_layer
)
if
downsample
else
None
def
forward
(
self
,
x
,
return_wo_downsample
=
False
):
for
blk
in
self
.
blocks
:
x
=
blk
(
x
)
if
not
self
.
post_norm
:
x
=
self
.
norm
(
x
)
if
return_wo_downsample
:
x_
=
x
if
self
.
downsample
is
not
None
:
x
=
self
.
downsample
(
x
)
if
return_wo_downsample
:
return
x
,
x_
return
x
@
BACKBONES
.
register_module
()
class
InternImage
(
nn
.
Module
):
r
""" InternImage
A PyTorch impl of : `InternImage: Exploring Large-Scale Vision Foundation Models with Deformable Convolutions` -
https://arxiv.org/pdf/2103.14030
Args:
core_op (str): Core operator. Default: 'DCNv3'
channels (int): Number of the first stage. Default: 64
depths (list): Depth of each block. Default: [3, 4, 18, 5]
groups (list): Groups of each block. Default: [3, 6, 12, 24]
mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.
drop_rate (float): Probability of an element to be zeroed. Default: 0.
drop_path_rate (float): Stochastic depth rate. Default: 0.
act_layer (str): Activation layer. Default: 'GELU'
norm_layer (str): Normalization layer. Default: 'LN'
layer_scale (bool): Whether to use layer scale. Default: False
cls_scale (bool): Whether to use class scale. Default: False
with_cp (bool): Use checkpoint or not. Using checkpoint will save some
"""
def
__init__
(
self
,
core_op
=
'DCNv3'
,
channels
=
64
,
depths
=
[
3
,
4
,
18
,
5
],
groups
=
[
3
,
6
,
12
,
24
],
mlp_ratio
=
4.
,
drop_rate
=
0.
,
drop_path_rate
=
0.2
,
drop_path_type
=
'linear'
,
act_layer
=
'GELU'
,
norm_layer
=
'LN'
,
layer_scale
=
None
,
offset_scale
=
1.0
,
post_norm
=
False
,
with_cp
=
False
,
out_indices
=
(
0
,
1
,
2
,
3
),
init_cfg
=
None
,
**
kwargs
):
super
().
__init__
()
self
.
core_op
=
core_op
self
.
num_levels
=
len
(
depths
)
self
.
depths
=
depths
self
.
channels
=
channels
self
.
num_features
=
int
(
channels
*
2
**
(
self
.
num_levels
-
1
))
self
.
post_norm
=
post_norm
self
.
mlp_ratio
=
mlp_ratio
self
.
init_cfg
=
init_cfg
self
.
out_indices
=
out_indices
print
(
f
'using core type:
{
core_op
}
'
)
print
(
f
'using activation layer:
{
act_layer
}
'
)
print
(
f
'using main norm layer:
{
norm_layer
}
'
)
print
(
f
'using dpr:
{
drop_path_type
}
,
{
drop_path_rate
}
'
)
in_chans
=
3
self
.
patch_embed
=
StemLayer
(
in_chans
=
in_chans
,
out_chans
=
channels
,
act_layer
=
act_layer
,
norm_layer
=
norm_layer
)
self
.
pos_drop
=
nn
.
Dropout
(
p
=
drop_rate
)
dpr
=
[
x
.
item
()
for
x
in
torch
.
linspace
(
0
,
drop_path_rate
,
sum
(
depths
))
]
if
drop_path_type
==
'uniform'
:
for
i
in
range
(
len
(
dpr
)):
dpr
[
i
]
=
drop_path_rate
self
.
levels
=
nn
.
ModuleList
()
for
i
in
range
(
self
.
num_levels
):
level
=
InternImageBlock
(
core_op
=
getattr
(
opsm
,
core_op
),
channels
=
int
(
channels
*
2
**
i
),
depth
=
depths
[
i
],
groups
=
groups
[
i
],
mlp_ratio
=
self
.
mlp_ratio
,
drop
=
drop_rate
,
drop_path
=
dpr
[
sum
(
depths
[:
i
]):
sum
(
depths
[:
i
+
1
])],
act_layer
=
act_layer
,
norm_layer
=
norm_layer
,
post_norm
=
post_norm
,
downsample
=
(
i
<
self
.
num_levels
-
1
),
layer_scale
=
layer_scale
,
offset_scale
=
offset_scale
,
with_cp
=
with_cp
)
self
.
levels
.
append
(
level
)
self
.
num_layers
=
len
(
depths
)
self
.
apply
(
self
.
_init_weights
)
self
.
apply
(
self
.
_init_deform_weights
)
def
init_weights
(
self
):
logger
=
get_root_logger
()
if
self
.
init_cfg
is
None
:
logger
.
warn
(
f
'No pre-trained weights for '
f
'
{
self
.
__class__
.
__name__
}
, '
f
'training start from scratch'
)
for
m
in
self
.
modules
():
if
isinstance
(
m
,
nn
.
Linear
):
trunc_normal_init
(
m
,
std
=
.
02
,
bias
=
0.
)
elif
isinstance
(
m
,
nn
.
LayerNorm
):
constant_init
(
m
,
1.0
)
else
:
assert
'checkpoint'
in
self
.
init_cfg
,
f
'Only support '
\
f
'specify `Pretrained` in '
\
f
'`init_cfg` in '
\
f
'
{
self
.
__class__
.
__name__
}
'
ckpt
=
_load_checkpoint
(
self
.
init_cfg
.
checkpoint
,
logger
=
logger
,
map_location
=
'cpu'
)
if
'state_dict'
in
ckpt
:
_state_dict
=
ckpt
[
'state_dict'
]
elif
'model'
in
ckpt
:
_state_dict
=
ckpt
[
'model'
]
else
:
_state_dict
=
ckpt
state_dict
=
OrderedDict
()
for
k
,
v
in
_state_dict
.
items
():
if
k
.
startswith
(
'backbone.'
):
state_dict
[
k
[
9
:]]
=
v
else
:
state_dict
[
k
]
=
v
# strip prefix of state_dict
if
list
(
state_dict
.
keys
())[
0
].
startswith
(
'module.'
):
state_dict
=
{
k
[
7
:]:
v
for
k
,
v
in
state_dict
.
items
()}
# load state_dict
meg
=
self
.
load_state_dict
(
state_dict
,
False
)
logger
.
info
(
meg
)
def
_init_weights
(
self
,
m
):
if
isinstance
(
m
,
nn
.
Linear
):
trunc_normal_
(
m
.
weight
,
std
=
.
02
)
if
isinstance
(
m
,
nn
.
Linear
)
and
m
.
bias
is
not
None
:
nn
.
init
.
constant_
(
m
.
bias
,
0
)
elif
isinstance
(
m
,
nn
.
LayerNorm
):
nn
.
init
.
constant_
(
m
.
bias
,
0
)
nn
.
init
.
constant_
(
m
.
weight
,
1.0
)
def
_init_deform_weights
(
self
,
m
):
if
isinstance
(
m
,
getattr
(
opsm
,
self
.
core_op
)):
m
.
_reset_parameters
()
def
forward
(
self
,
x
):
x
=
self
.
patch_embed
(
x
)
x
=
self
.
pos_drop
(
x
)
seq_out
=
[]
for
level_idx
,
level
in
enumerate
(
self
.
levels
):
x
,
x_
=
level
(
x
,
return_wo_downsample
=
True
)
if
level_idx
in
self
.
out_indices
:
seq_out
.
append
(
x_
.
permute
(
0
,
3
,
1
,
2
).
contiguous
())
return
seq_out
segmentation/ops_dcnv3/functions/__init__.py
0 → 100644
View file @
c4552f79
# --------------------------------------------------------
# InternImage
# Copyright (c) 2022 OpenGVLab
# Licensed under The MIT License [see LICENSE for details]
# --------------------------------------------------------
from
.dcnv3_func
import
DCNv3Function
,
dcnv3_core_pytorch
segmentation/ops_dcnv3/functions/dcnv3_func.py
0 → 100644
View file @
c4552f79
# --------------------------------------------------------
# InternImage
# Copyright (c) 2022 OpenGVLab
# Licensed under The MIT License [see LICENSE for details]
# --------------------------------------------------------
from
__future__
import
absolute_import
from
__future__
import
print_function
from
__future__
import
division
import
torch
import
torch.nn.functional
as
F
from
torch.autograd
import
Function
from
torch.autograd.function
import
once_differentiable
from
torch.cuda.amp
import
custom_bwd
,
custom_fwd
import
DCNv3
class
DCNv3Function
(
Function
):
@
staticmethod
@
custom_fwd
def
forward
(
ctx
,
input
,
offset
,
mask
,
kernel_h
,
kernel_w
,
stride_h
,
stride_w
,
pad_h
,
pad_w
,
dilation_h
,
dilation_w
,
group
,
group_channels
,
offset_scale
,
im2col_step
):
ctx
.
kernel_h
=
kernel_h
ctx
.
kernel_w
=
kernel_w
ctx
.
stride_h
=
stride_h
ctx
.
stride_w
=
stride_w
ctx
.
pad_h
=
pad_h
ctx
.
pad_w
=
pad_w
ctx
.
dilation_h
=
dilation_h
ctx
.
dilation_w
=
dilation_w
ctx
.
group
=
group
ctx
.
group_channels
=
group_channels
ctx
.
offset_scale
=
offset_scale
ctx
.
im2col_step
=
im2col_step
output
=
DCNv3
.
dcnv3_forward
(
input
,
offset
,
mask
,
kernel_h
,
kernel_w
,
stride_h
,
stride_w
,
pad_h
,
pad_w
,
dilation_h
,
dilation_w
,
group
,
group_channels
,
offset_scale
,
ctx
.
im2col_step
)
ctx
.
save_for_backward
(
input
,
offset
,
mask
)
return
output
@
staticmethod
@
once_differentiable
@
custom_bwd
def
backward
(
ctx
,
grad_output
):
input
,
offset
,
mask
=
ctx
.
saved_tensors
grad_input
,
grad_offset
,
grad_mask
=
\
DCNv3
.
dcnv3_backward
(
input
,
offset
,
mask
,
ctx
.
kernel_h
,
ctx
.
kernel_w
,
ctx
.
stride_h
,
ctx
.
stride_w
,
ctx
.
pad_h
,
ctx
.
pad_w
,
ctx
.
dilation_h
,
ctx
.
dilation_w
,
ctx
.
group
,
ctx
.
group_channels
,
ctx
.
offset_scale
,
grad_output
.
contiguous
(),
ctx
.
im2col_step
)
return
grad_input
,
grad_offset
,
grad_mask
,
\
None
,
None
,
None
,
None
,
None
,
None
,
None
,
None
,
None
,
None
,
None
,
None
def
_get_reference_points
(
spatial_shapes
,
device
,
kernel_h
,
kernel_w
,
dilation_h
,
dilation_w
,
pad_h
=
0
,
pad_w
=
0
,
stride_h
=
1
,
stride_w
=
1
):
_
,
H_
,
W_
,
_
=
spatial_shapes
H_out
=
(
H_
-
(
dilation_h
*
(
kernel_h
-
1
)
+
1
))
//
stride_h
+
1
W_out
=
(
W_
-
(
dilation_w
*
(
kernel_w
-
1
)
+
1
))
//
stride_w
+
1
ref_y
,
ref_x
=
torch
.
meshgrid
(
torch
.
linspace
(
# pad_h + 0.5,
# H_ - pad_h - 0.5,
(
dilation_h
*
(
kernel_h
-
1
))
//
2
+
0.5
,
(
dilation_h
*
(
kernel_h
-
1
))
//
2
+
0.5
+
(
H_out
-
1
)
*
stride_h
,
H_out
,
dtype
=
torch
.
float32
,
device
=
device
),
torch
.
linspace
(
# pad_w + 0.5,
# W_ - pad_w - 0.5,
(
dilation_w
*
(
kernel_w
-
1
))
//
2
+
0.5
,
(
dilation_w
*
(
kernel_w
-
1
))
//
2
+
0.5
+
(
W_out
-
1
)
*
stride_w
,
W_out
,
dtype
=
torch
.
float32
,
device
=
device
))
ref_y
=
ref_y
.
reshape
(
-
1
)[
None
]
/
H_
ref_x
=
ref_x
.
reshape
(
-
1
)[
None
]
/
W_
ref
=
torch
.
stack
((
ref_x
,
ref_y
),
-
1
).
reshape
(
1
,
H_out
,
W_out
,
1
,
2
)
return
ref
def
_generate_dilation_grids
(
spatial_shapes
,
kernel_h
,
kernel_w
,
dilation_h
,
dilation_w
,
group
,
device
):
_
,
H_
,
W_
,
_
=
spatial_shapes
points_list
=
[]
x
,
y
=
torch
.
meshgrid
(
torch
.
linspace
(
-
((
dilation_w
*
(
kernel_w
-
1
))
//
2
),
-
((
dilation_w
*
(
kernel_w
-
1
))
//
2
)
+
(
kernel_w
-
1
)
*
dilation_w
,
kernel_w
,
dtype
=
torch
.
float32
,
device
=
device
),
torch
.
linspace
(
-
((
dilation_h
*
(
kernel_h
-
1
))
//
2
),
-
((
dilation_h
*
(
kernel_h
-
1
))
//
2
)
+
(
kernel_h
-
1
)
*
dilation_h
,
kernel_h
,
dtype
=
torch
.
float32
,
device
=
device
))
points_list
.
extend
([
x
/
W_
,
y
/
H_
])
grid
=
torch
.
stack
(
points_list
,
-
1
).
reshape
(
-
1
,
1
,
2
).
\
repeat
(
1
,
group
,
1
).
permute
(
1
,
0
,
2
)
grid
=
grid
.
reshape
(
1
,
1
,
1
,
group
*
kernel_h
*
kernel_w
,
2
)
return
grid
def
dcnv3_core_pytorch
(
input
,
offset
,
mask
,
kernel_h
,
kernel_w
,
stride_h
,
stride_w
,
pad_h
,
pad_w
,
dilation_h
,
dilation_w
,
group
,
group_channels
,
offset_scale
):
# for debug and test only,
# need to use cuda version instead
input
=
F
.
pad
(
input
,
[
0
,
0
,
pad_h
,
pad_h
,
pad_w
,
pad_w
])
N_
,
H_in
,
W_in
,
_
=
input
.
shape
_
,
H_out
,
W_out
,
_
=
offset
.
shape
ref
=
_get_reference_points
(
input
.
shape
,
input
.
device
,
kernel_h
,
kernel_w
,
dilation_h
,
dilation_w
,
pad_h
,
pad_w
,
stride_h
,
stride_w
)
grid
=
_generate_dilation_grids
(
input
.
shape
,
kernel_h
,
kernel_w
,
dilation_h
,
dilation_w
,
group
,
input
.
device
)
spatial_norm
=
torch
.
tensor
([
W_in
,
H_in
]).
reshape
(
1
,
1
,
1
,
2
).
\
repeat
(
1
,
1
,
1
,
group
*
kernel_h
*
kernel_w
).
to
(
input
.
device
)
sampling_locations
=
(
ref
+
grid
*
offset_scale
).
repeat
(
N_
,
1
,
1
,
1
,
1
).
flatten
(
3
,
4
)
+
\
offset
*
offset_scale
/
spatial_norm
P_
=
kernel_h
*
kernel_w
sampling_grids
=
2
*
sampling_locations
-
1
# N_, H_in, W_in, group*group_channels -> N_, H_in*W_in, group*group_channels -> N_, group*group_channels, H_in*W_in -> N_*group, group_channels, H_in, W_in
input_
=
input
.
view
(
N_
,
H_in
*
W_in
,
group
*
group_channels
).
transpose
(
1
,
2
).
\
reshape
(
N_
*
group
,
group_channels
,
H_in
,
W_in
)
# N_, H_out, W_out, group*P_*2 -> N_, H_out*W_out, group, P_, 2 -> N_, group, H_out*W_out, P_, 2 -> N_*group, H_out*W_out, P_, 2
sampling_grid_
=
sampling_grids
.
view
(
N_
,
H_out
*
W_out
,
group
,
P_
,
2
).
transpose
(
1
,
2
).
\
flatten
(
0
,
1
)
# N_*group, group_channels, H_out*W_out, P_
sampling_input_
=
F
.
grid_sample
(
input_
,
sampling_grid_
,
mode
=
'bilinear'
,
padding_mode
=
'zeros'
,
align_corners
=
False
)
# (N_, H_out, W_out, group*P_) -> N_, H_out*W_out, group, P_ -> (N_, group, H_out*W_out, P_) -> (N_*group, 1, H_out*W_out, P_)
mask
=
mask
.
view
(
N_
,
H_out
*
W_out
,
group
,
P_
).
transpose
(
1
,
2
).
\
reshape
(
N_
*
group
,
1
,
H_out
*
W_out
,
P_
)
output
=
(
sampling_input_
*
mask
).
sum
(
-
1
).
view
(
N_
,
group
*
group_channels
,
H_out
*
W_out
)
return
output
.
transpose
(
1
,
2
).
reshape
(
N_
,
H_out
,
W_out
,
-
1
).
contiguous
()
segmentation/ops_dcnv3/make.sh
0 → 100755
View file @
c4552f79
#!/usr/bin/env bash
# --------------------------------------------------------
# InternImage
# Copyright (c) 2022 OpenGVLab
# Licensed under The MIT License [see LICENSE for details]
# --------------------------------------------------------
python setup.py build
install
segmentation/ops_dcnv3/modules/__init__.py
0 → 100644
View file @
c4552f79
# --------------------------------------------------------
# InternImage
# Copyright (c) 2022 OpenGVLab
# Licensed under The MIT License [see LICENSE for details]
# --------------------------------------------------------
from
.dcnv3
import
DCNv3
,
DCNv3_pytorch
\ No newline at end of file
segmentation/ops_dcnv3/modules/dcnv3.py
0 → 100644
View file @
c4552f79
# --------------------------------------------------------
# InternImage
# Copyright (c) 2022 OpenGVLab
# Licensed under The MIT License [see LICENSE for details]
# --------------------------------------------------------
from
__future__
import
absolute_import
from
__future__
import
print_function
from
__future__
import
division
import
warnings
from
torch
import
nn
import
torch.nn.functional
as
F
from
torch.nn.init
import
xavier_uniform_
,
constant_
from
..functions
import
DCNv3Function
,
dcnv3_core_pytorch
class
to_channels_first
(
nn
.
Module
):
def
__init__
(
self
):
super
().
__init__
()
def
forward
(
self
,
x
):
return
x
.
permute
(
0
,
3
,
1
,
2
)
class
to_channels_last
(
nn
.
Module
):
def
__init__
(
self
):
super
().
__init__
()
def
forward
(
self
,
x
):
return
x
.
permute
(
0
,
2
,
3
,
1
)
def
build_norm_layer
(
dim
,
norm_layer
,
in_format
=
'channels_last'
,
out_format
=
'channels_last'
,
eps
=
1e-6
):
layers
=
[]
if
norm_layer
==
'BN'
:
if
in_format
==
'channels_last'
:
layers
.
append
(
to_channels_first
())
layers
.
append
(
nn
.
BatchNorm2d
(
dim
))
if
out_format
==
'channels_last'
:
layers
.
append
(
to_channels_last
())
elif
norm_layer
==
'LN'
:
if
in_format
==
'channels_first'
:
layers
.
append
(
to_channels_last
())
layers
.
append
(
nn
.
LayerNorm
(
dim
,
eps
=
eps
))
if
out_format
==
'channels_first'
:
layers
.
append
(
to_channels_first
())
else
:
raise
NotImplementedError
(
f
'build_norm_layer does not support
{
norm_layer
}
'
)
return
nn
.
Sequential
(
*
layers
)
def
build_act_layer
(
act_layer
):
if
act_layer
==
'ReLU'
:
return
nn
.
ReLU
(
inplace
=
True
)
elif
act_layer
==
'SiLU'
:
return
nn
.
SiLU
(
inplace
=
True
)
elif
act_layer
==
'GELU'
:
return
nn
.
GELU
()
raise
NotImplementedError
(
f
'build_act_layer does not support
{
act_layer
}
'
)
def
_is_power_of_2
(
n
):
if
(
not
isinstance
(
n
,
int
))
or
(
n
<
0
):
raise
ValueError
(
"invalid input for _is_power_of_2: {} (type: {})"
.
format
(
n
,
type
(
n
)))
return
(
n
&
(
n
-
1
)
==
0
)
and
n
!=
0
class
DCNv3_pytorch
(
nn
.
Module
):
def
__init__
(
self
,
channels
=
64
,
kernel_size
=
3
,
stride
=
1
,
pad
=
1
,
dilation
=
1
,
group
=
4
,
offset_scale
=
1.0
,
act_layer
=
'GELU'
,
norm_layer
=
'LN'
):
"""
DCNv3 Module
:param channels
:param kernel_size
:param stride
:param pad
:param dilation
:param group
:param offset_scale
:param act_layer
:param norm_layer
"""
super
().
__init__
()
if
channels
%
group
!=
0
:
raise
ValueError
(
f
'channels must be divisible by group, but got
{
channels
}
and
{
group
}
'
)
_d_per_group
=
channels
//
group
# you'd better set _d_per_group to a power of 2 which is more efficient in our CUDA implementation
if
not
_is_power_of_2
(
_d_per_group
):
warnings
.
warn
(
"You'd better set channels in DCNv3 to make the dimension of each attention head a power of 2 "
"which is more efficient in our CUDA implementation."
)
self
.
offset_scale
=
offset_scale
self
.
channels
=
channels
self
.
kernel_size
=
kernel_size
self
.
stride
=
stride
self
.
dilation
=
1
self
.
pad
=
pad
self
.
group
=
group
self
.
group_channels
=
channels
//
group
self
.
offset_scale
=
offset_scale
self
.
dw_conv
=
nn
.
Sequential
(
nn
.
Conv2d
(
channels
,
channels
,
kernel_size
=
kernel_size
,
stride
=
1
,
padding
=
(
kernel_size
-
1
)
//
2
,
groups
=
channels
),
build_norm_layer
(
channels
,
norm_layer
,
'channels_first'
,
'channels_last'
),
build_act_layer
(
act_layer
))
self
.
offset
=
nn
.
Linear
(
channels
,
group
*
kernel_size
*
kernel_size
*
2
)
self
.
mask
=
nn
.
Linear
(
channels
,
group
*
kernel_size
*
kernel_size
)
self
.
input_proj
=
nn
.
Linear
(
channels
,
channels
)
self
.
output_proj
=
nn
.
Linear
(
channels
,
channels
)
self
.
_reset_parameters
()
def
_reset_parameters
(
self
):
constant_
(
self
.
offset
.
weight
.
data
,
0.
)
constant_
(
self
.
offset
.
bias
.
data
,
0.
)
constant_
(
self
.
mask
.
weight
.
data
,
0.
)
constant_
(
self
.
mask
.
bias
.
data
,
0.
)
xavier_uniform_
(
self
.
input_proj
.
weight
.
data
)
constant_
(
self
.
input_proj
.
bias
.
data
,
0.
)
xavier_uniform_
(
self
.
output_proj
.
weight
.
data
)
constant_
(
self
.
output_proj
.
bias
.
data
,
0.
)
def
forward
(
self
,
input
):
"""
:param query (N, H, W, C)
:return output (N, H, W, C)
"""
N
,
H
,
W
,
_
=
input
.
shape
x
=
self
.
input_proj
(
input
)
x1
=
input
.
permute
(
0
,
3
,
1
,
2
)
x1
=
self
.
dw_conv
(
x1
)
offset
=
self
.
offset
(
x1
)
mask
=
self
.
mask
(
x1
).
reshape
(
N
,
H
,
W
,
self
.
group
,
-
1
)
mask
=
F
.
softmax
(
mask
,
-
1
).
reshape
(
N
,
H
,
W
,
-
1
)
x
=
dcnv3_core_pytorch
(
x
,
offset
,
mask
,
self
.
kernel_size
,
self
.
kernel_size
,
self
.
stride
,
self
.
stride
,
self
.
pad
,
self
.
pad
,
self
.
dilation
,
self
.
dilation
,
self
.
group
,
self
.
group_channels
,
self
.
offset_scale
)
x
=
self
.
output_proj
(
x
)
return
x
class
DCNv3
(
nn
.
Module
):
def
__init__
(
self
,
channels
=
64
,
kernel_size
=
3
,
stride
=
1
,
pad
=
1
,
dilation
=
1
,
group
=
4
,
offset_scale
=
1.0
,
act_layer
=
'GELU'
,
norm_layer
=
'LN'
):
"""
DCNv3 Module
:param channels
:param kernel_size
:param stride
:param pad
:param dilation
:param group
:param offset_scale
:param act_layer
:param norm_layer
"""
super
().
__init__
()
if
channels
%
group
!=
0
:
raise
ValueError
(
f
'channels must be divisible by group, but got
{
channels
}
and
{
group
}
'
)
_d_per_group
=
channels
//
group
# you'd better set _d_per_group to a power of 2 which is more efficient in our CUDA implementation
if
not
_is_power_of_2
(
_d_per_group
):
warnings
.
warn
(
"You'd better set channels in DCNv3 to make the dimension of each attention head a power of 2 "
"which is more efficient in our CUDA implementation."
)
self
.
offset_scale
=
offset_scale
self
.
channels
=
channels
self
.
kernel_size
=
kernel_size
self
.
stride
=
stride
self
.
dilation
=
1
self
.
pad
=
pad
self
.
group
=
group
self
.
group_channels
=
channels
//
group
self
.
offset_scale
=
offset_scale
self
.
dw_conv
=
nn
.
Sequential
(
nn
.
Conv2d
(
channels
,
channels
,
kernel_size
=
kernel_size
,
stride
=
1
,
padding
=
(
kernel_size
-
1
)
//
2
,
groups
=
channels
),
build_norm_layer
(
channels
,
norm_layer
,
'channels_first'
,
'channels_last'
),
build_act_layer
(
act_layer
))
self
.
offset
=
nn
.
Linear
(
channels
,
group
*
kernel_size
*
kernel_size
*
2
)
self
.
mask
=
nn
.
Linear
(
channels
,
group
*
kernel_size
*
kernel_size
)
self
.
input_proj
=
nn
.
Linear
(
channels
,
channels
)
self
.
output_proj
=
nn
.
Linear
(
channels
,
channels
)
self
.
_reset_parameters
()
def
_reset_parameters
(
self
):
constant_
(
self
.
offset
.
weight
.
data
,
0.
)
constant_
(
self
.
offset
.
bias
.
data
,
0.
)
constant_
(
self
.
mask
.
weight
.
data
,
0.
)
constant_
(
self
.
mask
.
bias
.
data
,
0.
)
xavier_uniform_
(
self
.
input_proj
.
weight
.
data
)
constant_
(
self
.
input_proj
.
bias
.
data
,
0.
)
xavier_uniform_
(
self
.
output_proj
.
weight
.
data
)
constant_
(
self
.
output_proj
.
bias
.
data
,
0.
)
def
forward
(
self
,
input
):
"""
:param query (N, H, W, C)
:return output (N, H, W, C)
"""
N
,
H
,
W
,
_
=
input
.
shape
x
=
self
.
input_proj
(
input
)
dtype
=
x
.
dtype
x1
=
input
.
permute
(
0
,
3
,
1
,
2
)
x1
=
self
.
dw_conv
(
x1
)
offset
=
self
.
offset
(
x1
)
mask
=
self
.
mask
(
x1
).
reshape
(
N
,
H
,
W
,
self
.
group
,
-
1
)
mask
=
F
.
softmax
(
mask
,
-
1
).
reshape
(
N
,
H
,
W
,
-
1
).
type
(
dtype
)
x
=
DCNv3Function
.
apply
(
x
,
offset
,
mask
,
self
.
kernel_size
,
self
.
kernel_size
,
self
.
stride
,
self
.
stride
,
self
.
pad
,
self
.
pad
,
self
.
dilation
,
self
.
dilation
,
self
.
group
,
self
.
group_channels
,
self
.
offset_scale
,
256
)
x
=
self
.
output_proj
(
x
)
return
x
segmentation/ops_dcnv3/setup.py
0 → 100644
View file @
c4552f79
# --------------------------------------------------------
# InternImage
# Copyright (c) 2022 OpenGVLab
# Licensed under The MIT License [see LICENSE for details]
# --------------------------------------------------------
import
os
import
glob
import
torch
from
torch.utils.cpp_extension
import
CUDA_HOME
from
torch.utils.cpp_extension
import
CppExtension
from
torch.utils.cpp_extension
import
CUDAExtension
from
setuptools
import
find_packages
from
setuptools
import
setup
requirements
=
[
"torch"
,
"torchvision"
]
def
get_extensions
():
this_dir
=
os
.
path
.
dirname
(
os
.
path
.
abspath
(
__file__
))
extensions_dir
=
os
.
path
.
join
(
this_dir
,
"src"
)
main_file
=
glob
.
glob
(
os
.
path
.
join
(
extensions_dir
,
"*.cpp"
))
source_cpu
=
glob
.
glob
(
os
.
path
.
join
(
extensions_dir
,
"cpu"
,
"*.cpp"
))
source_cuda
=
glob
.
glob
(
os
.
path
.
join
(
extensions_dir
,
"cuda"
,
"*.cu"
))
sources
=
main_file
+
source_cpu
extension
=
CppExtension
extra_compile_args
=
{
"cxx"
:
[]}
define_macros
=
[]
if
torch
.
cuda
.
is_available
()
and
CUDA_HOME
is
not
None
:
extension
=
CUDAExtension
sources
+=
source_cuda
define_macros
+=
[(
"WITH_CUDA"
,
None
)]
extra_compile_args
[
"nvcc"
]
=
[
# "-DCUDA_HAS_FP16=1",
# "-D__CUDA_NO_HALF_OPERATORS__",
# "-D__CUDA_NO_HALF_CONVERSIONS__",
# "-D__CUDA_NO_HALF2_OPERATORS__",
]
else
:
raise
NotImplementedError
(
'Cuda is not availabel'
)
sources
=
[
os
.
path
.
join
(
extensions_dir
,
s
)
for
s
in
sources
]
include_dirs
=
[
extensions_dir
]
ext_modules
=
[
extension
(
"DCNv3"
,
sources
,
include_dirs
=
include_dirs
,
define_macros
=
define_macros
,
extra_compile_args
=
extra_compile_args
,
)
]
return
ext_modules
setup
(
name
=
"DCNv3"
,
version
=
"1.0"
,
author
=
"InternImage"
,
url
=
"https://github.com/OpenGVLab/InternImage"
,
description
=
"PyTorch Wrapper for CUDA Functions of DCNv3"
,
packages
=
find_packages
(
exclude
=
(
"configs"
,
"tests"
,
)),
ext_modules
=
get_extensions
(),
cmdclass
=
{
"build_ext"
:
torch
.
utils
.
cpp_extension
.
BuildExtension
},
)
segmentation/ops_dcnv3/src/cpu/dcnv3_cpu.cpp
0 → 100644
View file @
c4552f79
/*!
**************************************************************************************************
* InternImage
* Copyright (c) 2022 OpenGVLab
* Licensed under The MIT License [see LICENSE for details]
**************************************************************************************************
* Modified from
*https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
**************************************************************************************************
*/
#include <vector>
#include <ATen/ATen.h>
#include <ATen/cuda/CUDAContext.h>
at
::
Tensor
dcnv3_cpu_forward
(
const
at
::
Tensor
&
input
,
const
at
::
Tensor
&
offset
,
const
at
::
Tensor
&
mask
,
const
int
kernel_h
,
const
int
kernel_w
,
const
int
stride_h
,
const
int
stride_w
,
const
int
pad_h
,
const
int
pad_w
,
const
int
dilation_h
,
const
int
dilation_w
,
const
int
group
,
const
int
group_channels
,
const
float
offset_scale
,
const
int
im2col_step
)
{
AT_ERROR
(
"Not implement on cpu"
);
}
std
::
vector
<
at
::
Tensor
>
dcnv3_cpu_backward
(
const
at
::
Tensor
&
input
,
const
at
::
Tensor
&
offset
,
const
at
::
Tensor
&
mask
,
const
int
kernel_h
,
const
int
kernel_w
,
const
int
stride_h
,
const
int
stride_w
,
const
int
pad_h
,
const
int
pad_w
,
const
int
dilation_h
,
const
int
dilation_w
,
const
int
group
,
const
int
group_channels
,
const
float
offset_scale
,
const
at
::
Tensor
&
grad_output
,
const
int
im2col_step
)
{
AT_ERROR
(
"Not implement on cpu"
);
}
segmentation/ops_dcnv3/src/cpu/dcnv3_cpu.h
0 → 100644
View file @
c4552f79
/*!
**************************************************************************************************
* InternImage
* Copyright (c) 2022 OpenGVLab
* Licensed under The MIT License [see LICENSE for details]
**************************************************************************************************
* Modified from
*https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
**************************************************************************************************
*/
#pragma once
#include <torch/extension.h>
at
::
Tensor
dcnv3_cpu_forward
(
const
at
::
Tensor
&
input
,
const
at
::
Tensor
&
offset
,
const
at
::
Tensor
&
mask
,
const
int
kernel_h
,
const
int
kernel_w
,
const
int
stride_h
,
const
int
stride_w
,
const
int
pad_h
,
const
int
pad_w
,
const
int
dilation_h
,
const
int
dilation_w
,
const
int
group
,
const
int
group_channels
,
const
float
offset_scale
,
const
int
im2col_step
);
std
::
vector
<
at
::
Tensor
>
dcnv3_cpu_backward
(
const
at
::
Tensor
&
input
,
const
at
::
Tensor
&
offset
,
const
at
::
Tensor
&
mask
,
const
int
kernel_h
,
const
int
kernel_w
,
const
int
stride_h
,
const
int
stride_w
,
const
int
pad_h
,
const
int
pad_w
,
const
int
dilation_h
,
const
int
dilation_w
,
const
int
group
,
const
int
group_channels
,
const
float
offset_scale
,
const
at
::
Tensor
&
grad_output
,
const
int
im2col_step
);
segmentation/ops_dcnv3/src/cuda/dcnv3_cuda.cu
0 → 100644
View file @
c4552f79
/*!
**************************************************************************************************
* InternImage
* Copyright (c) 2022 OpenGVLab
* Licensed under The MIT License [see LICENSE for details]
**************************************************************************************************
* Modified from
*https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
**************************************************************************************************
*/
#include "cuda/dcnv3_im2col_cuda.cuh"
#include <vector>
#include <ATen/ATen.h>
#include <ATen/cuda/CUDAContext.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <torch/torch.h>
at
::
Tensor
dcnv3_cuda_forward
(
const
at
::
Tensor
&
input
,
const
at
::
Tensor
&
offset
,
const
at
::
Tensor
&
mask
,
const
int
kernel_h
,
const
int
kernel_w
,
const
int
stride_h
,
const
int
stride_w
,
const
int
pad_h
,
const
int
pad_w
,
const
int
dilation_h
,
const
int
dilation_w
,
const
int
group
,
const
int
group_channels
,
const
float
offset_scale
,
const
int
im2col_step
)
{
AT_ASSERTM
(
input
.
is_contiguous
(),
"input tensor has to be contiguous"
);
AT_ASSERTM
(
offset
.
is_contiguous
(),
"offset tensor has to be contiguous"
);
AT_ASSERTM
(
mask
.
is_contiguous
(),
"mask tensor has to be contiguous"
);
AT_ASSERTM
(
input
.
type
().
is_cuda
(),
"input must be a CUDA tensor"
);
AT_ASSERTM
(
offset
.
type
().
is_cuda
(),
"offset must be a CUDA tensor"
);
AT_ASSERTM
(
mask
.
type
().
is_cuda
(),
"mask must be a CUDA tensor"
);
const
int
batch
=
input
.
size
(
0
);
const
int
height_in
=
input
.
size
(
1
);
const
int
width_in
=
input
.
size
(
2
);
const
int
channels
=
input
.
size
(
3
);
const
int
height_out
=
(
height_in
+
2
*
pad_h
-
(
dilation_h
*
(
kernel_h
-
1
)
+
1
))
/
stride_h
+
1
;
const
int
width_out
=
(
width_in
+
2
*
pad_w
-
(
dilation_w
*
(
kernel_w
-
1
)
+
1
))
/
stride_w
+
1
;
const
int
im2col_step_
=
std
::
min
(
batch
,
im2col_step
);
AT_ASSERTM
(
batch
%
im2col_step_
==
0
,
"batch(%d) must divide im2col_step(%d)"
,
batch
,
im2col_step_
);
AT_ASSERTM
(
channels
==
(
group
*
group_channels
),
"Input channels and group times group channels wont match: (%d vs %d)."
,
channels
,
group
*
group_channels
);
auto
output
=
at
::
zeros
({
batch
,
height_out
,
width_out
,
group
*
group_channels
},
input
.
options
());
const
int
batch_n
=
im2col_step_
;
auto
output_n
=
output
.
view
({
batch
/
batch_n
,
batch_n
,
height_out
,
width_out
,
group
*
group_channels
});
auto
per_input_size
=
height_in
*
width_in
*
group
*
group_channels
;
auto
per_offset_size
=
height_out
*
width_out
*
group
*
kernel_h
*
kernel_w
*
2
;
auto
per_mask_size
=
height_out
*
width_out
*
group
*
kernel_h
*
kernel_w
;
for
(
int
n
=
0
;
n
<
batch
/
im2col_step_
;
++
n
)
{
auto
columns
=
output_n
.
select
(
0
,
n
);
// AT_DISPATCH_FLOATING_TYPES(
AT_DISPATCH_FLOATING_TYPES_AND_HALF
(
input
.
type
(),
"ms_deform_attn_forward_cuda"
,
([
&
]
{
dcnv3_im2col_cuda
(
at
::
cuda
::
getCurrentCUDAStream
(),
input
.
data
<
scalar_t
>
()
+
n
*
im2col_step_
*
per_input_size
,
offset
.
data
<
scalar_t
>
()
+
n
*
im2col_step_
*
per_offset_size
,
mask
.
data
<
scalar_t
>
()
+
n
*
im2col_step_
*
per_mask_size
,
columns
.
data
<
scalar_t
>
(),
kernel_h
,
kernel_w
,
stride_h
,
stride_w
,
pad_h
,
pad_w
,
dilation_h
,
dilation_w
,
group
,
group_channels
,
batch_n
,
height_in
,
width_in
,
height_out
,
width_out
,
offset_scale
);
}));
}
return
output
;
}
std
::
vector
<
at
::
Tensor
>
dcnv3_cuda_backward
(
const
at
::
Tensor
&
input
,
const
at
::
Tensor
&
offset
,
const
at
::
Tensor
&
mask
,
const
int
kernel_h
,
const
int
kernel_w
,
const
int
stride_h
,
const
int
stride_w
,
const
int
pad_h
,
const
int
pad_w
,
const
int
dilation_h
,
const
int
dilation_w
,
const
int
group
,
const
int
group_channels
,
const
float
offset_scale
,
const
at
::
Tensor
&
grad_output
,
const
int
im2col_step
)
{
AT_ASSERTM
(
input
.
is_contiguous
(),
"input tensor has to be contiguous"
);
AT_ASSERTM
(
offset
.
is_contiguous
(),
"offset tensor has to be contiguous"
);
AT_ASSERTM
(
mask
.
is_contiguous
(),
"mask tensor has to be contiguous"
);
AT_ASSERTM
(
grad_output
.
is_contiguous
(),
"grad_output tensor has to be contiguous"
);
AT_ASSERTM
(
input
.
type
().
is_cuda
(),
"input must be a CUDA tensor"
);
AT_ASSERTM
(
offset
.
type
().
is_cuda
(),
"offset must be a CUDA tensor"
);
AT_ASSERTM
(
mask
.
type
().
is_cuda
(),
"mask must be a CUDA tensor"
);
AT_ASSERTM
(
grad_output
.
type
().
is_cuda
(),
"grad_output must be a CUDA tensor"
);
const
int
batch
=
input
.
size
(
0
);
const
int
height_in
=
input
.
size
(
1
);
const
int
width_in
=
input
.
size
(
2
);
const
int
channels
=
input
.
size
(
3
);
const
int
height_out
=
(
height_in
+
2
*
pad_h
-
(
dilation_h
*
(
kernel_h
-
1
)
+
1
))
/
stride_h
+
1
;
const
int
width_out
=
(
width_in
+
2
*
pad_w
-
(
dilation_w
*
(
kernel_w
-
1
)
+
1
))
/
stride_w
+
1
;
const
int
im2col_step_
=
std
::
min
(
batch
,
im2col_step
);
AT_ASSERTM
(
batch
%
im2col_step_
==
0
,
"batch(%d) must divide im2col_step(%d)"
,
batch
,
im2col_step_
);
AT_ASSERTM
(
channels
==
(
group
*
group_channels
),
"Input channels and group times group channels wont match: (%d vs %d)."
,
channels
,
group
*
group_channels
);
auto
dtype
=
input
.
dtype
();
if
(
dtype
==
at
::
kHalf
)
{
dtype
=
at
::
kFloat
;
}
auto
grad_input
=
at
::
zeros_like
(
input
,
dtype
);
auto
grad_offset
=
at
::
zeros_like
(
offset
,
dtype
);
auto
grad_mask
=
at
::
zeros_like
(
mask
,
dtype
);
const
int
batch_n
=
im2col_step_
;
auto
per_input_size
=
height_in
*
width_in
*
group
*
group_channels
;
auto
per_offset_size
=
height_out
*
width_out
*
group
*
kernel_h
*
kernel_w
*
2
;
auto
per_mask_size
=
height_out
*
width_out
*
group
*
kernel_h
*
kernel_w
;
auto
grad_output_n
=
grad_output
.
view
({
batch
/
im2col_step_
,
batch_n
,
height_out
*
width_out
,
group
,
group_channels
});
for
(
int
n
=
0
;
n
<
batch
/
im2col_step_
;
++
n
)
{
auto
grad_output_g
=
grad_output_n
.
select
(
0
,
n
);
// AT_DISPATCH_FLOATING_TYPES(
AT_DISPATCH_FLOATING_TYPES_AND_HALF
(
input
.
type
(),
"ms_deform_attn_backward_cuda"
,
([
&
]
{
dcnv3_col2im_cuda
(
at
::
cuda
::
getCurrentCUDAStream
(),
grad_output_g
.
data
<
scalar_t
>
(),
input
.
data
<
scalar_t
>
()
+
n
*
im2col_step_
*
per_input_size
,
offset
.
data
<
scalar_t
>
()
+
n
*
im2col_step_
*
per_offset_size
,
mask
.
data
<
scalar_t
>
()
+
n
*
im2col_step_
*
per_mask_size
,
kernel_h
,
kernel_w
,
stride_h
,
stride_w
,
pad_h
,
pad_w
,
dilation_h
,
dilation_w
,
group
,
group_channels
,
batch_n
,
height_in
,
width_in
,
height_out
,
width_out
,
offset_scale
,
grad_input
.
data
<
opmath_t
>
()
+
n
*
im2col_step_
*
per_input_size
,
grad_offset
.
data
<
opmath_t
>
()
+
n
*
im2col_step_
*
per_offset_size
,
grad_mask
.
data
<
opmath_t
>
()
+
n
*
im2col_step_
*
per_mask_size
);
}));
}
if
(
input
.
dtype
()
==
torch
::
kHalf
)
{
return
{
grad_input
.
to
(
torch
::
kHalf
),
grad_offset
.
to
(
torch
::
kHalf
),
grad_mask
.
to
(
torch
::
kHalf
)};
}
else
{
return
{
grad_input
,
grad_offset
,
grad_mask
};
}
}
\ No newline at end of file
segmentation/ops_dcnv3/src/cuda/dcnv3_cuda.h
0 → 100644
View file @
c4552f79
/*!
**************************************************************************************************
* InternImage
* Copyright (c) 2022 OpenGVLab
* Licensed under The MIT License [see LICENSE for details]
**************************************************************************************************
* Modified from
*https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
**************************************************************************************************
*/
#pragma once
#include <torch/extension.h>
at
::
Tensor
dcnv3_cuda_forward
(
const
at
::
Tensor
&
input
,
const
at
::
Tensor
&
offset
,
const
at
::
Tensor
&
mask
,
const
int
kernel_h
,
const
int
kernel_w
,
const
int
stride_h
,
const
int
stride_w
,
const
int
pad_h
,
const
int
pad_w
,
const
int
dilation_h
,
const
int
dilation_w
,
const
int
group
,
const
int
group_channels
,
const
float
offset_scale
,
const
int
im2col_step
);
std
::
vector
<
at
::
Tensor
>
dcnv3_cuda_backward
(
const
at
::
Tensor
&
input
,
const
at
::
Tensor
&
offset
,
const
at
::
Tensor
&
mask
,
const
int
kernel_h
,
const
int
kernel_w
,
const
int
stride_h
,
const
int
stride_w
,
const
int
pad_h
,
const
int
pad_w
,
const
int
dilation_h
,
const
int
dilation_w
,
const
int
group
,
const
int
group_channels
,
const
float
offset_scale
,
const
at
::
Tensor
&
grad_output
,
const
int
im2col_step
);
segmentation/ops_dcnv3/src/cuda/dcnv3_im2col_cuda.cuh
0 → 100644
View file @
c4552f79
/*!
**************************************************************************************************
* InternImage
* Copyright (c) 2022 OpenGVLab
* Licensed under The MIT License [see LICENSE for details]
**************************************************************************************************
* Modified from
*https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
**************************************************************************************************
*/
#include <algorithm>
#include <cstdio>
#include <cstring>
#include <ATen/ATen.h>
#include <ATen/OpMathType.h>
#include <ATen/cuda/CUDAContext.h>
#include <THC/THCAtomics.cuh>
#define CUDA_KERNEL_LOOP(i, n) \
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
i += blockDim.x * gridDim.x)
const
int
CUDA_NUM_THREADS
=
256
;
inline
int
GET_BLOCKS
(
const
int
N
,
const
int
num_threads
)
{
return
(
N
+
num_threads
-
1
)
/
num_threads
;
}
#define opmath_t at::opmath_type<scalar_t>
template
<
typename
scalar_t
>
__device__
opmath_t
dcnv3_im2col_bilinear
(
const
scalar_t
*&
bottom_data
,
const
int
&
height
,
const
int
&
width
,
const
int
&
group
,
const
int
&
group_channels
,
const
opmath_t
&
h
,
const
opmath_t
&
w
,
const
int
&
g
,
const
int
&
c
)
{
const
int
h_low
=
floor
(
h
);
const
int
w_low
=
floor
(
w
);
const
int
h_high
=
h_low
+
1
;
const
int
w_high
=
w_low
+
1
;
const
opmath_t
lh
=
h
-
h_low
;
const
opmath_t
lw
=
w
-
w_low
;
const
opmath_t
hh
=
1
-
lh
,
hw
=
1
-
lw
;
const
int
w_stride
=
group
*
group_channels
;
const
int
h_stride
=
width
*
w_stride
;
const
int
h_low_ptr_offset
=
h_low
*
h_stride
;
const
int
h_high_ptr_offset
=
h_low_ptr_offset
+
h_stride
;
const
int
w_low_ptr_offset
=
w_low
*
w_stride
;
const
int
w_high_ptr_offset
=
w_low_ptr_offset
+
w_stride
;
const
int
base_ptr
=
g
*
group_channels
+
c
;
opmath_t
v1
=
0
;
if
(
h_low
>=
0
&&
w_low
>=
0
)
{
const
int
ptr1
=
h_low_ptr_offset
+
w_low_ptr_offset
+
base_ptr
;
v1
=
bottom_data
[
ptr1
];
}
opmath_t
v2
=
0
;
if
(
h_low
>=
0
&&
w_high
<=
width
-
1
)
{
const
int
ptr2
=
h_low_ptr_offset
+
w_high_ptr_offset
+
base_ptr
;
v2
=
bottom_data
[
ptr2
];
}
opmath_t
v3
=
0
;
if
(
h_high
<=
height
-
1
&&
w_low
>=
0
)
{
const
int
ptr3
=
h_high_ptr_offset
+
w_low_ptr_offset
+
base_ptr
;
v3
=
bottom_data
[
ptr3
];
}
opmath_t
v4
=
0
;
if
(
h_high
<=
height
-
1
&&
w_high
<=
width
-
1
)
{
const
int
ptr4
=
h_high_ptr_offset
+
w_high_ptr_offset
+
base_ptr
;
v4
=
bottom_data
[
ptr4
];
}
const
opmath_t
w1
=
hh
*
hw
,
w2
=
hh
*
lw
,
w3
=
lh
*
hw
,
w4
=
lh
*
lw
;
const
opmath_t
val
=
(
w1
*
v1
+
w2
*
v2
+
w3
*
v3
+
w4
*
v4
);
return
val
;
}
template
<
typename
scalar_t
>
__device__
void
dcnv3_col2im_bilinear
(
const
scalar_t
*&
bottom_data
,
const
int
&
height
,
const
int
&
width
,
const
int
&
nheads
,
const
int
&
group_channels
,
const
opmath_t
&
h
,
const
opmath_t
&
w
,
const
int
&
m
,
const
int
&
c
,
const
opmath_t
offset_scale
,
const
opmath_t
&
top_grad
,
const
opmath_t
&
mask
,
opmath_t
*&
grad_im
,
opmath_t
*
grad_offset
,
opmath_t
*
grad_mask
)
{
const
int
h_low
=
floor
(
h
);
const
int
w_low
=
floor
(
w
);
const
int
h_high
=
h_low
+
1
;
const
int
w_high
=
w_low
+
1
;
const
opmath_t
lh
=
h
-
h_low
;
const
opmath_t
lw
=
w
-
w_low
;
const
opmath_t
hh
=
1
-
lh
,
hw
=
1
-
lw
;
const
int
w_stride
=
nheads
*
group_channels
;
const
int
h_stride
=
width
*
w_stride
;
const
int
h_low_ptr_offset
=
h_low
*
h_stride
;
const
int
h_high_ptr_offset
=
h_low_ptr_offset
+
h_stride
;
const
int
w_low_ptr_offset
=
w_low
*
w_stride
;
const
int
w_high_ptr_offset
=
w_low_ptr_offset
+
w_stride
;
const
int
base_ptr
=
m
*
group_channels
+
c
;
const
opmath_t
w1
=
hh
*
hw
,
w2
=
hh
*
lw
,
w3
=
lh
*
hw
,
w4
=
lh
*
lw
;
const
opmath_t
top_grad_im
=
top_grad
*
mask
;
opmath_t
grad_h_weight
=
0
,
grad_w_weight
=
0
;
opmath_t
v1
=
0
;
if
(
h_low
>=
0
&&
w_low
>=
0
)
{
const
int
ptr1
=
h_low_ptr_offset
+
w_low_ptr_offset
+
base_ptr
;
v1
=
bottom_data
[
ptr1
];
grad_h_weight
-=
hw
*
v1
;
grad_w_weight
-=
hh
*
v1
;
atomicAdd
(
grad_im
+
ptr1
,
w1
*
top_grad_im
);
}
opmath_t
v2
=
0
;
if
(
h_low
>=
0
&&
w_high
<=
width
-
1
)
{
const
int
ptr2
=
h_low_ptr_offset
+
w_high_ptr_offset
+
base_ptr
;
v2
=
bottom_data
[
ptr2
];
grad_h_weight
-=
lw
*
v2
;
grad_w_weight
+=
hh
*
v2
;
atomicAdd
(
grad_im
+
ptr2
,
w2
*
top_grad_im
);
}
opmath_t
v3
=
0
;
if
(
h_high
<=
height
-
1
&&
w_low
>=
0
)
{
const
int
ptr3
=
h_high_ptr_offset
+
w_low_ptr_offset
+
base_ptr
;
v3
=
bottom_data
[
ptr3
];
grad_h_weight
+=
hw
*
v3
;
grad_w_weight
-=
lh
*
v3
;
atomicAdd
(
grad_im
+
ptr3
,
w3
*
top_grad_im
);
}
opmath_t
v4
=
0
;
if
(
h_high
<=
height
-
1
&&
w_high
<=
width
-
1
)
{
const
int
ptr4
=
h_high_ptr_offset
+
w_high_ptr_offset
+
base_ptr
;
v4
=
bottom_data
[
ptr4
];
grad_h_weight
+=
lw
*
v4
;
grad_w_weight
+=
lh
*
v4
;
atomicAdd
(
grad_im
+
ptr4
,
w4
*
top_grad_im
);
}
const
opmath_t
val
=
(
w1
*
v1
+
w2
*
v2
+
w3
*
v3
+
w4
*
v4
);
*
grad_mask
=
top_grad
*
val
;
*
grad_offset
=
offset_scale
*
grad_w_weight
*
top_grad_im
;
*
(
grad_offset
+
1
)
=
offset_scale
*
grad_h_weight
*
top_grad_im
;
}
template
<
typename
scalar_t
>
__device__
void
dcnv3_col2im_bilinear_gm
(
const
scalar_t
*&
bottom_data
,
const
int
&
height
,
const
int
&
width
,
const
int
&
nheads
,
const
int
&
group_channels
,
const
opmath_t
&
h
,
const
opmath_t
&
w
,
const
int
&
m
,
const
int
&
c
,
const
opmath_t
offset_scale
,
const
opmath_t
&
top_grad
,
const
opmath_t
&
mask
,
opmath_t
*&
grad_im
,
opmath_t
*
grad_offset
,
opmath_t
*
grad_mask
)
{
const
int
h_low
=
floor
(
h
);
const
int
w_low
=
floor
(
w
);
const
int
h_high
=
h_low
+
1
;
const
int
w_high
=
w_low
+
1
;
const
opmath_t
lh
=
h
-
h_low
;
const
opmath_t
lw
=
w
-
w_low
;
const
opmath_t
hh
=
1
-
lh
,
hw
=
1
-
lw
;
const
int
w_stride
=
nheads
*
group_channels
;
const
int
h_stride
=
width
*
w_stride
;
const
int
h_low_ptr_offset
=
h_low
*
h_stride
;
const
int
h_high_ptr_offset
=
h_low_ptr_offset
+
h_stride
;
const
int
w_low_ptr_offset
=
w_low
*
w_stride
;
const
int
w_high_ptr_offset
=
w_low_ptr_offset
+
w_stride
;
const
int
base_ptr
=
m
*
group_channels
+
c
;
const
opmath_t
w1
=
hh
*
hw
,
w2
=
hh
*
lw
,
w3
=
lh
*
hw
,
w4
=
lh
*
lw
;
const
opmath_t
top_grad_im
=
top_grad
*
mask
;
opmath_t
grad_h_weight
=
0
,
grad_w_weight
=
0
;
opmath_t
v1
=
0
;
if
(
h_low
>=
0
&&
w_low
>=
0
)
{
const
int
ptr1
=
h_low_ptr_offset
+
w_low_ptr_offset
+
base_ptr
;
v1
=
bottom_data
[
ptr1
];
grad_h_weight
-=
hw
*
v1
;
grad_w_weight
-=
hh
*
v1
;
atomicAdd
(
grad_im
+
ptr1
,
w1
*
top_grad_im
);
}
opmath_t
v2
=
0
;
if
(
h_low
>=
0
&&
w_high
<=
width
-
1
)
{
const
int
ptr2
=
h_low_ptr_offset
+
w_high_ptr_offset
+
base_ptr
;
v2
=
bottom_data
[
ptr2
];
grad_h_weight
-=
lw
*
v2
;
grad_w_weight
+=
hh
*
v2
;
atomicAdd
(
grad_im
+
ptr2
,
w2
*
top_grad_im
);
}
opmath_t
v3
=
0
;
if
(
h_high
<=
height
-
1
&&
w_low
>=
0
)
{
const
int
ptr3
=
h_high_ptr_offset
+
w_low_ptr_offset
+
base_ptr
;
v3
=
bottom_data
[
ptr3
];
grad_h_weight
+=
hw
*
v3
;
grad_w_weight
-=
lh
*
v3
;
atomicAdd
(
grad_im
+
ptr3
,
w3
*
top_grad_im
);
}
opmath_t
v4
=
0
;
if
(
h_high
<=
height
-
1
&&
w_high
<=
width
-
1
)
{
const
int
ptr4
=
h_high_ptr_offset
+
w_high_ptr_offset
+
base_ptr
;
v4
=
bottom_data
[
ptr4
];
grad_h_weight
+=
lw
*
v4
;
grad_w_weight
+=
lh
*
v4
;
atomicAdd
(
grad_im
+
ptr4
,
w4
*
top_grad_im
);
}
const
opmath_t
val
=
(
w1
*
v1
+
w2
*
v2
+
w3
*
v3
+
w4
*
v4
);
atomicAdd
(
grad_mask
,
top_grad
*
val
);
atomicAdd
(
grad_offset
,
offset_scale
*
grad_w_weight
*
top_grad_im
);
atomicAdd
(
grad_offset
+
1
,
offset_scale
*
grad_h_weight
*
top_grad_im
);
}
template
<
typename
scalar_t
>
__global__
void
dcnv3_im2col_gpu_kernel
(
const
int
num_kernels
,
const
scalar_t
*
data_im
,
const
scalar_t
*
data_offset
,
const
scalar_t
*
data_mask
,
scalar_t
*
data_col
,
const
int
kernel_h
,
const
int
kernel_w
,
const
int
stride_h
,
const
int
stride_w
,
const
int
pad_h
,
const
int
pad_w
,
const
int
dilation_h
,
const
int
dilation_w
,
const
int
group
,
const
int
group_channels
,
const
int
height_in
,
const
int
width_in
,
const
int
height_out
,
const
int
width_out
,
const
opmath_t
offset_scale
)
{
CUDA_KERNEL_LOOP
(
index
,
num_kernels
)
{
int
_temp
=
index
;
const
int
c_col
=
_temp
%
group_channels
;
_temp
/=
group_channels
;
const
int
sampling_index
=
_temp
;
const
int
g_col
=
_temp
%
group
;
_temp
/=
group
;
const
int
p0_w
=
((
dilation_w
*
(
kernel_w
-
1
))
>>
1
)
-
pad_w
+
(
_temp
%
width_out
)
*
stride_w
;
_temp
/=
width_out
;
const
int
p0_h
=
((
dilation_h
*
(
kernel_h
-
1
))
>>
1
)
-
pad_h
+
(
_temp
%
height_out
)
*
stride_h
;
_temp
/=
height_out
;
const
int
b_col
=
_temp
;
const
int
input_size
=
height_in
*
width_in
;
scalar_t
*
data_col_ptr
=
data_col
+
index
;
const
int
kernel_size
=
kernel_h
*
kernel_w
;
int
data_weight_ptr
=
sampling_index
*
kernel_size
;
int
data_loc_w_ptr
=
data_weight_ptr
<<
1
;
const
int
qid_stride
=
group
*
group_channels
;
opmath_t
col
=
0
;
const
scalar_t
*
data_im_ptr
=
data_im
+
b_col
*
input_size
*
qid_stride
;
// top-left
const
opmath_t
p0_w_
=
p0_w
-
((
dilation_w
*
(
kernel_w
-
1
))
>>
1
)
*
offset_scale
;
const
opmath_t
p0_h_
=
p0_h
-
((
dilation_h
*
(
kernel_h
-
1
))
>>
1
)
*
offset_scale
;
for
(
int
i
=
0
;
i
<
kernel_w
;
++
i
)
{
for
(
int
j
=
0
;
j
<
kernel_h
;
++
j
)
{
const
opmath_t
offset_w
=
data_offset
[
data_loc_w_ptr
];
const
opmath_t
offset_h
=
data_offset
[
data_loc_w_ptr
+
1
];
const
opmath_t
loc_w
=
p0_w_
+
(
i
*
dilation_w
+
offset_w
)
*
offset_scale
;
const
opmath_t
loc_h
=
p0_h_
+
(
j
*
dilation_h
+
offset_h
)
*
offset_scale
;
const
opmath_t
weight
=
data_mask
[
data_weight_ptr
];
if
(
loc_h
>
-
1
&&
loc_w
>
-
1
&&
loc_h
<
height_in
&&
loc_w
<
width_in
)
{
col
+=
dcnv3_im2col_bilinear
(
data_im_ptr
,
height_in
,
width_in
,
group
,
group_channels
,
loc_h
,
loc_w
,
g_col
,
c_col
)
*
weight
;
}
data_weight_ptr
+=
1
;
data_loc_w_ptr
+=
2
;
}
}
*
data_col_ptr
=
col
;
}
}
// debug
template
<
typename
scalar_t
,
unsigned
int
blockSize
>
__global__
void
dcnv3_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1
(
const
int
num_kernels
,
const
scalar_t
*
grad_col
,
const
scalar_t
*
data_im
,
const
scalar_t
*
data_offset
,
const
scalar_t
*
data_mask
,
const
int
kernel_h
,
const
int
kernel_w
,
const
int
stride_h
,
const
int
stride_w
,
const
int
pad_h
,
const
int
pad_w
,
const
int
dilation_h
,
const
int
dilation_w
,
const
int
group
,
const
int
group_channels
,
const
int
height_in
,
const
int
width_in
,
const
int
height_out
,
const
int
width_out
,
const
opmath_t
offset_scale
,
opmath_t
*
grad_im
,
opmath_t
*
grad_offset
,
opmath_t
*
grad_mask
)
{
CUDA_KERNEL_LOOP
(
index
,
num_kernels
)
{
__shared__
opmath_t
cache_grad_offset
[
blockSize
*
2
];
__shared__
opmath_t
cache_grad_mask
[
blockSize
];
unsigned
int
tid
=
threadIdx
.
x
;
int
_temp
=
index
;
const
int
c_col
=
_temp
%
group_channels
;
_temp
/=
group_channels
;
const
int
sampling_index
=
_temp
;
const
int
g_col
=
_temp
%
group
;
_temp
/=
group
;
const
int
p0_w
=
((
dilation_w
*
(
kernel_w
-
1
))
>>
1
)
-
pad_w
+
(
_temp
%
width_out
)
*
stride_w
;
_temp
/=
width_out
;
const
int
p0_h
=
((
dilation_h
*
(
kernel_h
-
1
))
>>
1
)
-
pad_h
+
(
_temp
%
height_out
)
*
stride_h
;
_temp
/=
height_out
;
const
int
b_col
=
_temp
;
const
opmath_t
top_grad
=
grad_col
[
index
];
const
int
input_size
=
height_in
*
width_in
;
const
int
kernel_size
=
kernel_h
*
kernel_w
;
int
data_weight_ptr
=
sampling_index
*
kernel_size
;
int
data_loc_w_ptr
=
data_weight_ptr
<<
1
;
const
int
grad_sampling_ptr
=
data_weight_ptr
;
grad_offset
+=
grad_sampling_ptr
<<
1
;
grad_mask
+=
grad_sampling_ptr
;
const
int
qid_stride
=
group
*
group_channels
;
const
int
im_ptr_offset
=
b_col
*
input_size
*
qid_stride
;
const
scalar_t
*
data_im_ptr
=
data_im
+
im_ptr_offset
;
opmath_t
*
grad_im_ptr
=
grad_im
+
im_ptr_offset
;
const
opmath_t
p0_w_
=
p0_w
-
((
dilation_w
*
(
kernel_w
-
1
))
>>
1
)
*
offset_scale
;
const
opmath_t
p0_h_
=
p0_h
-
((
dilation_h
*
(
kernel_h
-
1
))
>>
1
)
*
offset_scale
;
for
(
int
i
=
0
;
i
<
kernel_w
;
++
i
)
{
for
(
int
j
=
0
;
j
<
kernel_h
;
++
j
)
{
const
opmath_t
offset_w
=
data_offset
[
data_loc_w_ptr
];
const
opmath_t
offset_h
=
data_offset
[
data_loc_w_ptr
+
1
];
const
opmath_t
loc_w
=
p0_w_
+
(
i
*
dilation_w
+
offset_w
)
*
offset_scale
;
const
opmath_t
loc_h
=
p0_h_
+
(
j
*
dilation_h
+
offset_h
)
*
offset_scale
;
const
opmath_t
weight
=
data_mask
[
data_weight_ptr
];
*
(
cache_grad_offset
+
(
threadIdx
.
x
<<
1
))
=
0
;
*
(
cache_grad_offset
+
((
threadIdx
.
x
<<
1
)
+
1
))
=
0
;
*
(
cache_grad_mask
+
threadIdx
.
x
)
=
0
;
if
(
loc_h
>
-
1
&&
loc_w
>
-
1
&&
loc_h
<
height_in
&&
loc_w
<
width_in
)
{
dcnv3_col2im_bilinear
(
data_im_ptr
,
height_in
,
width_in
,
group
,
group_channels
,
loc_h
,
loc_w
,
g_col
,
c_col
,
offset_scale
,
top_grad
,
weight
,
grad_im_ptr
,
cache_grad_offset
+
(
threadIdx
.
x
<<
1
),
cache_grad_mask
+
threadIdx
.
x
);
}
__syncthreads
();
if
(
tid
==
0
)
{
opmath_t
_grad_w
=
cache_grad_offset
[
0
],
_grad_h
=
cache_grad_offset
[
1
],
_grad_a
=
cache_grad_mask
[
0
];
int
sid
=
2
;
for
(
unsigned
int
tid
=
1
;
tid
<
blockSize
;
++
tid
)
{
_grad_w
+=
cache_grad_offset
[
sid
];
_grad_h
+=
cache_grad_offset
[
sid
+
1
];
_grad_a
+=
cache_grad_mask
[
tid
];
sid
+=
2
;
}
*
grad_offset
=
_grad_w
;
*
(
grad_offset
+
1
)
=
_grad_h
;
*
grad_mask
=
_grad_a
;
}
__syncthreads
();
data_weight_ptr
+=
1
;
data_loc_w_ptr
+=
2
;
grad_mask
+=
1
;
grad_offset
+=
2
;
}
}
}
}
template
<
typename
scalar_t
,
unsigned
int
blockSize
>
__global__
void
dcnv3_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2
(
const
int
num_kernels
,
const
scalar_t
*
grad_col
,
const
scalar_t
*
data_im
,
const
scalar_t
*
data_offset
,
const
scalar_t
*
data_mask
,
const
int
kernel_h
,
const
int
kernel_w
,
const
int
stride_h
,
const
int
stride_w
,
const
int
pad_h
,
const
int
pad_w
,
const
int
dilation_h
,
const
int
dilation_w
,
const
int
group
,
const
int
group_channels
,
const
int
height_in
,
const
int
width_in
,
const
int
height_out
,
const
int
width_out
,
const
opmath_t
offset_scale
,
opmath_t
*
grad_im
,
opmath_t
*
grad_offset
,
opmath_t
*
grad_mask
)
{
CUDA_KERNEL_LOOP
(
index
,
num_kernels
)
{
__shared__
opmath_t
cache_grad_offset
[
blockSize
*
2
];
__shared__
opmath_t
cache_grad_mask
[
blockSize
];
unsigned
int
tid
=
threadIdx
.
x
;
int
_temp
=
index
;
const
int
c_col
=
_temp
%
group_channels
;
_temp
/=
group_channels
;
const
int
sampling_index
=
_temp
;
const
int
g_col
=
_temp
%
group
;
_temp
/=
group
;
const
int
p0_w
=
((
dilation_w
*
(
kernel_w
-
1
))
>>
1
)
-
pad_w
+
(
_temp
%
width_out
)
*
stride_w
;
_temp
/=
width_out
;
const
int
p0_h
=
((
dilation_h
*
(
kernel_h
-
1
))
>>
1
)
-
pad_h
+
(
_temp
%
height_out
)
*
stride_h
;
_temp
/=
height_out
;
const
int
b_col
=
_temp
;
const
opmath_t
top_grad
=
grad_col
[
index
];
const
int
input_size
=
height_in
*
width_in
;
const
int
kernel_size
=
kernel_h
*
kernel_w
;
int
data_weight_ptr
=
sampling_index
*
kernel_size
;
int
data_loc_w_ptr
=
data_weight_ptr
<<
1
;
const
int
grad_sampling_ptr
=
data_weight_ptr
;
grad_offset
+=
grad_sampling_ptr
<<
1
;
grad_mask
+=
grad_sampling_ptr
;
const
int
qid_stride
=
group
*
group_channels
;
const
int
im_ptr_offset
=
b_col
*
input_size
*
qid_stride
;
const
scalar_t
*
data_im_ptr
=
data_im
+
im_ptr_offset
;
opmath_t
*
grad_im_ptr
=
grad_im
+
im_ptr_offset
;
const
opmath_t
p0_w_
=
p0_w
-
((
dilation_w
*
(
kernel_w
-
1
))
>>
1
)
*
offset_scale
;
const
opmath_t
p0_h_
=
p0_h
-
((
dilation_h
*
(
kernel_h
-
1
))
>>
1
)
*
offset_scale
;
for
(
int
i
=
0
;
i
<
kernel_w
;
++
i
)
{
for
(
int
j
=
0
;
j
<
kernel_h
;
++
j
)
{
const
opmath_t
offset_w
=
data_offset
[
data_loc_w_ptr
];
const
opmath_t
offset_h
=
data_offset
[
data_loc_w_ptr
+
1
];
const
opmath_t
loc_w
=
p0_w_
+
(
i
*
dilation_w
+
offset_w
)
*
offset_scale
;
const
opmath_t
loc_h
=
p0_h_
+
(
j
*
dilation_h
+
offset_h
)
*
offset_scale
;
const
opmath_t
weight
=
data_mask
[
data_weight_ptr
];
*
(
cache_grad_offset
+
(
threadIdx
.
x
<<
1
))
=
0
;
*
(
cache_grad_offset
+
((
threadIdx
.
x
<<
1
)
+
1
))
=
0
;
*
(
cache_grad_mask
+
threadIdx
.
x
)
=
0
;
if
(
loc_h
>
-
1
&&
loc_w
>
-
1
&&
loc_h
<
height_in
&&
loc_w
<
width_in
)
{
dcnv3_col2im_bilinear
(
data_im_ptr
,
height_in
,
width_in
,
group
,
group_channels
,
loc_h
,
loc_w
,
g_col
,
c_col
,
offset_scale
,
top_grad
,
weight
,
grad_im_ptr
,
cache_grad_offset
+
(
threadIdx
.
x
<<
1
),
cache_grad_mask
+
threadIdx
.
x
);
}
__syncthreads
();
for
(
unsigned
int
s
=
blockSize
/
2
;
s
>
0
;
s
>>=
1
)
{
if
(
tid
<
s
)
{
const
unsigned
int
xid1
=
tid
<<
1
;
const
unsigned
int
xid2
=
(
tid
+
s
)
<<
1
;
cache_grad_mask
[
tid
]
+=
cache_grad_mask
[
tid
+
s
];
cache_grad_offset
[
xid1
]
+=
cache_grad_offset
[
xid2
];
cache_grad_offset
[
xid1
+
1
]
+=
cache_grad_offset
[
xid2
+
1
];
}
__syncthreads
();
}
if
(
tid
==
0
)
{
*
grad_offset
=
cache_grad_offset
[
0
];
*
(
grad_offset
+
1
)
=
cache_grad_offset
[
1
];
*
grad_mask
=
cache_grad_mask
[
0
];
}
__syncthreads
();
data_weight_ptr
+=
1
;
data_loc_w_ptr
+=
2
;
grad_mask
+=
1
;
grad_offset
+=
2
;
}
}
}
}
template
<
typename
scalar_t
>
__global__
void
dcnv3_col2im_gpu_kernel_shm_reduce_v1
(
const
int
num_kernels
,
const
scalar_t
*
grad_col
,
const
scalar_t
*
data_im
,
const
scalar_t
*
data_offset
,
const
scalar_t
*
data_mask
,
const
int
kernel_h
,
const
int
kernel_w
,
const
int
stride_h
,
const
int
stride_w
,
const
int
pad_h
,
const
int
pad_w
,
const
int
dilation_h
,
const
int
dilation_w
,
const
int
group
,
const
int
group_channels
,
const
int
height_in
,
const
int
width_in
,
const
int
height_out
,
const
int
width_out
,
const
opmath_t
offset_scale
,
opmath_t
*
grad_im
,
opmath_t
*
grad_offset
,
opmath_t
*
grad_mask
)
{
CUDA_KERNEL_LOOP
(
index
,
num_kernels
)
{
extern
__shared__
int
_s
[];
opmath_t
*
cache_grad_offset
=
(
opmath_t
*
)
_s
;
opmath_t
*
cache_grad_mask
=
cache_grad_offset
+
2
*
blockDim
.
x
;
unsigned
int
tid
=
threadIdx
.
x
;
int
_temp
=
index
;
const
int
c_col
=
_temp
%
group_channels
;
_temp
/=
group_channels
;
const
int
sampling_index
=
_temp
;
const
int
g_col
=
_temp
%
group
;
_temp
/=
group
;
const
int
p0_w
=
((
dilation_w
*
(
kernel_w
-
1
))
>>
1
)
-
pad_w
+
(
_temp
%
width_out
)
*
stride_w
;
_temp
/=
width_out
;
const
int
p0_h
=
((
dilation_h
*
(
kernel_h
-
1
))
>>
1
)
-
pad_h
+
(
_temp
%
height_out
)
*
stride_h
;
_temp
/=
height_out
;
const
int
b_col
=
_temp
;
const
opmath_t
top_grad
=
grad_col
[
index
];
const
int
input_size
=
height_in
*
width_in
;
const
int
kernel_size
=
kernel_h
*
kernel_w
;
int
data_weight_ptr
=
sampling_index
*
kernel_size
;
int
data_loc_w_ptr
=
data_weight_ptr
<<
1
;
const
int
grad_sampling_ptr
=
data_weight_ptr
;
grad_offset
+=
grad_sampling_ptr
<<
1
;
grad_mask
+=
grad_sampling_ptr
;
const
int
qid_stride
=
group
*
group_channels
;
const
int
im_ptr_offset
=
b_col
*
input_size
*
qid_stride
;
const
scalar_t
*
data_im_ptr
=
data_im
+
im_ptr_offset
;
opmath_t
*
grad_im_ptr
=
grad_im
+
im_ptr_offset
;
const
opmath_t
p0_w_
=
p0_w
-
((
dilation_w
*
(
kernel_w
-
1
))
>>
1
)
*
offset_scale
;
const
opmath_t
p0_h_
=
p0_h
-
((
dilation_h
*
(
kernel_h
-
1
))
>>
1
)
*
offset_scale
;
for
(
int
i
=
0
;
i
<
kernel_w
;
++
i
)
{
for
(
int
j
=
0
;
j
<
kernel_h
;
++
j
)
{
const
opmath_t
offset_w
=
data_offset
[
data_loc_w_ptr
];
const
opmath_t
offset_h
=
data_offset
[
data_loc_w_ptr
+
1
];
const
opmath_t
loc_w
=
p0_w_
+
(
i
*
dilation_w
+
offset_w
)
*
offset_scale
;
const
opmath_t
loc_h
=
p0_h_
+
(
j
*
dilation_h
+
offset_h
)
*
offset_scale
;
const
opmath_t
weight
=
data_mask
[
data_weight_ptr
];
*
(
cache_grad_offset
+
(
threadIdx
.
x
<<
1
))
=
0
;
*
(
cache_grad_offset
+
((
threadIdx
.
x
<<
1
)
+
1
))
=
0
;
*
(
cache_grad_mask
+
threadIdx
.
x
)
=
0
;
if
(
loc_h
>
-
1
&&
loc_w
>
-
1
&&
loc_h
<
height_in
&&
loc_w
<
width_in
)
{
dcnv3_col2im_bilinear
(
data_im_ptr
,
height_in
,
width_in
,
group
,
group_channels
,
loc_h
,
loc_w
,
g_col
,
c_col
,
offset_scale
,
top_grad
,
weight
,
grad_im_ptr
,
cache_grad_offset
+
(
threadIdx
.
x
<<
1
),
cache_grad_mask
+
threadIdx
.
x
);
}
__syncthreads
();
if
(
tid
==
0
)
{
opmath_t
_grad_w
=
cache_grad_offset
[
0
],
_grad_h
=
cache_grad_offset
[
1
],
_grad_a
=
cache_grad_mask
[
0
];
int
sid
=
2
;
for
(
unsigned
int
tid
=
1
;
tid
<
blockDim
.
x
;
++
tid
)
{
_grad_w
+=
cache_grad_offset
[
sid
];
_grad_h
+=
cache_grad_offset
[
sid
+
1
];
_grad_a
+=
cache_grad_mask
[
tid
];
sid
+=
2
;
}
*
grad_offset
=
_grad_w
;
*
(
grad_offset
+
1
)
=
_grad_h
;
*
grad_mask
=
_grad_a
;
}
__syncthreads
();
data_weight_ptr
+=
1
;
data_loc_w_ptr
+=
2
;
grad_mask
+=
1
;
grad_offset
+=
2
;
}
}
}
}
template
<
typename
scalar_t
>
__global__
void
dcnv3_col2im_gpu_kernel_shm_reduce_v2
(
const
int
num_kernels
,
const
scalar_t
*
grad_col
,
const
scalar_t
*
data_im
,
const
scalar_t
*
data_offset
,
const
scalar_t
*
data_mask
,
const
int
kernel_h
,
const
int
kernel_w
,
const
int
stride_h
,
const
int
stride_w
,
const
int
pad_h
,
const
int
pad_w
,
const
int
dilation_h
,
const
int
dilation_w
,
const
int
group
,
const
int
group_channels
,
const
int
height_in
,
const
int
width_in
,
const
int
height_out
,
const
int
width_out
,
const
opmath_t
offset_scale
,
opmath_t
*
grad_im
,
opmath_t
*
grad_offset
,
opmath_t
*
grad_mask
)
{
CUDA_KERNEL_LOOP
(
index
,
num_kernels
)
{
extern
__shared__
int
_s
[];
opmath_t
*
cache_grad_offset
=
(
opmath_t
*
)
_s
;
opmath_t
*
cache_grad_mask
=
cache_grad_offset
+
2
*
blockDim
.
x
;
unsigned
int
tid
=
threadIdx
.
x
;
int
_temp
=
index
;
const
int
c_col
=
_temp
%
group_channels
;
_temp
/=
group_channels
;
const
int
sampling_index
=
_temp
;
const
int
g_col
=
_temp
%
group
;
_temp
/=
group
;
const
int
p0_w
=
((
dilation_w
*
(
kernel_w
-
1
))
>>
1
)
-
pad_w
+
(
_temp
%
width_out
)
*
stride_w
;
_temp
/=
width_out
;
const
int
p0_h
=
((
dilation_h
*
(
kernel_h
-
1
))
>>
1
)
-
pad_h
+
(
_temp
%
height_out
)
*
stride_h
;
_temp
/=
height_out
;
const
int
b_col
=
_temp
;
const
opmath_t
top_grad
=
grad_col
[
index
];
const
int
input_size
=
height_in
*
width_in
;
const
int
kernel_size
=
kernel_h
*
kernel_w
;
int
data_weight_ptr
=
sampling_index
*
kernel_size
;
int
data_loc_w_ptr
=
data_weight_ptr
<<
1
;
const
int
grad_sampling_ptr
=
data_weight_ptr
;
grad_offset
+=
grad_sampling_ptr
<<
1
;
grad_mask
+=
grad_sampling_ptr
;
const
int
qid_stride
=
group
*
group_channels
;
const
int
im_ptr_offset
=
b_col
*
input_size
*
qid_stride
;
const
scalar_t
*
data_im_ptr
=
data_im
+
im_ptr_offset
;
opmath_t
*
grad_im_ptr
=
grad_im
+
im_ptr_offset
;
const
opmath_t
p0_w_
=
p0_w
-
((
dilation_w
*
(
kernel_w
-
1
))
>>
1
)
*
offset_scale
;
const
opmath_t
p0_h_
=
p0_h
-
((
dilation_h
*
(
kernel_h
-
1
))
>>
1
)
*
offset_scale
;
for
(
int
i
=
0
;
i
<
kernel_w
;
++
i
)
{
for
(
int
j
=
0
;
j
<
kernel_h
;
++
j
)
{
const
opmath_t
offset_w
=
data_offset
[
data_loc_w_ptr
];
const
opmath_t
offset_h
=
data_offset
[
data_loc_w_ptr
+
1
];
const
opmath_t
loc_w
=
p0_w_
+
(
i
*
dilation_w
+
offset_w
)
*
offset_scale
;
const
opmath_t
loc_h
=
p0_h_
+
(
j
*
dilation_h
+
offset_h
)
*
offset_scale
;
const
opmath_t
weight
=
data_mask
[
data_weight_ptr
];
*
(
cache_grad_offset
+
(
threadIdx
.
x
<<
1
))
=
0
;
*
(
cache_grad_offset
+
((
threadIdx
.
x
<<
1
)
+
1
))
=
0
;
*
(
cache_grad_mask
+
threadIdx
.
x
)
=
0
;
if
(
loc_h
>
-
1
&&
loc_w
>
-
1
&&
loc_h
<
height_in
&&
loc_w
<
width_in
)
{
dcnv3_col2im_bilinear
(
data_im_ptr
,
height_in
,
width_in
,
group
,
group_channels
,
loc_h
,
loc_w
,
g_col
,
c_col
,
offset_scale
,
top_grad
,
weight
,
grad_im_ptr
,
cache_grad_offset
+
(
threadIdx
.
x
<<
1
),
cache_grad_mask
+
threadIdx
.
x
);
}
__syncthreads
();
for
(
unsigned
int
s
=
blockDim
.
x
/
2
,
spre
=
blockDim
.
x
;
s
>
0
;
s
>>=
1
,
spre
>>=
1
)
{
if
(
tid
<
s
)
{
const
unsigned
int
xid1
=
tid
<<
1
;
const
unsigned
int
xid2
=
(
tid
+
s
)
<<
1
;
cache_grad_mask
[
tid
]
+=
cache_grad_mask
[
tid
+
s
];
cache_grad_offset
[
xid1
]
+=
cache_grad_offset
[
xid2
];
cache_grad_offset
[
xid1
+
1
]
+=
cache_grad_offset
[
xid2
+
1
];
if
(
tid
+
(
s
<<
1
)
<
spre
)
{
cache_grad_mask
[
tid
]
+=
cache_grad_mask
[
tid
+
(
s
<<
1
)];
cache_grad_offset
[
xid1
]
+=
cache_grad_offset
[
xid2
+
(
s
<<
1
)];
cache_grad_offset
[
xid1
+
1
]
+=
cache_grad_offset
[
xid2
+
1
+
(
s
<<
1
)];
}
}
__syncthreads
();
}
if
(
tid
==
0
)
{
*
grad_offset
=
cache_grad_offset
[
0
];
*
(
grad_offset
+
1
)
=
cache_grad_offset
[
1
];
*
grad_mask
=
cache_grad_mask
[
0
];
}
__syncthreads
();
data_weight_ptr
+=
1
;
data_loc_w_ptr
+=
2
;
grad_mask
+=
1
;
grad_offset
+=
2
;
}
}
}
}
template
<
typename
scalar_t
>
__global__
void
dcnv3_col2im_gpu_kernel_shm_reduce_v2_multi_blocks
(
const
int
num_kernels
,
const
scalar_t
*
grad_col
,
const
scalar_t
*
data_im
,
const
scalar_t
*
data_offset
,
const
scalar_t
*
data_mask
,
const
int
kernel_h
,
const
int
kernel_w
,
const
int
stride_h
,
const
int
stride_w
,
const
int
pad_h
,
const
int
pad_w
,
const
int
dilation_h
,
const
int
dilation_w
,
const
int
group
,
const
int
group_channels
,
const
int
height_in
,
const
int
width_in
,
const
int
height_out
,
const
int
width_out
,
const
opmath_t
offset_scale
,
opmath_t
*
grad_im
,
opmath_t
*
grad_offset
,
opmath_t
*
grad_mask
)
{
CUDA_KERNEL_LOOP
(
index
,
num_kernels
)
{
extern
__shared__
int
_s
[];
opmath_t
*
cache_grad_offset
=
(
opmath_t
*
)
_s
;
opmath_t
*
cache_grad_mask
=
cache_grad_offset
+
2
*
blockDim
.
x
;
unsigned
int
tid
=
threadIdx
.
x
;
int
_temp
=
index
;
const
int
c_col
=
_temp
%
group_channels
;
_temp
/=
group_channels
;
const
int
sampling_index
=
_temp
;
const
int
g_col
=
_temp
%
group
;
_temp
/=
group
;
const
int
p0_w
=
((
dilation_w
*
(
kernel_w
-
1
))
>>
1
)
-
pad_w
+
(
_temp
%
width_out
)
*
stride_w
;
_temp
/=
width_out
;
const
int
p0_h
=
((
dilation_h
*
(
kernel_h
-
1
))
>>
1
)
-
pad_h
+
(
_temp
%
height_out
)
*
stride_h
;
_temp
/=
height_out
;
const
int
b_col
=
_temp
;
const
opmath_t
top_grad
=
grad_col
[
index
];
const
int
input_size
=
height_in
*
width_in
;
const
int
kernel_size
=
kernel_h
*
kernel_w
;
int
data_weight_ptr
=
sampling_index
*
kernel_size
;
int
data_loc_w_ptr
=
data_weight_ptr
<<
1
;
const
int
grad_sampling_ptr
=
data_weight_ptr
;
grad_offset
+=
grad_sampling_ptr
<<
1
;
grad_mask
+=
grad_sampling_ptr
;
const
int
qid_stride
=
group
*
group_channels
;
const
int
im_ptr_offset
=
b_col
*
input_size
*
qid_stride
;
const
scalar_t
*
data_im_ptr
=
data_im
+
im_ptr_offset
;
opmath_t
*
grad_im_ptr
=
grad_im
+
im_ptr_offset
;
const
opmath_t
p0_w_
=
p0_w
-
((
dilation_w
*
(
kernel_w
-
1
))
>>
1
)
*
offset_scale
;
const
opmath_t
p0_h_
=
p0_h
-
((
dilation_h
*
(
kernel_h
-
1
))
>>
1
)
*
offset_scale
;
for
(
int
i
=
0
;
i
<
kernel_w
;
++
i
)
{
for
(
int
j
=
0
;
j
<
kernel_h
;
++
j
)
{
const
opmath_t
offset_w
=
data_offset
[
data_loc_w_ptr
];
const
opmath_t
offset_h
=
data_offset
[
data_loc_w_ptr
+
1
];
const
opmath_t
loc_w
=
p0_w_
+
(
i
*
dilation_w
+
offset_w
)
*
offset_scale
;
const
opmath_t
loc_h
=
p0_h_
+
(
j
*
dilation_h
+
offset_h
)
*
offset_scale
;
const
opmath_t
weight
=
data_mask
[
data_weight_ptr
];
*
(
cache_grad_offset
+
(
threadIdx
.
x
<<
1
))
=
0
;
*
(
cache_grad_offset
+
((
threadIdx
.
x
<<
1
)
+
1
))
=
0
;
*
(
cache_grad_mask
+
threadIdx
.
x
)
=
0
;
if
(
loc_h
>
-
1
&&
loc_w
>
-
1
&&
loc_h
<
height_in
&&
loc_w
<
width_in
)
{
dcnv3_col2im_bilinear
(
data_im_ptr
,
height_in
,
width_in
,
group
,
group_channels
,
loc_h
,
loc_w
,
g_col
,
c_col
,
offset_scale
,
top_grad
,
weight
,
grad_im_ptr
,
cache_grad_offset
+
(
threadIdx
.
x
<<
1
),
cache_grad_mask
+
threadIdx
.
x
);
}
__syncthreads
();
for
(
unsigned
int
s
=
blockDim
.
x
/
2
,
spre
=
blockDim
.
x
;
s
>
0
;
s
>>=
1
,
spre
>>=
1
)
{
if
(
tid
<
s
)
{
const
unsigned
int
xid1
=
tid
<<
1
;
const
unsigned
int
xid2
=
(
tid
+
s
)
<<
1
;
cache_grad_mask
[
tid
]
+=
cache_grad_mask
[
tid
+
s
];
cache_grad_offset
[
xid1
]
+=
cache_grad_offset
[
xid2
];
cache_grad_offset
[
xid1
+
1
]
+=
cache_grad_offset
[
xid2
+
1
];
if
(
tid
+
(
s
<<
1
)
<
spre
)
{
cache_grad_mask
[
tid
]
+=
cache_grad_mask
[
tid
+
(
s
<<
1
)];
cache_grad_offset
[
xid1
]
+=
cache_grad_offset
[
xid2
+
(
s
<<
1
)];
cache_grad_offset
[
xid1
+
1
]
+=
cache_grad_offset
[
xid2
+
1
+
(
s
<<
1
)];
}
}
__syncthreads
();
}
if
(
tid
==
0
)
{
atomicAdd
(
grad_offset
,
cache_grad_offset
[
0
]);
atomicAdd
(
grad_offset
+
1
,
cache_grad_offset
[
1
]);
atomicAdd
(
grad_mask
,
cache_grad_mask
[
0
]);
}
__syncthreads
();
data_weight_ptr
+=
1
;
data_loc_w_ptr
+=
2
;
grad_mask
+=
1
;
grad_offset
+=
2
;
}
}
}
}
template
<
typename
scalar_t
>
__global__
void
dcnv3_col2im_gpu_kernel_gm
(
const
int
num_kernels
,
const
scalar_t
*
grad_col
,
const
scalar_t
*
data_im
,
const
scalar_t
*
data_offset
,
const
scalar_t
*
data_mask
,
const
int
kernel_h
,
const
int
kernel_w
,
const
int
stride_h
,
const
int
stride_w
,
const
int
pad_h
,
const
int
pad_w
,
const
int
dilation_h
,
const
int
dilation_w
,
const
int
group
,
const
int
group_channels
,
const
int
height_in
,
const
int
width_in
,
const
int
height_out
,
const
int
width_out
,
const
opmath_t
offset_scale
,
opmath_t
*
grad_im
,
opmath_t
*
grad_offset
,
opmath_t
*
grad_mask
)
{
CUDA_KERNEL_LOOP
(
index
,
num_kernels
)
{
int
_temp
=
index
;
const
int
c_col
=
_temp
%
group_channels
;
_temp
/=
group_channels
;
const
int
sampling_index
=
_temp
;
const
int
g_col
=
_temp
%
group
;
_temp
/=
group
;
const
int
p0_w
=
((
dilation_w
*
(
kernel_w
-
1
))
>>
1
)
-
pad_w
+
(
_temp
%
width_out
)
*
stride_w
;
_temp
/=
width_out
;
const
int
p0_h
=
((
dilation_h
*
(
kernel_h
-
1
))
>>
1
)
-
pad_h
+
(
_temp
%
height_out
)
*
stride_h
;
_temp
/=
height_out
;
const
int
b_col
=
_temp
;
const
opmath_t
top_grad
=
grad_col
[
index
];
const
int
input_size
=
height_in
*
width_in
;
const
int
kernel_size
=
kernel_h
*
kernel_w
;
int
data_weight_ptr
=
sampling_index
*
kernel_size
;
int
data_loc_w_ptr
=
data_weight_ptr
<<
1
;
const
int
grad_sampling_ptr
=
data_weight_ptr
;
grad_offset
+=
grad_sampling_ptr
<<
1
;
grad_mask
+=
grad_sampling_ptr
;
const
int
qid_stride
=
group
*
group_channels
;
const
int
im_ptr_offset
=
b_col
*
input_size
*
qid_stride
;
const
scalar_t
*
data_im_ptr
=
data_im
+
im_ptr_offset
;
opmath_t
*
grad_im_ptr
=
grad_im
+
im_ptr_offset
;
const
opmath_t
p0_w_
=
p0_w
-
((
dilation_w
*
(
kernel_w
-
1
))
>>
1
)
*
offset_scale
;
const
opmath_t
p0_h_
=
p0_h
-
((
dilation_h
*
(
kernel_h
-
1
))
>>
1
)
*
offset_scale
;
for
(
int
i
=
0
;
i
<
kernel_w
;
++
i
)
{
for
(
int
j
=
0
;
j
<
kernel_h
;
++
j
)
{
const
opmath_t
offset_w
=
data_offset
[
data_loc_w_ptr
];
const
opmath_t
offset_h
=
data_offset
[
data_loc_w_ptr
+
1
];
const
opmath_t
loc_w
=
p0_w_
+
(
i
*
dilation_w
+
offset_w
)
*
offset_scale
;
const
opmath_t
loc_h
=
p0_h_
+
(
j
*
dilation_h
+
offset_h
)
*
offset_scale
;
const
opmath_t
weight
=
data_mask
[
data_weight_ptr
];
if
(
loc_h
>
-
1
&&
loc_w
>
-
1
&&
loc_h
<
height_in
&&
loc_w
<
width_in
)
{
dcnv3_col2im_bilinear_gm
(
data_im_ptr
,
height_in
,
width_in
,
group
,
group_channels
,
loc_h
,
loc_w
,
g_col
,
c_col
,
offset_scale
,
top_grad
,
weight
,
grad_im_ptr
,
grad_offset
,
grad_mask
);
}
data_weight_ptr
+=
1
;
data_loc_w_ptr
+=
2
;
grad_mask
+=
1
;
grad_offset
+=
2
;
}
}
}
}
template
<
typename
scalar_t
>
void
dcnv3_im2col_cuda
(
cudaStream_t
stream
,
const
scalar_t
*
data_im
,
const
scalar_t
*
data_offset
,
const
scalar_t
*
data_mask
,
scalar_t
*
data_col
,
const
int
kernel_h
,
const
int
kernel_w
,
const
int
stride_h
,
const
int
stride_w
,
const
int
pad_h
,
const
int
pad_w
,
const
int
dilation_h
,
const
int
dilation_w
,
const
int
group
,
const
int
group_channels
,
const
int
batch_n
,
const
int
height_in
,
const
int
width_in
,
const
int
height_out
,
const
int
width_out
,
const
opmath_t
offset_scale
)
{
const
int
num_kernels
=
batch_n
*
height_out
*
width_out
*
group
*
group_channels
;
const
int
num_actual_kernels
=
batch_n
*
height_out
*
width_out
*
group
*
group_channels
;
const
int
num_threads
=
CUDA_NUM_THREADS
;
dcnv3_im2col_gpu_kernel
<
scalar_t
>
<<<
GET_BLOCKS
(
num_actual_kernels
,
num_threads
),
num_threads
,
0
,
stream
>>>
(
num_kernels
,
data_im
,
data_offset
,
data_mask
,
data_col
,
kernel_h
,
kernel_w
,
stride_h
,
stride_w
,
pad_h
,
pad_w
,
dilation_h
,
dilation_w
,
group
,
group_channels
,
height_in
,
width_in
,
height_out
,
width_out
,
offset_scale
);
cudaError_t
err
=
cudaGetLastError
();
if
(
err
!=
cudaSuccess
)
{
printf
(
"error in dcnv3_im2col_cuda: %s
\n
"
,
cudaGetErrorString
(
err
));
}
}
template
<
typename
scalar_t
>
void
dcnv3_col2im_cuda
(
cudaStream_t
stream
,
const
scalar_t
*
grad_col
,
const
scalar_t
*
data_im
,
const
scalar_t
*
data_offset
,
const
scalar_t
*
data_mask
,
const
int
kernel_h
,
const
int
kernel_w
,
const
int
stride_h
,
const
int
stride_w
,
const
int
pad_h
,
const
int
pad_w
,
const
int
dilation_h
,
const
int
dilation_w
,
const
int
group
,
const
int
group_channels
,
const
int
batch_n
,
const
int
height_in
,
const
int
width_in
,
const
int
height_out
,
const
int
width_out
,
const
opmath_t
offset_scale
,
opmath_t
*
grad_im
,
opmath_t
*
grad_offset
,
opmath_t
*
grad_mask
)
{
const
int
num_threads
=
(
group_channels
>
CUDA_NUM_THREADS
)
?
CUDA_NUM_THREADS
:
group_channels
;
const
int
num_kernels
=
batch_n
*
height_out
*
width_out
*
group
*
group_channels
;
const
int
num_actual_kernels
=
batch_n
*
height_out
*
width_out
*
group
*
group_channels
;
if
(
group_channels
>
1024
)
{
if
((
group_channels
&
1023
)
==
0
)
{
dcnv3_col2im_gpu_kernel_shm_reduce_v2_multi_blocks
<
scalar_t
>
<<<
GET_BLOCKS
(
num_actual_kernels
,
num_threads
),
num_threads
,
num_threads
*
3
*
sizeof
(
opmath_t
),
stream
>>>
(
num_kernels
,
grad_col
,
data_im
,
data_offset
,
data_mask
,
kernel_h
,
kernel_w
,
stride_h
,
stride_w
,
pad_h
,
pad_w
,
dilation_h
,
dilation_w
,
group
,
group_channels
,
height_in
,
width_in
,
height_out
,
width_out
,
offset_scale
,
grad_im
,
grad_offset
,
grad_mask
);
}
else
{
dcnv3_col2im_gpu_kernel_gm
<
scalar_t
>
<<<
GET_BLOCKS
(
num_actual_kernels
,
num_threads
),
num_threads
,
0
,
stream
>>>
(
num_kernels
,
grad_col
,
data_im
,
data_offset
,
data_mask
,
kernel_h
,
kernel_w
,
stride_h
,
stride_w
,
pad_h
,
pad_w
,
dilation_h
,
dilation_w
,
group
,
group_channels
,
height_in
,
width_in
,
height_out
,
width_out
,
offset_scale
,
grad_im
,
grad_offset
,
grad_mask
);
}
}
else
{
switch
(
group_channels
)
{
case
1
:
dcnv3_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1
<
scalar_t
,
1
>
<<<
GET_BLOCKS
(
num_actual_kernels
,
num_threads
),
num_threads
,
0
,
stream
>>>
(
num_kernels
,
grad_col
,
data_im
,
data_offset
,
data_mask
,
kernel_h
,
kernel_w
,
stride_h
,
stride_w
,
pad_h
,
pad_w
,
dilation_h
,
dilation_w
,
group
,
group_channels
,
height_in
,
width_in
,
height_out
,
width_out
,
offset_scale
,
grad_im
,
grad_offset
,
grad_mask
);
break
;
case
2
:
dcnv3_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1
<
scalar_t
,
2
>
<<<
GET_BLOCKS
(
num_actual_kernels
,
num_threads
),
num_threads
,
0
,
stream
>>>
(
num_kernels
,
grad_col
,
data_im
,
data_offset
,
data_mask
,
kernel_h
,
kernel_w
,
stride_h
,
stride_w
,
pad_h
,
pad_w
,
dilation_h
,
dilation_w
,
group
,
group_channels
,
height_in
,
width_in
,
height_out
,
width_out
,
offset_scale
,
grad_im
,
grad_offset
,
grad_mask
);
break
;
case
4
:
dcnv3_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1
<
scalar_t
,
4
>
<<<
GET_BLOCKS
(
num_actual_kernels
,
num_threads
),
num_threads
,
0
,
stream
>>>
(
num_kernels
,
grad_col
,
data_im
,
data_offset
,
data_mask
,
kernel_h
,
kernel_w
,
stride_h
,
stride_w
,
pad_h
,
pad_w
,
dilation_h
,
dilation_w
,
group
,
group_channels
,
height_in
,
width_in
,
height_out
,
width_out
,
offset_scale
,
grad_im
,
grad_offset
,
grad_mask
);
break
;
case
8
:
dcnv3_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1
<
scalar_t
,
8
>
<<<
GET_BLOCKS
(
num_actual_kernels
,
num_threads
),
num_threads
,
0
,
stream
>>>
(
num_kernels
,
grad_col
,
data_im
,
data_offset
,
data_mask
,
kernel_h
,
kernel_w
,
stride_h
,
stride_w
,
pad_h
,
pad_w
,
dilation_h
,
dilation_w
,
group
,
group_channels
,
height_in
,
width_in
,
height_out
,
width_out
,
offset_scale
,
grad_im
,
grad_offset
,
grad_mask
);
break
;
case
16
:
dcnv3_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1
<
scalar_t
,
16
>
<<<
GET_BLOCKS
(
num_actual_kernels
,
num_threads
),
num_threads
,
0
,
stream
>>>
(
num_kernels
,
grad_col
,
data_im
,
data_offset
,
data_mask
,
kernel_h
,
kernel_w
,
stride_h
,
stride_w
,
pad_h
,
pad_w
,
dilation_h
,
dilation_w
,
group
,
group_channels
,
height_in
,
width_in
,
height_out
,
width_out
,
offset_scale
,
grad_im
,
grad_offset
,
grad_mask
);
break
;
case
32
:
dcnv3_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1
<
scalar_t
,
32
>
<<<
GET_BLOCKS
(
num_actual_kernels
,
num_threads
),
num_threads
,
0
,
stream
>>>
(
num_kernels
,
grad_col
,
data_im
,
data_offset
,
data_mask
,
kernel_h
,
kernel_w
,
stride_h
,
stride_w
,
pad_h
,
pad_w
,
dilation_h
,
dilation_w
,
group
,
group_channels
,
height_in
,
width_in
,
height_out
,
width_out
,
offset_scale
,
grad_im
,
grad_offset
,
grad_mask
);
break
;
case
64
:
dcnv3_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2
<
scalar_t
,
64
>
<<<
GET_BLOCKS
(
num_actual_kernels
,
num_threads
),
num_threads
,
0
,
stream
>>>
(
num_kernels
,
grad_col
,
data_im
,
data_offset
,
data_mask
,
kernel_h
,
kernel_w
,
stride_h
,
stride_w
,
pad_h
,
pad_w
,
dilation_h
,
dilation_w
,
group
,
group_channels
,
height_in
,
width_in
,
height_out
,
width_out
,
offset_scale
,
grad_im
,
grad_offset
,
grad_mask
);
break
;
case
128
:
dcnv3_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2
<
scalar_t
,
128
>
<<<
GET_BLOCKS
(
num_actual_kernels
,
num_threads
),
num_threads
,
0
,
stream
>>>
(
num_kernels
,
grad_col
,
data_im
,
data_offset
,
data_mask
,
kernel_h
,
kernel_w
,
stride_h
,
stride_w
,
pad_h
,
pad_w
,
dilation_h
,
dilation_w
,
group
,
group_channels
,
height_in
,
width_in
,
height_out
,
width_out
,
offset_scale
,
grad_im
,
grad_offset
,
grad_mask
);
break
;
case
256
:
dcnv3_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2
<
scalar_t
,
256
>
<<<
GET_BLOCKS
(
num_actual_kernels
,
num_threads
),
num_threads
,
0
,
stream
>>>
(
num_kernels
,
grad_col
,
data_im
,
data_offset
,
data_mask
,
kernel_h
,
kernel_w
,
stride_h
,
stride_w
,
pad_h
,
pad_w
,
dilation_h
,
dilation_w
,
group
,
group_channels
,
height_in
,
width_in
,
height_out
,
width_out
,
offset_scale
,
grad_im
,
grad_offset
,
grad_mask
);
break
;
case
512
:
dcnv3_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2
<
scalar_t
,
512
>
<<<
GET_BLOCKS
(
num_actual_kernels
,
num_threads
),
num_threads
,
0
,
stream
>>>
(
num_kernels
,
grad_col
,
data_im
,
data_offset
,
data_mask
,
kernel_h
,
kernel_w
,
stride_h
,
stride_w
,
pad_h
,
pad_w
,
dilation_h
,
dilation_w
,
group
,
group_channels
,
height_in
,
width_in
,
height_out
,
width_out
,
offset_scale
,
grad_im
,
grad_offset
,
grad_mask
);
break
;
case
1024
:
dcnv3_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2
<
scalar_t
,
1024
>
<<<
GET_BLOCKS
(
num_actual_kernels
,
num_threads
),
num_threads
,
0
,
stream
>>>
(
num_kernels
,
grad_col
,
data_im
,
data_offset
,
data_mask
,
kernel_h
,
kernel_w
,
stride_h
,
stride_w
,
pad_h
,
pad_w
,
dilation_h
,
dilation_w
,
group
,
group_channels
,
height_in
,
width_in
,
height_out
,
width_out
,
offset_scale
,
grad_im
,
grad_offset
,
grad_mask
);
break
;
default:
if
(
group_channels
<
64
)
{
dcnv3_col2im_gpu_kernel_shm_reduce_v1
<
scalar_t
>
<<<
GET_BLOCKS
(
num_actual_kernels
,
num_threads
),
num_threads
,
num_threads
*
3
*
sizeof
(
opmath_t
),
stream
>>>
(
num_kernels
,
grad_col
,
data_im
,
data_offset
,
data_mask
,
kernel_h
,
kernel_w
,
stride_h
,
stride_w
,
pad_h
,
pad_w
,
dilation_h
,
dilation_w
,
group
,
group_channels
,
height_in
,
width_in
,
height_out
,
width_out
,
offset_scale
,
grad_im
,
grad_offset
,
grad_mask
);
}
else
{
dcnv3_col2im_gpu_kernel_shm_reduce_v2
<
scalar_t
>
<<<
GET_BLOCKS
(
num_actual_kernels
,
num_threads
),
num_threads
,
num_threads
*
3
*
sizeof
(
opmath_t
),
stream
>>>
(
num_kernels
,
grad_col
,
data_im
,
data_offset
,
data_mask
,
kernel_h
,
kernel_w
,
stride_h
,
stride_w
,
pad_h
,
pad_w
,
dilation_h
,
dilation_w
,
group
,
group_channels
,
height_in
,
width_in
,
height_out
,
width_out
,
offset_scale
,
grad_im
,
grad_offset
,
grad_mask
);
}
}
}
cudaError_t
err
=
cudaGetLastError
();
if
(
err
!=
cudaSuccess
)
{
printf
(
"error in dcnv3_col2im_cuda: %s
\n
"
,
cudaGetErrorString
(
err
));
}
}
\ No newline at end of file
segmentation/ops_dcnv3/src/dcnv3.h
0 → 100644
View file @
c4552f79
/*!
**************************************************************************************************
* InternImage
* Copyright (c) 2022 OpenGVLab
* Licensed under The MIT License [see LICENSE for details]
**************************************************************************************************
* Modified from
*https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
**************************************************************************************************
*/
#pragma once
#include "cpu/dcnv3_cpu.h"
#ifdef WITH_CUDA
#include "cuda/dcnv3_cuda.h"
#endif
at
::
Tensor
dcnv3_forward
(
const
at
::
Tensor
&
input
,
const
at
::
Tensor
&
offset
,
const
at
::
Tensor
&
mask
,
const
int
kernel_h
,
const
int
kernel_w
,
const
int
stride_h
,
const
int
stride_w
,
const
int
pad_h
,
const
int
pad_w
,
const
int
dilation_h
,
const
int
dilation_w
,
const
int
group
,
const
int
group_channels
,
const
float
offset_scale
,
const
int
im2col_step
)
{
if
(
input
.
type
().
is_cuda
())
{
#ifdef WITH_CUDA
return
dcnv3_cuda_forward
(
input
,
offset
,
mask
,
kernel_h
,
kernel_w
,
stride_h
,
stride_w
,
pad_h
,
pad_w
,
dilation_h
,
dilation_w
,
group
,
group_channels
,
offset_scale
,
im2col_step
);
#else
AT_ERROR
(
"Not compiled with GPU support"
);
#endif
}
AT_ERROR
(
"Not implemented on the CPU"
);
}
std
::
vector
<
at
::
Tensor
>
dcnv3_backward
(
const
at
::
Tensor
&
input
,
const
at
::
Tensor
&
offset
,
const
at
::
Tensor
&
mask
,
const
int
kernel_h
,
const
int
kernel_w
,
const
int
stride_h
,
const
int
stride_w
,
const
int
pad_h
,
const
int
pad_w
,
const
int
dilation_h
,
const
int
dilation_w
,
const
int
group
,
const
int
group_channels
,
const
float
offset_scale
,
const
at
::
Tensor
&
grad_output
,
const
int
im2col_step
)
{
if
(
input
.
type
().
is_cuda
())
{
#ifdef WITH_CUDA
return
dcnv3_cuda_backward
(
input
,
offset
,
mask
,
kernel_h
,
kernel_w
,
stride_h
,
stride_w
,
pad_h
,
pad_w
,
dilation_h
,
dilation_w
,
group
,
group_channels
,
offset_scale
,
grad_output
,
im2col_step
);
#else
AT_ERROR
(
"Not compiled with GPU support"
);
#endif
}
AT_ERROR
(
"Not implemented on the CPU"
);
}
segmentation/ops_dcnv3/src/vision.cpp
0 → 100644
View file @
c4552f79
/*!
**************************************************************************************************
* InternImage
* Copyright (c) 2022 OpenGVLab
* Licensed under The MIT License [see LICENSE for details]
**************************************************************************************************
* Modified from
*https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
**************************************************************************************************
*/
#include "dcnv3.h"
PYBIND11_MODULE
(
TORCH_EXTENSION_NAME
,
m
)
{
m
.
def
(
"dcnv3_forward"
,
&
dcnv3_forward
,
"dcnv3_forward"
);
m
.
def
(
"dcnv3_backward"
,
&
dcnv3_backward
,
"dcnv3_backward"
);
}
segmentation/ops_dcnv3/test.py
0 → 100644
View file @
c4552f79
# --------------------------------------------------------
# InternImage
# Copyright (c) 2022 OpenGVLab
# Licensed under The MIT License [see LICENSE for details]
# --------------------------------------------------------
from
__future__
import
absolute_import
from
__future__
import
print_function
from
__future__
import
division
import
time
import
torch
import
torch.nn
as
nn
import
math
from
torch.autograd
import
gradcheck
from
functions.dcnv3_func
import
DCNv3Function
,
dcnv3_core_pytorch
H_in
,
W_in
=
8
,
8
N
,
M
,
D
=
2
,
4
,
16
Kh
,
Kw
=
3
,
3
P
=
Kh
*
Kw
offset_scale
=
2.0
pad
=
1
dilation
=
1
stride
=
1
H_out
=
(
H_in
+
2
*
pad
-
(
dilation
*
(
Kh
-
1
)
+
1
))
//
stride
+
1
W_out
=
(
W_in
+
2
*
pad
-
(
dilation
*
(
Kw
-
1
)
+
1
))
//
stride
+
1
torch
.
manual_seed
(
3
)
@
torch
.
no_grad
()
def
check_forward_equal_with_pytorch_double
():
input
=
torch
.
rand
(
N
,
H_in
,
W_in
,
M
*
D
).
cuda
()
*
0.01
offset
=
torch
.
rand
(
N
,
H_out
,
W_out
,
M
*
P
*
2
).
cuda
()
*
10
mask
=
torch
.
rand
(
N
,
H_out
,
W_out
,
M
,
P
).
cuda
()
+
1e-5
mask
/=
mask
.
sum
(
-
1
,
keepdim
=
True
)
mask
=
mask
.
reshape
(
N
,
H_out
,
W_out
,
M
*
P
)
output_pytorch
=
dcnv3_core_pytorch
(
input
.
double
(),
offset
.
double
(),
mask
.
double
(),
Kh
,
Kw
,
stride
,
stride
,
Kh
//
2
,
Kw
//
2
,
dilation
,
dilation
,
M
,
D
,
offset_scale
).
detach
().
cpu
()
im2col_step
=
2
output_cuda
=
DCNv3Function
.
apply
(
input
.
double
(),
offset
.
double
(),
mask
.
double
(),
Kh
,
Kw
,
stride
,
stride
,
Kh
//
2
,
Kw
//
2
,
dilation
,
dilation
,
M
,
D
,
offset_scale
,
im2col_step
).
detach
().
cpu
()
fwdok
=
torch
.
allclose
(
output_cuda
,
output_pytorch
)
max_abs_err
=
(
output_cuda
-
output_pytorch
).
abs
().
max
()
max_rel_err
=
((
output_cuda
-
output_pytorch
).
abs
()
/
output_pytorch
.
abs
()).
max
()
print
(
'>>> forward double'
)
print
(
f
'*
{
fwdok
}
check_forward_equal_with_pytorch_double: max_abs_err
{
max_abs_err
:.
2
e
}
max_rel_err
{
max_rel_err
:.
2
e
}
'
)
@
torch
.
no_grad
()
def
check_forward_equal_with_pytorch_float
():
input
=
torch
.
rand
(
N
,
H_in
,
W_in
,
M
*
D
).
cuda
()
*
0.01
offset
=
torch
.
rand
(
N
,
H_out
,
W_out
,
M
*
P
*
2
).
cuda
()
*
10
mask
=
torch
.
rand
(
N
,
H_out
,
W_out
,
M
,
P
).
cuda
()
+
1e-5
mask
/=
mask
.
sum
(
-
1
,
keepdim
=
True
)
mask
=
mask
.
reshape
(
N
,
H_out
,
W_out
,
M
*
P
)
output_pytorch
=
dcnv3_core_pytorch
(
input
,
offset
,
mask
,
Kh
,
Kw
,
stride
,
stride
,
Kh
//
2
,
Kw
//
2
,
dilation
,
dilation
,
M
,
D
,
offset_scale
).
detach
().
cpu
()
im2col_step
=
2
output_cuda
=
DCNv3Function
.
apply
(
input
,
offset
,
mask
,
Kh
,
Kw
,
stride
,
stride
,
Kh
//
2
,
Kw
//
2
,
dilation
,
dilation
,
M
,
D
,
offset_scale
,
im2col_step
).
detach
().
cpu
()
fwdok
=
torch
.
allclose
(
output_cuda
,
output_pytorch
,
rtol
=
1e-2
,
atol
=
1e-3
)
max_abs_err
=
(
output_cuda
-
output_pytorch
).
abs
().
max
()
max_rel_err
=
((
output_cuda
-
output_pytorch
).
abs
()
/
output_pytorch
.
abs
()).
max
()
print
(
'>>> forward float'
)
print
(
f
'*
{
fwdok
}
check_forward_equal_with_pytorch_float: max_abs_err
{
max_abs_err
:.
2
e
}
max_rel_err
{
max_rel_err
:.
2
e
}
'
)
def
check_backward_equal_with_pytorch_double
(
channels
=
4
,
grad_input
=
True
,
grad_offset
=
True
,
grad_mask
=
True
):
# H_in, W_in = 4, 4
N
=
2
M
=
2
H_out
=
(
H_in
+
2
*
pad
-
(
dilation
*
(
Kh
-
1
)
+
1
))
//
stride
+
1
W_out
=
(
W_in
+
2
*
pad
-
(
dilation
*
(
Kw
-
1
)
+
1
))
//
stride
+
1
D
=
channels
input0
=
torch
.
rand
(
N
,
H_in
,
W_in
,
M
*
D
).
cuda
()
*
0.01
offset0
=
torch
.
rand
(
N
,
H_out
,
W_out
,
M
*
P
*
2
).
cuda
()
*
10
mask0
=
torch
.
rand
(
N
,
H_out
,
W_out
,
M
,
P
).
cuda
()
+
1e-5
mask0
/=
mask0
.
sum
(
-
1
,
keepdim
=
True
)
mask0
=
mask0
.
reshape
(
N
,
H_out
,
W_out
,
M
*
P
)
input0
.
requires_grad
=
grad_input
offset0
.
requires_grad
=
grad_offset
mask0
.
requires_grad
=
grad_mask
output_pytorch
=
dcnv3_core_pytorch
(
input0
.
double
(),
offset0
.
double
(),
mask0
.
double
(),
Kh
,
Kw
,
stride
,
stride
,
Kh
//
2
,
Kw
//
2
,
dilation
,
dilation
,
M
,
D
,
offset_scale
)
output_pytorch
.
sum
().
backward
()
input1
=
input0
.
detach
()
offset1
=
offset0
.
detach
()
mask1
=
mask0
.
detach
()
input1
.
requires_grad
=
grad_input
offset1
.
requires_grad
=
grad_offset
mask1
.
requires_grad
=
grad_mask
im2col_step
=
2
output_cuda
=
DCNv3Function
.
apply
(
input1
.
double
(),
offset1
.
double
(),
mask1
.
double
(),
Kh
,
Kw
,
stride
,
stride
,
Kh
//
2
,
Kw
//
2
,
dilation
,
dilation
,
M
,
D
,
offset_scale
,
im2col_step
)
output_cuda
.
sum
().
backward
()
print
(
f
'>>> backward double: channels
{
D
}
'
)
bwdok
=
torch
.
allclose
(
input0
.
grad
,
input1
.
grad
,
rtol
=
1e-2
,
atol
=
1e-3
)
max_abs_err
=
(
input0
.
grad
-
input1
.
grad
).
abs
().
max
()
max_rel_err
=
((
input0
.
grad
-
input1
.
grad
).
abs
()
/
input0
.
grad
.
abs
()).
max
()
print
(
f
'*
{
bwdok
}
input_grad check_backward_equal_with_pytorch_double: max_abs_err
{
max_abs_err
:.
2
e
}
max_rel_err
{
max_rel_err
:.
2
e
}
'
)
bwdok
=
torch
.
allclose
(
offset0
.
grad
,
offset1
.
grad
,
rtol
=
1e-2
,
atol
=
1e-3
)
max_abs_err
=
(
offset0
.
grad
-
offset1
.
grad
).
abs
().
max
()
max_rel_err
=
((
offset0
.
grad
-
offset1
.
grad
).
abs
()
/
offset0
.
grad
.
abs
()).
max
()
print
(
f
'*
{
bwdok
}
offset_grad check_backward_equal_with_pytorch_double: max_abs_err
{
max_abs_err
:.
2
e
}
max_rel_err
{
max_rel_err
:.
2
e
}
'
)
bwdok
=
torch
.
allclose
(
mask0
.
grad
,
mask1
.
grad
,
rtol
=
1e-2
,
atol
=
1e-3
)
max_abs_err
=
(
mask0
.
grad
-
mask1
.
grad
).
abs
().
max
()
max_rel_err
=
((
mask0
.
grad
-
mask1
.
grad
).
abs
()
/
mask0
.
grad
.
abs
()).
max
()
print
(
f
'*
{
bwdok
}
mask_grad check_backward_equal_with_pytorch_double: max_abs_err
{
max_abs_err
:.
2
e
}
max_rel_err
{
max_rel_err
:.
2
e
}
'
)
def
check_backward_equal_with_pytorch_float
(
channels
=
4
,
grad_input
=
True
,
grad_offset
=
True
,
grad_mask
=
True
):
# H_in, W_in = 4, 4
N
=
2
M
=
2
H_out
=
(
H_in
+
2
*
pad
-
(
dilation
*
(
Kh
-
1
)
+
1
))
//
stride
+
1
W_out
=
(
W_in
+
2
*
pad
-
(
dilation
*
(
Kw
-
1
)
+
1
))
//
stride
+
1
D
=
channels
input0
=
torch
.
rand
(
N
,
H_in
,
W_in
,
M
*
D
).
cuda
()
*
0.01
offset0
=
torch
.
rand
(
N
,
H_out
,
W_out
,
M
*
P
*
2
).
cuda
()
*
10
mask0
=
torch
.
rand
(
N
,
H_out
,
W_out
,
M
,
P
).
cuda
()
+
1e-5
mask0
/=
mask0
.
sum
(
-
1
,
keepdim
=
True
)
mask0
=
mask0
.
reshape
(
N
,
H_out
,
W_out
,
M
*
P
)
input0
.
requires_grad
=
grad_input
offset0
.
requires_grad
=
grad_offset
mask0
.
requires_grad
=
grad_mask
output_pytorch
=
dcnv3_core_pytorch
(
input0
,
offset0
,
mask0
,
Kh
,
Kw
,
stride
,
stride
,
Kh
//
2
,
Kw
//
2
,
dilation
,
dilation
,
M
,
D
,
offset_scale
)
output_pytorch
.
sum
().
backward
()
input1
=
input0
.
detach
()
offset1
=
offset0
.
detach
()
mask1
=
mask0
.
detach
()
input1
.
requires_grad
=
grad_input
offset1
.
requires_grad
=
grad_offset
mask1
.
requires_grad
=
grad_mask
im2col_step
=
2
output_cuda
=
DCNv3Function
.
apply
(
input1
,
offset1
,
mask1
,
Kh
,
Kw
,
stride
,
stride
,
Kh
//
2
,
Kw
//
2
,
dilation
,
dilation
,
M
,
D
,
offset_scale
,
im2col_step
)
output_cuda
.
sum
().
backward
()
print
(
f
'>>> backward float: channels
{
D
}
'
)
bwdok
=
torch
.
allclose
(
input0
.
grad
,
input1
.
grad
,
rtol
=
1e-2
,
atol
=
1e-3
)
max_abs_err
=
(
input0
.
grad
-
input1
.
grad
).
abs
().
max
()
max_rel_err
=
((
input0
.
grad
-
input1
.
grad
).
abs
()
/
input0
.
grad
.
abs
()).
max
()
print
(
f
'*
{
bwdok
}
input_grad check_backward_equal_with_pytorch_float: max_abs_err
{
max_abs_err
:.
2
e
}
max_rel_err
{
max_rel_err
:.
2
e
}
'
)
bwdok
=
torch
.
allclose
(
offset0
.
grad
,
offset1
.
grad
,
rtol
=
1e-2
,
atol
=
1e-3
)
max_abs_err
=
(
offset0
.
grad
-
offset1
.
grad
).
abs
().
max
()
max_rel_err
=
((
offset0
.
grad
-
offset1
.
grad
).
abs
()
/
offset0
.
grad
.
abs
()).
max
()
print
(
f
'*
{
bwdok
}
offset_grad check_backward_equal_with_pytorch_float: max_abs_err
{
max_abs_err
:.
2
e
}
max_rel_err
{
max_rel_err
:.
2
e
}
'
)
bwdok
=
torch
.
allclose
(
mask0
.
grad
,
mask1
.
grad
,
rtol
=
1e-2
,
atol
=
1e-3
)
max_abs_err
=
(
mask0
.
grad
-
mask1
.
grad
).
abs
().
max
()
max_rel_err
=
((
mask0
.
grad
-
mask1
.
grad
).
abs
()
/
mask0
.
grad
.
abs
()).
max
()
print
(
f
'*
{
bwdok
}
mask_grad check_backward_equal_with_pytorch_float: max_abs_err
{
max_abs_err
:.
2
e
}
max_rel_err
{
max_rel_err
:.
2
e
}
'
)
@
torch
.
no_grad
()
def
check_time_cost
(
im2col_step
=
128
):
N
=
512
H_in
,
W_in
=
64
,
64
H_out
=
(
H_in
+
2
*
pad
-
(
dilation
*
(
Kh
-
1
)
+
1
))
//
stride
+
1
W_out
=
(
W_in
+
2
*
pad
-
(
dilation
*
(
Kw
-
1
)
+
1
))
//
stride
+
1
input
=
torch
.
rand
(
N
,
H_in
,
W_in
,
M
*
D
).
cuda
()
*
0.01
offset
=
torch
.
rand
(
N
,
H_out
,
W_out
,
M
*
P
*
2
).
cuda
()
*
10
mask
=
torch
.
rand
(
N
,
H_out
,
W_out
,
M
,
P
).
cuda
()
+
1e-5
mask
/=
mask
.
sum
(
-
1
,
keepdim
=
True
)
mask
=
mask
.
reshape
(
N
,
H_out
,
W_out
,
M
*
P
)
print
(
f
'>>> time cost: im2col_step
{
im2col_step
}
; input
{
input
.
shape
}
; points
{
P
}
'
)
repeat
=
100
for
i
in
range
(
repeat
):
output_cuda
=
DCNv3Function
.
apply
(
input
,
offset
,
mask
,
Kh
,
Kw
,
stride
,
stride
,
Kh
//
2
,
Kw
//
2
,
dilation
,
dilation
,
M
,
D
,
1.0
,
im2col_step
)
torch
.
cuda
.
synchronize
()
start
=
time
.
time
()
for
i
in
range
(
repeat
):
output_cuda
=
DCNv3Function
.
apply
(
input
,
offset
,
mask
,
Kh
,
Kw
,
stride
,
stride
,
Kh
//
2
,
Kw
//
2
,
dilation
,
dilation
,
M
,
D
,
1.0
,
im2col_step
)
torch
.
cuda
.
synchronize
()
print
(
f
'foward time cost:
{
(
time
.
time
()
-
start
)
/
repeat
}
'
)
if
__name__
==
'__main__'
:
check_forward_equal_with_pytorch_double
()
check_forward_equal_with_pytorch_float
()
for
channels
in
[
1
,
16
,
30
,
32
,
64
,
71
,
1025
]:
check_backward_equal_with_pytorch_double
(
channels
,
True
,
True
,
True
)
for
channels
in
[
1
,
16
,
30
,
32
,
64
,
71
,
1025
]:
check_backward_equal_with_pytorch_float
(
channels
,
True
,
True
,
True
)
for
i
in
range
(
3
):
im2col_step
=
128
*
(
2
**
i
)
check_time_cost
(
im2col_step
)
Prev
1
2
3
4
5
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment