Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
dcnv3
Commits
df3c64a9
Commit
df3c64a9
authored
Apr 17, 2023
by
zhiqi-li
Browse files
support occupancy prediction
parent
bdd98bcb
Changes
160
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
3337 additions
and
0 deletions
+3337
-0
autonomous_driving/occupancy_prediction/projects/mmdet3d_plugin/bevformer/__init__.py
..._prediction/projects/mmdet3d_plugin/bevformer/__init__.py
+7
-0
autonomous_driving/occupancy_prediction/projects/mmdet3d_plugin/bevformer/apis/__init__.py
...iction/projects/mmdet3d_plugin/bevformer/apis/__init__.py
+3
-0
autonomous_driving/occupancy_prediction/projects/mmdet3d_plugin/bevformer/apis/mmdet_train.py
...ion/projects/mmdet3d_plugin/bevformer/apis/mmdet_train.py
+200
-0
autonomous_driving/occupancy_prediction/projects/mmdet3d_plugin/bevformer/apis/test.py
...prediction/projects/mmdet3d_plugin/bevformer/apis/test.py
+207
-0
autonomous_driving/occupancy_prediction/projects/mmdet3d_plugin/bevformer/apis/train.py
...rediction/projects/mmdet3d_plugin/bevformer/apis/train.py
+67
-0
autonomous_driving/occupancy_prediction/projects/mmdet3d_plugin/bevformer/backbones/__init__.py
...n/projects/mmdet3d_plugin/bevformer/backbones/__init__.py
+2
-0
autonomous_driving/occupancy_prediction/projects/mmdet3d_plugin/bevformer/backbones/custom_layer_decay_optimizer_constructor.py
...mer/backbones/custom_layer_decay_optimizer_constructor.py
+142
-0
autonomous_driving/occupancy_prediction/projects/mmdet3d_plugin/bevformer/backbones/internimage.py
...rojects/mmdet3d_plugin/bevformer/backbones/internimage.py
+702
-0
autonomous_driving/occupancy_prediction/projects/mmdet3d_plugin/bevformer/backbones/ops_dcnv3/functions/__init__.py
...lugin/bevformer/backbones/ops_dcnv3/functions/__init__.py
+7
-0
autonomous_driving/occupancy_prediction/projects/mmdet3d_plugin/bevformer/backbones/ops_dcnv3/functions/dcnv3_func.py
...gin/bevformer/backbones/ops_dcnv3/functions/dcnv3_func.py
+188
-0
autonomous_driving/occupancy_prediction/projects/mmdet3d_plugin/bevformer/backbones/ops_dcnv3/make.sh
...ects/mmdet3d_plugin/bevformer/backbones/ops_dcnv3/make.sh
+8
-0
autonomous_driving/occupancy_prediction/projects/mmdet3d_plugin/bevformer/backbones/ops_dcnv3/modules/__init__.py
..._plugin/bevformer/backbones/ops_dcnv3/modules/__init__.py
+7
-0
autonomous_driving/occupancy_prediction/projects/mmdet3d_plugin/bevformer/backbones/ops_dcnv3/modules/dcnv3.py
...t3d_plugin/bevformer/backbones/ops_dcnv3/modules/dcnv3.py
+345
-0
autonomous_driving/occupancy_prediction/projects/mmdet3d_plugin/bevformer/backbones/ops_dcnv3/setup.py
...cts/mmdet3d_plugin/bevformer/backbones/ops_dcnv3/setup.py
+75
-0
autonomous_driving/occupancy_prediction/projects/mmdet3d_plugin/bevformer/backbones/ops_dcnv3/src/cpu/dcnv3_cpu.cpp
...lugin/bevformer/backbones/ops_dcnv3/src/cpu/dcnv3_cpu.cpp
+37
-0
autonomous_driving/occupancy_prediction/projects/mmdet3d_plugin/bevformer/backbones/ops_dcnv3/src/cpu/dcnv3_cpu.h
..._plugin/bevformer/backbones/ops_dcnv3/src/cpu/dcnv3_cpu.h
+31
-0
autonomous_driving/occupancy_prediction/projects/mmdet3d_plugin/bevformer/backbones/ops_dcnv3/src/cuda/dcnv3_cuda.cu
...ugin/bevformer/backbones/ops_dcnv3/src/cuda/dcnv3_cuda.cu
+174
-0
autonomous_driving/occupancy_prediction/projects/mmdet3d_plugin/bevformer/backbones/ops_dcnv3/src/cuda/dcnv3_cuda.h
...lugin/bevformer/backbones/ops_dcnv3/src/cuda/dcnv3_cuda.h
+31
-0
autonomous_driving/occupancy_prediction/projects/mmdet3d_plugin/bevformer/backbones/ops_dcnv3/src/cuda/dcnv3_im2col_cuda.cuh
...former/backbones/ops_dcnv3/src/cuda/dcnv3_im2col_cuda.cuh
+1045
-0
autonomous_driving/occupancy_prediction/projects/mmdet3d_plugin/bevformer/backbones/ops_dcnv3/src/dcnv3.h
.../mmdet3d_plugin/bevformer/backbones/ops_dcnv3/src/dcnv3.h
+59
-0
No files found.
autonomous_driving/occupancy_prediction/projects/mmdet3d_plugin/bevformer/__init__.py
0 → 100644
View file @
df3c64a9
from
.dense_heads
import
*
from
.detectors
import
*
from
.modules
import
*
from
.runner
import
*
from
.hooks
import
*
from
.backbones
import
*
\ No newline at end of file
autonomous_driving/occupancy_prediction/projects/mmdet3d_plugin/bevformer/apis/__init__.py
0 → 100644
View file @
df3c64a9
from
.train
import
custom_train_model
from
.mmdet_train
import
custom_train_detector
# from .test import custom_multi_gpu_test
\ No newline at end of file
autonomous_driving/occupancy_prediction/projects/mmdet3d_plugin/bevformer/apis/mmdet_train.py
0 → 100644
View file @
df3c64a9
# ---------------------------------------------
# Copyright (c) OpenMMLab. All rights reserved.
# ---------------------------------------------
# Modified by Zhiqi Li
# ---------------------------------------------
import
random
import
warnings
import
numpy
as
np
import
torch
import
torch.distributed
as
dist
from
mmcv.parallel
import
MMDataParallel
,
MMDistributedDataParallel
from
mmcv.runner
import
(
HOOKS
,
DistSamplerSeedHook
,
EpochBasedRunner
,
Fp16OptimizerHook
,
OptimizerHook
,
build_optimizer
,
build_runner
,
get_dist_info
)
from
mmcv.utils
import
build_from_cfg
from
mmdet.core
import
EvalHook
from
mmdet.datasets
import
(
build_dataset
,
replace_ImageToTensor
)
from
mmdet.utils
import
get_root_logger
import
time
import
os.path
as
osp
from
projects.mmdet3d_plugin.datasets.builder
import
build_dataloader
from
projects.mmdet3d_plugin.core.evaluation.eval_hooks
import
CustomDistEvalHook
from
projects.mmdet3d_plugin.datasets
import
custom_build_dataset
def
custom_train_detector
(
model
,
dataset
,
cfg
,
distributed
=
False
,
validate
=
False
,
timestamp
=
None
,
eval_model
=
None
,
meta
=
None
):
logger
=
get_root_logger
(
cfg
.
log_level
)
# prepare data loaders
dataset
=
dataset
if
isinstance
(
dataset
,
(
list
,
tuple
))
else
[
dataset
]
#assert len(dataset)==1s
if
'imgs_per_gpu'
in
cfg
.
data
:
logger
.
warning
(
'"imgs_per_gpu" is deprecated in MMDet V2.0. '
'Please use "samples_per_gpu" instead'
)
if
'samples_per_gpu'
in
cfg
.
data
:
logger
.
warning
(
f
'Got "imgs_per_gpu"=
{
cfg
.
data
.
imgs_per_gpu
}
and '
f
'"samples_per_gpu"=
{
cfg
.
data
.
samples_per_gpu
}
, "imgs_per_gpu"'
f
'=
{
cfg
.
data
.
imgs_per_gpu
}
is used in this experiments'
)
else
:
logger
.
warning
(
'Automatically set "samples_per_gpu"="imgs_per_gpu"='
f
'
{
cfg
.
data
.
imgs_per_gpu
}
in this experiments'
)
cfg
.
data
.
samples_per_gpu
=
cfg
.
data
.
imgs_per_gpu
data_loaders
=
[
build_dataloader
(
ds
,
cfg
.
data
.
samples_per_gpu
,
cfg
.
data
.
workers_per_gpu
,
# cfg.gpus will be ignored if distributed
len
(
cfg
.
gpu_ids
),
dist
=
distributed
,
seed
=
cfg
.
seed
,
shuffler_sampler
=
cfg
.
data
.
shuffler_sampler
,
# dict(type='DistributedGroupSampler'),
nonshuffler_sampler
=
cfg
.
data
.
nonshuffler_sampler
,
# dict(type='DistributedSampler'),
)
for
ds
in
dataset
]
# put model on gpus
if
distributed
:
find_unused_parameters
=
cfg
.
get
(
'find_unused_parameters'
,
False
)
# Sets the `find_unused_parameters` parameter in
# torch.nn.parallel.DistributedDataParallel
model
=
MMDistributedDataParallel
(
model
.
cuda
(),
device_ids
=
[
torch
.
cuda
.
current_device
()],
broadcast_buffers
=
False
,
find_unused_parameters
=
find_unused_parameters
)
if
eval_model
is
not
None
:
eval_model
=
MMDistributedDataParallel
(
eval_model
.
cuda
(),
device_ids
=
[
torch
.
cuda
.
current_device
()],
broadcast_buffers
=
False
,
find_unused_parameters
=
find_unused_parameters
)
else
:
model
=
MMDataParallel
(
model
.
cuda
(
cfg
.
gpu_ids
[
0
]),
device_ids
=
cfg
.
gpu_ids
)
if
eval_model
is
not
None
:
eval_model
=
MMDataParallel
(
eval_model
.
cuda
(
cfg
.
gpu_ids
[
0
]),
device_ids
=
cfg
.
gpu_ids
)
# build runner
optimizer
=
build_optimizer
(
model
,
cfg
.
optimizer
)
if
'runner'
not
in
cfg
:
cfg
.
runner
=
{
'type'
:
'EpochBasedRunner'
,
'max_epochs'
:
cfg
.
total_epochs
}
warnings
.
warn
(
'config is now expected to have a `runner` section, '
'please set `runner` in your config.'
,
UserWarning
)
else
:
if
'total_epochs'
in
cfg
:
assert
cfg
.
total_epochs
==
cfg
.
runner
.
max_epochs
if
eval_model
is
not
None
:
runner
=
build_runner
(
cfg
.
runner
,
default_args
=
dict
(
model
=
model
,
eval_model
=
eval_model
,
optimizer
=
optimizer
,
work_dir
=
cfg
.
work_dir
,
logger
=
logger
,
meta
=
meta
))
else
:
runner
=
build_runner
(
cfg
.
runner
,
default_args
=
dict
(
model
=
model
,
optimizer
=
optimizer
,
work_dir
=
cfg
.
work_dir
,
logger
=
logger
,
meta
=
meta
))
# an ugly workaround to make .log and .log.json filenames the same
runner
.
timestamp
=
timestamp
# fp16 setting
fp16_cfg
=
cfg
.
get
(
'fp16'
,
None
)
if
fp16_cfg
is
not
None
:
optimizer_config
=
Fp16OptimizerHook
(
**
cfg
.
optimizer_config
,
**
fp16_cfg
,
distributed
=
distributed
)
elif
distributed
and
'type'
not
in
cfg
.
optimizer_config
:
optimizer_config
=
OptimizerHook
(
**
cfg
.
optimizer_config
)
else
:
optimizer_config
=
cfg
.
optimizer_config
# register hooks
runner
.
register_training_hooks
(
cfg
.
lr_config
,
optimizer_config
,
cfg
.
checkpoint_config
,
cfg
.
log_config
,
cfg
.
get
(
'momentum_config'
,
None
))
# register profiler hook
#trace_config = dict(type='tb_trace', dir_name='work_dir')
#profiler_config = dict(on_trace_ready=trace_config)
#runner.register_profiler_hook(profiler_config)
if
distributed
:
if
isinstance
(
runner
,
EpochBasedRunner
):
runner
.
register_hook
(
DistSamplerSeedHook
())
# register eval hooks
if
validate
:
# Support batch_size > 1 in validation
val_samples_per_gpu
=
cfg
.
data
.
val
.
pop
(
'samples_per_gpu'
,
1
)
if
val_samples_per_gpu
>
1
:
assert
False
# Replace 'ImageToTensor' to 'DefaultFormatBundle'
cfg
.
data
.
val
.
pipeline
=
replace_ImageToTensor
(
cfg
.
data
.
val
.
pipeline
)
val_dataset
=
custom_build_dataset
(
cfg
.
data
.
val
,
dict
(
test_mode
=
True
))
val_dataloader
=
build_dataloader
(
val_dataset
,
samples_per_gpu
=
val_samples_per_gpu
,
workers_per_gpu
=
cfg
.
data
.
workers_per_gpu
,
dist
=
distributed
,
shuffle
=
False
,
shuffler_sampler
=
cfg
.
data
.
shuffler_sampler
,
# dict(type='DistributedGroupSampler'),
nonshuffler_sampler
=
cfg
.
data
.
nonshuffler_sampler
,
# dict(type='DistributedSampler'),
)
eval_cfg
=
cfg
.
get
(
'evaluation'
,
{})
eval_cfg
[
'by_epoch'
]
=
cfg
.
runner
[
'type'
]
!=
'IterBasedRunner'
eval_cfg
[
'jsonfile_prefix'
]
=
osp
.
join
(
'val'
,
cfg
.
work_dir
,
time
.
ctime
().
replace
(
' '
,
'_'
).
replace
(
':'
,
'_'
))
eval_hook
=
CustomDistEvalHook
if
distributed
else
EvalHook
runner
.
register_hook
(
eval_hook
(
val_dataloader
,
**
eval_cfg
))
# user-defined hooks
if
cfg
.
get
(
'custom_hooks'
,
None
):
custom_hooks
=
cfg
.
custom_hooks
assert
isinstance
(
custom_hooks
,
list
),
\
f
'custom_hooks expect list type, but got
{
type
(
custom_hooks
)
}
'
for
hook_cfg
in
cfg
.
custom_hooks
:
assert
isinstance
(
hook_cfg
,
dict
),
\
'Each item in custom_hooks expects dict type, but got '
\
f
'
{
type
(
hook_cfg
)
}
'
hook_cfg
=
hook_cfg
.
copy
()
priority
=
hook_cfg
.
pop
(
'priority'
,
'NORMAL'
)
hook
=
build_from_cfg
(
hook_cfg
,
HOOKS
)
runner
.
register_hook
(
hook
,
priority
=
priority
)
if
cfg
.
resume_from
:
runner
.
resume
(
cfg
.
resume_from
)
elif
cfg
.
load_from
:
runner
.
load_checkpoint
(
cfg
.
load_from
)
runner
.
run
(
data_loaders
,
cfg
.
workflow
)
autonomous_driving/occupancy_prediction/projects/mmdet3d_plugin/bevformer/apis/test.py
0 → 100644
View file @
df3c64a9
# ---------------------------------------------
# Copyright (c) OpenMMLab. All rights reserved.
# ---------------------------------------------
# Modified by Xiaoyu Tian
# ---------------------------------------------
import
os.path
as
osp
import
pickle
import
shutil
import
tempfile
import
time
import
mmcv
import
torch
import
torch.distributed
as
dist
from
mmcv.image
import
tensor2imgs
from
mmcv.runner
import
get_dist_info
from
mmdet.core
import
encode_mask_results
import
mmcv
import
numpy
as
np
import
pycocotools.mask
as
mask_util
def
custom_encode_mask_results
(
mask_results
):
"""Encode bitmap mask to RLE code. Semantic Masks only
Args:
mask_results (list | tuple[list]): bitmap mask results.
In mask scoring rcnn, mask_results is a tuple of (segm_results,
segm_cls_score).
Returns:
list | tuple: RLE encoded mask.
"""
cls_segms
=
mask_results
num_classes
=
len
(
cls_segms
)
encoded_mask_results
=
[]
for
i
in
range
(
len
(
cls_segms
)):
encoded_mask_results
.
append
(
mask_util
.
encode
(
np
.
array
(
cls_segms
[
i
][:,
:,
np
.
newaxis
],
order
=
'F'
,
dtype
=
'uint8'
))[
0
])
# encoded with RLE
return
[
encoded_mask_results
]
def
custom_multi_gpu_test
(
model
,
data_loader
,
tmpdir
=
None
,
gpu_collect
=
False
):
"""Test model with multiple gpus.
This method tests model with multiple gpus and collects the results
under two different modes: gpu and cpu modes. By setting 'gpu_collect=True'
it encodes results to gpu tensors and use gpu communication for results
collection. On cpu mode it saves the results on different gpus to 'tmpdir'
and collects them by the rank 0 worker.
Args:
model (nn.Module): Model to be tested.
data_loader (nn.Dataloader): Pytorch data loader.
tmpdir (str): Path of directory to save the temporary results from
different gpus under cpu mode.
gpu_collect (bool): Option to use either gpu or cpu to collect results.
Returns:
list: The prediction results.
"""
model
.
eval
()
bbox_results
=
[]
mask_results
=
[]
occ_results
=
[]
dataset
=
data_loader
.
dataset
rank
,
world_size
=
get_dist_info
()
if
rank
==
0
:
prog_bar
=
mmcv
.
ProgressBar
(
len
(
dataset
))
time
.
sleep
(
2
)
# This line can prevent deadlock problem in some cases.
have_mask
=
False
for
i
,
data
in
enumerate
(
data_loader
):
with
torch
.
no_grad
():
result
=
model
(
return_loss
=
False
,
rescale
=
True
,
**
data
)
bs
=
result
.
shape
[
0
]
assert
bs
==
1
,
\
'Evaluation only supports batch_size=1 in this version'
# encode mask results
if
isinstance
(
result
,
dict
):
if
'bbox_results'
in
result
.
keys
():
bbox_result
=
result
[
'bbox_results'
]
batch_size
=
len
(
result
[
'bbox_results'
])
bbox_results
.
extend
(
bbox_result
)
if
'mask_results'
in
result
.
keys
()
and
result
[
'mask_results'
]
is
not
None
:
mask_result
=
custom_encode_mask_results
(
result
[
'mask_results'
])
mask_results
.
extend
(
mask_result
)
have_mask
=
True
else
:
batch_size
=
1
occ_results
.
extend
([
result
.
squeeze
(
dim
=
0
).
cpu
().
numpy
().
astype
(
np
.
uint8
)])
# batch_size = len(result)
# bbox_results.extend(result)
#if isinstance(result[0], tuple):
# assert False, 'this code is for instance segmentation, which our code will not utilize.'
# result = [(bbox_results, encode_mask_results(mask_results))
# for bbox_results, mask_results in result]
if
rank
==
0
:
for
_
in
range
(
batch_size
*
world_size
):
prog_bar
.
update
()
# collect results from all ranks
if
gpu_collect
:
bbox_results
=
collect_results_gpu
(
bbox_results
,
len
(
dataset
))
if
have_mask
:
mask_results
=
collect_results_gpu
(
mask_results
,
len
(
dataset
))
else
:
mask_results
=
None
else
:
# bbox_results = collect_results_cpu(bbox_results, len(dataset), tmpdir)
# tmpdir = tmpdir+'_mask' if tmpdir is not None else None
# if have_mask:
# mask_results = collect_results_cpu(mask_results, len(dataset), tmpdir)
# else:
# mask_results = None
tmpdir
=
tmpdir
+
'_occ'
if
tmpdir
is
not
None
else
None
occ_results
=
collect_results_cpu
(
occ_results
,
len
(
dataset
),
tmpdir
)
return
occ_results
def
collect_results_cpu
(
result_part
,
size
,
tmpdir
=
None
):
rank
,
world_size
=
get_dist_info
()
# create a tmp dir if it is not specified
if
tmpdir
is
None
:
MAX_LEN
=
512
# 32 is whitespace
dir_tensor
=
torch
.
full
((
MAX_LEN
,
),
32
,
dtype
=
torch
.
uint8
,
device
=
'cuda'
)
if
rank
==
0
:
mmcv
.
mkdir_or_exist
(
'.dist_test'
)
tmpdir
=
tempfile
.
mkdtemp
(
dir
=
'.dist_test'
)
tmpdir
=
torch
.
tensor
(
bytearray
(
tmpdir
.
encode
()),
dtype
=
torch
.
uint8
,
device
=
'cuda'
)
dir_tensor
[:
len
(
tmpdir
)]
=
tmpdir
dist
.
broadcast
(
dir_tensor
,
0
)
tmpdir
=
dir_tensor
.
cpu
().
numpy
().
tobytes
().
decode
().
rstrip
()
else
:
mmcv
.
mkdir_or_exist
(
tmpdir
)
# dump the part result to the dir
mmcv
.
dump
(
result_part
,
osp
.
join
(
tmpdir
,
f
'part_
{
rank
}
.pkl'
))
dist
.
barrier
()
# collect all parts
if
rank
!=
0
:
return
None
else
:
# load results of all parts from tmp dir
part_list
=
[]
for
i
in
range
(
world_size
):
part_file
=
osp
.
join
(
tmpdir
,
f
'part_
{
i
}
.pkl'
)
part_list
.
append
(
mmcv
.
load
(
part_file
))
# sort the results
ordered_results
=
[]
'''
bacause we change the sample of the evaluation stage to make sure that each gpu will handle continuous sample,
'''
#for res in zip(*part_list):
for
res
in
part_list
:
ordered_results
.
extend
(
list
(
res
))
# the dataloader may pad some samples
ordered_results
=
ordered_results
[:
size
]
# remove tmp dir
shutil
.
rmtree
(
tmpdir
)
return
ordered_results
def
single_gpu_test
(
model
,
data_loader
,
show
=
False
,
out_dir
=
None
,
show_score_thr
=
0.3
):
"""Test model with single gpu.
This method tests model with single gpu and gives the 'show' option.
By setting ``show=True``, it saves the visualization results under
``out_dir``.
Args:
model (nn.Module): Model to be tested.
data_loader (nn.Dataloader): Pytorch data loader.
show (bool): Whether to save viualization results.
Default: True.
out_dir (str): The path to save visualization results.
Default: None.
Returns:
list[dict]: The prediction results.
"""
model
.
eval
()
results
=
[]
dataset
=
data_loader
.
dataset
prog_bar
=
mmcv
.
ProgressBar
(
len
(
dataset
))
for
i
,
data
in
enumerate
(
data_loader
):
with
torch
.
no_grad
():
result
=
model
(
return_loss
=
False
,
rescale
=
True
,
**
data
)
results
.
extend
([
result
.
squeeze
(
dim
=
0
).
cpu
().
numpy
().
astype
(
np
.
uint8
)])
batch_size
=
len
(
result
)
for
_
in
range
(
batch_size
):
prog_bar
.
update
()
return
results
def
collect_results_gpu
(
result_part
,
size
):
collect_results_cpu
(
result_part
,
size
)
\ No newline at end of file
autonomous_driving/occupancy_prediction/projects/mmdet3d_plugin/bevformer/apis/train.py
0 → 100644
View file @
df3c64a9
# ---------------------------------------------
# Copyright (c) OpenMMLab. All rights reserved.
# ---------------------------------------------
# Modified by Zhiqi Li
# ---------------------------------------------
from
.mmdet_train
import
custom_train_detector
from
mmseg.apis
import
train_segmentor
from
mmdet.apis
import
train_detector
def
custom_train_model
(
model
,
dataset
,
cfg
,
distributed
=
False
,
validate
=
False
,
timestamp
=
None
,
eval_model
=
None
,
meta
=
None
):
"""A function wrapper for launching model training according to cfg.
Because we need different eval_hook in runner. Should be deprecated in the
future.
"""
if
cfg
.
model
.
type
in
[
'EncoderDecoder3D'
]:
assert
False
else
:
custom_train_detector
(
model
,
dataset
,
cfg
,
distributed
=
distributed
,
validate
=
validate
,
timestamp
=
timestamp
,
eval_model
=
eval_model
,
meta
=
meta
)
def
train_model
(
model
,
dataset
,
cfg
,
distributed
=
False
,
validate
=
False
,
timestamp
=
None
,
meta
=
None
):
"""A function wrapper for launching model training according to cfg.
Because we need different eval_hook in runner. Should be deprecated in the
future.
"""
if
cfg
.
model
.
type
in
[
'EncoderDecoder3D'
]:
train_segmentor
(
model
,
dataset
,
cfg
,
distributed
=
distributed
,
validate
=
validate
,
timestamp
=
timestamp
,
meta
=
meta
)
else
:
train_detector
(
model
,
dataset
,
cfg
,
distributed
=
distributed
,
validate
=
validate
,
timestamp
=
timestamp
,
meta
=
meta
)
autonomous_driving/occupancy_prediction/projects/mmdet3d_plugin/bevformer/backbones/__init__.py
0 → 100644
View file @
df3c64a9
from
.internimage
import
InternImage
from
.custom_layer_decay_optimizer_constructor
import
CustomLayerDecayOptimizerConstructor
\ No newline at end of file
autonomous_driving/occupancy_prediction/projects/mmdet3d_plugin/bevformer/backbones/custom_layer_decay_optimizer_constructor.py
0 → 100644
View file @
df3c64a9
# --------------------------------------------------------
# InternImage
# Copyright (c) 2022 OpenGVLab
# Licensed under The MIT License [see LICENSE for details]
# --------------------------------------------------------
"""
Mostly copy-paste from BEiT library:
https://github.com/microsoft/unilm/blob/master/beit/semantic_segmentation/mmcv_custom/layer_decay_optimizer_constructor.py
"""
import
json
from
mmcv.runner
import
OPTIMIZER_BUILDERS
,
DefaultOptimizerConstructor
from
mmcv.runner
import
get_dist_info
from
mmdet.utils
import
get_root_logger
def
get_num_layer_for_swin
(
var_name
,
num_max_layer
,
depths
):
if
var_name
.
startswith
(
"backbone.patch_embed"
):
return
0
elif
"level_embeds"
in
var_name
:
return
0
elif
var_name
.
startswith
(
"backbone.layers"
)
or
var_name
.
startswith
(
"backbone.levels"
):
if
var_name
.
split
(
'.'
)[
3
]
not
in
[
'downsample'
,
'norm'
]:
stage_id
=
int
(
var_name
.
split
(
'.'
)[
2
])
layer_id
=
int
(
var_name
.
split
(
'.'
)[
4
])
# layers for Swin-Large: [2, 2, 18, 2]
if
stage_id
==
0
:
return
layer_id
+
1
elif
stage_id
==
1
:
return
layer_id
+
1
+
depths
[
0
]
elif
stage_id
==
2
:
return
layer_id
+
1
+
depths
[
0
]
+
depths
[
1
]
else
:
return
layer_id
+
1
+
depths
[
0
]
+
depths
[
1
]
+
depths
[
2
]
else
:
stage_id
=
int
(
var_name
.
split
(
'.'
)[
2
])
if
stage_id
==
0
:
return
1
+
depths
[
0
]
elif
stage_id
==
1
:
return
1
+
depths
[
0
]
+
depths
[
1
]
elif
stage_id
==
2
:
return
1
+
depths
[
0
]
+
depths
[
1
]
+
depths
[
2
]
else
:
return
1
+
depths
[
0
]
+
depths
[
1
]
+
depths
[
2
]
else
:
return
num_max_layer
-
1
@
OPTIMIZER_BUILDERS
.
register_module
()
class
CustomLayerDecayOptimizerConstructor
(
DefaultOptimizerConstructor
):
def
add_params
(
self
,
params
,
module
,
prefix
=
''
,
is_dcn_module
=
None
):
"""Add all parameters of module to the params list.
The parameters of the given module will be added to the list of param
groups, with specific rules defined by paramwise_cfg.
Args:
params (list[dict]): A list of param groups, it will be modified
in place.
module (nn.Module): The module to be added.
prefix (str): The prefix of the module
is_dcn_module (int|float|None): If the current module is a
submodule of DCN, `is_dcn_module` will be passed to
control conv_offset layer's learning rate. Defaults to None.
"""
parameter_groups
=
{}
logger
=
get_root_logger
()
logger
.
info
(
self
.
paramwise_cfg
)
backbone_small_lr
=
self
.
paramwise_cfg
.
get
(
'backbone_small_lr'
,
False
)
dino_head
=
self
.
paramwise_cfg
.
get
(
'dino_head'
,
False
)
num_layers
=
self
.
paramwise_cfg
.
get
(
'num_layers'
)
+
2
layer_decay_rate
=
self
.
paramwise_cfg
.
get
(
'layer_decay_rate'
)
depths
=
self
.
paramwise_cfg
.
get
(
'depths'
)
offset_lr_scale
=
self
.
paramwise_cfg
.
get
(
'offset_lr_scale'
,
1.0
)
logger
.
info
(
"Build CustomLayerDecayOptimizerConstructor %f - %d"
%
(
layer_decay_rate
,
num_layers
))
weight_decay
=
self
.
base_wd
for
name
,
param
in
module
.
named_parameters
():
if
not
param
.
requires_grad
:
continue
# frozen weights
if
len
(
param
.
shape
)
==
1
or
name
.
endswith
(
".bias"
)
or
\
"relative_position"
in
name
or
\
"norm"
in
name
or
\
"sampling_offsets"
in
name
:
group_name
=
"no_decay"
this_weight_decay
=
0.
else
:
group_name
=
"decay"
this_weight_decay
=
weight_decay
layer_id
=
get_num_layer_for_swin
(
name
,
num_layers
,
depths
)
if
layer_id
==
num_layers
-
1
and
dino_head
and
\
(
"sampling_offsets"
in
name
or
"reference_points"
in
name
):
group_name
=
"layer_%d_%s_0.1x"
%
(
layer_id
,
group_name
)
elif
"sampling_offsets"
in
name
or
"reference_points"
in
name
:
group_name
=
"layer_%d_%s_offset_lr_scale"
%
(
layer_id
,
group_name
)
else
:
group_name
=
"layer_%d_%s"
%
(
layer_id
,
group_name
)
if
group_name
not
in
parameter_groups
:
scale
=
layer_decay_rate
**
(
num_layers
-
layer_id
-
1
)
if
scale
<
1
and
backbone_small_lr
==
True
:
scale
=
scale
*
0.1
if
"0.1x"
in
group_name
:
scale
=
scale
*
0.1
if
"offset_lr_scale"
in
group_name
:
scale
=
scale
*
offset_lr_scale
parameter_groups
[
group_name
]
=
{
"weight_decay"
:
this_weight_decay
,
"params"
:
[],
"param_names"
:
[],
"lr_scale"
:
scale
,
"group_name"
:
group_name
,
"lr"
:
scale
*
self
.
base_lr
,
}
parameter_groups
[
group_name
][
"params"
].
append
(
param
)
parameter_groups
[
group_name
][
"param_names"
].
append
(
name
)
rank
,
_
=
get_dist_info
()
if
rank
==
0
:
to_display
=
{}
for
key
in
parameter_groups
:
to_display
[
key
]
=
{
"param_names"
:
parameter_groups
[
key
][
"param_names"
],
"lr_scale"
:
parameter_groups
[
key
][
"lr_scale"
],
"lr"
:
parameter_groups
[
key
][
"lr"
],
"weight_decay"
:
parameter_groups
[
key
][
"weight_decay"
],
}
logger
.
info
(
"Param groups = %s"
%
json
.
dumps
(
to_display
,
indent
=
2
))
# state_dict = module.state_dict()
# for group_name in parameter_groups:
# group = parameter_groups[group_name]
# for name in group["param_names"]:
# group["params"].append(state_dict[name])
params
.
extend
(
parameter_groups
.
values
())
\ No newline at end of file
autonomous_driving/occupancy_prediction/projects/mmdet3d_plugin/bevformer/backbones/internimage.py
0 → 100644
View file @
df3c64a9
# --------------------------------------------------------
# InternImage
# Copyright (c) 2022 OpenGVLab
# Licensed under The MIT License [see LICENSE for details]
# --------------------------------------------------------
import
torch
import
torch.nn
as
nn
from
collections
import
OrderedDict
import
torch.utils.checkpoint
as
checkpoint
from
timm.models.layers
import
trunc_normal_
,
DropPath
from
mmcv.runner
import
_load_checkpoint
from
mmcv.cnn
import
constant_init
,
trunc_normal_init
from
mmdet.utils
import
get_root_logger
from
mmdet.models.builder
import
BACKBONES
import
torch.nn.functional
as
F
from
.ops_dcnv3
import
modules
as
opsm
class
to_channels_first
(
nn
.
Module
):
def
__init__
(
self
):
super
().
__init__
()
def
forward
(
self
,
x
):
return
x
.
permute
(
0
,
3
,
1
,
2
)
class
to_channels_last
(
nn
.
Module
):
def
__init__
(
self
):
super
().
__init__
()
def
forward
(
self
,
x
):
return
x
.
permute
(
0
,
2
,
3
,
1
)
def
build_norm_layer
(
dim
,
norm_layer
,
in_format
=
'channels_last'
,
out_format
=
'channels_last'
,
eps
=
1e-6
):
layers
=
[]
if
norm_layer
==
'BN'
:
if
in_format
==
'channels_last'
:
layers
.
append
(
to_channels_first
())
layers
.
append
(
nn
.
BatchNorm2d
(
dim
))
if
out_format
==
'channels_last'
:
layers
.
append
(
to_channels_last
())
elif
norm_layer
==
'LN'
:
if
in_format
==
'channels_first'
:
layers
.
append
(
to_channels_last
())
layers
.
append
(
nn
.
LayerNorm
(
dim
,
eps
=
eps
))
if
out_format
==
'channels_first'
:
layers
.
append
(
to_channels_first
())
else
:
raise
NotImplementedError
(
f
'build_norm_layer does not support
{
norm_layer
}
'
)
return
nn
.
Sequential
(
*
layers
)
def
build_act_layer
(
act_layer
):
if
act_layer
==
'ReLU'
:
return
nn
.
ReLU
(
inplace
=
True
)
elif
act_layer
==
'SiLU'
:
return
nn
.
SiLU
(
inplace
=
True
)
elif
act_layer
==
'GELU'
:
return
nn
.
GELU
()
raise
NotImplementedError
(
f
'build_act_layer does not support
{
act_layer
}
'
)
class
CrossAttention
(
nn
.
Module
):
r
""" Cross Attention Module
Args:
dim (int): Number of input channels.
num_heads (int): Number of attention heads. Default: 8
qkv_bias (bool, optional): If True, add a learnable bias to q, k, v.
Default: False.
qk_scale (float | None, optional): Override default qk scale of
head_dim ** -0.5 if set. Default: None.
attn_drop (float, optional): Dropout ratio of attention weight.
Default: 0.0
proj_drop (float, optional): Dropout ratio of output. Default: 0.0
attn_head_dim (int, optional): Dimension of attention head.
out_dim (int, optional): Dimension of output.
"""
def
__init__
(
self
,
dim
,
num_heads
=
8
,
qkv_bias
=
False
,
qk_scale
=
None
,
attn_drop
=
0.
,
proj_drop
=
0.
,
attn_head_dim
=
None
,
out_dim
=
None
):
super
().
__init__
()
if
out_dim
is
None
:
out_dim
=
dim
self
.
num_heads
=
num_heads
head_dim
=
dim
//
num_heads
if
attn_head_dim
is
not
None
:
head_dim
=
attn_head_dim
all_head_dim
=
head_dim
*
self
.
num_heads
self
.
scale
=
qk_scale
or
head_dim
**
-
0.5
assert
all_head_dim
==
dim
self
.
q
=
nn
.
Linear
(
dim
,
all_head_dim
,
bias
=
False
)
self
.
k
=
nn
.
Linear
(
dim
,
all_head_dim
,
bias
=
False
)
self
.
v
=
nn
.
Linear
(
dim
,
all_head_dim
,
bias
=
False
)
if
qkv_bias
:
self
.
q_bias
=
nn
.
Parameter
(
torch
.
zeros
(
all_head_dim
))
self
.
k_bias
=
nn
.
Parameter
(
torch
.
zeros
(
all_head_dim
))
self
.
v_bias
=
nn
.
Parameter
(
torch
.
zeros
(
all_head_dim
))
else
:
self
.
q_bias
=
None
self
.
k_bias
=
None
self
.
v_bias
=
None
self
.
attn_drop
=
nn
.
Dropout
(
attn_drop
)
self
.
proj
=
nn
.
Linear
(
all_head_dim
,
out_dim
)
self
.
proj_drop
=
nn
.
Dropout
(
proj_drop
)
def
forward
(
self
,
x
,
k
=
None
,
v
=
None
):
B
,
N
,
C
=
x
.
shape
N_k
=
k
.
shape
[
1
]
N_v
=
v
.
shape
[
1
]
q_bias
,
k_bias
,
v_bias
=
None
,
None
,
None
if
self
.
q_bias
is
not
None
:
q_bias
=
self
.
q_bias
k_bias
=
self
.
k_bias
v_bias
=
self
.
v_bias
q
=
F
.
linear
(
input
=
x
,
weight
=
self
.
q
.
weight
,
bias
=
q_bias
)
q
=
q
.
reshape
(
B
,
N
,
1
,
self
.
num_heads
,
-
1
).
permute
(
2
,
0
,
3
,
1
,
4
).
squeeze
(
0
)
# (B, N_head, N_q, dim)
k
=
F
.
linear
(
input
=
k
,
weight
=
self
.
k
.
weight
,
bias
=
k_bias
)
k
=
k
.
reshape
(
B
,
N_k
,
1
,
self
.
num_heads
,
-
1
).
permute
(
2
,
0
,
3
,
1
,
4
).
squeeze
(
0
)
v
=
F
.
linear
(
input
=
v
,
weight
=
self
.
v
.
weight
,
bias
=
v_bias
)
v
=
v
.
reshape
(
B
,
N_v
,
1
,
self
.
num_heads
,
-
1
).
permute
(
2
,
0
,
3
,
1
,
4
).
squeeze
(
0
)
q
=
q
*
self
.
scale
attn
=
(
q
@
k
.
transpose
(
-
2
,
-
1
))
# (B, N_head, N_q, N_k)
attn
=
attn
.
softmax
(
dim
=-
1
)
attn
=
self
.
attn_drop
(
attn
)
x
=
(
attn
@
v
).
transpose
(
1
,
2
).
reshape
(
B
,
N
,
-
1
)
x
=
self
.
proj
(
x
)
x
=
self
.
proj_drop
(
x
)
return
x
class
AttentiveBlock
(
nn
.
Module
):
r
"""Attentive Block
Args:
dim (int): Number of input channels.
num_heads (int): Number of attention heads. Default: 8
qkv_bias (bool, optional): If True, add a learnable bias to q, k, v.
Default: False.
qk_scale (float | None, optional): Override default qk scale of
head_dim ** -0.5 if set. Default: None.
drop (float, optional): Dropout rate. Default: 0.0.
attn_drop (float, optional): Attention dropout rate. Default: 0.0.
drop_path (float | tuple[float], optional): Stochastic depth rate.
Default: 0.0.
norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm.
attn_head_dim (int, optional): Dimension of attention head. Default: None.
out_dim (int, optional): Dimension of output. Default: None.
"""
def
__init__
(
self
,
dim
,
num_heads
,
qkv_bias
=
False
,
qk_scale
=
None
,
drop
=
0.
,
attn_drop
=
0.
,
drop_path
=
0.
,
norm_layer
=
"LN"
,
attn_head_dim
=
None
,
out_dim
=
None
):
super
().
__init__
()
self
.
norm1_q
=
build_norm_layer
(
dim
,
norm_layer
,
eps
=
1e-6
)
self
.
norm1_k
=
build_norm_layer
(
dim
,
norm_layer
,
eps
=
1e-6
)
self
.
norm1_v
=
build_norm_layer
(
dim
,
norm_layer
,
eps
=
1e-6
)
self
.
cross_dcn
=
CrossAttention
(
dim
,
num_heads
=
num_heads
,
qkv_bias
=
qkv_bias
,
qk_scale
=
qk_scale
,
attn_drop
=
attn_drop
,
proj_drop
=
drop
,
attn_head_dim
=
attn_head_dim
,
out_dim
=
out_dim
)
self
.
drop_path
=
DropPath
(
drop_path
)
if
drop_path
>
0.
else
nn
.
Identity
()
def
forward
(
self
,
x_q
,
x_kv
,
pos_q
,
pos_k
,
bool_masked_pos
,
rel_pos_bias
=
None
):
x_q
=
self
.
norm1_q
(
x_q
+
pos_q
)
x_k
=
self
.
norm1_k
(
x_kv
+
pos_k
)
x_v
=
self
.
norm1_v
(
x_kv
)
x
=
self
.
cross_dcn
(
x_q
,
k
=
x_k
,
v
=
x_v
)
return
x
class
AttentionPoolingBlock
(
AttentiveBlock
):
def
forward
(
self
,
x
):
x_q
=
x
.
mean
(
1
,
keepdim
=
True
)
x_kv
=
x
pos_q
,
pos_k
=
0
,
0
x
=
super
().
forward
(
x_q
,
x_kv
,
pos_q
,
pos_k
,
bool_masked_pos
=
None
,
rel_pos_bias
=
None
)
x
=
x
.
squeeze
(
1
)
return
x
class
StemLayer
(
nn
.
Module
):
r
""" Stem layer of InternImage
Args:
in_chans (int): number of input channels
out_chans (int): number of output channels
act_layer (str): activation layer
norm_layer (str): normalization layer
"""
def
__init__
(
self
,
in_chans
=
3
,
out_chans
=
96
,
act_layer
=
'GELU'
,
norm_layer
=
'BN'
):
super
().
__init__
()
self
.
conv1
=
nn
.
Conv2d
(
in_chans
,
out_chans
//
2
,
kernel_size
=
3
,
stride
=
2
,
padding
=
1
)
self
.
norm1
=
build_norm_layer
(
out_chans
//
2
,
norm_layer
,
'channels_first'
,
'channels_first'
)
self
.
act
=
build_act_layer
(
act_layer
)
self
.
conv2
=
nn
.
Conv2d
(
out_chans
//
2
,
out_chans
,
kernel_size
=
3
,
stride
=
2
,
padding
=
1
)
self
.
norm2
=
build_norm_layer
(
out_chans
,
norm_layer
,
'channels_first'
,
'channels_last'
)
def
forward
(
self
,
x
):
x
=
self
.
conv1
(
x
)
x
=
self
.
norm1
(
x
)
x
=
self
.
act
(
x
)
x
=
self
.
conv2
(
x
)
x
=
self
.
norm2
(
x
)
return
x
class
DownsampleLayer
(
nn
.
Module
):
r
""" Downsample layer of InternImage
Args:
channels (int): number of input channels
norm_layer (str): normalization layer
"""
def
__init__
(
self
,
channels
,
norm_layer
=
'LN'
):
super
().
__init__
()
self
.
conv
=
nn
.
Conv2d
(
channels
,
2
*
channels
,
kernel_size
=
3
,
stride
=
2
,
padding
=
1
,
bias
=
False
)
self
.
norm
=
build_norm_layer
(
2
*
channels
,
norm_layer
,
'channels_first'
,
'channels_last'
)
def
forward
(
self
,
x
):
x
=
self
.
conv
(
x
.
permute
(
0
,
3
,
1
,
2
))
x
=
self
.
norm
(
x
)
return
x
class
MLPLayer
(
nn
.
Module
):
r
""" MLP layer of InternImage
Args:
in_features (int): number of input features
hidden_features (int): number of hidden features
out_features (int): number of output features
act_layer (str): activation layer
drop (float): dropout rate
"""
def
__init__
(
self
,
in_features
,
hidden_features
=
None
,
out_features
=
None
,
act_layer
=
'GELU'
,
drop
=
0.
):
super
().
__init__
()
out_features
=
out_features
or
in_features
hidden_features
=
hidden_features
or
in_features
self
.
fc1
=
nn
.
Linear
(
in_features
,
hidden_features
)
self
.
act
=
build_act_layer
(
act_layer
)
self
.
fc2
=
nn
.
Linear
(
hidden_features
,
out_features
)
self
.
drop
=
nn
.
Dropout
(
drop
)
def
forward
(
self
,
x
):
x
=
self
.
fc1
(
x
)
x
=
self
.
act
(
x
)
x
=
self
.
drop
(
x
)
x
=
self
.
fc2
(
x
)
x
=
self
.
drop
(
x
)
return
x
class
InternImageLayer
(
nn
.
Module
):
r
""" Basic layer of InternImage
Args:
core_op (nn.Module): core operation of InternImage
channels (int): number of input channels
groups (list): Groups of each block.
mlp_ratio (float): ratio of mlp hidden features to input channels
drop (float): dropout rate
drop_path (float): drop path rate
act_layer (str): activation layer
norm_layer (str): normalization layer
post_norm (bool): whether to use post normalization
layer_scale (float): layer scale
offset_scale (float): offset scale
with_cp (bool): whether to use checkpoint
"""
def
__init__
(
self
,
core_op
,
channels
,
groups
,
mlp_ratio
=
4.
,
drop
=
0.
,
drop_path
=
0.
,
act_layer
=
'GELU'
,
norm_layer
=
'LN'
,
post_norm
=
False
,
layer_scale
=
None
,
offset_scale
=
1.0
,
with_cp
=
False
,
dw_kernel_size
=
None
,
# for InternImage-H/G
res_post_norm
=
False
,
# for InternImage-H/G
center_feature_scale
=
False
):
# for InternImage-H/G
super
().
__init__
()
self
.
channels
=
channels
self
.
groups
=
groups
self
.
mlp_ratio
=
mlp_ratio
self
.
with_cp
=
with_cp
self
.
norm1
=
build_norm_layer
(
channels
,
'LN'
)
self
.
post_norm
=
post_norm
self
.
dcn
=
core_op
(
channels
=
channels
,
kernel_size
=
3
,
stride
=
1
,
pad
=
1
,
dilation
=
1
,
group
=
groups
,
offset_scale
=
offset_scale
,
act_layer
=
act_layer
,
norm_layer
=
norm_layer
,
dw_kernel_size
=
dw_kernel_size
,
# for InternImage-H/G
center_feature_scale
=
center_feature_scale
)
# for InternImage-H/G
self
.
drop_path
=
DropPath
(
drop_path
)
if
drop_path
>
0.
\
else
nn
.
Identity
()
self
.
norm2
=
build_norm_layer
(
channels
,
'LN'
)
self
.
mlp
=
MLPLayer
(
in_features
=
channels
,
hidden_features
=
int
(
channels
*
mlp_ratio
),
act_layer
=
act_layer
,
drop
=
drop
)
self
.
layer_scale
=
layer_scale
is
not
None
if
self
.
layer_scale
:
self
.
gamma1
=
nn
.
Parameter
(
layer_scale
*
torch
.
ones
(
channels
),
requires_grad
=
True
)
self
.
gamma2
=
nn
.
Parameter
(
layer_scale
*
torch
.
ones
(
channels
),
requires_grad
=
True
)
self
.
res_post_norm
=
res_post_norm
if
res_post_norm
:
self
.
res_post_norm1
=
build_norm_layer
(
channels
,
'LN'
)
self
.
res_post_norm2
=
build_norm_layer
(
channels
,
'LN'
)
def
forward
(
self
,
x
):
def
_inner_forward
(
x
):
if
not
self
.
layer_scale
:
if
self
.
post_norm
:
x
=
x
+
self
.
drop_path
(
self
.
norm1
(
self
.
dcn
(
x
)))
x
=
x
+
self
.
drop_path
(
self
.
norm2
(
self
.
mlp
(
x
)))
elif
self
.
res_post_norm
:
# for InternImage-H/G
x
=
x
+
self
.
drop_path
(
self
.
res_post_norm1
(
self
.
dcn
(
self
.
norm1
(
x
))))
x
=
x
+
self
.
drop_path
(
self
.
res_post_norm2
(
self
.
mlp
(
self
.
norm2
(
x
))))
else
:
x
=
x
+
self
.
drop_path
(
self
.
dcn
(
self
.
norm1
(
x
)))
x
=
x
+
self
.
drop_path
(
self
.
mlp
(
self
.
norm2
(
x
)))
return
x
if
self
.
post_norm
:
x
=
x
+
self
.
drop_path
(
self
.
gamma1
*
self
.
norm1
(
self
.
dcn
(
x
)))
x
=
x
+
self
.
drop_path
(
self
.
gamma2
*
self
.
norm2
(
self
.
mlp
(
x
)))
else
:
x
=
x
+
self
.
drop_path
(
self
.
gamma1
*
self
.
dcn
(
self
.
norm1
(
x
)))
x
=
x
+
self
.
drop_path
(
self
.
gamma2
*
self
.
mlp
(
self
.
norm2
(
x
)))
return
x
if
self
.
with_cp
and
x
.
requires_grad
:
x
=
checkpoint
.
checkpoint
(
_inner_forward
,
x
)
else
:
x
=
_inner_forward
(
x
)
return
x
class
InternImageBlock
(
nn
.
Module
):
r
""" Block of InternImage
Args:
core_op (nn.Module): core operation of InternImage
channels (int): number of input channels
depths (list): Depth of each block.
groups (list): Groups of each block.
mlp_ratio (float): ratio of mlp hidden features to input channels
drop (float): dropout rate
drop_path (float): drop path rate
act_layer (str): activation layer
norm_layer (str): normalization layer
post_norm (bool): whether to use post normalization
layer_scale (float): layer scale
offset_scale (float): offset scale
with_cp (bool): whether to use checkpoint
"""
def
__init__
(
self
,
core_op
,
channels
,
depth
,
groups
,
downsample
=
True
,
mlp_ratio
=
4.
,
drop
=
0.
,
drop_path
=
0.
,
act_layer
=
'GELU'
,
norm_layer
=
'LN'
,
post_norm
=
False
,
offset_scale
=
1.0
,
layer_scale
=
None
,
with_cp
=
False
,
dw_kernel_size
=
None
,
# for InternImage-H/G
post_norm_block_ids
=
None
,
# for InternImage-H/G
res_post_norm
=
False
,
# for InternImage-H/G
center_feature_scale
=
False
):
# for InternImage-H/G
super
().
__init__
()
self
.
channels
=
channels
self
.
depth
=
depth
self
.
post_norm
=
post_norm
self
.
center_feature_scale
=
center_feature_scale
self
.
blocks
=
nn
.
ModuleList
([
InternImageLayer
(
core_op
=
core_op
,
channels
=
channels
,
groups
=
groups
,
mlp_ratio
=
mlp_ratio
,
drop
=
drop
,
drop_path
=
drop_path
[
i
]
if
isinstance
(
drop_path
,
list
)
else
drop_path
,
act_layer
=
act_layer
,
norm_layer
=
norm_layer
,
post_norm
=
post_norm
,
layer_scale
=
layer_scale
,
offset_scale
=
offset_scale
,
with_cp
=
with_cp
,
dw_kernel_size
=
dw_kernel_size
,
# for InternImage-H/G
res_post_norm
=
res_post_norm
,
# for InternImage-H/G
center_feature_scale
=
center_feature_scale
# for InternImage-H/G
)
for
i
in
range
(
depth
)
])
if
not
self
.
post_norm
or
center_feature_scale
:
self
.
norm
=
build_norm_layer
(
channels
,
'LN'
)
self
.
post_norm_block_ids
=
post_norm_block_ids
if
post_norm_block_ids
is
not
None
:
# for InternImage-H/G
self
.
post_norms
=
nn
.
ModuleList
(
[
build_norm_layer
(
channels
,
'LN'
,
eps
=
1e-6
)
for
_
in
post_norm_block_ids
]
)
self
.
downsample
=
DownsampleLayer
(
channels
=
channels
,
norm_layer
=
norm_layer
)
if
downsample
else
None
def
forward
(
self
,
x
,
return_wo_downsample
=
False
):
for
i
,
blk
in
enumerate
(
self
.
blocks
):
x
=
blk
(
x
)
if
(
self
.
post_norm_block_ids
is
not
None
)
and
(
i
in
self
.
post_norm_block_ids
):
index
=
self
.
post_norm_block_ids
.
index
(
i
)
x
=
self
.
post_norms
[
index
](
x
)
# for InternImage-H/G
if
not
self
.
post_norm
or
self
.
center_feature_scale
:
x
=
self
.
norm
(
x
)
if
return_wo_downsample
:
x_
=
x
if
self
.
downsample
is
not
None
:
x
=
self
.
downsample
(
x
)
if
return_wo_downsample
:
return
x
,
x_
return
x
@
BACKBONES
.
register_module
()
class
InternImage
(
nn
.
Module
):
r
""" InternImage
A PyTorch impl of : `InternImage: Exploring Large-Scale Vision Foundation Models with Deformable Convolutions` -
https://arxiv.org/pdf/2103.14030
Args:
core_op (str): Core operator. Default: 'DCNv3'
channels (int): Number of the first stage. Default: 64
depths (list): Depth of each block. Default: [3, 4, 18, 5]
groups (list): Groups of each block. Default: [3, 6, 12, 24]
mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.
drop_rate (float): Probability of an element to be zeroed. Default: 0.
drop_path_rate (float): Stochastic depth rate. Default: 0.
act_layer (str): Activation layer. Default: 'GELU'
norm_layer (str): Normalization layer. Default: 'LN'
layer_scale (bool): Whether to use layer scale. Default: False
cls_scale (bool): Whether to use class scale. Default: False
with_cp (bool): Use checkpoint or not. Using checkpoint will save some
dw_kernel_size (int): Size of the dwconv. Default: None
level2_post_norm (bool): Whether to use level2 post norm. Default: False
level2_post_norm_block_ids (list): Indexes of post norm blocks. Default: None
res_post_norm (bool): Whether to use res post norm. Default: False
center_feature_scale (bool): Whether to use center feature scale. Default: False
"""
def
__init__
(
self
,
core_op
=
'DCNv3'
,
channels
=
64
,
depths
=
[
3
,
4
,
18
,
5
],
groups
=
[
3
,
6
,
12
,
24
],
mlp_ratio
=
4.
,
drop_rate
=
0.
,
drop_path_rate
=
0.2
,
drop_path_type
=
'linear'
,
act_layer
=
'GELU'
,
norm_layer
=
'LN'
,
layer_scale
=
None
,
offset_scale
=
1.0
,
post_norm
=
False
,
with_cp
=
False
,
dw_kernel_size
=
None
,
# for InternImage-H/G
level2_post_norm
=
False
,
# for InternImage-H/G
level2_post_norm_block_ids
=
None
,
# for InternImage-H/G
res_post_norm
=
False
,
# for InternImage-H/G
center_feature_scale
=
False
,
# for InternImage-H/G
out_indices
=
(
0
,
1
,
2
,
3
),
init_cfg
=
None
,
**
kwargs
):
super
().
__init__
()
self
.
core_op
=
core_op
self
.
num_levels
=
len
(
depths
)
self
.
depths
=
depths
self
.
channels
=
channels
self
.
num_features
=
int
(
channels
*
2
**
(
self
.
num_levels
-
1
))
self
.
post_norm
=
post_norm
self
.
mlp_ratio
=
mlp_ratio
self
.
init_cfg
=
init_cfg
self
.
out_indices
=
out_indices
self
.
level2_post_norm_block_ids
=
level2_post_norm_block_ids
logger
=
get_root_logger
()
logger
.
info
(
f
'using core type:
{
core_op
}
'
)
logger
.
info
(
f
'using activation layer:
{
act_layer
}
'
)
logger
.
info
(
f
'using main norm layer:
{
norm_layer
}
'
)
logger
.
info
(
f
'using dpr:
{
drop_path_type
}
,
{
drop_path_rate
}
'
)
logger
.
info
(
f
"level2_post_norm:
{
level2_post_norm
}
"
)
logger
.
info
(
f
"level2_post_norm_block_ids:
{
level2_post_norm_block_ids
}
"
)
logger
.
info
(
f
"res_post_norm:
{
res_post_norm
}
"
)
in_chans
=
3
self
.
patch_embed
=
StemLayer
(
in_chans
=
in_chans
,
out_chans
=
channels
,
act_layer
=
act_layer
,
norm_layer
=
norm_layer
)
self
.
pos_drop
=
nn
.
Dropout
(
p
=
drop_rate
)
dpr
=
[
x
.
item
()
for
x
in
torch
.
linspace
(
0
,
drop_path_rate
,
sum
(
depths
))
]
if
drop_path_type
==
'uniform'
:
for
i
in
range
(
len
(
dpr
)):
dpr
[
i
]
=
drop_path_rate
self
.
levels
=
nn
.
ModuleList
()
for
i
in
range
(
self
.
num_levels
):
post_norm_block_ids
=
level2_post_norm_block_ids
if
level2_post_norm
and
(
i
==
2
)
else
None
# for InternImage-H/G
level
=
InternImageBlock
(
core_op
=
getattr
(
opsm
,
core_op
),
channels
=
int
(
channels
*
2
**
i
),
depth
=
depths
[
i
],
groups
=
groups
[
i
],
mlp_ratio
=
self
.
mlp_ratio
,
drop
=
drop_rate
,
drop_path
=
dpr
[
sum
(
depths
[:
i
]):
sum
(
depths
[:
i
+
1
])],
act_layer
=
act_layer
,
norm_layer
=
norm_layer
,
post_norm
=
post_norm
,
downsample
=
(
i
<
self
.
num_levels
-
1
),
layer_scale
=
layer_scale
,
offset_scale
=
offset_scale
,
with_cp
=
with_cp
,
dw_kernel_size
=
dw_kernel_size
,
# for InternImage-H/G
post_norm_block_ids
=
post_norm_block_ids
,
# for InternImage-H/G
res_post_norm
=
res_post_norm
,
# for InternImage-H/G
center_feature_scale
=
center_feature_scale
# for InternImage-H/G
)
self
.
levels
.
append
(
level
)
self
.
num_layers
=
len
(
depths
)
self
.
apply
(
self
.
_init_weights
)
self
.
apply
(
self
.
_init_deform_weights
)
def
init_weights
(
self
):
logger
=
get_root_logger
()
if
self
.
init_cfg
is
None
:
logger
.
warn
(
f
'No pre-trained weights for '
f
'
{
self
.
__class__
.
__name__
}
, '
f
'training start from scratch'
)
for
m
in
self
.
modules
():
if
isinstance
(
m
,
nn
.
Linear
):
trunc_normal_init
(
m
,
std
=
.
02
,
bias
=
0.
)
elif
isinstance
(
m
,
nn
.
LayerNorm
):
constant_init
(
m
,
1.0
)
else
:
assert
'checkpoint'
in
self
.
init_cfg
,
f
'Only support '
\
f
'specify `Pretrained` in '
\
f
'`init_cfg` in '
\
f
'
{
self
.
__class__
.
__name__
}
'
ckpt
=
_load_checkpoint
(
self
.
init_cfg
.
checkpoint
,
logger
=
logger
,
map_location
=
'cpu'
)
if
'state_dict'
in
ckpt
:
_state_dict
=
ckpt
[
'state_dict'
]
elif
'model'
in
ckpt
:
_state_dict
=
ckpt
[
'model'
]
else
:
_state_dict
=
ckpt
state_dict
=
OrderedDict
()
for
k
,
v
in
_state_dict
.
items
():
if
k
.
startswith
(
'backbone.'
):
state_dict
[
k
[
9
:]]
=
v
else
:
state_dict
[
k
]
=
v
# strip prefix of state_dict
if
list
(
state_dict
.
keys
())[
0
].
startswith
(
'module.'
):
state_dict
=
{
k
[
7
:]:
v
for
k
,
v
in
state_dict
.
items
()}
# load state_dict
meg
=
self
.
load_state_dict
(
state_dict
,
False
)
logger
.
info
(
meg
)
def
_init_weights
(
self
,
m
):
if
isinstance
(
m
,
nn
.
Linear
):
trunc_normal_
(
m
.
weight
,
std
=
.
02
)
if
isinstance
(
m
,
nn
.
Linear
)
and
m
.
bias
is
not
None
:
nn
.
init
.
constant_
(
m
.
bias
,
0
)
elif
isinstance
(
m
,
nn
.
LayerNorm
):
nn
.
init
.
constant_
(
m
.
bias
,
0
)
nn
.
init
.
constant_
(
m
.
weight
,
1.0
)
def
_init_deform_weights
(
self
,
m
):
if
isinstance
(
m
,
getattr
(
opsm
,
self
.
core_op
)):
m
.
_reset_parameters
()
def
forward
(
self
,
x
):
x
=
self
.
patch_embed
(
x
)
x
=
self
.
pos_drop
(
x
)
seq_out
=
[]
for
level_idx
,
level
in
enumerate
(
self
.
levels
):
x
,
x_
=
level
(
x
,
return_wo_downsample
=
True
)
if
level_idx
in
self
.
out_indices
:
seq_out
.
append
(
x_
.
permute
(
0
,
3
,
1
,
2
).
contiguous
())
return
seq_out
\ No newline at end of file
autonomous_driving/occupancy_prediction/projects/mmdet3d_plugin/bevformer/backbones/ops_dcnv3/functions/__init__.py
0 → 100644
View file @
df3c64a9
# --------------------------------------------------------
# InternImage
# Copyright (c) 2022 OpenGVLab
# Licensed under The MIT License [see LICENSE for details]
# --------------------------------------------------------
from
.dcnv3_func
import
DCNv3Function
,
dcnv3_core_pytorch
autonomous_driving/occupancy_prediction/projects/mmdet3d_plugin/bevformer/backbones/ops_dcnv3/functions/dcnv3_func.py
0 → 100644
View file @
df3c64a9
# --------------------------------------------------------
# InternImage
# Copyright (c) 2022 OpenGVLab
# Licensed under The MIT License [see LICENSE for details]
# --------------------------------------------------------
from
__future__
import
absolute_import
from
__future__
import
print_function
from
__future__
import
division
import
torch
import
torch.nn.functional
as
F
from
torch.autograd
import
Function
from
torch.autograd.function
import
once_differentiable
from
torch.cuda.amp
import
custom_bwd
,
custom_fwd
import
DCNv3
class
DCNv3Function
(
Function
):
@
staticmethod
@
custom_fwd
def
forward
(
ctx
,
input
,
offset
,
mask
,
kernel_h
,
kernel_w
,
stride_h
,
stride_w
,
pad_h
,
pad_w
,
dilation_h
,
dilation_w
,
group
,
group_channels
,
offset_scale
,
im2col_step
):
ctx
.
kernel_h
=
kernel_h
ctx
.
kernel_w
=
kernel_w
ctx
.
stride_h
=
stride_h
ctx
.
stride_w
=
stride_w
ctx
.
pad_h
=
pad_h
ctx
.
pad_w
=
pad_w
ctx
.
dilation_h
=
dilation_h
ctx
.
dilation_w
=
dilation_w
ctx
.
group
=
group
ctx
.
group_channels
=
group_channels
ctx
.
offset_scale
=
offset_scale
ctx
.
im2col_step
=
im2col_step
output
=
DCNv3
.
dcnv3_forward
(
input
,
offset
,
mask
,
kernel_h
,
kernel_w
,
stride_h
,
stride_w
,
pad_h
,
pad_w
,
dilation_h
,
dilation_w
,
group
,
group_channels
,
offset_scale
,
ctx
.
im2col_step
)
ctx
.
save_for_backward
(
input
,
offset
,
mask
)
return
output
@
staticmethod
@
once_differentiable
@
custom_bwd
def
backward
(
ctx
,
grad_output
):
input
,
offset
,
mask
=
ctx
.
saved_tensors
grad_input
,
grad_offset
,
grad_mask
=
\
DCNv3
.
dcnv3_backward
(
input
,
offset
,
mask
,
ctx
.
kernel_h
,
ctx
.
kernel_w
,
ctx
.
stride_h
,
ctx
.
stride_w
,
ctx
.
pad_h
,
ctx
.
pad_w
,
ctx
.
dilation_h
,
ctx
.
dilation_w
,
ctx
.
group
,
ctx
.
group_channels
,
ctx
.
offset_scale
,
grad_output
.
contiguous
(),
ctx
.
im2col_step
)
return
grad_input
,
grad_offset
,
grad_mask
,
\
None
,
None
,
None
,
None
,
None
,
None
,
None
,
None
,
None
,
None
,
None
,
None
@
staticmethod
def
symbolic
(
g
,
input
,
offset
,
mask
,
kernel_h
,
kernel_w
,
stride_h
,
stride_w
,
pad_h
,
pad_w
,
dilation_h
,
dilation_w
,
group
,
group_channels
,
offset_scale
,
im2col_step
):
"""Symbolic function for mmdeploy::DCNv3.
Returns:
DCNv3 op for onnx.
"""
return
g
.
op
(
'mmdeploy::TRTDCNv3'
,
input
,
offset
,
mask
,
kernel_h_i
=
int
(
kernel_h
),
kernel_w_i
=
int
(
kernel_w
),
stride_h_i
=
int
(
stride_h
),
stride_w_i
=
int
(
stride_w
),
pad_h_i
=
int
(
pad_h
),
pad_w_i
=
int
(
pad_w
),
dilation_h_i
=
int
(
dilation_h
),
dilation_w_i
=
int
(
dilation_w
),
group_i
=
int
(
group
),
group_channels_i
=
int
(
group_channels
),
offset_scale_f
=
float
(
offset_scale
),
im2col_step_i
=
int
(
im2col_step
),
)
def
_get_reference_points
(
spatial_shapes
,
device
,
kernel_h
,
kernel_w
,
dilation_h
,
dilation_w
,
pad_h
=
0
,
pad_w
=
0
,
stride_h
=
1
,
stride_w
=
1
):
_
,
H_
,
W_
,
_
=
spatial_shapes
H_out
=
(
H_
-
(
dilation_h
*
(
kernel_h
-
1
)
+
1
))
//
stride_h
+
1
W_out
=
(
W_
-
(
dilation_w
*
(
kernel_w
-
1
)
+
1
))
//
stride_w
+
1
ref_y
,
ref_x
=
torch
.
meshgrid
(
torch
.
linspace
(
# pad_h + 0.5,
# H_ - pad_h - 0.5,
(
dilation_h
*
(
kernel_h
-
1
))
//
2
+
0.5
,
(
dilation_h
*
(
kernel_h
-
1
))
//
2
+
0.5
+
(
H_out
-
1
)
*
stride_h
,
H_out
,
dtype
=
torch
.
float32
,
device
=
device
),
torch
.
linspace
(
# pad_w + 0.5,
# W_ - pad_w - 0.5,
(
dilation_w
*
(
kernel_w
-
1
))
//
2
+
0.5
,
(
dilation_w
*
(
kernel_w
-
1
))
//
2
+
0.5
+
(
W_out
-
1
)
*
stride_w
,
W_out
,
dtype
=
torch
.
float32
,
device
=
device
))
ref_y
=
ref_y
.
reshape
(
-
1
)[
None
]
/
H_
ref_x
=
ref_x
.
reshape
(
-
1
)[
None
]
/
W_
ref
=
torch
.
stack
((
ref_x
,
ref_y
),
-
1
).
reshape
(
1
,
H_out
,
W_out
,
1
,
2
)
return
ref
def
_generate_dilation_grids
(
spatial_shapes
,
kernel_h
,
kernel_w
,
dilation_h
,
dilation_w
,
group
,
device
):
_
,
H_
,
W_
,
_
=
spatial_shapes
points_list
=
[]
x
,
y
=
torch
.
meshgrid
(
torch
.
linspace
(
-
((
dilation_w
*
(
kernel_w
-
1
))
//
2
),
-
((
dilation_w
*
(
kernel_w
-
1
))
//
2
)
+
(
kernel_w
-
1
)
*
dilation_w
,
kernel_w
,
dtype
=
torch
.
float32
,
device
=
device
),
torch
.
linspace
(
-
((
dilation_h
*
(
kernel_h
-
1
))
//
2
),
-
((
dilation_h
*
(
kernel_h
-
1
))
//
2
)
+
(
kernel_h
-
1
)
*
dilation_h
,
kernel_h
,
dtype
=
torch
.
float32
,
device
=
device
))
points_list
.
extend
([
x
/
W_
,
y
/
H_
])
grid
=
torch
.
stack
(
points_list
,
-
1
).
reshape
(
-
1
,
1
,
2
).
\
repeat
(
1
,
group
,
1
).
permute
(
1
,
0
,
2
)
grid
=
grid
.
reshape
(
1
,
1
,
1
,
group
*
kernel_h
*
kernel_w
,
2
)
return
grid
def
dcnv3_core_pytorch
(
input
,
offset
,
mask
,
kernel_h
,
kernel_w
,
stride_h
,
stride_w
,
pad_h
,
pad_w
,
dilation_h
,
dilation_w
,
group
,
group_channels
,
offset_scale
):
# for debug and test only,
# need to use cuda version instead
input
=
F
.
pad
(
input
,
[
0
,
0
,
pad_h
,
pad_h
,
pad_w
,
pad_w
])
N_
,
H_in
,
W_in
,
_
=
input
.
shape
_
,
H_out
,
W_out
,
_
=
offset
.
shape
ref
=
_get_reference_points
(
input
.
shape
,
input
.
device
,
kernel_h
,
kernel_w
,
dilation_h
,
dilation_w
,
pad_h
,
pad_w
,
stride_h
,
stride_w
)
grid
=
_generate_dilation_grids
(
input
.
shape
,
kernel_h
,
kernel_w
,
dilation_h
,
dilation_w
,
group
,
input
.
device
)
spatial_norm
=
torch
.
tensor
([
W_in
,
H_in
]).
reshape
(
1
,
1
,
1
,
2
).
\
repeat
(
1
,
1
,
1
,
group
*
kernel_h
*
kernel_w
).
to
(
input
.
device
)
sampling_locations
=
(
ref
+
grid
*
offset_scale
).
repeat
(
N_
,
1
,
1
,
1
,
1
).
flatten
(
3
,
4
)
+
\
offset
*
offset_scale
/
spatial_norm
P_
=
kernel_h
*
kernel_w
sampling_grids
=
2
*
sampling_locations
-
1
# N_, H_in, W_in, group*group_channels -> N_, H_in*W_in, group*group_channels -> N_, group*group_channels, H_in*W_in -> N_*group, group_channels, H_in, W_in
input_
=
input
.
view
(
N_
,
H_in
*
W_in
,
group
*
group_channels
).
transpose
(
1
,
2
).
\
reshape
(
N_
*
group
,
group_channels
,
H_in
,
W_in
)
# N_, H_out, W_out, group*P_*2 -> N_, H_out*W_out, group, P_, 2 -> N_, group, H_out*W_out, P_, 2 -> N_*group, H_out*W_out, P_, 2
sampling_grid_
=
sampling_grids
.
view
(
N_
,
H_out
*
W_out
,
group
,
P_
,
2
).
transpose
(
1
,
2
).
\
flatten
(
0
,
1
)
# N_*group, group_channels, H_out*W_out, P_
sampling_input_
=
F
.
grid_sample
(
input_
,
sampling_grid_
,
mode
=
'bilinear'
,
padding_mode
=
'zeros'
,
align_corners
=
False
)
# (N_, H_out, W_out, group*P_) -> N_, H_out*W_out, group, P_ -> (N_, group, H_out*W_out, P_) -> (N_*group, 1, H_out*W_out, P_)
mask
=
mask
.
view
(
N_
,
H_out
*
W_out
,
group
,
P_
).
transpose
(
1
,
2
).
\
reshape
(
N_
*
group
,
1
,
H_out
*
W_out
,
P_
)
output
=
(
sampling_input_
*
mask
).
sum
(
-
1
).
view
(
N_
,
group
*
group_channels
,
H_out
*
W_out
)
return
output
.
transpose
(
1
,
2
).
reshape
(
N_
,
H_out
,
W_out
,
-
1
).
contiguous
()
autonomous_driving/occupancy_prediction/projects/mmdet3d_plugin/bevformer/backbones/ops_dcnv3/make.sh
0 → 100755
View file @
df3c64a9
#!/usr/bin/env bash
# --------------------------------------------------------
# InternImage
# Copyright (c) 2022 OpenGVLab
# Licensed under The MIT License [see LICENSE for details]
# --------------------------------------------------------
python setup.py build
install
autonomous_driving/occupancy_prediction/projects/mmdet3d_plugin/bevformer/backbones/ops_dcnv3/modules/__init__.py
0 → 100644
View file @
df3c64a9
# --------------------------------------------------------
# InternImage
# Copyright (c) 2022 OpenGVLab
# Licensed under The MIT License [see LICENSE for details]
# --------------------------------------------------------
from
.dcnv3
import
DCNv3
,
DCNv3_pytorch
\ No newline at end of file
autonomous_driving/occupancy_prediction/projects/mmdet3d_plugin/bevformer/backbones/ops_dcnv3/modules/dcnv3.py
0 → 100644
View file @
df3c64a9
# --------------------------------------------------------
# InternImage
# Copyright (c) 2022 OpenGVLab
# Licensed under The MIT License [see LICENSE for details]
# --------------------------------------------------------
from
__future__
import
absolute_import
from
__future__
import
print_function
from
__future__
import
division
import
warnings
import
torch
from
torch
import
nn
import
torch.nn.functional
as
F
from
torch.nn.init
import
xavier_uniform_
,
constant_
from
..functions
import
DCNv3Function
,
dcnv3_core_pytorch
class
to_channels_first
(
nn
.
Module
):
def
__init__
(
self
):
super
().
__init__
()
def
forward
(
self
,
x
):
return
x
.
permute
(
0
,
3
,
1
,
2
)
class
to_channels_last
(
nn
.
Module
):
def
__init__
(
self
):
super
().
__init__
()
def
forward
(
self
,
x
):
return
x
.
permute
(
0
,
2
,
3
,
1
)
def
build_norm_layer
(
dim
,
norm_layer
,
in_format
=
'channels_last'
,
out_format
=
'channels_last'
,
eps
=
1e-6
):
layers
=
[]
if
norm_layer
==
'BN'
:
if
in_format
==
'channels_last'
:
layers
.
append
(
to_channels_first
())
layers
.
append
(
nn
.
BatchNorm2d
(
dim
))
if
out_format
==
'channels_last'
:
layers
.
append
(
to_channels_last
())
elif
norm_layer
==
'LN'
:
if
in_format
==
'channels_first'
:
layers
.
append
(
to_channels_last
())
layers
.
append
(
nn
.
LayerNorm
(
dim
,
eps
=
eps
))
if
out_format
==
'channels_first'
:
layers
.
append
(
to_channels_first
())
else
:
raise
NotImplementedError
(
f
'build_norm_layer does not support
{
norm_layer
}
'
)
return
nn
.
Sequential
(
*
layers
)
def
build_act_layer
(
act_layer
):
if
act_layer
==
'ReLU'
:
return
nn
.
ReLU
(
inplace
=
True
)
elif
act_layer
==
'SiLU'
:
return
nn
.
SiLU
(
inplace
=
True
)
elif
act_layer
==
'GELU'
:
return
nn
.
GELU
()
raise
NotImplementedError
(
f
'build_act_layer does not support
{
act_layer
}
'
)
def
_is_power_of_2
(
n
):
if
(
not
isinstance
(
n
,
int
))
or
(
n
<
0
):
raise
ValueError
(
"invalid input for _is_power_of_2: {} (type: {})"
.
format
(
n
,
type
(
n
)))
return
(
n
&
(
n
-
1
)
==
0
)
and
n
!=
0
class
CenterFeatureScaleModule
(
nn
.
Module
):
def
forward
(
self
,
query
,
center_feature_scale_proj_weight
,
center_feature_scale_proj_bias
):
center_feature_scale
=
F
.
linear
(
query
,
weight
=
center_feature_scale_proj_weight
,
bias
=
center_feature_scale_proj_bias
).
sigmoid
()
return
center_feature_scale
class
DCNv3_pytorch
(
nn
.
Module
):
def
__init__
(
self
,
channels
=
64
,
kernel_size
=
3
,
dw_kernel_size
=
None
,
stride
=
1
,
pad
=
1
,
dilation
=
1
,
group
=
4
,
offset_scale
=
1.0
,
act_layer
=
'GELU'
,
norm_layer
=
'LN'
,
center_feature_scale
=
False
):
"""
DCNv3 Module
:param channels
:param kernel_size
:param stride
:param pad
:param dilation
:param group
:param offset_scale
:param act_layer
:param norm_layer
"""
super
().
__init__
()
if
channels
%
group
!=
0
:
raise
ValueError
(
f
'channels must be divisible by group, but got
{
channels
}
and
{
group
}
'
)
_d_per_group
=
channels
//
group
dw_kernel_size
=
dw_kernel_size
if
dw_kernel_size
is
not
None
else
kernel_size
# you'd better set _d_per_group to a power of 2 which is more efficient in our CUDA implementation
if
not
_is_power_of_2
(
_d_per_group
):
warnings
.
warn
(
"You'd better set channels in DCNv3 to make the dimension of each attention head a power of 2 "
"which is more efficient in our CUDA implementation."
)
self
.
offset_scale
=
offset_scale
self
.
channels
=
channels
self
.
kernel_size
=
kernel_size
self
.
dw_kernel_size
=
dw_kernel_size
self
.
stride
=
stride
self
.
dilation
=
dilation
self
.
pad
=
pad
self
.
group
=
group
self
.
group_channels
=
channels
//
group
self
.
offset_scale
=
offset_scale
self
.
center_feature_scale
=
center_feature_scale
self
.
dw_conv
=
nn
.
Sequential
(
nn
.
Conv2d
(
channels
,
channels
,
kernel_size
=
dw_kernel_size
,
stride
=
1
,
padding
=
(
dw_kernel_size
-
1
)
//
2
,
groups
=
channels
),
build_norm_layer
(
channels
,
norm_layer
,
'channels_first'
,
'channels_last'
),
build_act_layer
(
act_layer
))
self
.
offset
=
nn
.
Linear
(
channels
,
group
*
kernel_size
*
kernel_size
*
2
)
self
.
mask
=
nn
.
Linear
(
channels
,
group
*
kernel_size
*
kernel_size
)
self
.
input_proj
=
nn
.
Linear
(
channels
,
channels
)
self
.
output_proj
=
nn
.
Linear
(
channels
,
channels
)
self
.
_reset_parameters
()
if
center_feature_scale
:
self
.
center_feature_scale_proj_weight
=
nn
.
Parameter
(
torch
.
zeros
((
group
,
channels
),
dtype
=
torch
.
float
))
self
.
center_feature_scale_proj_bias
=
nn
.
Parameter
(
torch
.
tensor
(
0.0
,
dtype
=
torch
.
float
).
view
((
1
,)).
repeat
(
group
,
))
self
.
center_feature_scale_module
=
CenterFeatureScaleModule
()
def
_reset_parameters
(
self
):
constant_
(
self
.
offset
.
weight
.
data
,
0.
)
constant_
(
self
.
offset
.
bias
.
data
,
0.
)
constant_
(
self
.
mask
.
weight
.
data
,
0.
)
constant_
(
self
.
mask
.
bias
.
data
,
0.
)
xavier_uniform_
(
self
.
input_proj
.
weight
.
data
)
constant_
(
self
.
input_proj
.
bias
.
data
,
0.
)
xavier_uniform_
(
self
.
output_proj
.
weight
.
data
)
constant_
(
self
.
output_proj
.
bias
.
data
,
0.
)
def
forward
(
self
,
input
):
"""
:param query (N, H, W, C)
:return output (N, H, W, C)
"""
N
,
H
,
W
,
_
=
input
.
shape
x
=
self
.
input_proj
(
input
)
x_proj
=
x
x1
=
input
.
permute
(
0
,
3
,
1
,
2
)
x1
=
self
.
dw_conv
(
x1
)
offset
=
self
.
offset
(
x1
)
mask
=
self
.
mask
(
x1
).
reshape
(
N
,
H
,
W
,
self
.
group
,
-
1
)
mask
=
F
.
softmax
(
mask
,
-
1
).
reshape
(
N
,
H
,
W
,
-
1
)
x
=
dcnv3_core_pytorch
(
x
,
offset
,
mask
,
self
.
kernel_size
,
self
.
kernel_size
,
self
.
stride
,
self
.
stride
,
self
.
pad
,
self
.
pad
,
self
.
dilation
,
self
.
dilation
,
self
.
group
,
self
.
group_channels
,
self
.
offset_scale
)
if
self
.
center_feature_scale
:
center_feature_scale
=
self
.
center_feature_scale_module
(
x1
,
self
.
center_feature_scale_proj_weight
,
self
.
center_feature_scale_proj_bias
)
# N, H, W, groups -> N, H, W, groups, 1 -> N, H, W, groups, _d_per_group -> N, H, W, channels
center_feature_scale
=
center_feature_scale
[...,
None
].
repeat
(
1
,
1
,
1
,
1
,
self
.
channels
//
self
.
group
).
flatten
(
-
2
)
x
=
x
*
(
1
-
center_feature_scale
)
+
x_proj
*
center_feature_scale
x
=
self
.
output_proj
(
x
)
return
x
class
DCNv3
(
nn
.
Module
):
def
__init__
(
self
,
channels
=
64
,
kernel_size
=
3
,
dw_kernel_size
=
None
,
stride
=
1
,
pad
=
1
,
dilation
=
1
,
group
=
4
,
offset_scale
=
1.0
,
act_layer
=
'GELU'
,
norm_layer
=
'LN'
,
center_feature_scale
=
False
):
"""
DCNv3 Module
:param channels
:param kernel_size
:param stride
:param pad
:param dilation
:param group
:param offset_scale
:param act_layer
:param norm_layer
"""
super
().
__init__
()
if
channels
%
group
!=
0
:
raise
ValueError
(
f
'channels must be divisible by group, but got
{
channels
}
and
{
group
}
'
)
_d_per_group
=
channels
//
group
dw_kernel_size
=
dw_kernel_size
if
dw_kernel_size
is
not
None
else
kernel_size
# you'd better set _d_per_group to a power of 2 which is more efficient in our CUDA implementation
if
not
_is_power_of_2
(
_d_per_group
):
warnings
.
warn
(
"You'd better set channels in DCNv3 to make the dimension of each attention head a power of 2 "
"which is more efficient in our CUDA implementation."
)
self
.
offset_scale
=
offset_scale
self
.
channels
=
channels
self
.
kernel_size
=
kernel_size
self
.
dw_kernel_size
=
dw_kernel_size
self
.
stride
=
stride
self
.
dilation
=
dilation
self
.
pad
=
pad
self
.
group
=
group
self
.
group_channels
=
channels
//
group
self
.
offset_scale
=
offset_scale
self
.
center_feature_scale
=
center_feature_scale
self
.
dw_conv
=
nn
.
Sequential
(
nn
.
Conv2d
(
channels
,
channels
,
kernel_size
=
dw_kernel_size
,
stride
=
1
,
padding
=
(
dw_kernel_size
-
1
)
//
2
,
groups
=
channels
),
build_norm_layer
(
channels
,
norm_layer
,
'channels_first'
,
'channels_last'
),
build_act_layer
(
act_layer
))
self
.
offset
=
nn
.
Linear
(
channels
,
group
*
kernel_size
*
kernel_size
*
2
)
self
.
mask
=
nn
.
Linear
(
channels
,
group
*
kernel_size
*
kernel_size
)
self
.
input_proj
=
nn
.
Linear
(
channels
,
channels
)
self
.
output_proj
=
nn
.
Linear
(
channels
,
channels
)
self
.
_reset_parameters
()
if
center_feature_scale
:
self
.
center_feature_scale_proj_weight
=
nn
.
Parameter
(
torch
.
zeros
((
group
,
channels
),
dtype
=
torch
.
float
))
self
.
center_feature_scale_proj_bias
=
nn
.
Parameter
(
torch
.
tensor
(
0.0
,
dtype
=
torch
.
float
).
view
((
1
,)).
repeat
(
group
,
))
self
.
center_feature_scale_module
=
CenterFeatureScaleModule
()
def
_reset_parameters
(
self
):
constant_
(
self
.
offset
.
weight
.
data
,
0.
)
constant_
(
self
.
offset
.
bias
.
data
,
0.
)
constant_
(
self
.
mask
.
weight
.
data
,
0.
)
constant_
(
self
.
mask
.
bias
.
data
,
0.
)
xavier_uniform_
(
self
.
input_proj
.
weight
.
data
)
constant_
(
self
.
input_proj
.
bias
.
data
,
0.
)
xavier_uniform_
(
self
.
output_proj
.
weight
.
data
)
constant_
(
self
.
output_proj
.
bias
.
data
,
0.
)
def
forward
(
self
,
input
):
"""
:param query (N, H, W, C)
:return output (N, H, W, C)
"""
N
,
H
,
W
,
_
=
input
.
shape
x
=
self
.
input_proj
(
input
)
x_proj
=
x
dtype
=
x
.
dtype
x1
=
input
.
permute
(
0
,
3
,
1
,
2
)
x1
=
self
.
dw_conv
(
x1
)
offset
=
self
.
offset
(
x1
)
mask
=
self
.
mask
(
x1
).
reshape
(
N
,
H
,
W
,
self
.
group
,
-
1
)
mask
=
F
.
softmax
(
mask
,
-
1
).
reshape
(
N
,
H
,
W
,
-
1
).
type
(
dtype
)
x
=
DCNv3Function
.
apply
(
x
,
offset
,
mask
,
self
.
kernel_size
,
self
.
kernel_size
,
self
.
stride
,
self
.
stride
,
self
.
pad
,
self
.
pad
,
self
.
dilation
,
self
.
dilation
,
self
.
group
,
self
.
group_channels
,
self
.
offset_scale
,
256
)
if
self
.
center_feature_scale
:
center_feature_scale
=
self
.
center_feature_scale_module
(
x1
,
self
.
center_feature_scale_proj_weight
,
self
.
center_feature_scale_proj_bias
)
# N, H, W, groups -> N, H, W, groups, 1 -> N, H, W, groups, _d_per_group -> N, H, W, channels
center_feature_scale
=
center_feature_scale
[...,
None
].
repeat
(
1
,
1
,
1
,
1
,
self
.
channels
//
self
.
group
).
flatten
(
-
2
)
x
=
x
*
(
1
-
center_feature_scale
)
+
x_proj
*
center_feature_scale
x
=
self
.
output_proj
(
x
)
return
x
autonomous_driving/occupancy_prediction/projects/mmdet3d_plugin/bevformer/backbones/ops_dcnv3/setup.py
0 → 100644
View file @
df3c64a9
# --------------------------------------------------------
# InternImage
# Copyright (c) 2022 OpenGVLab
# Licensed under The MIT License [see LICENSE for details]
# --------------------------------------------------------
import
os
import
glob
import
torch
from
torch.utils.cpp_extension
import
CUDA_HOME
from
torch.utils.cpp_extension
import
CppExtension
from
torch.utils.cpp_extension
import
CUDAExtension
from
setuptools
import
find_packages
from
setuptools
import
setup
requirements
=
[
"torch"
,
"torchvision"
]
def
get_extensions
():
this_dir
=
os
.
path
.
dirname
(
os
.
path
.
abspath
(
__file__
))
extensions_dir
=
os
.
path
.
join
(
this_dir
,
"src"
)
main_file
=
glob
.
glob
(
os
.
path
.
join
(
extensions_dir
,
"*.cpp"
))
source_cpu
=
glob
.
glob
(
os
.
path
.
join
(
extensions_dir
,
"cpu"
,
"*.cpp"
))
source_cuda
=
glob
.
glob
(
os
.
path
.
join
(
extensions_dir
,
"cuda"
,
"*.cu"
))
sources
=
main_file
+
source_cpu
extension
=
CppExtension
extra_compile_args
=
{
"cxx"
:
[]}
define_macros
=
[]
if
torch
.
cuda
.
is_available
()
and
CUDA_HOME
is
not
None
:
extension
=
CUDAExtension
sources
+=
source_cuda
define_macros
+=
[(
"WITH_CUDA"
,
None
)]
extra_compile_args
[
"nvcc"
]
=
[
# "-DCUDA_HAS_FP16=1",
# "-D__CUDA_NO_HALF_OPERATORS__",
# "-D__CUDA_NO_HALF_CONVERSIONS__",
# "-D__CUDA_NO_HALF2_OPERATORS__",
]
else
:
raise
NotImplementedError
(
'Cuda is not availabel'
)
sources
=
[
os
.
path
.
join
(
extensions_dir
,
s
)
for
s
in
sources
]
include_dirs
=
[
extensions_dir
]
ext_modules
=
[
extension
(
"DCNv3"
,
sources
,
include_dirs
=
include_dirs
,
define_macros
=
define_macros
,
extra_compile_args
=
extra_compile_args
,
)
]
return
ext_modules
setup
(
name
=
"DCNv3"
,
version
=
"1.0"
,
author
=
"InternImage"
,
url
=
"https://github.com/OpenGVLab/InternImage"
,
description
=
"PyTorch Wrapper for CUDA Functions of DCNv3"
,
packages
=
find_packages
(
exclude
=
(
"configs"
,
"tests"
,
)),
ext_modules
=
get_extensions
(),
cmdclass
=
{
"build_ext"
:
torch
.
utils
.
cpp_extension
.
BuildExtension
},
)
autonomous_driving/occupancy_prediction/projects/mmdet3d_plugin/bevformer/backbones/ops_dcnv3/src/cpu/dcnv3_cpu.cpp
0 → 100644
View file @
df3c64a9
/*!
**************************************************************************************************
* InternImage
* Copyright (c) 2022 OpenGVLab
* Licensed under The MIT License [see LICENSE for details]
**************************************************************************************************
* Modified from
*https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
**************************************************************************************************
*/
#include <vector>
#include <ATen/ATen.h>
#include <ATen/cuda/CUDAContext.h>
at
::
Tensor
dcnv3_cpu_forward
(
const
at
::
Tensor
&
input
,
const
at
::
Tensor
&
offset
,
const
at
::
Tensor
&
mask
,
const
int
kernel_h
,
const
int
kernel_w
,
const
int
stride_h
,
const
int
stride_w
,
const
int
pad_h
,
const
int
pad_w
,
const
int
dilation_h
,
const
int
dilation_w
,
const
int
group
,
const
int
group_channels
,
const
float
offset_scale
,
const
int
im2col_step
)
{
AT_ERROR
(
"Not implement on cpu"
);
}
std
::
vector
<
at
::
Tensor
>
dcnv3_cpu_backward
(
const
at
::
Tensor
&
input
,
const
at
::
Tensor
&
offset
,
const
at
::
Tensor
&
mask
,
const
int
kernel_h
,
const
int
kernel_w
,
const
int
stride_h
,
const
int
stride_w
,
const
int
pad_h
,
const
int
pad_w
,
const
int
dilation_h
,
const
int
dilation_w
,
const
int
group
,
const
int
group_channels
,
const
float
offset_scale
,
const
at
::
Tensor
&
grad_output
,
const
int
im2col_step
)
{
AT_ERROR
(
"Not implement on cpu"
);
}
autonomous_driving/occupancy_prediction/projects/mmdet3d_plugin/bevformer/backbones/ops_dcnv3/src/cpu/dcnv3_cpu.h
0 → 100644
View file @
df3c64a9
/*!
**************************************************************************************************
* InternImage
* Copyright (c) 2022 OpenGVLab
* Licensed under The MIT License [see LICENSE for details]
**************************************************************************************************
* Modified from
*https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
**************************************************************************************************
*/
#pragma once
#include <torch/extension.h>
at
::
Tensor
dcnv3_cpu_forward
(
const
at
::
Tensor
&
input
,
const
at
::
Tensor
&
offset
,
const
at
::
Tensor
&
mask
,
const
int
kernel_h
,
const
int
kernel_w
,
const
int
stride_h
,
const
int
stride_w
,
const
int
pad_h
,
const
int
pad_w
,
const
int
dilation_h
,
const
int
dilation_w
,
const
int
group
,
const
int
group_channels
,
const
float
offset_scale
,
const
int
im2col_step
);
std
::
vector
<
at
::
Tensor
>
dcnv3_cpu_backward
(
const
at
::
Tensor
&
input
,
const
at
::
Tensor
&
offset
,
const
at
::
Tensor
&
mask
,
const
int
kernel_h
,
const
int
kernel_w
,
const
int
stride_h
,
const
int
stride_w
,
const
int
pad_h
,
const
int
pad_w
,
const
int
dilation_h
,
const
int
dilation_w
,
const
int
group
,
const
int
group_channels
,
const
float
offset_scale
,
const
at
::
Tensor
&
grad_output
,
const
int
im2col_step
);
autonomous_driving/occupancy_prediction/projects/mmdet3d_plugin/bevformer/backbones/ops_dcnv3/src/cuda/dcnv3_cuda.cu
0 → 100644
View file @
df3c64a9
/*!
**************************************************************************************************
* InternImage
* Copyright (c) 2022 OpenGVLab
* Licensed under The MIT License [see LICENSE for details]
**************************************************************************************************
* Modified from
*https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
**************************************************************************************************
*/
#include "cuda/dcnv3_im2col_cuda.cuh"
#include <vector>
#include <ATen/ATen.h>
#include <ATen/cuda/CUDAContext.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <torch/torch.h>
at
::
Tensor
dcnv3_cuda_forward
(
const
at
::
Tensor
&
input
,
const
at
::
Tensor
&
offset
,
const
at
::
Tensor
&
mask
,
const
int
kernel_h
,
const
int
kernel_w
,
const
int
stride_h
,
const
int
stride_w
,
const
int
pad_h
,
const
int
pad_w
,
const
int
dilation_h
,
const
int
dilation_w
,
const
int
group
,
const
int
group_channels
,
const
float
offset_scale
,
const
int
im2col_step
)
{
AT_ASSERTM
(
input
.
is_contiguous
(),
"input tensor has to be contiguous"
);
AT_ASSERTM
(
offset
.
is_contiguous
(),
"offset tensor has to be contiguous"
);
AT_ASSERTM
(
mask
.
is_contiguous
(),
"mask tensor has to be contiguous"
);
AT_ASSERTM
(
input
.
type
().
is_cuda
(),
"input must be a CUDA tensor"
);
AT_ASSERTM
(
offset
.
type
().
is_cuda
(),
"offset must be a CUDA tensor"
);
AT_ASSERTM
(
mask
.
type
().
is_cuda
(),
"mask must be a CUDA tensor"
);
const
int
batch
=
input
.
size
(
0
);
const
int
height_in
=
input
.
size
(
1
);
const
int
width_in
=
input
.
size
(
2
);
const
int
channels
=
input
.
size
(
3
);
const
int
height_out
=
(
height_in
+
2
*
pad_h
-
(
dilation_h
*
(
kernel_h
-
1
)
+
1
))
/
stride_h
+
1
;
const
int
width_out
=
(
width_in
+
2
*
pad_w
-
(
dilation_w
*
(
kernel_w
-
1
)
+
1
))
/
stride_w
+
1
;
const
int
im2col_step_
=
std
::
min
(
batch
,
im2col_step
);
AT_ASSERTM
(
batch
%
im2col_step_
==
0
,
"batch(%d) must divide im2col_step(%d)"
,
batch
,
im2col_step_
);
AT_ASSERTM
(
channels
==
(
group
*
group_channels
),
"Input channels and group times group channels wont match: (%d vs %d)."
,
channels
,
group
*
group_channels
);
auto
output
=
at
::
zeros
({
batch
,
height_out
,
width_out
,
group
*
group_channels
},
input
.
options
());
const
int
batch_n
=
im2col_step_
;
auto
output_n
=
output
.
view
({
batch
/
batch_n
,
batch_n
,
height_out
,
width_out
,
group
*
group_channels
});
auto
per_input_size
=
height_in
*
width_in
*
group
*
group_channels
;
auto
per_offset_size
=
height_out
*
width_out
*
group
*
kernel_h
*
kernel_w
*
2
;
auto
per_mask_size
=
height_out
*
width_out
*
group
*
kernel_h
*
kernel_w
;
for
(
int
n
=
0
;
n
<
batch
/
im2col_step_
;
++
n
)
{
auto
columns
=
output_n
.
select
(
0
,
n
);
// AT_DISPATCH_FLOATING_TYPES(
AT_DISPATCH_FLOATING_TYPES_AND_HALF
(
input
.
type
(),
"ms_deform_attn_forward_cuda"
,
([
&
]
{
dcnv3_im2col_cuda
(
at
::
cuda
::
getCurrentCUDAStream
(),
input
.
data
<
scalar_t
>
()
+
n
*
im2col_step_
*
per_input_size
,
offset
.
data
<
scalar_t
>
()
+
n
*
im2col_step_
*
per_offset_size
,
mask
.
data
<
scalar_t
>
()
+
n
*
im2col_step_
*
per_mask_size
,
columns
.
data
<
scalar_t
>
(),
kernel_h
,
kernel_w
,
stride_h
,
stride_w
,
pad_h
,
pad_w
,
dilation_h
,
dilation_w
,
group
,
group_channels
,
batch_n
,
height_in
,
width_in
,
height_out
,
width_out
,
offset_scale
);
}));
}
return
output
;
}
std
::
vector
<
at
::
Tensor
>
dcnv3_cuda_backward
(
const
at
::
Tensor
&
input
,
const
at
::
Tensor
&
offset
,
const
at
::
Tensor
&
mask
,
const
int
kernel_h
,
const
int
kernel_w
,
const
int
stride_h
,
const
int
stride_w
,
const
int
pad_h
,
const
int
pad_w
,
const
int
dilation_h
,
const
int
dilation_w
,
const
int
group
,
const
int
group_channels
,
const
float
offset_scale
,
const
at
::
Tensor
&
grad_output
,
const
int
im2col_step
)
{
AT_ASSERTM
(
input
.
is_contiguous
(),
"input tensor has to be contiguous"
);
AT_ASSERTM
(
offset
.
is_contiguous
(),
"offset tensor has to be contiguous"
);
AT_ASSERTM
(
mask
.
is_contiguous
(),
"mask tensor has to be contiguous"
);
AT_ASSERTM
(
grad_output
.
is_contiguous
(),
"grad_output tensor has to be contiguous"
);
AT_ASSERTM
(
input
.
type
().
is_cuda
(),
"input must be a CUDA tensor"
);
AT_ASSERTM
(
offset
.
type
().
is_cuda
(),
"offset must be a CUDA tensor"
);
AT_ASSERTM
(
mask
.
type
().
is_cuda
(),
"mask must be a CUDA tensor"
);
AT_ASSERTM
(
grad_output
.
type
().
is_cuda
(),
"grad_output must be a CUDA tensor"
);
const
int
batch
=
input
.
size
(
0
);
const
int
height_in
=
input
.
size
(
1
);
const
int
width_in
=
input
.
size
(
2
);
const
int
channels
=
input
.
size
(
3
);
const
int
height_out
=
(
height_in
+
2
*
pad_h
-
(
dilation_h
*
(
kernel_h
-
1
)
+
1
))
/
stride_h
+
1
;
const
int
width_out
=
(
width_in
+
2
*
pad_w
-
(
dilation_w
*
(
kernel_w
-
1
)
+
1
))
/
stride_w
+
1
;
const
int
im2col_step_
=
std
::
min
(
batch
,
im2col_step
);
AT_ASSERTM
(
batch
%
im2col_step_
==
0
,
"batch(%d) must divide im2col_step(%d)"
,
batch
,
im2col_step_
);
AT_ASSERTM
(
channels
==
(
group
*
group_channels
),
"Input channels and group times group channels wont match: (%d vs %d)."
,
channels
,
group
*
group_channels
);
auto
dtype
=
input
.
dtype
();
if
(
dtype
==
at
::
kHalf
)
{
dtype
=
at
::
kFloat
;
}
auto
grad_input
=
at
::
zeros_like
(
input
,
dtype
);
auto
grad_offset
=
at
::
zeros_like
(
offset
,
dtype
);
auto
grad_mask
=
at
::
zeros_like
(
mask
,
dtype
);
const
int
batch_n
=
im2col_step_
;
auto
per_input_size
=
height_in
*
width_in
*
group
*
group_channels
;
auto
per_offset_size
=
height_out
*
width_out
*
group
*
kernel_h
*
kernel_w
*
2
;
auto
per_mask_size
=
height_out
*
width_out
*
group
*
kernel_h
*
kernel_w
;
auto
grad_output_n
=
grad_output
.
view
({
batch
/
im2col_step_
,
batch_n
,
height_out
*
width_out
,
group
,
group_channels
});
for
(
int
n
=
0
;
n
<
batch
/
im2col_step_
;
++
n
)
{
auto
grad_output_g
=
grad_output_n
.
select
(
0
,
n
);
// AT_DISPATCH_FLOATING_TYPES(
AT_DISPATCH_FLOATING_TYPES_AND_HALF
(
input
.
type
(),
"ms_deform_attn_backward_cuda"
,
([
&
]
{
dcnv3_col2im_cuda
(
at
::
cuda
::
getCurrentCUDAStream
(),
grad_output_g
.
data
<
scalar_t
>
(),
input
.
data
<
scalar_t
>
()
+
n
*
im2col_step_
*
per_input_size
,
offset
.
data
<
scalar_t
>
()
+
n
*
im2col_step_
*
per_offset_size
,
mask
.
data
<
scalar_t
>
()
+
n
*
im2col_step_
*
per_mask_size
,
kernel_h
,
kernel_w
,
stride_h
,
stride_w
,
pad_h
,
pad_w
,
dilation_h
,
dilation_w
,
group
,
group_channels
,
batch_n
,
height_in
,
width_in
,
height_out
,
width_out
,
offset_scale
,
grad_input
.
data
<
opmath_t
>
()
+
n
*
im2col_step_
*
per_input_size
,
grad_offset
.
data
<
opmath_t
>
()
+
n
*
im2col_step_
*
per_offset_size
,
grad_mask
.
data
<
opmath_t
>
()
+
n
*
im2col_step_
*
per_mask_size
);
}));
}
if
(
input
.
dtype
()
==
torch
::
kHalf
)
{
return
{
grad_input
.
to
(
torch
::
kHalf
),
grad_offset
.
to
(
torch
::
kHalf
),
grad_mask
.
to
(
torch
::
kHalf
)};
}
else
{
return
{
grad_input
,
grad_offset
,
grad_mask
};
}
}
\ No newline at end of file
autonomous_driving/occupancy_prediction/projects/mmdet3d_plugin/bevformer/backbones/ops_dcnv3/src/cuda/dcnv3_cuda.h
0 → 100644
View file @
df3c64a9
/*!
**************************************************************************************************
* InternImage
* Copyright (c) 2022 OpenGVLab
* Licensed under The MIT License [see LICENSE for details]
**************************************************************************************************
* Modified from
*https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
**************************************************************************************************
*/
#pragma once
#include <torch/extension.h>
at
::
Tensor
dcnv3_cuda_forward
(
const
at
::
Tensor
&
input
,
const
at
::
Tensor
&
offset
,
const
at
::
Tensor
&
mask
,
const
int
kernel_h
,
const
int
kernel_w
,
const
int
stride_h
,
const
int
stride_w
,
const
int
pad_h
,
const
int
pad_w
,
const
int
dilation_h
,
const
int
dilation_w
,
const
int
group
,
const
int
group_channels
,
const
float
offset_scale
,
const
int
im2col_step
);
std
::
vector
<
at
::
Tensor
>
dcnv3_cuda_backward
(
const
at
::
Tensor
&
input
,
const
at
::
Tensor
&
offset
,
const
at
::
Tensor
&
mask
,
const
int
kernel_h
,
const
int
kernel_w
,
const
int
stride_h
,
const
int
stride_w
,
const
int
pad_h
,
const
int
pad_w
,
const
int
dilation_h
,
const
int
dilation_w
,
const
int
group
,
const
int
group_channels
,
const
float
offset_scale
,
const
at
::
Tensor
&
grad_output
,
const
int
im2col_step
);
autonomous_driving/occupancy_prediction/projects/mmdet3d_plugin/bevformer/backbones/ops_dcnv3/src/cuda/dcnv3_im2col_cuda.cuh
0 → 100644
View file @
df3c64a9
/*!
**************************************************************************************************
* InternImage
* Copyright (c) 2022 OpenGVLab
* Licensed under The MIT License [see LICENSE for details]
**************************************************************************************************
* Modified from
*https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
**************************************************************************************************
*/
#include <algorithm>
#include <cstdio>
#include <cstring>
#include <ATen/ATen.h>
#include <ATen/OpMathType.h>
#include <ATen/cuda/CUDAContext.h>
#include <THC/THCAtomics.cuh>
#define CUDA_KERNEL_LOOP(i, n) \
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
i += blockDim.x * gridDim.x)
const
int
CUDA_NUM_THREADS
=
256
;
inline
int
GET_BLOCKS
(
const
int
N
,
const
int
num_threads
)
{
return
(
N
+
num_threads
-
1
)
/
num_threads
;
}
#define opmath_t at::opmath_type<scalar_t>
template
<
typename
scalar_t
>
__device__
opmath_t
dcnv3_im2col_bilinear
(
const
scalar_t
*&
bottom_data
,
const
int
&
height
,
const
int
&
width
,
const
int
&
group
,
const
int
&
group_channels
,
const
opmath_t
&
h
,
const
opmath_t
&
w
,
const
int
&
g
,
const
int
&
c
)
{
const
int
h_low
=
floor
(
h
);
const
int
w_low
=
floor
(
w
);
const
int
h_high
=
h_low
+
1
;
const
int
w_high
=
w_low
+
1
;
const
opmath_t
lh
=
h
-
h_low
;
const
opmath_t
lw
=
w
-
w_low
;
const
opmath_t
hh
=
1
-
lh
,
hw
=
1
-
lw
;
const
int
w_stride
=
group
*
group_channels
;
const
int
h_stride
=
width
*
w_stride
;
const
int
h_low_ptr_offset
=
h_low
*
h_stride
;
const
int
h_high_ptr_offset
=
h_low_ptr_offset
+
h_stride
;
const
int
w_low_ptr_offset
=
w_low
*
w_stride
;
const
int
w_high_ptr_offset
=
w_low_ptr_offset
+
w_stride
;
const
int
base_ptr
=
g
*
group_channels
+
c
;
opmath_t
v1
=
0
;
if
(
h_low
>=
0
&&
w_low
>=
0
)
{
const
int
ptr1
=
h_low_ptr_offset
+
w_low_ptr_offset
+
base_ptr
;
v1
=
bottom_data
[
ptr1
];
}
opmath_t
v2
=
0
;
if
(
h_low
>=
0
&&
w_high
<=
width
-
1
)
{
const
int
ptr2
=
h_low_ptr_offset
+
w_high_ptr_offset
+
base_ptr
;
v2
=
bottom_data
[
ptr2
];
}
opmath_t
v3
=
0
;
if
(
h_high
<=
height
-
1
&&
w_low
>=
0
)
{
const
int
ptr3
=
h_high_ptr_offset
+
w_low_ptr_offset
+
base_ptr
;
v3
=
bottom_data
[
ptr3
];
}
opmath_t
v4
=
0
;
if
(
h_high
<=
height
-
1
&&
w_high
<=
width
-
1
)
{
const
int
ptr4
=
h_high_ptr_offset
+
w_high_ptr_offset
+
base_ptr
;
v4
=
bottom_data
[
ptr4
];
}
const
opmath_t
w1
=
hh
*
hw
,
w2
=
hh
*
lw
,
w3
=
lh
*
hw
,
w4
=
lh
*
lw
;
const
opmath_t
val
=
(
w1
*
v1
+
w2
*
v2
+
w3
*
v3
+
w4
*
v4
);
return
val
;
}
template
<
typename
scalar_t
>
__device__
void
dcnv3_col2im_bilinear
(
const
scalar_t
*&
bottom_data
,
const
int
&
height
,
const
int
&
width
,
const
int
&
nheads
,
const
int
&
group_channels
,
const
opmath_t
&
h
,
const
opmath_t
&
w
,
const
int
&
m
,
const
int
&
c
,
const
opmath_t
offset_scale
,
const
opmath_t
&
top_grad
,
const
opmath_t
&
mask
,
opmath_t
*&
grad_im
,
opmath_t
*
grad_offset
,
opmath_t
*
grad_mask
)
{
const
int
h_low
=
floor
(
h
);
const
int
w_low
=
floor
(
w
);
const
int
h_high
=
h_low
+
1
;
const
int
w_high
=
w_low
+
1
;
const
opmath_t
lh
=
h
-
h_low
;
const
opmath_t
lw
=
w
-
w_low
;
const
opmath_t
hh
=
1
-
lh
,
hw
=
1
-
lw
;
const
int
w_stride
=
nheads
*
group_channels
;
const
int
h_stride
=
width
*
w_stride
;
const
int
h_low_ptr_offset
=
h_low
*
h_stride
;
const
int
h_high_ptr_offset
=
h_low_ptr_offset
+
h_stride
;
const
int
w_low_ptr_offset
=
w_low
*
w_stride
;
const
int
w_high_ptr_offset
=
w_low_ptr_offset
+
w_stride
;
const
int
base_ptr
=
m
*
group_channels
+
c
;
const
opmath_t
w1
=
hh
*
hw
,
w2
=
hh
*
lw
,
w3
=
lh
*
hw
,
w4
=
lh
*
lw
;
const
opmath_t
top_grad_im
=
top_grad
*
mask
;
opmath_t
grad_h_weight
=
0
,
grad_w_weight
=
0
;
opmath_t
v1
=
0
;
if
(
h_low
>=
0
&&
w_low
>=
0
)
{
const
int
ptr1
=
h_low_ptr_offset
+
w_low_ptr_offset
+
base_ptr
;
v1
=
bottom_data
[
ptr1
];
grad_h_weight
-=
hw
*
v1
;
grad_w_weight
-=
hh
*
v1
;
atomicAdd
(
grad_im
+
ptr1
,
w1
*
top_grad_im
);
}
opmath_t
v2
=
0
;
if
(
h_low
>=
0
&&
w_high
<=
width
-
1
)
{
const
int
ptr2
=
h_low_ptr_offset
+
w_high_ptr_offset
+
base_ptr
;
v2
=
bottom_data
[
ptr2
];
grad_h_weight
-=
lw
*
v2
;
grad_w_weight
+=
hh
*
v2
;
atomicAdd
(
grad_im
+
ptr2
,
w2
*
top_grad_im
);
}
opmath_t
v3
=
0
;
if
(
h_high
<=
height
-
1
&&
w_low
>=
0
)
{
const
int
ptr3
=
h_high_ptr_offset
+
w_low_ptr_offset
+
base_ptr
;
v3
=
bottom_data
[
ptr3
];
grad_h_weight
+=
hw
*
v3
;
grad_w_weight
-=
lh
*
v3
;
atomicAdd
(
grad_im
+
ptr3
,
w3
*
top_grad_im
);
}
opmath_t
v4
=
0
;
if
(
h_high
<=
height
-
1
&&
w_high
<=
width
-
1
)
{
const
int
ptr4
=
h_high_ptr_offset
+
w_high_ptr_offset
+
base_ptr
;
v4
=
bottom_data
[
ptr4
];
grad_h_weight
+=
lw
*
v4
;
grad_w_weight
+=
lh
*
v4
;
atomicAdd
(
grad_im
+
ptr4
,
w4
*
top_grad_im
);
}
const
opmath_t
val
=
(
w1
*
v1
+
w2
*
v2
+
w3
*
v3
+
w4
*
v4
);
*
grad_mask
=
top_grad
*
val
;
*
grad_offset
=
offset_scale
*
grad_w_weight
*
top_grad_im
;
*
(
grad_offset
+
1
)
=
offset_scale
*
grad_h_weight
*
top_grad_im
;
}
template
<
typename
scalar_t
>
__device__
void
dcnv3_col2im_bilinear_gm
(
const
scalar_t
*&
bottom_data
,
const
int
&
height
,
const
int
&
width
,
const
int
&
nheads
,
const
int
&
group_channels
,
const
opmath_t
&
h
,
const
opmath_t
&
w
,
const
int
&
m
,
const
int
&
c
,
const
opmath_t
offset_scale
,
const
opmath_t
&
top_grad
,
const
opmath_t
&
mask
,
opmath_t
*&
grad_im
,
opmath_t
*
grad_offset
,
opmath_t
*
grad_mask
)
{
const
int
h_low
=
floor
(
h
);
const
int
w_low
=
floor
(
w
);
const
int
h_high
=
h_low
+
1
;
const
int
w_high
=
w_low
+
1
;
const
opmath_t
lh
=
h
-
h_low
;
const
opmath_t
lw
=
w
-
w_low
;
const
opmath_t
hh
=
1
-
lh
,
hw
=
1
-
lw
;
const
int
w_stride
=
nheads
*
group_channels
;
const
int
h_stride
=
width
*
w_stride
;
const
int
h_low_ptr_offset
=
h_low
*
h_stride
;
const
int
h_high_ptr_offset
=
h_low_ptr_offset
+
h_stride
;
const
int
w_low_ptr_offset
=
w_low
*
w_stride
;
const
int
w_high_ptr_offset
=
w_low_ptr_offset
+
w_stride
;
const
int
base_ptr
=
m
*
group_channels
+
c
;
const
opmath_t
w1
=
hh
*
hw
,
w2
=
hh
*
lw
,
w3
=
lh
*
hw
,
w4
=
lh
*
lw
;
const
opmath_t
top_grad_im
=
top_grad
*
mask
;
opmath_t
grad_h_weight
=
0
,
grad_w_weight
=
0
;
opmath_t
v1
=
0
;
if
(
h_low
>=
0
&&
w_low
>=
0
)
{
const
int
ptr1
=
h_low_ptr_offset
+
w_low_ptr_offset
+
base_ptr
;
v1
=
bottom_data
[
ptr1
];
grad_h_weight
-=
hw
*
v1
;
grad_w_weight
-=
hh
*
v1
;
atomicAdd
(
grad_im
+
ptr1
,
w1
*
top_grad_im
);
}
opmath_t
v2
=
0
;
if
(
h_low
>=
0
&&
w_high
<=
width
-
1
)
{
const
int
ptr2
=
h_low_ptr_offset
+
w_high_ptr_offset
+
base_ptr
;
v2
=
bottom_data
[
ptr2
];
grad_h_weight
-=
lw
*
v2
;
grad_w_weight
+=
hh
*
v2
;
atomicAdd
(
grad_im
+
ptr2
,
w2
*
top_grad_im
);
}
opmath_t
v3
=
0
;
if
(
h_high
<=
height
-
1
&&
w_low
>=
0
)
{
const
int
ptr3
=
h_high_ptr_offset
+
w_low_ptr_offset
+
base_ptr
;
v3
=
bottom_data
[
ptr3
];
grad_h_weight
+=
hw
*
v3
;
grad_w_weight
-=
lh
*
v3
;
atomicAdd
(
grad_im
+
ptr3
,
w3
*
top_grad_im
);
}
opmath_t
v4
=
0
;
if
(
h_high
<=
height
-
1
&&
w_high
<=
width
-
1
)
{
const
int
ptr4
=
h_high_ptr_offset
+
w_high_ptr_offset
+
base_ptr
;
v4
=
bottom_data
[
ptr4
];
grad_h_weight
+=
lw
*
v4
;
grad_w_weight
+=
lh
*
v4
;
atomicAdd
(
grad_im
+
ptr4
,
w4
*
top_grad_im
);
}
const
opmath_t
val
=
(
w1
*
v1
+
w2
*
v2
+
w3
*
v3
+
w4
*
v4
);
atomicAdd
(
grad_mask
,
top_grad
*
val
);
atomicAdd
(
grad_offset
,
offset_scale
*
grad_w_weight
*
top_grad_im
);
atomicAdd
(
grad_offset
+
1
,
offset_scale
*
grad_h_weight
*
top_grad_im
);
}
template
<
typename
scalar_t
>
__global__
void
dcnv3_im2col_gpu_kernel
(
const
int
num_kernels
,
const
scalar_t
*
data_im
,
const
scalar_t
*
data_offset
,
const
scalar_t
*
data_mask
,
scalar_t
*
data_col
,
const
int
kernel_h
,
const
int
kernel_w
,
const
int
stride_h
,
const
int
stride_w
,
const
int
pad_h
,
const
int
pad_w
,
const
int
dilation_h
,
const
int
dilation_w
,
const
int
group
,
const
int
group_channels
,
const
int
height_in
,
const
int
width_in
,
const
int
height_out
,
const
int
width_out
,
const
opmath_t
offset_scale
)
{
CUDA_KERNEL_LOOP
(
index
,
num_kernels
)
{
int
_temp
=
index
;
const
int
c_col
=
_temp
%
group_channels
;
_temp
/=
group_channels
;
const
int
sampling_index
=
_temp
;
const
int
g_col
=
_temp
%
group
;
_temp
/=
group
;
const
int
p0_w
=
((
dilation_w
*
(
kernel_w
-
1
))
>>
1
)
-
pad_w
+
(
_temp
%
width_out
)
*
stride_w
;
_temp
/=
width_out
;
const
int
p0_h
=
((
dilation_h
*
(
kernel_h
-
1
))
>>
1
)
-
pad_h
+
(
_temp
%
height_out
)
*
stride_h
;
_temp
/=
height_out
;
const
int
b_col
=
_temp
;
const
int
input_size
=
height_in
*
width_in
;
scalar_t
*
data_col_ptr
=
data_col
+
index
;
const
int
kernel_size
=
kernel_h
*
kernel_w
;
int
data_weight_ptr
=
sampling_index
*
kernel_size
;
int
data_loc_w_ptr
=
data_weight_ptr
<<
1
;
const
int
qid_stride
=
group
*
group_channels
;
opmath_t
col
=
0
;
const
scalar_t
*
data_im_ptr
=
data_im
+
b_col
*
input_size
*
qid_stride
;
// top-left
const
opmath_t
p0_w_
=
p0_w
-
((
dilation_w
*
(
kernel_w
-
1
))
>>
1
)
*
offset_scale
;
const
opmath_t
p0_h_
=
p0_h
-
((
dilation_h
*
(
kernel_h
-
1
))
>>
1
)
*
offset_scale
;
for
(
int
i
=
0
;
i
<
kernel_w
;
++
i
)
{
for
(
int
j
=
0
;
j
<
kernel_h
;
++
j
)
{
const
opmath_t
offset_w
=
data_offset
[
data_loc_w_ptr
];
const
opmath_t
offset_h
=
data_offset
[
data_loc_w_ptr
+
1
];
const
opmath_t
loc_w
=
p0_w_
+
(
i
*
dilation_w
+
offset_w
)
*
offset_scale
;
const
opmath_t
loc_h
=
p0_h_
+
(
j
*
dilation_h
+
offset_h
)
*
offset_scale
;
const
opmath_t
weight
=
data_mask
[
data_weight_ptr
];
if
(
loc_h
>
-
1
&&
loc_w
>
-
1
&&
loc_h
<
height_in
&&
loc_w
<
width_in
)
{
col
+=
dcnv3_im2col_bilinear
(
data_im_ptr
,
height_in
,
width_in
,
group
,
group_channels
,
loc_h
,
loc_w
,
g_col
,
c_col
)
*
weight
;
}
data_weight_ptr
+=
1
;
data_loc_w_ptr
+=
2
;
}
}
*
data_col_ptr
=
col
;
}
}
// debug
template
<
typename
scalar_t
,
unsigned
int
blockSize
>
__global__
void
dcnv3_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1
(
const
int
num_kernels
,
const
scalar_t
*
grad_col
,
const
scalar_t
*
data_im
,
const
scalar_t
*
data_offset
,
const
scalar_t
*
data_mask
,
const
int
kernel_h
,
const
int
kernel_w
,
const
int
stride_h
,
const
int
stride_w
,
const
int
pad_h
,
const
int
pad_w
,
const
int
dilation_h
,
const
int
dilation_w
,
const
int
group
,
const
int
group_channels
,
const
int
height_in
,
const
int
width_in
,
const
int
height_out
,
const
int
width_out
,
const
opmath_t
offset_scale
,
opmath_t
*
grad_im
,
opmath_t
*
grad_offset
,
opmath_t
*
grad_mask
)
{
CUDA_KERNEL_LOOP
(
index
,
num_kernels
)
{
__shared__
opmath_t
cache_grad_offset
[
blockSize
*
2
];
__shared__
opmath_t
cache_grad_mask
[
blockSize
];
unsigned
int
tid
=
threadIdx
.
x
;
int
_temp
=
index
;
const
int
c_col
=
_temp
%
group_channels
;
_temp
/=
group_channels
;
const
int
sampling_index
=
_temp
;
const
int
g_col
=
_temp
%
group
;
_temp
/=
group
;
const
int
p0_w
=
((
dilation_w
*
(
kernel_w
-
1
))
>>
1
)
-
pad_w
+
(
_temp
%
width_out
)
*
stride_w
;
_temp
/=
width_out
;
const
int
p0_h
=
((
dilation_h
*
(
kernel_h
-
1
))
>>
1
)
-
pad_h
+
(
_temp
%
height_out
)
*
stride_h
;
_temp
/=
height_out
;
const
int
b_col
=
_temp
;
const
opmath_t
top_grad
=
grad_col
[
index
];
const
int
input_size
=
height_in
*
width_in
;
const
int
kernel_size
=
kernel_h
*
kernel_w
;
int
data_weight_ptr
=
sampling_index
*
kernel_size
;
int
data_loc_w_ptr
=
data_weight_ptr
<<
1
;
const
int
grad_sampling_ptr
=
data_weight_ptr
;
grad_offset
+=
grad_sampling_ptr
<<
1
;
grad_mask
+=
grad_sampling_ptr
;
const
int
qid_stride
=
group
*
group_channels
;
const
int
im_ptr_offset
=
b_col
*
input_size
*
qid_stride
;
const
scalar_t
*
data_im_ptr
=
data_im
+
im_ptr_offset
;
opmath_t
*
grad_im_ptr
=
grad_im
+
im_ptr_offset
;
const
opmath_t
p0_w_
=
p0_w
-
((
dilation_w
*
(
kernel_w
-
1
))
>>
1
)
*
offset_scale
;
const
opmath_t
p0_h_
=
p0_h
-
((
dilation_h
*
(
kernel_h
-
1
))
>>
1
)
*
offset_scale
;
for
(
int
i
=
0
;
i
<
kernel_w
;
++
i
)
{
for
(
int
j
=
0
;
j
<
kernel_h
;
++
j
)
{
const
opmath_t
offset_w
=
data_offset
[
data_loc_w_ptr
];
const
opmath_t
offset_h
=
data_offset
[
data_loc_w_ptr
+
1
];
const
opmath_t
loc_w
=
p0_w_
+
(
i
*
dilation_w
+
offset_w
)
*
offset_scale
;
const
opmath_t
loc_h
=
p0_h_
+
(
j
*
dilation_h
+
offset_h
)
*
offset_scale
;
const
opmath_t
weight
=
data_mask
[
data_weight_ptr
];
*
(
cache_grad_offset
+
(
threadIdx
.
x
<<
1
))
=
0
;
*
(
cache_grad_offset
+
((
threadIdx
.
x
<<
1
)
+
1
))
=
0
;
*
(
cache_grad_mask
+
threadIdx
.
x
)
=
0
;
if
(
loc_h
>
-
1
&&
loc_w
>
-
1
&&
loc_h
<
height_in
&&
loc_w
<
width_in
)
{
dcnv3_col2im_bilinear
(
data_im_ptr
,
height_in
,
width_in
,
group
,
group_channels
,
loc_h
,
loc_w
,
g_col
,
c_col
,
offset_scale
,
top_grad
,
weight
,
grad_im_ptr
,
cache_grad_offset
+
(
threadIdx
.
x
<<
1
),
cache_grad_mask
+
threadIdx
.
x
);
}
__syncthreads
();
if
(
tid
==
0
)
{
opmath_t
_grad_w
=
cache_grad_offset
[
0
],
_grad_h
=
cache_grad_offset
[
1
],
_grad_a
=
cache_grad_mask
[
0
];
int
sid
=
2
;
for
(
unsigned
int
tid
=
1
;
tid
<
blockSize
;
++
tid
)
{
_grad_w
+=
cache_grad_offset
[
sid
];
_grad_h
+=
cache_grad_offset
[
sid
+
1
];
_grad_a
+=
cache_grad_mask
[
tid
];
sid
+=
2
;
}
*
grad_offset
=
_grad_w
;
*
(
grad_offset
+
1
)
=
_grad_h
;
*
grad_mask
=
_grad_a
;
}
__syncthreads
();
data_weight_ptr
+=
1
;
data_loc_w_ptr
+=
2
;
grad_mask
+=
1
;
grad_offset
+=
2
;
}
}
}
}
template
<
typename
scalar_t
,
unsigned
int
blockSize
>
__global__
void
dcnv3_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2
(
const
int
num_kernels
,
const
scalar_t
*
grad_col
,
const
scalar_t
*
data_im
,
const
scalar_t
*
data_offset
,
const
scalar_t
*
data_mask
,
const
int
kernel_h
,
const
int
kernel_w
,
const
int
stride_h
,
const
int
stride_w
,
const
int
pad_h
,
const
int
pad_w
,
const
int
dilation_h
,
const
int
dilation_w
,
const
int
group
,
const
int
group_channels
,
const
int
height_in
,
const
int
width_in
,
const
int
height_out
,
const
int
width_out
,
const
opmath_t
offset_scale
,
opmath_t
*
grad_im
,
opmath_t
*
grad_offset
,
opmath_t
*
grad_mask
)
{
CUDA_KERNEL_LOOP
(
index
,
num_kernels
)
{
__shared__
opmath_t
cache_grad_offset
[
blockSize
*
2
];
__shared__
opmath_t
cache_grad_mask
[
blockSize
];
unsigned
int
tid
=
threadIdx
.
x
;
int
_temp
=
index
;
const
int
c_col
=
_temp
%
group_channels
;
_temp
/=
group_channels
;
const
int
sampling_index
=
_temp
;
const
int
g_col
=
_temp
%
group
;
_temp
/=
group
;
const
int
p0_w
=
((
dilation_w
*
(
kernel_w
-
1
))
>>
1
)
-
pad_w
+
(
_temp
%
width_out
)
*
stride_w
;
_temp
/=
width_out
;
const
int
p0_h
=
((
dilation_h
*
(
kernel_h
-
1
))
>>
1
)
-
pad_h
+
(
_temp
%
height_out
)
*
stride_h
;
_temp
/=
height_out
;
const
int
b_col
=
_temp
;
const
opmath_t
top_grad
=
grad_col
[
index
];
const
int
input_size
=
height_in
*
width_in
;
const
int
kernel_size
=
kernel_h
*
kernel_w
;
int
data_weight_ptr
=
sampling_index
*
kernel_size
;
int
data_loc_w_ptr
=
data_weight_ptr
<<
1
;
const
int
grad_sampling_ptr
=
data_weight_ptr
;
grad_offset
+=
grad_sampling_ptr
<<
1
;
grad_mask
+=
grad_sampling_ptr
;
const
int
qid_stride
=
group
*
group_channels
;
const
int
im_ptr_offset
=
b_col
*
input_size
*
qid_stride
;
const
scalar_t
*
data_im_ptr
=
data_im
+
im_ptr_offset
;
opmath_t
*
grad_im_ptr
=
grad_im
+
im_ptr_offset
;
const
opmath_t
p0_w_
=
p0_w
-
((
dilation_w
*
(
kernel_w
-
1
))
>>
1
)
*
offset_scale
;
const
opmath_t
p0_h_
=
p0_h
-
((
dilation_h
*
(
kernel_h
-
1
))
>>
1
)
*
offset_scale
;
for
(
int
i
=
0
;
i
<
kernel_w
;
++
i
)
{
for
(
int
j
=
0
;
j
<
kernel_h
;
++
j
)
{
const
opmath_t
offset_w
=
data_offset
[
data_loc_w_ptr
];
const
opmath_t
offset_h
=
data_offset
[
data_loc_w_ptr
+
1
];
const
opmath_t
loc_w
=
p0_w_
+
(
i
*
dilation_w
+
offset_w
)
*
offset_scale
;
const
opmath_t
loc_h
=
p0_h_
+
(
j
*
dilation_h
+
offset_h
)
*
offset_scale
;
const
opmath_t
weight
=
data_mask
[
data_weight_ptr
];
*
(
cache_grad_offset
+
(
threadIdx
.
x
<<
1
))
=
0
;
*
(
cache_grad_offset
+
((
threadIdx
.
x
<<
1
)
+
1
))
=
0
;
*
(
cache_grad_mask
+
threadIdx
.
x
)
=
0
;
if
(
loc_h
>
-
1
&&
loc_w
>
-
1
&&
loc_h
<
height_in
&&
loc_w
<
width_in
)
{
dcnv3_col2im_bilinear
(
data_im_ptr
,
height_in
,
width_in
,
group
,
group_channels
,
loc_h
,
loc_w
,
g_col
,
c_col
,
offset_scale
,
top_grad
,
weight
,
grad_im_ptr
,
cache_grad_offset
+
(
threadIdx
.
x
<<
1
),
cache_grad_mask
+
threadIdx
.
x
);
}
__syncthreads
();
for
(
unsigned
int
s
=
blockSize
/
2
;
s
>
0
;
s
>>=
1
)
{
if
(
tid
<
s
)
{
const
unsigned
int
xid1
=
tid
<<
1
;
const
unsigned
int
xid2
=
(
tid
+
s
)
<<
1
;
cache_grad_mask
[
tid
]
+=
cache_grad_mask
[
tid
+
s
];
cache_grad_offset
[
xid1
]
+=
cache_grad_offset
[
xid2
];
cache_grad_offset
[
xid1
+
1
]
+=
cache_grad_offset
[
xid2
+
1
];
}
__syncthreads
();
}
if
(
tid
==
0
)
{
*
grad_offset
=
cache_grad_offset
[
0
];
*
(
grad_offset
+
1
)
=
cache_grad_offset
[
1
];
*
grad_mask
=
cache_grad_mask
[
0
];
}
__syncthreads
();
data_weight_ptr
+=
1
;
data_loc_w_ptr
+=
2
;
grad_mask
+=
1
;
grad_offset
+=
2
;
}
}
}
}
template
<
typename
scalar_t
>
__global__
void
dcnv3_col2im_gpu_kernel_shm_reduce_v1
(
const
int
num_kernels
,
const
scalar_t
*
grad_col
,
const
scalar_t
*
data_im
,
const
scalar_t
*
data_offset
,
const
scalar_t
*
data_mask
,
const
int
kernel_h
,
const
int
kernel_w
,
const
int
stride_h
,
const
int
stride_w
,
const
int
pad_h
,
const
int
pad_w
,
const
int
dilation_h
,
const
int
dilation_w
,
const
int
group
,
const
int
group_channels
,
const
int
height_in
,
const
int
width_in
,
const
int
height_out
,
const
int
width_out
,
const
opmath_t
offset_scale
,
opmath_t
*
grad_im
,
opmath_t
*
grad_offset
,
opmath_t
*
grad_mask
)
{
CUDA_KERNEL_LOOP
(
index
,
num_kernels
)
{
extern
__shared__
int
_s
[];
opmath_t
*
cache_grad_offset
=
(
opmath_t
*
)
_s
;
opmath_t
*
cache_grad_mask
=
cache_grad_offset
+
2
*
blockDim
.
x
;
unsigned
int
tid
=
threadIdx
.
x
;
int
_temp
=
index
;
const
int
c_col
=
_temp
%
group_channels
;
_temp
/=
group_channels
;
const
int
sampling_index
=
_temp
;
const
int
g_col
=
_temp
%
group
;
_temp
/=
group
;
const
int
p0_w
=
((
dilation_w
*
(
kernel_w
-
1
))
>>
1
)
-
pad_w
+
(
_temp
%
width_out
)
*
stride_w
;
_temp
/=
width_out
;
const
int
p0_h
=
((
dilation_h
*
(
kernel_h
-
1
))
>>
1
)
-
pad_h
+
(
_temp
%
height_out
)
*
stride_h
;
_temp
/=
height_out
;
const
int
b_col
=
_temp
;
const
opmath_t
top_grad
=
grad_col
[
index
];
const
int
input_size
=
height_in
*
width_in
;
const
int
kernel_size
=
kernel_h
*
kernel_w
;
int
data_weight_ptr
=
sampling_index
*
kernel_size
;
int
data_loc_w_ptr
=
data_weight_ptr
<<
1
;
const
int
grad_sampling_ptr
=
data_weight_ptr
;
grad_offset
+=
grad_sampling_ptr
<<
1
;
grad_mask
+=
grad_sampling_ptr
;
const
int
qid_stride
=
group
*
group_channels
;
const
int
im_ptr_offset
=
b_col
*
input_size
*
qid_stride
;
const
scalar_t
*
data_im_ptr
=
data_im
+
im_ptr_offset
;
opmath_t
*
grad_im_ptr
=
grad_im
+
im_ptr_offset
;
const
opmath_t
p0_w_
=
p0_w
-
((
dilation_w
*
(
kernel_w
-
1
))
>>
1
)
*
offset_scale
;
const
opmath_t
p0_h_
=
p0_h
-
((
dilation_h
*
(
kernel_h
-
1
))
>>
1
)
*
offset_scale
;
for
(
int
i
=
0
;
i
<
kernel_w
;
++
i
)
{
for
(
int
j
=
0
;
j
<
kernel_h
;
++
j
)
{
const
opmath_t
offset_w
=
data_offset
[
data_loc_w_ptr
];
const
opmath_t
offset_h
=
data_offset
[
data_loc_w_ptr
+
1
];
const
opmath_t
loc_w
=
p0_w_
+
(
i
*
dilation_w
+
offset_w
)
*
offset_scale
;
const
opmath_t
loc_h
=
p0_h_
+
(
j
*
dilation_h
+
offset_h
)
*
offset_scale
;
const
opmath_t
weight
=
data_mask
[
data_weight_ptr
];
*
(
cache_grad_offset
+
(
threadIdx
.
x
<<
1
))
=
0
;
*
(
cache_grad_offset
+
((
threadIdx
.
x
<<
1
)
+
1
))
=
0
;
*
(
cache_grad_mask
+
threadIdx
.
x
)
=
0
;
if
(
loc_h
>
-
1
&&
loc_w
>
-
1
&&
loc_h
<
height_in
&&
loc_w
<
width_in
)
{
dcnv3_col2im_bilinear
(
data_im_ptr
,
height_in
,
width_in
,
group
,
group_channels
,
loc_h
,
loc_w
,
g_col
,
c_col
,
offset_scale
,
top_grad
,
weight
,
grad_im_ptr
,
cache_grad_offset
+
(
threadIdx
.
x
<<
1
),
cache_grad_mask
+
threadIdx
.
x
);
}
__syncthreads
();
if
(
tid
==
0
)
{
opmath_t
_grad_w
=
cache_grad_offset
[
0
],
_grad_h
=
cache_grad_offset
[
1
],
_grad_a
=
cache_grad_mask
[
0
];
int
sid
=
2
;
for
(
unsigned
int
tid
=
1
;
tid
<
blockDim
.
x
;
++
tid
)
{
_grad_w
+=
cache_grad_offset
[
sid
];
_grad_h
+=
cache_grad_offset
[
sid
+
1
];
_grad_a
+=
cache_grad_mask
[
tid
];
sid
+=
2
;
}
*
grad_offset
=
_grad_w
;
*
(
grad_offset
+
1
)
=
_grad_h
;
*
grad_mask
=
_grad_a
;
}
__syncthreads
();
data_weight_ptr
+=
1
;
data_loc_w_ptr
+=
2
;
grad_mask
+=
1
;
grad_offset
+=
2
;
}
}
}
}
template
<
typename
scalar_t
>
__global__
void
dcnv3_col2im_gpu_kernel_shm_reduce_v2
(
const
int
num_kernels
,
const
scalar_t
*
grad_col
,
const
scalar_t
*
data_im
,
const
scalar_t
*
data_offset
,
const
scalar_t
*
data_mask
,
const
int
kernel_h
,
const
int
kernel_w
,
const
int
stride_h
,
const
int
stride_w
,
const
int
pad_h
,
const
int
pad_w
,
const
int
dilation_h
,
const
int
dilation_w
,
const
int
group
,
const
int
group_channels
,
const
int
height_in
,
const
int
width_in
,
const
int
height_out
,
const
int
width_out
,
const
opmath_t
offset_scale
,
opmath_t
*
grad_im
,
opmath_t
*
grad_offset
,
opmath_t
*
grad_mask
)
{
CUDA_KERNEL_LOOP
(
index
,
num_kernels
)
{
extern
__shared__
int
_s
[];
opmath_t
*
cache_grad_offset
=
(
opmath_t
*
)
_s
;
opmath_t
*
cache_grad_mask
=
cache_grad_offset
+
2
*
blockDim
.
x
;
unsigned
int
tid
=
threadIdx
.
x
;
int
_temp
=
index
;
const
int
c_col
=
_temp
%
group_channels
;
_temp
/=
group_channels
;
const
int
sampling_index
=
_temp
;
const
int
g_col
=
_temp
%
group
;
_temp
/=
group
;
const
int
p0_w
=
((
dilation_w
*
(
kernel_w
-
1
))
>>
1
)
-
pad_w
+
(
_temp
%
width_out
)
*
stride_w
;
_temp
/=
width_out
;
const
int
p0_h
=
((
dilation_h
*
(
kernel_h
-
1
))
>>
1
)
-
pad_h
+
(
_temp
%
height_out
)
*
stride_h
;
_temp
/=
height_out
;
const
int
b_col
=
_temp
;
const
opmath_t
top_grad
=
grad_col
[
index
];
const
int
input_size
=
height_in
*
width_in
;
const
int
kernel_size
=
kernel_h
*
kernel_w
;
int
data_weight_ptr
=
sampling_index
*
kernel_size
;
int
data_loc_w_ptr
=
data_weight_ptr
<<
1
;
const
int
grad_sampling_ptr
=
data_weight_ptr
;
grad_offset
+=
grad_sampling_ptr
<<
1
;
grad_mask
+=
grad_sampling_ptr
;
const
int
qid_stride
=
group
*
group_channels
;
const
int
im_ptr_offset
=
b_col
*
input_size
*
qid_stride
;
const
scalar_t
*
data_im_ptr
=
data_im
+
im_ptr_offset
;
opmath_t
*
grad_im_ptr
=
grad_im
+
im_ptr_offset
;
const
opmath_t
p0_w_
=
p0_w
-
((
dilation_w
*
(
kernel_w
-
1
))
>>
1
)
*
offset_scale
;
const
opmath_t
p0_h_
=
p0_h
-
((
dilation_h
*
(
kernel_h
-
1
))
>>
1
)
*
offset_scale
;
for
(
int
i
=
0
;
i
<
kernel_w
;
++
i
)
{
for
(
int
j
=
0
;
j
<
kernel_h
;
++
j
)
{
const
opmath_t
offset_w
=
data_offset
[
data_loc_w_ptr
];
const
opmath_t
offset_h
=
data_offset
[
data_loc_w_ptr
+
1
];
const
opmath_t
loc_w
=
p0_w_
+
(
i
*
dilation_w
+
offset_w
)
*
offset_scale
;
const
opmath_t
loc_h
=
p0_h_
+
(
j
*
dilation_h
+
offset_h
)
*
offset_scale
;
const
opmath_t
weight
=
data_mask
[
data_weight_ptr
];
*
(
cache_grad_offset
+
(
threadIdx
.
x
<<
1
))
=
0
;
*
(
cache_grad_offset
+
((
threadIdx
.
x
<<
1
)
+
1
))
=
0
;
*
(
cache_grad_mask
+
threadIdx
.
x
)
=
0
;
if
(
loc_h
>
-
1
&&
loc_w
>
-
1
&&
loc_h
<
height_in
&&
loc_w
<
width_in
)
{
dcnv3_col2im_bilinear
(
data_im_ptr
,
height_in
,
width_in
,
group
,
group_channels
,
loc_h
,
loc_w
,
g_col
,
c_col
,
offset_scale
,
top_grad
,
weight
,
grad_im_ptr
,
cache_grad_offset
+
(
threadIdx
.
x
<<
1
),
cache_grad_mask
+
threadIdx
.
x
);
}
__syncthreads
();
for
(
unsigned
int
s
=
blockDim
.
x
/
2
,
spre
=
blockDim
.
x
;
s
>
0
;
s
>>=
1
,
spre
>>=
1
)
{
if
(
tid
<
s
)
{
const
unsigned
int
xid1
=
tid
<<
1
;
const
unsigned
int
xid2
=
(
tid
+
s
)
<<
1
;
cache_grad_mask
[
tid
]
+=
cache_grad_mask
[
tid
+
s
];
cache_grad_offset
[
xid1
]
+=
cache_grad_offset
[
xid2
];
cache_grad_offset
[
xid1
+
1
]
+=
cache_grad_offset
[
xid2
+
1
];
if
(
tid
+
(
s
<<
1
)
<
spre
)
{
cache_grad_mask
[
tid
]
+=
cache_grad_mask
[
tid
+
(
s
<<
1
)];
cache_grad_offset
[
xid1
]
+=
cache_grad_offset
[
xid2
+
(
s
<<
1
)];
cache_grad_offset
[
xid1
+
1
]
+=
cache_grad_offset
[
xid2
+
1
+
(
s
<<
1
)];
}
}
__syncthreads
();
}
if
(
tid
==
0
)
{
*
grad_offset
=
cache_grad_offset
[
0
];
*
(
grad_offset
+
1
)
=
cache_grad_offset
[
1
];
*
grad_mask
=
cache_grad_mask
[
0
];
}
__syncthreads
();
data_weight_ptr
+=
1
;
data_loc_w_ptr
+=
2
;
grad_mask
+=
1
;
grad_offset
+=
2
;
}
}
}
}
template
<
typename
scalar_t
>
__global__
void
dcnv3_col2im_gpu_kernel_shm_reduce_v2_multi_blocks
(
const
int
num_kernels
,
const
scalar_t
*
grad_col
,
const
scalar_t
*
data_im
,
const
scalar_t
*
data_offset
,
const
scalar_t
*
data_mask
,
const
int
kernel_h
,
const
int
kernel_w
,
const
int
stride_h
,
const
int
stride_w
,
const
int
pad_h
,
const
int
pad_w
,
const
int
dilation_h
,
const
int
dilation_w
,
const
int
group
,
const
int
group_channels
,
const
int
height_in
,
const
int
width_in
,
const
int
height_out
,
const
int
width_out
,
const
opmath_t
offset_scale
,
opmath_t
*
grad_im
,
opmath_t
*
grad_offset
,
opmath_t
*
grad_mask
)
{
CUDA_KERNEL_LOOP
(
index
,
num_kernels
)
{
extern
__shared__
int
_s
[];
opmath_t
*
cache_grad_offset
=
(
opmath_t
*
)
_s
;
opmath_t
*
cache_grad_mask
=
cache_grad_offset
+
2
*
blockDim
.
x
;
unsigned
int
tid
=
threadIdx
.
x
;
int
_temp
=
index
;
const
int
c_col
=
_temp
%
group_channels
;
_temp
/=
group_channels
;
const
int
sampling_index
=
_temp
;
const
int
g_col
=
_temp
%
group
;
_temp
/=
group
;
const
int
p0_w
=
((
dilation_w
*
(
kernel_w
-
1
))
>>
1
)
-
pad_w
+
(
_temp
%
width_out
)
*
stride_w
;
_temp
/=
width_out
;
const
int
p0_h
=
((
dilation_h
*
(
kernel_h
-
1
))
>>
1
)
-
pad_h
+
(
_temp
%
height_out
)
*
stride_h
;
_temp
/=
height_out
;
const
int
b_col
=
_temp
;
const
opmath_t
top_grad
=
grad_col
[
index
];
const
int
input_size
=
height_in
*
width_in
;
const
int
kernel_size
=
kernel_h
*
kernel_w
;
int
data_weight_ptr
=
sampling_index
*
kernel_size
;
int
data_loc_w_ptr
=
data_weight_ptr
<<
1
;
const
int
grad_sampling_ptr
=
data_weight_ptr
;
grad_offset
+=
grad_sampling_ptr
<<
1
;
grad_mask
+=
grad_sampling_ptr
;
const
int
qid_stride
=
group
*
group_channels
;
const
int
im_ptr_offset
=
b_col
*
input_size
*
qid_stride
;
const
scalar_t
*
data_im_ptr
=
data_im
+
im_ptr_offset
;
opmath_t
*
grad_im_ptr
=
grad_im
+
im_ptr_offset
;
const
opmath_t
p0_w_
=
p0_w
-
((
dilation_w
*
(
kernel_w
-
1
))
>>
1
)
*
offset_scale
;
const
opmath_t
p0_h_
=
p0_h
-
((
dilation_h
*
(
kernel_h
-
1
))
>>
1
)
*
offset_scale
;
for
(
int
i
=
0
;
i
<
kernel_w
;
++
i
)
{
for
(
int
j
=
0
;
j
<
kernel_h
;
++
j
)
{
const
opmath_t
offset_w
=
data_offset
[
data_loc_w_ptr
];
const
opmath_t
offset_h
=
data_offset
[
data_loc_w_ptr
+
1
];
const
opmath_t
loc_w
=
p0_w_
+
(
i
*
dilation_w
+
offset_w
)
*
offset_scale
;
const
opmath_t
loc_h
=
p0_h_
+
(
j
*
dilation_h
+
offset_h
)
*
offset_scale
;
const
opmath_t
weight
=
data_mask
[
data_weight_ptr
];
*
(
cache_grad_offset
+
(
threadIdx
.
x
<<
1
))
=
0
;
*
(
cache_grad_offset
+
((
threadIdx
.
x
<<
1
)
+
1
))
=
0
;
*
(
cache_grad_mask
+
threadIdx
.
x
)
=
0
;
if
(
loc_h
>
-
1
&&
loc_w
>
-
1
&&
loc_h
<
height_in
&&
loc_w
<
width_in
)
{
dcnv3_col2im_bilinear
(
data_im_ptr
,
height_in
,
width_in
,
group
,
group_channels
,
loc_h
,
loc_w
,
g_col
,
c_col
,
offset_scale
,
top_grad
,
weight
,
grad_im_ptr
,
cache_grad_offset
+
(
threadIdx
.
x
<<
1
),
cache_grad_mask
+
threadIdx
.
x
);
}
__syncthreads
();
for
(
unsigned
int
s
=
blockDim
.
x
/
2
,
spre
=
blockDim
.
x
;
s
>
0
;
s
>>=
1
,
spre
>>=
1
)
{
if
(
tid
<
s
)
{
const
unsigned
int
xid1
=
tid
<<
1
;
const
unsigned
int
xid2
=
(
tid
+
s
)
<<
1
;
cache_grad_mask
[
tid
]
+=
cache_grad_mask
[
tid
+
s
];
cache_grad_offset
[
xid1
]
+=
cache_grad_offset
[
xid2
];
cache_grad_offset
[
xid1
+
1
]
+=
cache_grad_offset
[
xid2
+
1
];
if
(
tid
+
(
s
<<
1
)
<
spre
)
{
cache_grad_mask
[
tid
]
+=
cache_grad_mask
[
tid
+
(
s
<<
1
)];
cache_grad_offset
[
xid1
]
+=
cache_grad_offset
[
xid2
+
(
s
<<
1
)];
cache_grad_offset
[
xid1
+
1
]
+=
cache_grad_offset
[
xid2
+
1
+
(
s
<<
1
)];
}
}
__syncthreads
();
}
if
(
tid
==
0
)
{
atomicAdd
(
grad_offset
,
cache_grad_offset
[
0
]);
atomicAdd
(
grad_offset
+
1
,
cache_grad_offset
[
1
]);
atomicAdd
(
grad_mask
,
cache_grad_mask
[
0
]);
}
__syncthreads
();
data_weight_ptr
+=
1
;
data_loc_w_ptr
+=
2
;
grad_mask
+=
1
;
grad_offset
+=
2
;
}
}
}
}
template
<
typename
scalar_t
>
__global__
void
dcnv3_col2im_gpu_kernel_gm
(
const
int
num_kernels
,
const
scalar_t
*
grad_col
,
const
scalar_t
*
data_im
,
const
scalar_t
*
data_offset
,
const
scalar_t
*
data_mask
,
const
int
kernel_h
,
const
int
kernel_w
,
const
int
stride_h
,
const
int
stride_w
,
const
int
pad_h
,
const
int
pad_w
,
const
int
dilation_h
,
const
int
dilation_w
,
const
int
group
,
const
int
group_channels
,
const
int
height_in
,
const
int
width_in
,
const
int
height_out
,
const
int
width_out
,
const
opmath_t
offset_scale
,
opmath_t
*
grad_im
,
opmath_t
*
grad_offset
,
opmath_t
*
grad_mask
)
{
CUDA_KERNEL_LOOP
(
index
,
num_kernels
)
{
int
_temp
=
index
;
const
int
c_col
=
_temp
%
group_channels
;
_temp
/=
group_channels
;
const
int
sampling_index
=
_temp
;
const
int
g_col
=
_temp
%
group
;
_temp
/=
group
;
const
int
p0_w
=
((
dilation_w
*
(
kernel_w
-
1
))
>>
1
)
-
pad_w
+
(
_temp
%
width_out
)
*
stride_w
;
_temp
/=
width_out
;
const
int
p0_h
=
((
dilation_h
*
(
kernel_h
-
1
))
>>
1
)
-
pad_h
+
(
_temp
%
height_out
)
*
stride_h
;
_temp
/=
height_out
;
const
int
b_col
=
_temp
;
const
opmath_t
top_grad
=
grad_col
[
index
];
const
int
input_size
=
height_in
*
width_in
;
const
int
kernel_size
=
kernel_h
*
kernel_w
;
int
data_weight_ptr
=
sampling_index
*
kernel_size
;
int
data_loc_w_ptr
=
data_weight_ptr
<<
1
;
const
int
grad_sampling_ptr
=
data_weight_ptr
;
grad_offset
+=
grad_sampling_ptr
<<
1
;
grad_mask
+=
grad_sampling_ptr
;
const
int
qid_stride
=
group
*
group_channels
;
const
int
im_ptr_offset
=
b_col
*
input_size
*
qid_stride
;
const
scalar_t
*
data_im_ptr
=
data_im
+
im_ptr_offset
;
opmath_t
*
grad_im_ptr
=
grad_im
+
im_ptr_offset
;
const
opmath_t
p0_w_
=
p0_w
-
((
dilation_w
*
(
kernel_w
-
1
))
>>
1
)
*
offset_scale
;
const
opmath_t
p0_h_
=
p0_h
-
((
dilation_h
*
(
kernel_h
-
1
))
>>
1
)
*
offset_scale
;
for
(
int
i
=
0
;
i
<
kernel_w
;
++
i
)
{
for
(
int
j
=
0
;
j
<
kernel_h
;
++
j
)
{
const
opmath_t
offset_w
=
data_offset
[
data_loc_w_ptr
];
const
opmath_t
offset_h
=
data_offset
[
data_loc_w_ptr
+
1
];
const
opmath_t
loc_w
=
p0_w_
+
(
i
*
dilation_w
+
offset_w
)
*
offset_scale
;
const
opmath_t
loc_h
=
p0_h_
+
(
j
*
dilation_h
+
offset_h
)
*
offset_scale
;
const
opmath_t
weight
=
data_mask
[
data_weight_ptr
];
if
(
loc_h
>
-
1
&&
loc_w
>
-
1
&&
loc_h
<
height_in
&&
loc_w
<
width_in
)
{
dcnv3_col2im_bilinear_gm
(
data_im_ptr
,
height_in
,
width_in
,
group
,
group_channels
,
loc_h
,
loc_w
,
g_col
,
c_col
,
offset_scale
,
top_grad
,
weight
,
grad_im_ptr
,
grad_offset
,
grad_mask
);
}
data_weight_ptr
+=
1
;
data_loc_w_ptr
+=
2
;
grad_mask
+=
1
;
grad_offset
+=
2
;
}
}
}
}
template
<
typename
scalar_t
>
void
dcnv3_im2col_cuda
(
cudaStream_t
stream
,
const
scalar_t
*
data_im
,
const
scalar_t
*
data_offset
,
const
scalar_t
*
data_mask
,
scalar_t
*
data_col
,
const
int
kernel_h
,
const
int
kernel_w
,
const
int
stride_h
,
const
int
stride_w
,
const
int
pad_h
,
const
int
pad_w
,
const
int
dilation_h
,
const
int
dilation_w
,
const
int
group
,
const
int
group_channels
,
const
int
batch_n
,
const
int
height_in
,
const
int
width_in
,
const
int
height_out
,
const
int
width_out
,
const
opmath_t
offset_scale
)
{
const
int
num_kernels
=
batch_n
*
height_out
*
width_out
*
group
*
group_channels
;
const
int
num_actual_kernels
=
batch_n
*
height_out
*
width_out
*
group
*
group_channels
;
const
int
num_threads
=
CUDA_NUM_THREADS
;
dcnv3_im2col_gpu_kernel
<
scalar_t
>
<<<
GET_BLOCKS
(
num_actual_kernels
,
num_threads
),
num_threads
,
0
,
stream
>>>
(
num_kernels
,
data_im
,
data_offset
,
data_mask
,
data_col
,
kernel_h
,
kernel_w
,
stride_h
,
stride_w
,
pad_h
,
pad_w
,
dilation_h
,
dilation_w
,
group
,
group_channels
,
height_in
,
width_in
,
height_out
,
width_out
,
offset_scale
);
cudaError_t
err
=
cudaGetLastError
();
if
(
err
!=
cudaSuccess
)
{
printf
(
"error in dcnv3_im2col_cuda: %s
\n
"
,
cudaGetErrorString
(
err
));
}
}
template
<
typename
scalar_t
>
void
dcnv3_col2im_cuda
(
cudaStream_t
stream
,
const
scalar_t
*
grad_col
,
const
scalar_t
*
data_im
,
const
scalar_t
*
data_offset
,
const
scalar_t
*
data_mask
,
const
int
kernel_h
,
const
int
kernel_w
,
const
int
stride_h
,
const
int
stride_w
,
const
int
pad_h
,
const
int
pad_w
,
const
int
dilation_h
,
const
int
dilation_w
,
const
int
group
,
const
int
group_channels
,
const
int
batch_n
,
const
int
height_in
,
const
int
width_in
,
const
int
height_out
,
const
int
width_out
,
const
opmath_t
offset_scale
,
opmath_t
*
grad_im
,
opmath_t
*
grad_offset
,
opmath_t
*
grad_mask
)
{
const
int
num_threads
=
(
group_channels
>
CUDA_NUM_THREADS
)
?
CUDA_NUM_THREADS
:
group_channels
;
const
int
num_kernels
=
batch_n
*
height_out
*
width_out
*
group
*
group_channels
;
const
int
num_actual_kernels
=
batch_n
*
height_out
*
width_out
*
group
*
group_channels
;
if
(
group_channels
>
1024
)
{
if
((
group_channels
&
1023
)
==
0
)
{
dcnv3_col2im_gpu_kernel_shm_reduce_v2_multi_blocks
<
scalar_t
>
<<<
GET_BLOCKS
(
num_actual_kernels
,
num_threads
),
num_threads
,
num_threads
*
3
*
sizeof
(
opmath_t
),
stream
>>>
(
num_kernels
,
grad_col
,
data_im
,
data_offset
,
data_mask
,
kernel_h
,
kernel_w
,
stride_h
,
stride_w
,
pad_h
,
pad_w
,
dilation_h
,
dilation_w
,
group
,
group_channels
,
height_in
,
width_in
,
height_out
,
width_out
,
offset_scale
,
grad_im
,
grad_offset
,
grad_mask
);
}
else
{
dcnv3_col2im_gpu_kernel_gm
<
scalar_t
>
<<<
GET_BLOCKS
(
num_actual_kernels
,
num_threads
),
num_threads
,
0
,
stream
>>>
(
num_kernels
,
grad_col
,
data_im
,
data_offset
,
data_mask
,
kernel_h
,
kernel_w
,
stride_h
,
stride_w
,
pad_h
,
pad_w
,
dilation_h
,
dilation_w
,
group
,
group_channels
,
height_in
,
width_in
,
height_out
,
width_out
,
offset_scale
,
grad_im
,
grad_offset
,
grad_mask
);
}
}
else
{
switch
(
group_channels
)
{
case
1
:
dcnv3_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1
<
scalar_t
,
1
>
<<<
GET_BLOCKS
(
num_actual_kernels
,
num_threads
),
num_threads
,
0
,
stream
>>>
(
num_kernels
,
grad_col
,
data_im
,
data_offset
,
data_mask
,
kernel_h
,
kernel_w
,
stride_h
,
stride_w
,
pad_h
,
pad_w
,
dilation_h
,
dilation_w
,
group
,
group_channels
,
height_in
,
width_in
,
height_out
,
width_out
,
offset_scale
,
grad_im
,
grad_offset
,
grad_mask
);
break
;
case
2
:
dcnv3_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1
<
scalar_t
,
2
>
<<<
GET_BLOCKS
(
num_actual_kernels
,
num_threads
),
num_threads
,
0
,
stream
>>>
(
num_kernels
,
grad_col
,
data_im
,
data_offset
,
data_mask
,
kernel_h
,
kernel_w
,
stride_h
,
stride_w
,
pad_h
,
pad_w
,
dilation_h
,
dilation_w
,
group
,
group_channels
,
height_in
,
width_in
,
height_out
,
width_out
,
offset_scale
,
grad_im
,
grad_offset
,
grad_mask
);
break
;
case
4
:
dcnv3_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1
<
scalar_t
,
4
>
<<<
GET_BLOCKS
(
num_actual_kernels
,
num_threads
),
num_threads
,
0
,
stream
>>>
(
num_kernels
,
grad_col
,
data_im
,
data_offset
,
data_mask
,
kernel_h
,
kernel_w
,
stride_h
,
stride_w
,
pad_h
,
pad_w
,
dilation_h
,
dilation_w
,
group
,
group_channels
,
height_in
,
width_in
,
height_out
,
width_out
,
offset_scale
,
grad_im
,
grad_offset
,
grad_mask
);
break
;
case
8
:
dcnv3_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1
<
scalar_t
,
8
>
<<<
GET_BLOCKS
(
num_actual_kernels
,
num_threads
),
num_threads
,
0
,
stream
>>>
(
num_kernels
,
grad_col
,
data_im
,
data_offset
,
data_mask
,
kernel_h
,
kernel_w
,
stride_h
,
stride_w
,
pad_h
,
pad_w
,
dilation_h
,
dilation_w
,
group
,
group_channels
,
height_in
,
width_in
,
height_out
,
width_out
,
offset_scale
,
grad_im
,
grad_offset
,
grad_mask
);
break
;
case
16
:
dcnv3_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1
<
scalar_t
,
16
>
<<<
GET_BLOCKS
(
num_actual_kernels
,
num_threads
),
num_threads
,
0
,
stream
>>>
(
num_kernels
,
grad_col
,
data_im
,
data_offset
,
data_mask
,
kernel_h
,
kernel_w
,
stride_h
,
stride_w
,
pad_h
,
pad_w
,
dilation_h
,
dilation_w
,
group
,
group_channels
,
height_in
,
width_in
,
height_out
,
width_out
,
offset_scale
,
grad_im
,
grad_offset
,
grad_mask
);
break
;
case
32
:
dcnv3_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1
<
scalar_t
,
32
>
<<<
GET_BLOCKS
(
num_actual_kernels
,
num_threads
),
num_threads
,
0
,
stream
>>>
(
num_kernels
,
grad_col
,
data_im
,
data_offset
,
data_mask
,
kernel_h
,
kernel_w
,
stride_h
,
stride_w
,
pad_h
,
pad_w
,
dilation_h
,
dilation_w
,
group
,
group_channels
,
height_in
,
width_in
,
height_out
,
width_out
,
offset_scale
,
grad_im
,
grad_offset
,
grad_mask
);
break
;
case
64
:
dcnv3_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2
<
scalar_t
,
64
>
<<<
GET_BLOCKS
(
num_actual_kernels
,
num_threads
),
num_threads
,
0
,
stream
>>>
(
num_kernels
,
grad_col
,
data_im
,
data_offset
,
data_mask
,
kernel_h
,
kernel_w
,
stride_h
,
stride_w
,
pad_h
,
pad_w
,
dilation_h
,
dilation_w
,
group
,
group_channels
,
height_in
,
width_in
,
height_out
,
width_out
,
offset_scale
,
grad_im
,
grad_offset
,
grad_mask
);
break
;
case
128
:
dcnv3_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2
<
scalar_t
,
128
>
<<<
GET_BLOCKS
(
num_actual_kernels
,
num_threads
),
num_threads
,
0
,
stream
>>>
(
num_kernels
,
grad_col
,
data_im
,
data_offset
,
data_mask
,
kernel_h
,
kernel_w
,
stride_h
,
stride_w
,
pad_h
,
pad_w
,
dilation_h
,
dilation_w
,
group
,
group_channels
,
height_in
,
width_in
,
height_out
,
width_out
,
offset_scale
,
grad_im
,
grad_offset
,
grad_mask
);
break
;
case
256
:
dcnv3_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2
<
scalar_t
,
256
>
<<<
GET_BLOCKS
(
num_actual_kernels
,
num_threads
),
num_threads
,
0
,
stream
>>>
(
num_kernels
,
grad_col
,
data_im
,
data_offset
,
data_mask
,
kernel_h
,
kernel_w
,
stride_h
,
stride_w
,
pad_h
,
pad_w
,
dilation_h
,
dilation_w
,
group
,
group_channels
,
height_in
,
width_in
,
height_out
,
width_out
,
offset_scale
,
grad_im
,
grad_offset
,
grad_mask
);
break
;
case
512
:
dcnv3_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2
<
scalar_t
,
512
>
<<<
GET_BLOCKS
(
num_actual_kernels
,
num_threads
),
num_threads
,
0
,
stream
>>>
(
num_kernels
,
grad_col
,
data_im
,
data_offset
,
data_mask
,
kernel_h
,
kernel_w
,
stride_h
,
stride_w
,
pad_h
,
pad_w
,
dilation_h
,
dilation_w
,
group
,
group_channels
,
height_in
,
width_in
,
height_out
,
width_out
,
offset_scale
,
grad_im
,
grad_offset
,
grad_mask
);
break
;
case
1024
:
dcnv3_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2
<
scalar_t
,
1024
>
<<<
GET_BLOCKS
(
num_actual_kernels
,
num_threads
),
num_threads
,
0
,
stream
>>>
(
num_kernels
,
grad_col
,
data_im
,
data_offset
,
data_mask
,
kernel_h
,
kernel_w
,
stride_h
,
stride_w
,
pad_h
,
pad_w
,
dilation_h
,
dilation_w
,
group
,
group_channels
,
height_in
,
width_in
,
height_out
,
width_out
,
offset_scale
,
grad_im
,
grad_offset
,
grad_mask
);
break
;
default:
if
(
group_channels
<
64
)
{
dcnv3_col2im_gpu_kernel_shm_reduce_v1
<
scalar_t
>
<<<
GET_BLOCKS
(
num_actual_kernels
,
num_threads
),
num_threads
,
num_threads
*
3
*
sizeof
(
opmath_t
),
stream
>>>
(
num_kernels
,
grad_col
,
data_im
,
data_offset
,
data_mask
,
kernel_h
,
kernel_w
,
stride_h
,
stride_w
,
pad_h
,
pad_w
,
dilation_h
,
dilation_w
,
group
,
group_channels
,
height_in
,
width_in
,
height_out
,
width_out
,
offset_scale
,
grad_im
,
grad_offset
,
grad_mask
);
}
else
{
dcnv3_col2im_gpu_kernel_shm_reduce_v2
<
scalar_t
>
<<<
GET_BLOCKS
(
num_actual_kernels
,
num_threads
),
num_threads
,
num_threads
*
3
*
sizeof
(
opmath_t
),
stream
>>>
(
num_kernels
,
grad_col
,
data_im
,
data_offset
,
data_mask
,
kernel_h
,
kernel_w
,
stride_h
,
stride_w
,
pad_h
,
pad_w
,
dilation_h
,
dilation_w
,
group
,
group_channels
,
height_in
,
width_in
,
height_out
,
width_out
,
offset_scale
,
grad_im
,
grad_offset
,
grad_mask
);
}
}
}
cudaError_t
err
=
cudaGetLastError
();
if
(
err
!=
cudaSuccess
)
{
printf
(
"error in dcnv3_col2im_cuda: %s
\n
"
,
cudaGetErrorString
(
err
));
}
}
\ No newline at end of file
autonomous_driving/occupancy_prediction/projects/mmdet3d_plugin/bevformer/backbones/ops_dcnv3/src/dcnv3.h
0 → 100644
View file @
df3c64a9
/*!
**************************************************************************************************
* InternImage
* Copyright (c) 2022 OpenGVLab
* Licensed under The MIT License [see LICENSE for details]
**************************************************************************************************
* Modified from
*https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
**************************************************************************************************
*/
#pragma once
#include "cpu/dcnv3_cpu.h"
#ifdef WITH_CUDA
#include "cuda/dcnv3_cuda.h"
#endif
at
::
Tensor
dcnv3_forward
(
const
at
::
Tensor
&
input
,
const
at
::
Tensor
&
offset
,
const
at
::
Tensor
&
mask
,
const
int
kernel_h
,
const
int
kernel_w
,
const
int
stride_h
,
const
int
stride_w
,
const
int
pad_h
,
const
int
pad_w
,
const
int
dilation_h
,
const
int
dilation_w
,
const
int
group
,
const
int
group_channels
,
const
float
offset_scale
,
const
int
im2col_step
)
{
if
(
input
.
type
().
is_cuda
())
{
#ifdef WITH_CUDA
return
dcnv3_cuda_forward
(
input
,
offset
,
mask
,
kernel_h
,
kernel_w
,
stride_h
,
stride_w
,
pad_h
,
pad_w
,
dilation_h
,
dilation_w
,
group
,
group_channels
,
offset_scale
,
im2col_step
);
#else
AT_ERROR
(
"Not compiled with GPU support"
);
#endif
}
AT_ERROR
(
"Not implemented on the CPU"
);
}
std
::
vector
<
at
::
Tensor
>
dcnv3_backward
(
const
at
::
Tensor
&
input
,
const
at
::
Tensor
&
offset
,
const
at
::
Tensor
&
mask
,
const
int
kernel_h
,
const
int
kernel_w
,
const
int
stride_h
,
const
int
stride_w
,
const
int
pad_h
,
const
int
pad_w
,
const
int
dilation_h
,
const
int
dilation_w
,
const
int
group
,
const
int
group_channels
,
const
float
offset_scale
,
const
at
::
Tensor
&
grad_output
,
const
int
im2col_step
)
{
if
(
input
.
type
().
is_cuda
())
{
#ifdef WITH_CUDA
return
dcnv3_cuda_backward
(
input
,
offset
,
mask
,
kernel_h
,
kernel_w
,
stride_h
,
stride_w
,
pad_h
,
pad_w
,
dilation_h
,
dilation_w
,
group
,
group_channels
,
offset_scale
,
grad_output
,
im2col_step
);
#else
AT_ERROR
(
"Not compiled with GPU support"
);
#endif
}
AT_ERROR
(
"Not implemented on the CPU"
);
}
Prev
1
2
3
4
5
6
7
8
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment