Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
dcnv3
Commits
b64d9ca3
Unverified
Commit
b64d9ca3
authored
Apr 17, 2023
by
Wenhai Wang
Committed by
GitHub
Apr 17, 2023
Browse files
Merge pull request #105 from zhiqi-li/occupancy
support occupancy prediction
parents
bdd98bcb
df3c64a9
Changes
160
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
3337 additions
and
0 deletions
+3337
-0
autonomous_driving/occupancy_prediction/projects/mmdet3d_plugin/bevformer/__init__.py
..._prediction/projects/mmdet3d_plugin/bevformer/__init__.py
+7
-0
autonomous_driving/occupancy_prediction/projects/mmdet3d_plugin/bevformer/apis/__init__.py
...iction/projects/mmdet3d_plugin/bevformer/apis/__init__.py
+3
-0
autonomous_driving/occupancy_prediction/projects/mmdet3d_plugin/bevformer/apis/mmdet_train.py
...ion/projects/mmdet3d_plugin/bevformer/apis/mmdet_train.py
+200
-0
autonomous_driving/occupancy_prediction/projects/mmdet3d_plugin/bevformer/apis/test.py
...prediction/projects/mmdet3d_plugin/bevformer/apis/test.py
+207
-0
autonomous_driving/occupancy_prediction/projects/mmdet3d_plugin/bevformer/apis/train.py
...rediction/projects/mmdet3d_plugin/bevformer/apis/train.py
+67
-0
autonomous_driving/occupancy_prediction/projects/mmdet3d_plugin/bevformer/backbones/__init__.py
...n/projects/mmdet3d_plugin/bevformer/backbones/__init__.py
+2
-0
autonomous_driving/occupancy_prediction/projects/mmdet3d_plugin/bevformer/backbones/custom_layer_decay_optimizer_constructor.py
...mer/backbones/custom_layer_decay_optimizer_constructor.py
+142
-0
autonomous_driving/occupancy_prediction/projects/mmdet3d_plugin/bevformer/backbones/internimage.py
...rojects/mmdet3d_plugin/bevformer/backbones/internimage.py
+702
-0
autonomous_driving/occupancy_prediction/projects/mmdet3d_plugin/bevformer/backbones/ops_dcnv3/functions/__init__.py
...lugin/bevformer/backbones/ops_dcnv3/functions/__init__.py
+7
-0
autonomous_driving/occupancy_prediction/projects/mmdet3d_plugin/bevformer/backbones/ops_dcnv3/functions/dcnv3_func.py
...gin/bevformer/backbones/ops_dcnv3/functions/dcnv3_func.py
+188
-0
autonomous_driving/occupancy_prediction/projects/mmdet3d_plugin/bevformer/backbones/ops_dcnv3/make.sh
...ects/mmdet3d_plugin/bevformer/backbones/ops_dcnv3/make.sh
+8
-0
autonomous_driving/occupancy_prediction/projects/mmdet3d_plugin/bevformer/backbones/ops_dcnv3/modules/__init__.py
..._plugin/bevformer/backbones/ops_dcnv3/modules/__init__.py
+7
-0
autonomous_driving/occupancy_prediction/projects/mmdet3d_plugin/bevformer/backbones/ops_dcnv3/modules/dcnv3.py
...t3d_plugin/bevformer/backbones/ops_dcnv3/modules/dcnv3.py
+345
-0
autonomous_driving/occupancy_prediction/projects/mmdet3d_plugin/bevformer/backbones/ops_dcnv3/setup.py
...cts/mmdet3d_plugin/bevformer/backbones/ops_dcnv3/setup.py
+75
-0
autonomous_driving/occupancy_prediction/projects/mmdet3d_plugin/bevformer/backbones/ops_dcnv3/src/cpu/dcnv3_cpu.cpp
...lugin/bevformer/backbones/ops_dcnv3/src/cpu/dcnv3_cpu.cpp
+37
-0
autonomous_driving/occupancy_prediction/projects/mmdet3d_plugin/bevformer/backbones/ops_dcnv3/src/cpu/dcnv3_cpu.h
..._plugin/bevformer/backbones/ops_dcnv3/src/cpu/dcnv3_cpu.h
+31
-0
autonomous_driving/occupancy_prediction/projects/mmdet3d_plugin/bevformer/backbones/ops_dcnv3/src/cuda/dcnv3_cuda.cu
...ugin/bevformer/backbones/ops_dcnv3/src/cuda/dcnv3_cuda.cu
+174
-0
autonomous_driving/occupancy_prediction/projects/mmdet3d_plugin/bevformer/backbones/ops_dcnv3/src/cuda/dcnv3_cuda.h
...lugin/bevformer/backbones/ops_dcnv3/src/cuda/dcnv3_cuda.h
+31
-0
autonomous_driving/occupancy_prediction/projects/mmdet3d_plugin/bevformer/backbones/ops_dcnv3/src/cuda/dcnv3_im2col_cuda.cuh
...former/backbones/ops_dcnv3/src/cuda/dcnv3_im2col_cuda.cuh
+1045
-0
autonomous_driving/occupancy_prediction/projects/mmdet3d_plugin/bevformer/backbones/ops_dcnv3/src/dcnv3.h
.../mmdet3d_plugin/bevformer/backbones/ops_dcnv3/src/dcnv3.h
+59
-0
No files found.
autonomous_driving/occupancy_prediction/projects/mmdet3d_plugin/bevformer/__init__.py
0 → 100644
View file @
b64d9ca3
from
.dense_heads
import
*
from
.detectors
import
*
from
.modules
import
*
from
.runner
import
*
from
.hooks
import
*
from
.backbones
import
*
\ No newline at end of file
autonomous_driving/occupancy_prediction/projects/mmdet3d_plugin/bevformer/apis/__init__.py
0 → 100644
View file @
b64d9ca3
from
.train
import
custom_train_model
from
.mmdet_train
import
custom_train_detector
# from .test import custom_multi_gpu_test
\ No newline at end of file
autonomous_driving/occupancy_prediction/projects/mmdet3d_plugin/bevformer/apis/mmdet_train.py
0 → 100644
View file @
b64d9ca3
# ---------------------------------------------
# Copyright (c) OpenMMLab. All rights reserved.
# ---------------------------------------------
# Modified by Zhiqi Li
# ---------------------------------------------
import
random
import
warnings
import
numpy
as
np
import
torch
import
torch.distributed
as
dist
from
mmcv.parallel
import
MMDataParallel
,
MMDistributedDataParallel
from
mmcv.runner
import
(
HOOKS
,
DistSamplerSeedHook
,
EpochBasedRunner
,
Fp16OptimizerHook
,
OptimizerHook
,
build_optimizer
,
build_runner
,
get_dist_info
)
from
mmcv.utils
import
build_from_cfg
from
mmdet.core
import
EvalHook
from
mmdet.datasets
import
(
build_dataset
,
replace_ImageToTensor
)
from
mmdet.utils
import
get_root_logger
import
time
import
os.path
as
osp
from
projects.mmdet3d_plugin.datasets.builder
import
build_dataloader
from
projects.mmdet3d_plugin.core.evaluation.eval_hooks
import
CustomDistEvalHook
from
projects.mmdet3d_plugin.datasets
import
custom_build_dataset
def
custom_train_detector
(
model
,
dataset
,
cfg
,
distributed
=
False
,
validate
=
False
,
timestamp
=
None
,
eval_model
=
None
,
meta
=
None
):
logger
=
get_root_logger
(
cfg
.
log_level
)
# prepare data loaders
dataset
=
dataset
if
isinstance
(
dataset
,
(
list
,
tuple
))
else
[
dataset
]
#assert len(dataset)==1s
if
'imgs_per_gpu'
in
cfg
.
data
:
logger
.
warning
(
'"imgs_per_gpu" is deprecated in MMDet V2.0. '
'Please use "samples_per_gpu" instead'
)
if
'samples_per_gpu'
in
cfg
.
data
:
logger
.
warning
(
f
'Got "imgs_per_gpu"=
{
cfg
.
data
.
imgs_per_gpu
}
and '
f
'"samples_per_gpu"=
{
cfg
.
data
.
samples_per_gpu
}
, "imgs_per_gpu"'
f
'=
{
cfg
.
data
.
imgs_per_gpu
}
is used in this experiments'
)
else
:
logger
.
warning
(
'Automatically set "samples_per_gpu"="imgs_per_gpu"='
f
'
{
cfg
.
data
.
imgs_per_gpu
}
in this experiments'
)
cfg
.
data
.
samples_per_gpu
=
cfg
.
data
.
imgs_per_gpu
data_loaders
=
[
build_dataloader
(
ds
,
cfg
.
data
.
samples_per_gpu
,
cfg
.
data
.
workers_per_gpu
,
# cfg.gpus will be ignored if distributed
len
(
cfg
.
gpu_ids
),
dist
=
distributed
,
seed
=
cfg
.
seed
,
shuffler_sampler
=
cfg
.
data
.
shuffler_sampler
,
# dict(type='DistributedGroupSampler'),
nonshuffler_sampler
=
cfg
.
data
.
nonshuffler_sampler
,
# dict(type='DistributedSampler'),
)
for
ds
in
dataset
]
# put model on gpus
if
distributed
:
find_unused_parameters
=
cfg
.
get
(
'find_unused_parameters'
,
False
)
# Sets the `find_unused_parameters` parameter in
# torch.nn.parallel.DistributedDataParallel
model
=
MMDistributedDataParallel
(
model
.
cuda
(),
device_ids
=
[
torch
.
cuda
.
current_device
()],
broadcast_buffers
=
False
,
find_unused_parameters
=
find_unused_parameters
)
if
eval_model
is
not
None
:
eval_model
=
MMDistributedDataParallel
(
eval_model
.
cuda
(),
device_ids
=
[
torch
.
cuda
.
current_device
()],
broadcast_buffers
=
False
,
find_unused_parameters
=
find_unused_parameters
)
else
:
model
=
MMDataParallel
(
model
.
cuda
(
cfg
.
gpu_ids
[
0
]),
device_ids
=
cfg
.
gpu_ids
)
if
eval_model
is
not
None
:
eval_model
=
MMDataParallel
(
eval_model
.
cuda
(
cfg
.
gpu_ids
[
0
]),
device_ids
=
cfg
.
gpu_ids
)
# build runner
optimizer
=
build_optimizer
(
model
,
cfg
.
optimizer
)
if
'runner'
not
in
cfg
:
cfg
.
runner
=
{
'type'
:
'EpochBasedRunner'
,
'max_epochs'
:
cfg
.
total_epochs
}
warnings
.
warn
(
'config is now expected to have a `runner` section, '
'please set `runner` in your config.'
,
UserWarning
)
else
:
if
'total_epochs'
in
cfg
:
assert
cfg
.
total_epochs
==
cfg
.
runner
.
max_epochs
if
eval_model
is
not
None
:
runner
=
build_runner
(
cfg
.
runner
,
default_args
=
dict
(
model
=
model
,
eval_model
=
eval_model
,
optimizer
=
optimizer
,
work_dir
=
cfg
.
work_dir
,
logger
=
logger
,
meta
=
meta
))
else
:
runner
=
build_runner
(
cfg
.
runner
,
default_args
=
dict
(
model
=
model
,
optimizer
=
optimizer
,
work_dir
=
cfg
.
work_dir
,
logger
=
logger
,
meta
=
meta
))
# an ugly workaround to make .log and .log.json filenames the same
runner
.
timestamp
=
timestamp
# fp16 setting
fp16_cfg
=
cfg
.
get
(
'fp16'
,
None
)
if
fp16_cfg
is
not
None
:
optimizer_config
=
Fp16OptimizerHook
(
**
cfg
.
optimizer_config
,
**
fp16_cfg
,
distributed
=
distributed
)
elif
distributed
and
'type'
not
in
cfg
.
optimizer_config
:
optimizer_config
=
OptimizerHook
(
**
cfg
.
optimizer_config
)
else
:
optimizer_config
=
cfg
.
optimizer_config
# register hooks
runner
.
register_training_hooks
(
cfg
.
lr_config
,
optimizer_config
,
cfg
.
checkpoint_config
,
cfg
.
log_config
,
cfg
.
get
(
'momentum_config'
,
None
))
# register profiler hook
#trace_config = dict(type='tb_trace', dir_name='work_dir')
#profiler_config = dict(on_trace_ready=trace_config)
#runner.register_profiler_hook(profiler_config)
if
distributed
:
if
isinstance
(
runner
,
EpochBasedRunner
):
runner
.
register_hook
(
DistSamplerSeedHook
())
# register eval hooks
if
validate
:
# Support batch_size > 1 in validation
val_samples_per_gpu
=
cfg
.
data
.
val
.
pop
(
'samples_per_gpu'
,
1
)
if
val_samples_per_gpu
>
1
:
assert
False
# Replace 'ImageToTensor' to 'DefaultFormatBundle'
cfg
.
data
.
val
.
pipeline
=
replace_ImageToTensor
(
cfg
.
data
.
val
.
pipeline
)
val_dataset
=
custom_build_dataset
(
cfg
.
data
.
val
,
dict
(
test_mode
=
True
))
val_dataloader
=
build_dataloader
(
val_dataset
,
samples_per_gpu
=
val_samples_per_gpu
,
workers_per_gpu
=
cfg
.
data
.
workers_per_gpu
,
dist
=
distributed
,
shuffle
=
False
,
shuffler_sampler
=
cfg
.
data
.
shuffler_sampler
,
# dict(type='DistributedGroupSampler'),
nonshuffler_sampler
=
cfg
.
data
.
nonshuffler_sampler
,
# dict(type='DistributedSampler'),
)
eval_cfg
=
cfg
.
get
(
'evaluation'
,
{})
eval_cfg
[
'by_epoch'
]
=
cfg
.
runner
[
'type'
]
!=
'IterBasedRunner'
eval_cfg
[
'jsonfile_prefix'
]
=
osp
.
join
(
'val'
,
cfg
.
work_dir
,
time
.
ctime
().
replace
(
' '
,
'_'
).
replace
(
':'
,
'_'
))
eval_hook
=
CustomDistEvalHook
if
distributed
else
EvalHook
runner
.
register_hook
(
eval_hook
(
val_dataloader
,
**
eval_cfg
))
# user-defined hooks
if
cfg
.
get
(
'custom_hooks'
,
None
):
custom_hooks
=
cfg
.
custom_hooks
assert
isinstance
(
custom_hooks
,
list
),
\
f
'custom_hooks expect list type, but got
{
type
(
custom_hooks
)
}
'
for
hook_cfg
in
cfg
.
custom_hooks
:
assert
isinstance
(
hook_cfg
,
dict
),
\
'Each item in custom_hooks expects dict type, but got '
\
f
'
{
type
(
hook_cfg
)
}
'
hook_cfg
=
hook_cfg
.
copy
()
priority
=
hook_cfg
.
pop
(
'priority'
,
'NORMAL'
)
hook
=
build_from_cfg
(
hook_cfg
,
HOOKS
)
runner
.
register_hook
(
hook
,
priority
=
priority
)
if
cfg
.
resume_from
:
runner
.
resume
(
cfg
.
resume_from
)
elif
cfg
.
load_from
:
runner
.
load_checkpoint
(
cfg
.
load_from
)
runner
.
run
(
data_loaders
,
cfg
.
workflow
)
autonomous_driving/occupancy_prediction/projects/mmdet3d_plugin/bevformer/apis/test.py
0 → 100644
View file @
b64d9ca3
# ---------------------------------------------
# Copyright (c) OpenMMLab. All rights reserved.
# ---------------------------------------------
# Modified by Xiaoyu Tian
# ---------------------------------------------
import
os.path
as
osp
import
pickle
import
shutil
import
tempfile
import
time
import
mmcv
import
torch
import
torch.distributed
as
dist
from
mmcv.image
import
tensor2imgs
from
mmcv.runner
import
get_dist_info
from
mmdet.core
import
encode_mask_results
import
mmcv
import
numpy
as
np
import
pycocotools.mask
as
mask_util
def
custom_encode_mask_results
(
mask_results
):
"""Encode bitmap mask to RLE code. Semantic Masks only
Args:
mask_results (list | tuple[list]): bitmap mask results.
In mask scoring rcnn, mask_results is a tuple of (segm_results,
segm_cls_score).
Returns:
list | tuple: RLE encoded mask.
"""
cls_segms
=
mask_results
num_classes
=
len
(
cls_segms
)
encoded_mask_results
=
[]
for
i
in
range
(
len
(
cls_segms
)):
encoded_mask_results
.
append
(
mask_util
.
encode
(
np
.
array
(
cls_segms
[
i
][:,
:,
np
.
newaxis
],
order
=
'F'
,
dtype
=
'uint8'
))[
0
])
# encoded with RLE
return
[
encoded_mask_results
]
def
custom_multi_gpu_test
(
model
,
data_loader
,
tmpdir
=
None
,
gpu_collect
=
False
):
"""Test model with multiple gpus.
This method tests model with multiple gpus and collects the results
under two different modes: gpu and cpu modes. By setting 'gpu_collect=True'
it encodes results to gpu tensors and use gpu communication for results
collection. On cpu mode it saves the results on different gpus to 'tmpdir'
and collects them by the rank 0 worker.
Args:
model (nn.Module): Model to be tested.
data_loader (nn.Dataloader): Pytorch data loader.
tmpdir (str): Path of directory to save the temporary results from
different gpus under cpu mode.
gpu_collect (bool): Option to use either gpu or cpu to collect results.
Returns:
list: The prediction results.
"""
model
.
eval
()
bbox_results
=
[]
mask_results
=
[]
occ_results
=
[]
dataset
=
data_loader
.
dataset
rank
,
world_size
=
get_dist_info
()
if
rank
==
0
:
prog_bar
=
mmcv
.
ProgressBar
(
len
(
dataset
))
time
.
sleep
(
2
)
# This line can prevent deadlock problem in some cases.
have_mask
=
False
for
i
,
data
in
enumerate
(
data_loader
):
with
torch
.
no_grad
():
result
=
model
(
return_loss
=
False
,
rescale
=
True
,
**
data
)
bs
=
result
.
shape
[
0
]
assert
bs
==
1
,
\
'Evaluation only supports batch_size=1 in this version'
# encode mask results
if
isinstance
(
result
,
dict
):
if
'bbox_results'
in
result
.
keys
():
bbox_result
=
result
[
'bbox_results'
]
batch_size
=
len
(
result
[
'bbox_results'
])
bbox_results
.
extend
(
bbox_result
)
if
'mask_results'
in
result
.
keys
()
and
result
[
'mask_results'
]
is
not
None
:
mask_result
=
custom_encode_mask_results
(
result
[
'mask_results'
])
mask_results
.
extend
(
mask_result
)
have_mask
=
True
else
:
batch_size
=
1
occ_results
.
extend
([
result
.
squeeze
(
dim
=
0
).
cpu
().
numpy
().
astype
(
np
.
uint8
)])
# batch_size = len(result)
# bbox_results.extend(result)
#if isinstance(result[0], tuple):
# assert False, 'this code is for instance segmentation, which our code will not utilize.'
# result = [(bbox_results, encode_mask_results(mask_results))
# for bbox_results, mask_results in result]
if
rank
==
0
:
for
_
in
range
(
batch_size
*
world_size
):
prog_bar
.
update
()
# collect results from all ranks
if
gpu_collect
:
bbox_results
=
collect_results_gpu
(
bbox_results
,
len
(
dataset
))
if
have_mask
:
mask_results
=
collect_results_gpu
(
mask_results
,
len
(
dataset
))
else
:
mask_results
=
None
else
:
# bbox_results = collect_results_cpu(bbox_results, len(dataset), tmpdir)
# tmpdir = tmpdir+'_mask' if tmpdir is not None else None
# if have_mask:
# mask_results = collect_results_cpu(mask_results, len(dataset), tmpdir)
# else:
# mask_results = None
tmpdir
=
tmpdir
+
'_occ'
if
tmpdir
is
not
None
else
None
occ_results
=
collect_results_cpu
(
occ_results
,
len
(
dataset
),
tmpdir
)
return
occ_results
def
collect_results_cpu
(
result_part
,
size
,
tmpdir
=
None
):
rank
,
world_size
=
get_dist_info
()
# create a tmp dir if it is not specified
if
tmpdir
is
None
:
MAX_LEN
=
512
# 32 is whitespace
dir_tensor
=
torch
.
full
((
MAX_LEN
,
),
32
,
dtype
=
torch
.
uint8
,
device
=
'cuda'
)
if
rank
==
0
:
mmcv
.
mkdir_or_exist
(
'.dist_test'
)
tmpdir
=
tempfile
.
mkdtemp
(
dir
=
'.dist_test'
)
tmpdir
=
torch
.
tensor
(
bytearray
(
tmpdir
.
encode
()),
dtype
=
torch
.
uint8
,
device
=
'cuda'
)
dir_tensor
[:
len
(
tmpdir
)]
=
tmpdir
dist
.
broadcast
(
dir_tensor
,
0
)
tmpdir
=
dir_tensor
.
cpu
().
numpy
().
tobytes
().
decode
().
rstrip
()
else
:
mmcv
.
mkdir_or_exist
(
tmpdir
)
# dump the part result to the dir
mmcv
.
dump
(
result_part
,
osp
.
join
(
tmpdir
,
f
'part_
{
rank
}
.pkl'
))
dist
.
barrier
()
# collect all parts
if
rank
!=
0
:
return
None
else
:
# load results of all parts from tmp dir
part_list
=
[]
for
i
in
range
(
world_size
):
part_file
=
osp
.
join
(
tmpdir
,
f
'part_
{
i
}
.pkl'
)
part_list
.
append
(
mmcv
.
load
(
part_file
))
# sort the results
ordered_results
=
[]
'''
bacause we change the sample of the evaluation stage to make sure that each gpu will handle continuous sample,
'''
#for res in zip(*part_list):
for
res
in
part_list
:
ordered_results
.
extend
(
list
(
res
))
# the dataloader may pad some samples
ordered_results
=
ordered_results
[:
size
]
# remove tmp dir
shutil
.
rmtree
(
tmpdir
)
return
ordered_results
def
single_gpu_test
(
model
,
data_loader
,
show
=
False
,
out_dir
=
None
,
show_score_thr
=
0.3
):
"""Test model with single gpu.
This method tests model with single gpu and gives the 'show' option.
By setting ``show=True``, it saves the visualization results under
``out_dir``.
Args:
model (nn.Module): Model to be tested.
data_loader (nn.Dataloader): Pytorch data loader.
show (bool): Whether to save viualization results.
Default: True.
out_dir (str): The path to save visualization results.
Default: None.
Returns:
list[dict]: The prediction results.
"""
model
.
eval
()
results
=
[]
dataset
=
data_loader
.
dataset
prog_bar
=
mmcv
.
ProgressBar
(
len
(
dataset
))
for
i
,
data
in
enumerate
(
data_loader
):
with
torch
.
no_grad
():
result
=
model
(
return_loss
=
False
,
rescale
=
True
,
**
data
)
results
.
extend
([
result
.
squeeze
(
dim
=
0
).
cpu
().
numpy
().
astype
(
np
.
uint8
)])
batch_size
=
len
(
result
)
for
_
in
range
(
batch_size
):
prog_bar
.
update
()
return
results
def
collect_results_gpu
(
result_part
,
size
):
collect_results_cpu
(
result_part
,
size
)
\ No newline at end of file
autonomous_driving/occupancy_prediction/projects/mmdet3d_plugin/bevformer/apis/train.py
0 → 100644
View file @
b64d9ca3
# ---------------------------------------------
# Copyright (c) OpenMMLab. All rights reserved.
# ---------------------------------------------
# Modified by Zhiqi Li
# ---------------------------------------------
from
.mmdet_train
import
custom_train_detector
from
mmseg.apis
import
train_segmentor
from
mmdet.apis
import
train_detector
def
custom_train_model
(
model
,
dataset
,
cfg
,
distributed
=
False
,
validate
=
False
,
timestamp
=
None
,
eval_model
=
None
,
meta
=
None
):
"""A function wrapper for launching model training according to cfg.
Because we need different eval_hook in runner. Should be deprecated in the
future.
"""
if
cfg
.
model
.
type
in
[
'EncoderDecoder3D'
]:
assert
False
else
:
custom_train_detector
(
model
,
dataset
,
cfg
,
distributed
=
distributed
,
validate
=
validate
,
timestamp
=
timestamp
,
eval_model
=
eval_model
,
meta
=
meta
)
def
train_model
(
model
,
dataset
,
cfg
,
distributed
=
False
,
validate
=
False
,
timestamp
=
None
,
meta
=
None
):
"""A function wrapper for launching model training according to cfg.
Because we need different eval_hook in runner. Should be deprecated in the
future.
"""
if
cfg
.
model
.
type
in
[
'EncoderDecoder3D'
]:
train_segmentor
(
model
,
dataset
,
cfg
,
distributed
=
distributed
,
validate
=
validate
,
timestamp
=
timestamp
,
meta
=
meta
)
else
:
train_detector
(
model
,
dataset
,
cfg
,
distributed
=
distributed
,
validate
=
validate
,
timestamp
=
timestamp
,
meta
=
meta
)
autonomous_driving/occupancy_prediction/projects/mmdet3d_plugin/bevformer/backbones/__init__.py
0 → 100644
View file @
b64d9ca3
from
.internimage
import
InternImage
from
.custom_layer_decay_optimizer_constructor
import
CustomLayerDecayOptimizerConstructor
\ No newline at end of file
autonomous_driving/occupancy_prediction/projects/mmdet3d_plugin/bevformer/backbones/custom_layer_decay_optimizer_constructor.py
0 → 100644
View file @
b64d9ca3
# --------------------------------------------------------
# InternImage
# Copyright (c) 2022 OpenGVLab
# Licensed under The MIT License [see LICENSE for details]
# --------------------------------------------------------
"""
Mostly copy-paste from BEiT library:
https://github.com/microsoft/unilm/blob/master/beit/semantic_segmentation/mmcv_custom/layer_decay_optimizer_constructor.py
"""
import
json
from
mmcv.runner
import
OPTIMIZER_BUILDERS
,
DefaultOptimizerConstructor
from
mmcv.runner
import
get_dist_info
from
mmdet.utils
import
get_root_logger
def
get_num_layer_for_swin
(
var_name
,
num_max_layer
,
depths
):
if
var_name
.
startswith
(
"backbone.patch_embed"
):
return
0
elif
"level_embeds"
in
var_name
:
return
0
elif
var_name
.
startswith
(
"backbone.layers"
)
or
var_name
.
startswith
(
"backbone.levels"
):
if
var_name
.
split
(
'.'
)[
3
]
not
in
[
'downsample'
,
'norm'
]:
stage_id
=
int
(
var_name
.
split
(
'.'
)[
2
])
layer_id
=
int
(
var_name
.
split
(
'.'
)[
4
])
# layers for Swin-Large: [2, 2, 18, 2]
if
stage_id
==
0
:
return
layer_id
+
1
elif
stage_id
==
1
:
return
layer_id
+
1
+
depths
[
0
]
elif
stage_id
==
2
:
return
layer_id
+
1
+
depths
[
0
]
+
depths
[
1
]
else
:
return
layer_id
+
1
+
depths
[
0
]
+
depths
[
1
]
+
depths
[
2
]
else
:
stage_id
=
int
(
var_name
.
split
(
'.'
)[
2
])
if
stage_id
==
0
:
return
1
+
depths
[
0
]
elif
stage_id
==
1
:
return
1
+
depths
[
0
]
+
depths
[
1
]
elif
stage_id
==
2
:
return
1
+
depths
[
0
]
+
depths
[
1
]
+
depths
[
2
]
else
:
return
1
+
depths
[
0
]
+
depths
[
1
]
+
depths
[
2
]
else
:
return
num_max_layer
-
1
@
OPTIMIZER_BUILDERS
.
register_module
()
class
CustomLayerDecayOptimizerConstructor
(
DefaultOptimizerConstructor
):
def
add_params
(
self
,
params
,
module
,
prefix
=
''
,
is_dcn_module
=
None
):
"""Add all parameters of module to the params list.
The parameters of the given module will be added to the list of param
groups, with specific rules defined by paramwise_cfg.
Args:
params (list[dict]): A list of param groups, it will be modified
in place.
module (nn.Module): The module to be added.
prefix (str): The prefix of the module
is_dcn_module (int|float|None): If the current module is a
submodule of DCN, `is_dcn_module` will be passed to
control conv_offset layer's learning rate. Defaults to None.
"""
parameter_groups
=
{}
logger
=
get_root_logger
()
logger
.
info
(
self
.
paramwise_cfg
)
backbone_small_lr
=
self
.
paramwise_cfg
.
get
(
'backbone_small_lr'
,
False
)
dino_head
=
self
.
paramwise_cfg
.
get
(
'dino_head'
,
False
)
num_layers
=
self
.
paramwise_cfg
.
get
(
'num_layers'
)
+
2
layer_decay_rate
=
self
.
paramwise_cfg
.
get
(
'layer_decay_rate'
)
depths
=
self
.
paramwise_cfg
.
get
(
'depths'
)
offset_lr_scale
=
self
.
paramwise_cfg
.
get
(
'offset_lr_scale'
,
1.0
)
logger
.
info
(
"Build CustomLayerDecayOptimizerConstructor %f - %d"
%
(
layer_decay_rate
,
num_layers
))
weight_decay
=
self
.
base_wd
for
name
,
param
in
module
.
named_parameters
():
if
not
param
.
requires_grad
:
continue
# frozen weights
if
len
(
param
.
shape
)
==
1
or
name
.
endswith
(
".bias"
)
or
\
"relative_position"
in
name
or
\
"norm"
in
name
or
\
"sampling_offsets"
in
name
:
group_name
=
"no_decay"
this_weight_decay
=
0.
else
:
group_name
=
"decay"
this_weight_decay
=
weight_decay
layer_id
=
get_num_layer_for_swin
(
name
,
num_layers
,
depths
)
if
layer_id
==
num_layers
-
1
and
dino_head
and
\
(
"sampling_offsets"
in
name
or
"reference_points"
in
name
):
group_name
=
"layer_%d_%s_0.1x"
%
(
layer_id
,
group_name
)
elif
"sampling_offsets"
in
name
or
"reference_points"
in
name
:
group_name
=
"layer_%d_%s_offset_lr_scale"
%
(
layer_id
,
group_name
)
else
:
group_name
=
"layer_%d_%s"
%
(
layer_id
,
group_name
)
if
group_name
not
in
parameter_groups
:
scale
=
layer_decay_rate
**
(
num_layers
-
layer_id
-
1
)
if
scale
<
1
and
backbone_small_lr
==
True
:
scale
=
scale
*
0.1
if
"0.1x"
in
group_name
:
scale
=
scale
*
0.1
if
"offset_lr_scale"
in
group_name
:
scale
=
scale
*
offset_lr_scale
parameter_groups
[
group_name
]
=
{
"weight_decay"
:
this_weight_decay
,
"params"
:
[],
"param_names"
:
[],
"lr_scale"
:
scale
,
"group_name"
:
group_name
,
"lr"
:
scale
*
self
.
base_lr
,
}
parameter_groups
[
group_name
][
"params"
].
append
(
param
)
parameter_groups
[
group_name
][
"param_names"
].
append
(
name
)
rank
,
_
=
get_dist_info
()
if
rank
==
0
:
to_display
=
{}
for
key
in
parameter_groups
:
to_display
[
key
]
=
{
"param_names"
:
parameter_groups
[
key
][
"param_names"
],
"lr_scale"
:
parameter_groups
[
key
][
"lr_scale"
],
"lr"
:
parameter_groups
[
key
][
"lr"
],
"weight_decay"
:
parameter_groups
[
key
][
"weight_decay"
],
}
logger
.
info
(
"Param groups = %s"
%
json
.
dumps
(
to_display
,
indent
=
2
))
# state_dict = module.state_dict()
# for group_name in parameter_groups:
# group = parameter_groups[group_name]
# for name in group["param_names"]:
# group["params"].append(state_dict[name])
params
.
extend
(
parameter_groups
.
values
())
\ No newline at end of file
autonomous_driving/occupancy_prediction/projects/mmdet3d_plugin/bevformer/backbones/internimage.py
0 → 100644
View file @
b64d9ca3
# --------------------------------------------------------
# InternImage
# Copyright (c) 2022 OpenGVLab
# Licensed under The MIT License [see LICENSE for details]
# --------------------------------------------------------
import
torch
import
torch.nn
as
nn
from
collections
import
OrderedDict
import
torch.utils.checkpoint
as
checkpoint
from
timm.models.layers
import
trunc_normal_
,
DropPath
from
mmcv.runner
import
_load_checkpoint
from
mmcv.cnn
import
constant_init
,
trunc_normal_init
from
mmdet.utils
import
get_root_logger
from
mmdet.models.builder
import
BACKBONES
import
torch.nn.functional
as
F
from
.ops_dcnv3
import
modules
as
opsm
class
to_channels_first
(
nn
.
Module
):
def
__init__
(
self
):
super
().
__init__
()
def
forward
(
self
,
x
):
return
x
.
permute
(
0
,
3
,
1
,
2
)
class
to_channels_last
(
nn
.
Module
):
def
__init__
(
self
):
super
().
__init__
()
def
forward
(
self
,
x
):
return
x
.
permute
(
0
,
2
,
3
,
1
)
def
build_norm_layer
(
dim
,
norm_layer
,
in_format
=
'channels_last'
,
out_format
=
'channels_last'
,
eps
=
1e-6
):
layers
=
[]
if
norm_layer
==
'BN'
:
if
in_format
==
'channels_last'
:
layers
.
append
(
to_channels_first
())
layers
.
append
(
nn
.
BatchNorm2d
(
dim
))
if
out_format
==
'channels_last'
:
layers
.
append
(
to_channels_last
())
elif
norm_layer
==
'LN'
:
if
in_format
==
'channels_first'
:
layers
.
append
(
to_channels_last
())
layers
.
append
(
nn
.
LayerNorm
(
dim
,
eps
=
eps
))
if
out_format
==
'channels_first'
:
layers
.
append
(
to_channels_first
())
else
:
raise
NotImplementedError
(
f
'build_norm_layer does not support
{
norm_layer
}
'
)
return
nn
.
Sequential
(
*
layers
)
def
build_act_layer
(
act_layer
):
if
act_layer
==
'ReLU'
:
return
nn
.
ReLU
(
inplace
=
True
)
elif
act_layer
==
'SiLU'
:
return
nn
.
SiLU
(
inplace
=
True
)
elif
act_layer
==
'GELU'
:
return
nn
.
GELU
()
raise
NotImplementedError
(
f
'build_act_layer does not support
{
act_layer
}
'
)
class
CrossAttention
(
nn
.
Module
):
r
""" Cross Attention Module
Args:
dim (int): Number of input channels.
num_heads (int): Number of attention heads. Default: 8
qkv_bias (bool, optional): If True, add a learnable bias to q, k, v.
Default: False.
qk_scale (float | None, optional): Override default qk scale of
head_dim ** -0.5 if set. Default: None.
attn_drop (float, optional): Dropout ratio of attention weight.
Default: 0.0
proj_drop (float, optional): Dropout ratio of output. Default: 0.0
attn_head_dim (int, optional): Dimension of attention head.
out_dim (int, optional): Dimension of output.
"""
def
__init__
(
self
,
dim
,
num_heads
=
8
,
qkv_bias
=
False
,
qk_scale
=
None
,
attn_drop
=
0.
,
proj_drop
=
0.
,
attn_head_dim
=
None
,
out_dim
=
None
):
super
().
__init__
()
if
out_dim
is
None
:
out_dim
=
dim
self
.
num_heads
=
num_heads
head_dim
=
dim
//
num_heads
if
attn_head_dim
is
not
None
:
head_dim
=
attn_head_dim
all_head_dim
=
head_dim
*
self
.
num_heads
self
.
scale
=
qk_scale
or
head_dim
**
-
0.5
assert
all_head_dim
==
dim
self
.
q
=
nn
.
Linear
(
dim
,
all_head_dim
,
bias
=
False
)
self
.
k
=
nn
.
Linear
(
dim
,
all_head_dim
,
bias
=
False
)
self
.
v
=
nn
.
Linear
(
dim
,
all_head_dim
,
bias
=
False
)
if
qkv_bias
:
self
.
q_bias
=
nn
.
Parameter
(
torch
.
zeros
(
all_head_dim
))
self
.
k_bias
=
nn
.
Parameter
(
torch
.
zeros
(
all_head_dim
))
self
.
v_bias
=
nn
.
Parameter
(
torch
.
zeros
(
all_head_dim
))
else
:
self
.
q_bias
=
None
self
.
k_bias
=
None
self
.
v_bias
=
None
self
.
attn_drop
=
nn
.
Dropout
(
attn_drop
)
self
.
proj
=
nn
.
Linear
(
all_head_dim
,
out_dim
)
self
.
proj_drop
=
nn
.
Dropout
(
proj_drop
)
def
forward
(
self
,
x
,
k
=
None
,
v
=
None
):
B
,
N
,
C
=
x
.
shape
N_k
=
k
.
shape
[
1
]
N_v
=
v
.
shape
[
1
]
q_bias
,
k_bias
,
v_bias
=
None
,
None
,
None
if
self
.
q_bias
is
not
None
:
q_bias
=
self
.
q_bias
k_bias
=
self
.
k_bias
v_bias
=
self
.
v_bias
q
=
F
.
linear
(
input
=
x
,
weight
=
self
.
q
.
weight
,
bias
=
q_bias
)
q
=
q
.
reshape
(
B
,
N
,
1
,
self
.
num_heads
,
-
1
).
permute
(
2
,
0
,
3
,
1
,
4
).
squeeze
(
0
)
# (B, N_head, N_q, dim)
k
=
F
.
linear
(
input
=
k
,
weight
=
self
.
k
.
weight
,
bias
=
k_bias
)
k
=
k
.
reshape
(
B
,
N_k
,
1
,
self
.
num_heads
,
-
1
).
permute
(
2
,
0
,
3
,
1
,
4
).
squeeze
(
0
)
v
=
F
.
linear
(
input
=
v
,
weight
=
self
.
v
.
weight
,
bias
=
v_bias
)
v
=
v
.
reshape
(
B
,
N_v
,
1
,
self
.
num_heads
,
-
1
).
permute
(
2
,
0
,
3
,
1
,
4
).
squeeze
(
0
)
q
=
q
*
self
.
scale
attn
=
(
q
@
k
.
transpose
(
-
2
,
-
1
))
# (B, N_head, N_q, N_k)
attn
=
attn
.
softmax
(
dim
=-
1
)
attn
=
self
.
attn_drop
(
attn
)
x
=
(
attn
@
v
).
transpose
(
1
,
2
).
reshape
(
B
,
N
,
-
1
)
x
=
self
.
proj
(
x
)
x
=
self
.
proj_drop
(
x
)
return
x
class
AttentiveBlock
(
nn
.
Module
):
r
"""Attentive Block
Args:
dim (int): Number of input channels.
num_heads (int): Number of attention heads. Default: 8
qkv_bias (bool, optional): If True, add a learnable bias to q, k, v.
Default: False.
qk_scale (float | None, optional): Override default qk scale of
head_dim ** -0.5 if set. Default: None.
drop (float, optional): Dropout rate. Default: 0.0.
attn_drop (float, optional): Attention dropout rate. Default: 0.0.
drop_path (float | tuple[float], optional): Stochastic depth rate.
Default: 0.0.
norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm.
attn_head_dim (int, optional): Dimension of attention head. Default: None.
out_dim (int, optional): Dimension of output. Default: None.
"""
def
__init__
(
self
,
dim
,
num_heads
,
qkv_bias
=
False
,
qk_scale
=
None
,
drop
=
0.
,
attn_drop
=
0.
,
drop_path
=
0.
,
norm_layer
=
"LN"
,
attn_head_dim
=
None
,
out_dim
=
None
):
super
().
__init__
()
self
.
norm1_q
=
build_norm_layer
(
dim
,
norm_layer
,
eps
=
1e-6
)
self
.
norm1_k
=
build_norm_layer
(
dim
,
norm_layer
,
eps
=
1e-6
)
self
.
norm1_v
=
build_norm_layer
(
dim
,
norm_layer
,
eps
=
1e-6
)
self
.
cross_dcn
=
CrossAttention
(
dim
,
num_heads
=
num_heads
,
qkv_bias
=
qkv_bias
,
qk_scale
=
qk_scale
,
attn_drop
=
attn_drop
,
proj_drop
=
drop
,
attn_head_dim
=
attn_head_dim
,
out_dim
=
out_dim
)
self
.
drop_path
=
DropPath
(
drop_path
)
if
drop_path
>
0.
else
nn
.
Identity
()
def
forward
(
self
,
x_q
,
x_kv
,
pos_q
,
pos_k
,
bool_masked_pos
,
rel_pos_bias
=
None
):
x_q
=
self
.
norm1_q
(
x_q
+
pos_q
)
x_k
=
self
.
norm1_k
(
x_kv
+
pos_k
)
x_v
=
self
.
norm1_v
(
x_kv
)
x
=
self
.
cross_dcn
(
x_q
,
k
=
x_k
,
v
=
x_v
)
return
x
class
AttentionPoolingBlock
(
AttentiveBlock
):
def
forward
(
self
,
x
):
x_q
=
x
.
mean
(
1
,
keepdim
=
True
)
x_kv
=
x
pos_q
,
pos_k
=
0
,
0
x
=
super
().
forward
(
x_q
,
x_kv
,
pos_q
,
pos_k
,
bool_masked_pos
=
None
,
rel_pos_bias
=
None
)
x
=
x
.
squeeze
(
1
)
return
x
class
StemLayer
(
nn
.
Module
):
r
""" Stem layer of InternImage
Args:
in_chans (int): number of input channels
out_chans (int): number of output channels
act_layer (str): activation layer
norm_layer (str): normalization layer
"""
def
__init__
(
self
,
in_chans
=
3
,
out_chans
=
96
,
act_layer
=
'GELU'
,
norm_layer
=
'BN'
):
super
().
__init__
()
self
.
conv1
=
nn
.
Conv2d
(
in_chans
,
out_chans
//
2
,
kernel_size
=
3
,
stride
=
2
,
padding
=
1
)
self
.
norm1
=
build_norm_layer
(
out_chans
//
2
,
norm_layer
,
'channels_first'
,
'channels_first'
)
self
.
act
=
build_act_layer
(
act_layer
)
self
.
conv2
=
nn
.
Conv2d
(
out_chans
//
2
,
out_chans
,
kernel_size
=
3
,
stride
=
2
,
padding
=
1
)
self
.
norm2
=
build_norm_layer
(
out_chans
,
norm_layer
,
'channels_first'
,
'channels_last'
)
def
forward
(
self
,
x
):
x
=
self
.
conv1
(
x
)
x
=
self
.
norm1
(
x
)
x
=
self
.
act
(
x
)
x
=
self
.
conv2
(
x
)
x
=
self
.
norm2
(
x
)
return
x
class
DownsampleLayer
(
nn
.
Module
):
r
""" Downsample layer of InternImage
Args:
channels (int): number of input channels
norm_layer (str): normalization layer
"""
def
__init__
(
self
,
channels
,
norm_layer
=
'LN'
):
super
().
__init__
()
self
.
conv
=
nn
.
Conv2d
(
channels
,
2
*
channels
,
kernel_size
=
3
,
stride
=
2
,
padding
=
1
,
bias
=
False
)
self
.
norm
=
build_norm_layer
(
2
*
channels
,
norm_layer
,
'channels_first'
,
'channels_last'
)
def
forward
(
self
,
x
):
x
=
self
.
conv
(
x
.
permute
(
0
,
3
,
1
,
2
))
x
=
self
.
norm
(
x
)
return
x
class
MLPLayer
(
nn
.
Module
):
r
""" MLP layer of InternImage
Args:
in_features (int): number of input features
hidden_features (int): number of hidden features
out_features (int): number of output features
act_layer (str): activation layer
drop (float): dropout rate
"""
def
__init__
(
self
,
in_features
,
hidden_features
=
None
,
out_features
=
None
,
act_layer
=
'GELU'
,
drop
=
0.
):
super
().
__init__
()
out_features
=
out_features
or
in_features
hidden_features
=
hidden_features
or
in_features
self
.
fc1
=
nn
.
Linear
(
in_features
,
hidden_features
)
self
.
act
=
build_act_layer
(
act_layer
)
self
.
fc2
=
nn
.
Linear
(
hidden_features
,
out_features
)
self
.
drop
=
nn
.
Dropout
(
drop
)
def
forward
(
self
,
x
):
x
=
self
.
fc1
(
x
)
x
=
self
.
act
(
x
)
x
=
self
.
drop
(
x
)
x
=
self
.
fc2
(
x
)
x
=
self
.
drop
(
x
)
return
x
class
InternImageLayer
(
nn
.
Module
):
r
""" Basic layer of InternImage
Args:
core_op (nn.Module): core operation of InternImage
channels (int): number of input channels
groups (list): Groups of each block.
mlp_ratio (float): ratio of mlp hidden features to input channels
drop (float): dropout rate
drop_path (float): drop path rate
act_layer (str): activation layer
norm_layer (str): normalization layer
post_norm (bool): whether to use post normalization
layer_scale (float): layer scale
offset_scale (float): offset scale
with_cp (bool): whether to use checkpoint
"""
def
__init__
(
self
,
core_op
,
channels
,
groups
,
mlp_ratio
=
4.
,
drop
=
0.
,
drop_path
=
0.
,
act_layer
=
'GELU'
,
norm_layer
=
'LN'
,
post_norm
=
False
,
layer_scale
=
None
,
offset_scale
=
1.0
,
with_cp
=
False
,
dw_kernel_size
=
None
,
# for InternImage-H/G
res_post_norm
=
False
,
# for InternImage-H/G
center_feature_scale
=
False
):
# for InternImage-H/G
super
().
__init__
()
self
.
channels
=
channels
self
.
groups
=
groups
self
.
mlp_ratio
=
mlp_ratio
self
.
with_cp
=
with_cp
self
.
norm1
=
build_norm_layer
(
channels
,
'LN'
)
self
.
post_norm
=
post_norm
self
.
dcn
=
core_op
(
channels
=
channels
,
kernel_size
=
3
,
stride
=
1
,
pad
=
1
,
dilation
=
1
,
group
=
groups
,
offset_scale
=
offset_scale
,
act_layer
=
act_layer
,
norm_layer
=
norm_layer
,
dw_kernel_size
=
dw_kernel_size
,
# for InternImage-H/G
center_feature_scale
=
center_feature_scale
)
# for InternImage-H/G
self
.
drop_path
=
DropPath
(
drop_path
)
if
drop_path
>
0.
\
else
nn
.
Identity
()
self
.
norm2
=
build_norm_layer
(
channels
,
'LN'
)
self
.
mlp
=
MLPLayer
(
in_features
=
channels
,
hidden_features
=
int
(
channels
*
mlp_ratio
),
act_layer
=
act_layer
,
drop
=
drop
)
self
.
layer_scale
=
layer_scale
is
not
None
if
self
.
layer_scale
:
self
.
gamma1
=
nn
.
Parameter
(
layer_scale
*
torch
.
ones
(
channels
),
requires_grad
=
True
)
self
.
gamma2
=
nn
.
Parameter
(
layer_scale
*
torch
.
ones
(
channels
),
requires_grad
=
True
)
self
.
res_post_norm
=
res_post_norm
if
res_post_norm
:
self
.
res_post_norm1
=
build_norm_layer
(
channels
,
'LN'
)
self
.
res_post_norm2
=
build_norm_layer
(
channels
,
'LN'
)
def
forward
(
self
,
x
):
def
_inner_forward
(
x
):
if
not
self
.
layer_scale
:
if
self
.
post_norm
:
x
=
x
+
self
.
drop_path
(
self
.
norm1
(
self
.
dcn
(
x
)))
x
=
x
+
self
.
drop_path
(
self
.
norm2
(
self
.
mlp
(
x
)))
elif
self
.
res_post_norm
:
# for InternImage-H/G
x
=
x
+
self
.
drop_path
(
self
.
res_post_norm1
(
self
.
dcn
(
self
.
norm1
(
x
))))
x
=
x
+
self
.
drop_path
(
self
.
res_post_norm2
(
self
.
mlp
(
self
.
norm2
(
x
))))
else
:
x
=
x
+
self
.
drop_path
(
self
.
dcn
(
self
.
norm1
(
x
)))
x
=
x
+
self
.
drop_path
(
self
.
mlp
(
self
.
norm2
(
x
)))
return
x
if
self
.
post_norm
:
x
=
x
+
self
.
drop_path
(
self
.
gamma1
*
self
.
norm1
(
self
.
dcn
(
x
)))
x
=
x
+
self
.
drop_path
(
self
.
gamma2
*
self
.
norm2
(
self
.
mlp
(
x
)))
else
:
x
=
x
+
self
.
drop_path
(
self
.
gamma1
*
self
.
dcn
(
self
.
norm1
(
x
)))
x
=
x
+
self
.
drop_path
(
self
.
gamma2
*
self
.
mlp
(
self
.
norm2
(
x
)))
return
x
if
self
.
with_cp
and
x
.
requires_grad
:
x
=
checkpoint
.
checkpoint
(
_inner_forward
,
x
)
else
:
x
=
_inner_forward
(
x
)
return
x
class
InternImageBlock
(
nn
.
Module
):
r
""" Block of InternImage
Args:
core_op (nn.Module): core operation of InternImage
channels (int): number of input channels
depths (list): Depth of each block.
groups (list): Groups of each block.
mlp_ratio (float): ratio of mlp hidden features to input channels
drop (float): dropout rate
drop_path (float): drop path rate
act_layer (str): activation layer
norm_layer (str): normalization layer
post_norm (bool): whether to use post normalization
layer_scale (float): layer scale
offset_scale (float): offset scale
with_cp (bool): whether to use checkpoint
"""
def
__init__
(
self
,
core_op
,
channels
,
depth
,
groups
,
downsample
=
True
,
mlp_ratio
=
4.
,
drop
=
0.
,
drop_path
=
0.
,
act_layer
=
'GELU'
,
norm_layer
=
'LN'
,
post_norm
=
False
,
offset_scale
=
1.0
,
layer_scale
=
None
,
with_cp
=
False
,
dw_kernel_size
=
None
,
# for InternImage-H/G
post_norm_block_ids
=
None
,
# for InternImage-H/G
res_post_norm
=
False
,
# for InternImage-H/G
center_feature_scale
=
False
):
# for InternImage-H/G
super
().
__init__
()
self
.
channels
=
channels
self
.
depth
=
depth
self
.
post_norm
=
post_norm
self
.
center_feature_scale
=
center_feature_scale
self
.
blocks
=
nn
.
ModuleList
([
InternImageLayer
(
core_op
=
core_op
,
channels
=
channels
,
groups
=
groups
,
mlp_ratio
=
mlp_ratio
,
drop
=
drop
,
drop_path
=
drop_path
[
i
]
if
isinstance
(
drop_path
,
list
)
else
drop_path
,
act_layer
=
act_layer
,
norm_layer
=
norm_layer
,
post_norm
=
post_norm
,
layer_scale
=
layer_scale
,
offset_scale
=
offset_scale
,
with_cp
=
with_cp
,
dw_kernel_size
=
dw_kernel_size
,
# for InternImage-H/G
res_post_norm
=
res_post_norm
,
# for InternImage-H/G
center_feature_scale
=
center_feature_scale
# for InternImage-H/G
)
for
i
in
range
(
depth
)
])
if
not
self
.
post_norm
or
center_feature_scale
:
self
.
norm
=
build_norm_layer
(
channels
,
'LN'
)
self
.
post_norm_block_ids
=
post_norm_block_ids
if
post_norm_block_ids
is
not
None
:
# for InternImage-H/G
self
.
post_norms
=
nn
.
ModuleList
(
[
build_norm_layer
(
channels
,
'LN'
,
eps
=
1e-6
)
for
_
in
post_norm_block_ids
]
)
self
.
downsample
=
DownsampleLayer
(
channels
=
channels
,
norm_layer
=
norm_layer
)
if
downsample
else
None
def
forward
(
self
,
x
,
return_wo_downsample
=
False
):
for
i
,
blk
in
enumerate
(
self
.
blocks
):
x
=
blk
(
x
)
if
(
self
.
post_norm_block_ids
is
not
None
)
and
(
i
in
self
.
post_norm_block_ids
):
index
=
self
.
post_norm_block_ids
.
index
(
i
)
x
=
self
.
post_norms
[
index
](
x
)
# for InternImage-H/G
if
not
self
.
post_norm
or
self
.
center_feature_scale
:
x
=
self
.
norm
(
x
)
if
return_wo_downsample
:
x_
=
x
if
self
.
downsample
is
not
None
:
x
=
self
.
downsample
(
x
)
if
return_wo_downsample
:
return
x
,
x_
return
x
@
BACKBONES
.
register_module
()
class
InternImage
(
nn
.
Module
):
r
""" InternImage
A PyTorch impl of : `InternImage: Exploring Large-Scale Vision Foundation Models with Deformable Convolutions` -
https://arxiv.org/pdf/2103.14030
Args:
core_op (str): Core operator. Default: 'DCNv3'
channels (int): Number of the first stage. Default: 64
depths (list): Depth of each block. Default: [3, 4, 18, 5]
groups (list): Groups of each block. Default: [3, 6, 12, 24]
mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.
drop_rate (float): Probability of an element to be zeroed. Default: 0.
drop_path_rate (float): Stochastic depth rate. Default: 0.
act_layer (str): Activation layer. Default: 'GELU'
norm_layer (str): Normalization layer. Default: 'LN'
layer_scale (bool): Whether to use layer scale. Default: False
cls_scale (bool): Whether to use class scale. Default: False
with_cp (bool): Use checkpoint or not. Using checkpoint will save some
dw_kernel_size (int): Size of the dwconv. Default: None
level2_post_norm (bool): Whether to use level2 post norm. Default: False
level2_post_norm_block_ids (list): Indexes of post norm blocks. Default: None
res_post_norm (bool): Whether to use res post norm. Default: False
center_feature_scale (bool): Whether to use center feature scale. Default: False
"""
def
__init__
(
self
,
core_op
=
'DCNv3'
,
channels
=
64
,
depths
=
[
3
,
4
,
18
,
5
],
groups
=
[
3
,
6
,
12
,
24
],
mlp_ratio
=
4.
,
drop_rate
=
0.
,
drop_path_rate
=
0.2
,
drop_path_type
=
'linear'
,
act_layer
=
'GELU'
,
norm_layer
=
'LN'
,
layer_scale
=
None
,
offset_scale
=
1.0
,
post_norm
=
False
,
with_cp
=
False
,
dw_kernel_size
=
None
,
# for InternImage-H/G
level2_post_norm
=
False
,
# for InternImage-H/G
level2_post_norm_block_ids
=
None
,
# for InternImage-H/G
res_post_norm
=
False
,
# for InternImage-H/G
center_feature_scale
=
False
,
# for InternImage-H/G
out_indices
=
(
0
,
1
,
2
,
3
),
init_cfg
=
None
,
**
kwargs
):
super
().
__init__
()
self
.
core_op
=
core_op
self
.
num_levels
=
len
(
depths
)
self
.
depths
=
depths
self
.
channels
=
channels
self
.
num_features
=
int
(
channels
*
2
**
(
self
.
num_levels
-
1
))
self
.
post_norm
=
post_norm
self
.
mlp_ratio
=
mlp_ratio
self
.
init_cfg
=
init_cfg
self
.
out_indices
=
out_indices
self
.
level2_post_norm_block_ids
=
level2_post_norm_block_ids
logger
=
get_root_logger
()
logger
.
info
(
f
'using core type:
{
core_op
}
'
)
logger
.
info
(
f
'using activation layer:
{
act_layer
}
'
)
logger
.
info
(
f
'using main norm layer:
{
norm_layer
}
'
)
logger
.
info
(
f
'using dpr:
{
drop_path_type
}
,
{
drop_path_rate
}
'
)
logger
.
info
(
f
"level2_post_norm:
{
level2_post_norm
}
"
)
logger
.
info
(
f
"level2_post_norm_block_ids:
{
level2_post_norm_block_ids
}
"
)
logger
.
info
(
f
"res_post_norm:
{
res_post_norm
}
"
)
in_chans
=
3
self
.
patch_embed
=
StemLayer
(
in_chans
=
in_chans
,
out_chans
=
channels
,
act_layer
=
act_layer
,
norm_layer
=
norm_layer
)
self
.
pos_drop
=
nn
.
Dropout
(
p
=
drop_rate
)
dpr
=
[
x
.
item
()
for
x
in
torch
.
linspace
(
0
,
drop_path_rate
,
sum
(
depths
))
]
if
drop_path_type
==
'uniform'
:
for
i
in
range
(
len
(
dpr
)):
dpr
[
i
]
=
drop_path_rate
self
.
levels
=
nn
.
ModuleList
()
for
i
in
range
(
self
.
num_levels
):
post_norm_block_ids
=
level2_post_norm_block_ids
if
level2_post_norm
and
(
i
==
2
)
else
None
# for InternImage-H/G
level
=
InternImageBlock
(
core_op
=
getattr
(
opsm
,
core_op
),
channels
=
int
(
channels
*
2
**
i
),
depth
=
depths
[
i
],
groups
=
groups
[
i
],
mlp_ratio
=
self
.
mlp_ratio
,
drop
=
drop_rate
,
drop_path
=
dpr
[
sum
(
depths
[:
i
]):
sum
(
depths
[:
i
+
1
])],
act_layer
=
act_layer
,
norm_layer
=
norm_layer
,
post_norm
=
post_norm
,
downsample
=
(
i
<
self
.
num_levels
-
1
),
layer_scale
=
layer_scale
,
offset_scale
=
offset_scale
,
with_cp
=
with_cp
,
dw_kernel_size
=
dw_kernel_size
,
# for InternImage-H/G
post_norm_block_ids
=
post_norm_block_ids
,
# for InternImage-H/G
res_post_norm
=
res_post_norm
,
# for InternImage-H/G
center_feature_scale
=
center_feature_scale
# for InternImage-H/G
)
self
.
levels
.
append
(
level
)
self
.
num_layers
=
len
(
depths
)
self
.
apply
(
self
.
_init_weights
)
self
.
apply
(
self
.
_init_deform_weights
)
def
init_weights
(
self
):
logger
=
get_root_logger
()
if
self
.
init_cfg
is
None
:
logger
.
warn
(
f
'No pre-trained weights for '
f
'
{
self
.
__class__
.
__name__
}
, '
f
'training start from scratch'
)
for
m
in
self
.
modules
():
if
isinstance
(
m
,
nn
.
Linear
):
trunc_normal_init
(
m
,
std
=
.
02
,
bias
=
0.
)
elif
isinstance
(
m
,
nn
.
LayerNorm
):
constant_init
(
m
,
1.0
)
else
:
assert
'checkpoint'
in
self
.
init_cfg
,
f
'Only support '
\
f
'specify `Pretrained` in '
\
f
'`init_cfg` in '
\
f
'
{
self
.
__class__
.
__name__
}
'
ckpt
=
_load_checkpoint
(
self
.
init_cfg
.
checkpoint
,
logger
=
logger
,
map_location
=
'cpu'
)
if
'state_dict'
in
ckpt
:
_state_dict
=
ckpt
[
'state_dict'
]
elif
'model'
in
ckpt
:
_state_dict
=
ckpt
[
'model'
]
else
:
_state_dict
=
ckpt
state_dict
=
OrderedDict
()
for
k
,
v
in
_state_dict
.
items
():
if
k
.
startswith
(
'backbone.'
):
state_dict
[
k
[
9
:]]
=
v
else
:
state_dict
[
k
]
=
v
# strip prefix of state_dict
if
list
(
state_dict
.
keys
())[
0
].
startswith
(
'module.'
):
state_dict
=
{
k
[
7
:]:
v
for
k
,
v
in
state_dict
.
items
()}
# load state_dict
meg
=
self
.
load_state_dict
(
state_dict
,
False
)
logger
.
info
(
meg
)
def
_init_weights
(
self
,
m
):
if
isinstance
(
m
,
nn
.
Linear
):
trunc_normal_
(
m
.
weight
,
std
=
.
02
)
if
isinstance
(
m
,
nn
.
Linear
)
and
m
.
bias
is
not
None
:
nn
.
init
.
constant_
(
m
.
bias
,
0
)
elif
isinstance
(
m
,
nn
.
LayerNorm
):
nn
.
init
.
constant_
(
m
.
bias
,
0
)
nn
.
init
.
constant_
(
m
.
weight
,
1.0
)
def
_init_deform_weights
(
self
,
m
):
if
isinstance
(
m
,
getattr
(
opsm
,
self
.
core_op
)):
m
.
_reset_parameters
()
def
forward
(
self
,
x
):
x
=
self
.
patch_embed
(
x
)
x
=
self
.
pos_drop
(
x
)
seq_out
=
[]
for
level_idx
,
level
in
enumerate
(
self
.
levels
):
x
,
x_
=
level
(
x
,
return_wo_downsample
=
True
)
if
level_idx
in
self
.
out_indices
:
seq_out
.
append
(
x_
.
permute
(
0
,
3
,
1
,
2
).
contiguous
())
return
seq_out
\ No newline at end of file
autonomous_driving/occupancy_prediction/projects/mmdet3d_plugin/bevformer/backbones/ops_dcnv3/functions/__init__.py
0 → 100644
View file @
b64d9ca3
# --------------------------------------------------------
# InternImage
# Copyright (c) 2022 OpenGVLab
# Licensed under The MIT License [see LICENSE for details]
# --------------------------------------------------------
from
.dcnv3_func
import
DCNv3Function
,
dcnv3_core_pytorch
autonomous_driving/occupancy_prediction/projects/mmdet3d_plugin/bevformer/backbones/ops_dcnv3/functions/dcnv3_func.py
0 → 100644
View file @
b64d9ca3
# --------------------------------------------------------
# InternImage
# Copyright (c) 2022 OpenGVLab
# Licensed under The MIT License [see LICENSE for details]
# --------------------------------------------------------
from
__future__
import
absolute_import
from
__future__
import
print_function
from
__future__
import
division
import
torch
import
torch.nn.functional
as
F
from
torch.autograd
import
Function
from
torch.autograd.function
import
once_differentiable
from
torch.cuda.amp
import
custom_bwd
,
custom_fwd
import
DCNv3
class
DCNv3Function
(
Function
):
@
staticmethod
@
custom_fwd
def
forward
(
ctx
,
input
,
offset
,
mask
,
kernel_h
,
kernel_w
,
stride_h
,
stride_w
,
pad_h
,
pad_w
,
dilation_h
,
dilation_w
,
group
,
group_channels
,
offset_scale
,
im2col_step
):
ctx
.
kernel_h
=
kernel_h
ctx
.
kernel_w
=
kernel_w
ctx
.
stride_h
=
stride_h
ctx
.
stride_w
=
stride_w
ctx
.
pad_h
=
pad_h
ctx
.
pad_w
=
pad_w
ctx
.
dilation_h
=
dilation_h
ctx
.
dilation_w
=
dilation_w
ctx
.
group
=
group
ctx
.
group_channels
=
group_channels
ctx
.
offset_scale
=
offset_scale
ctx
.
im2col_step
=
im2col_step
output
=
DCNv3
.
dcnv3_forward
(
input
,
offset
,
mask
,
kernel_h
,
kernel_w
,
stride_h
,
stride_w
,
pad_h
,
pad_w
,
dilation_h
,
dilation_w
,
group
,
group_channels
,
offset_scale
,
ctx
.
im2col_step
)
ctx
.
save_for_backward
(
input
,
offset
,
mask
)
return
output
@
staticmethod
@
once_differentiable
@
custom_bwd
def
backward
(
ctx
,
grad_output
):
input
,
offset
,
mask
=
ctx
.
saved_tensors
grad_input
,
grad_offset
,
grad_mask
=
\
DCNv3
.
dcnv3_backward
(
input
,
offset
,
mask
,
ctx
.
kernel_h
,
ctx
.
kernel_w
,
ctx
.
stride_h
,
ctx
.
stride_w
,
ctx
.
pad_h
,
ctx
.
pad_w
,
ctx
.
dilation_h
,
ctx
.
dilation_w
,
ctx
.
group
,
ctx
.
group_channels
,
ctx
.
offset_scale
,
grad_output
.
contiguous
(),
ctx
.
im2col_step
)
return
grad_input
,
grad_offset
,
grad_mask
,
\
None
,
None
,
None
,
None
,
None
,
None
,
None
,
None
,
None
,
None
,
None
,
None
@
staticmethod
def
symbolic
(
g
,
input
,
offset
,
mask
,
kernel_h
,
kernel_w
,
stride_h
,
stride_w
,
pad_h
,
pad_w
,
dilation_h
,
dilation_w
,
group
,
group_channels
,
offset_scale
,
im2col_step
):
"""Symbolic function for mmdeploy::DCNv3.
Returns:
DCNv3 op for onnx.
"""
return
g
.
op
(
'mmdeploy::TRTDCNv3'
,
input
,
offset
,
mask
,
kernel_h_i
=
int
(
kernel_h
),
kernel_w_i
=
int
(
kernel_w
),
stride_h_i
=
int
(
stride_h
),
stride_w_i
=
int
(
stride_w
),
pad_h_i
=
int
(
pad_h
),
pad_w_i
=
int
(
pad_w
),
dilation_h_i
=
int
(
dilation_h
),
dilation_w_i
=
int
(
dilation_w
),
group_i
=
int
(
group
),
group_channels_i
=
int
(
group_channels
),
offset_scale_f
=
float
(
offset_scale
),
im2col_step_i
=
int
(
im2col_step
),
)
def
_get_reference_points
(
spatial_shapes
,
device
,
kernel_h
,
kernel_w
,
dilation_h
,
dilation_w
,
pad_h
=
0
,
pad_w
=
0
,
stride_h
=
1
,
stride_w
=
1
):
_
,
H_
,
W_
,
_
=
spatial_shapes
H_out
=
(
H_
-
(
dilation_h
*
(
kernel_h
-
1
)
+
1
))
//
stride_h
+
1
W_out
=
(
W_
-
(
dilation_w
*
(
kernel_w
-
1
)
+
1
))
//
stride_w
+
1
ref_y
,
ref_x
=
torch
.
meshgrid
(
torch
.
linspace
(
# pad_h + 0.5,
# H_ - pad_h - 0.5,
(
dilation_h
*
(
kernel_h
-
1
))
//
2
+
0.5
,
(
dilation_h
*
(
kernel_h
-
1
))
//
2
+
0.5
+
(
H_out
-
1
)
*
stride_h
,
H_out
,
dtype
=
torch
.
float32
,
device
=
device
),
torch
.
linspace
(
# pad_w + 0.5,
# W_ - pad_w - 0.5,
(
dilation_w
*
(
kernel_w
-
1
))
//
2
+
0.5
,
(
dilation_w
*
(
kernel_w
-
1
))
//
2
+
0.5
+
(
W_out
-
1
)
*
stride_w
,
W_out
,
dtype
=
torch
.
float32
,
device
=
device
))
ref_y
=
ref_y
.
reshape
(
-
1
)[
None
]
/
H_
ref_x
=
ref_x
.
reshape
(
-
1
)[
None
]
/
W_
ref
=
torch
.
stack
((
ref_x
,
ref_y
),
-
1
).
reshape
(
1
,
H_out
,
W_out
,
1
,
2
)
return
ref
def
_generate_dilation_grids
(
spatial_shapes
,
kernel_h
,
kernel_w
,
dilation_h
,
dilation_w
,
group
,
device
):
_
,
H_
,
W_
,
_
=
spatial_shapes
points_list
=
[]
x
,
y
=
torch
.
meshgrid
(
torch
.
linspace
(
-
((
dilation_w
*
(
kernel_w
-
1
))
//
2
),
-
((
dilation_w
*
(
kernel_w
-
1
))
//
2
)
+
(
kernel_w
-
1
)
*
dilation_w
,
kernel_w
,
dtype
=
torch
.
float32
,
device
=
device
),
torch
.
linspace
(
-
((
dilation_h
*
(
kernel_h
-
1
))
//
2
),
-
((
dilation_h
*
(
kernel_h
-
1
))
//
2
)
+
(
kernel_h
-
1
)
*
dilation_h
,
kernel_h
,
dtype
=
torch
.
float32
,
device
=
device
))
points_list
.
extend
([
x
/
W_
,
y
/
H_
])
grid
=
torch
.
stack
(
points_list
,
-
1
).
reshape
(
-
1
,
1
,
2
).
\
repeat
(
1
,
group
,
1
).
permute
(
1
,
0
,
2
)
grid
=
grid
.
reshape
(
1
,
1
,
1
,
group
*
kernel_h
*
kernel_w
,
2
)
return
grid
def
dcnv3_core_pytorch
(
input
,
offset
,
mask
,
kernel_h
,
kernel_w
,
stride_h
,
stride_w
,
pad_h
,
pad_w
,
dilation_h
,
dilation_w
,
group
,
group_channels
,
offset_scale
):
# for debug and test only,
# need to use cuda version instead
input
=
F
.
pad
(
input
,
[
0
,
0
,
pad_h
,
pad_h
,
pad_w
,
pad_w
])
N_
,
H_in
,
W_in
,
_
=
input
.
shape
_
,
H_out
,
W_out
,
_
=
offset
.
shape
ref
=
_get_reference_points
(
input
.
shape
,
input
.
device
,
kernel_h
,
kernel_w
,
dilation_h
,
dilation_w
,
pad_h
,
pad_w
,
stride_h
,
stride_w
)
grid
=
_generate_dilation_grids
(
input
.
shape
,
kernel_h
,
kernel_w
,
dilation_h
,
dilation_w
,
group
,
input
.
device
)
spatial_norm
=
torch
.
tensor
([
W_in
,
H_in
]).
reshape
(
1
,
1
,
1
,
2
).
\
repeat
(
1
,
1
,
1
,
group
*
kernel_h
*
kernel_w
).
to
(
input
.
device
)
sampling_locations
=
(
ref
+
grid
*
offset_scale
).
repeat
(
N_
,
1
,
1
,
1
,
1
).
flatten
(
3
,
4
)
+
\
offset
*
offset_scale
/
spatial_norm
P_
=
kernel_h
*
kernel_w
sampling_grids
=
2
*
sampling_locations
-
1
# N_, H_in, W_in, group*group_channels -> N_, H_in*W_in, group*group_channels -> N_, group*group_channels, H_in*W_in -> N_*group, group_channels, H_in, W_in
input_
=
input
.
view
(
N_
,
H_in
*
W_in
,
group
*
group_channels
).
transpose
(
1
,
2
).
\
reshape
(
N_
*
group
,
group_channels
,
H_in
,
W_in
)
# N_, H_out, W_out, group*P_*2 -> N_, H_out*W_out, group, P_, 2 -> N_, group, H_out*W_out, P_, 2 -> N_*group, H_out*W_out, P_, 2
sampling_grid_
=
sampling_grids
.
view
(
N_
,
H_out
*
W_out
,
group
,
P_
,
2
).
transpose
(
1
,
2
).
\
flatten
(
0
,
1
)
# N_*group, group_channels, H_out*W_out, P_
sampling_input_
=
F
.
grid_sample
(
input_
,
sampling_grid_
,
mode
=
'bilinear'
,
padding_mode
=
'zeros'
,
align_corners
=
False
)
# (N_, H_out, W_out, group*P_) -> N_, H_out*W_out, group, P_ -> (N_, group, H_out*W_out, P_) -> (N_*group, 1, H_out*W_out, P_)
mask
=
mask
.
view
(
N_
,
H_out
*
W_out
,
group
,
P_
).
transpose
(
1
,
2
).
\
reshape
(
N_
*
group
,
1
,
H_out
*
W_out
,
P_
)
output
=
(
sampling_input_
*
mask
).
sum
(
-
1
).
view
(
N_
,
group
*
group_channels
,
H_out
*
W_out
)
return
output
.
transpose
(
1
,
2
).
reshape
(
N_
,
H_out
,
W_out
,
-
1
).
contiguous
()
autonomous_driving/occupancy_prediction/projects/mmdet3d_plugin/bevformer/backbones/ops_dcnv3/make.sh
0 → 100755
View file @
b64d9ca3
#!/usr/bin/env bash
# --------------------------------------------------------
# InternImage
# Copyright (c) 2022 OpenGVLab
# Licensed under The MIT License [see LICENSE for details]
# --------------------------------------------------------
python setup.py build
install
autonomous_driving/occupancy_prediction/projects/mmdet3d_plugin/bevformer/backbones/ops_dcnv3/modules/__init__.py
0 → 100644
View file @
b64d9ca3
# --------------------------------------------------------
# InternImage
# Copyright (c) 2022 OpenGVLab
# Licensed under The MIT License [see LICENSE for details]
# --------------------------------------------------------
from
.dcnv3
import
DCNv3
,
DCNv3_pytorch
\ No newline at end of file
autonomous_driving/occupancy_prediction/projects/mmdet3d_plugin/bevformer/backbones/ops_dcnv3/modules/dcnv3.py
0 → 100644
View file @
b64d9ca3
# --------------------------------------------------------
# InternImage
# Copyright (c) 2022 OpenGVLab
# Licensed under The MIT License [see LICENSE for details]
# --------------------------------------------------------
from
__future__
import
absolute_import
from
__future__
import
print_function
from
__future__
import
division
import
warnings
import
torch
from
torch
import
nn
import
torch.nn.functional
as
F
from
torch.nn.init
import
xavier_uniform_
,
constant_
from
..functions
import
DCNv3Function
,
dcnv3_core_pytorch
class
to_channels_first
(
nn
.
Module
):
def
__init__
(
self
):
super
().
__init__
()
def
forward
(
self
,
x
):
return
x
.
permute
(
0
,
3
,
1
,
2
)
class
to_channels_last
(
nn
.
Module
):
def
__init__
(
self
):
super
().
__init__
()
def
forward
(
self
,
x
):
return
x
.
permute
(
0
,
2
,
3
,
1
)
def
build_norm_layer
(
dim
,
norm_layer
,
in_format
=
'channels_last'
,
out_format
=
'channels_last'
,
eps
=
1e-6
):
layers
=
[]
if
norm_layer
==
'BN'
:
if
in_format
==
'channels_last'
:
layers
.
append
(
to_channels_first
())
layers
.
append
(
nn
.
BatchNorm2d
(
dim
))
if
out_format
==
'channels_last'
:
layers
.
append
(
to_channels_last
())
elif
norm_layer
==
'LN'
:
if
in_format
==
'channels_first'
:
layers
.
append
(
to_channels_last
())
layers
.
append
(
nn
.
LayerNorm
(
dim
,
eps
=
eps
))
if
out_format
==
'channels_first'
:
layers
.
append
(
to_channels_first
())
else
:
raise
NotImplementedError
(
f
'build_norm_layer does not support
{
norm_layer
}
'
)
return
nn
.
Sequential
(
*
layers
)
def
build_act_layer
(
act_layer
):
if
act_layer
==
'ReLU'
:
return
nn
.
ReLU
(
inplace
=
True
)
elif
act_layer
==
'SiLU'
:
return
nn
.
SiLU
(
inplace
=
True
)
elif
act_layer
==
'GELU'
:
return
nn
.
GELU
()
raise
NotImplementedError
(
f
'build_act_layer does not support
{
act_layer
}
'
)
def
_is_power_of_2
(
n
):
if
(
not
isinstance
(
n
,
int
))
or
(
n
<
0
):
raise
ValueError
(
"invalid input for _is_power_of_2: {} (type: {})"
.
format
(
n
,
type
(
n
)))
return
(
n
&
(
n
-
1
)
==
0
)
and
n
!=
0
class
CenterFeatureScaleModule
(
nn
.
Module
):
def
forward
(
self
,
query
,
center_feature_scale_proj_weight
,
center_feature_scale_proj_bias
):
center_feature_scale
=
F
.
linear
(
query
,
weight
=
center_feature_scale_proj_weight
,
bias
=
center_feature_scale_proj_bias
).
sigmoid
()
return
center_feature_scale
class
DCNv3_pytorch
(
nn
.
Module
):
def
__init__
(
self
,
channels
=
64
,
kernel_size
=
3
,
dw_kernel_size
=
None
,
stride
=
1
,
pad
=
1
,
dilation
=
1
,
group
=
4
,
offset_scale
=
1.0
,
act_layer
=
'GELU'
,
norm_layer
=
'LN'
,
center_feature_scale
=
False
):
"""
DCNv3 Module
:param channels
:param kernel_size
:param stride
:param pad
:param dilation
:param group
:param offset_scale
:param act_layer
:param norm_layer
"""
super
().
__init__
()
if
channels
%
group
!=
0
:
raise
ValueError
(
f
'channels must be divisible by group, but got
{
channels
}
and
{
group
}
'
)
_d_per_group
=
channels
//
group
dw_kernel_size
=
dw_kernel_size
if
dw_kernel_size
is
not
None
else
kernel_size
# you'd better set _d_per_group to a power of 2 which is more efficient in our CUDA implementation
if
not
_is_power_of_2
(
_d_per_group
):
warnings
.
warn
(
"You'd better set channels in DCNv3 to make the dimension of each attention head a power of 2 "
"which is more efficient in our CUDA implementation."
)
self
.
offset_scale
=
offset_scale
self
.
channels
=
channels
self
.
kernel_size
=
kernel_size
self
.
dw_kernel_size
=
dw_kernel_size
self
.
stride
=
stride
self
.
dilation
=
dilation
self
.
pad
=
pad
self
.
group
=
group
self
.
group_channels
=
channels
//
group
self
.
offset_scale
=
offset_scale
self
.
center_feature_scale
=
center_feature_scale
self
.
dw_conv
=
nn
.
Sequential
(
nn
.
Conv2d
(
channels
,
channels
,
kernel_size
=
dw_kernel_size
,
stride
=
1
,
padding
=
(
dw_kernel_size
-
1
)
//
2
,
groups
=
channels
),
build_norm_layer
(
channels
,
norm_layer
,
'channels_first'
,
'channels_last'
),
build_act_layer
(
act_layer
))
self
.
offset
=
nn
.
Linear
(
channels
,
group
*
kernel_size
*
kernel_size
*
2
)
self
.
mask
=
nn
.
Linear
(
channels
,
group
*
kernel_size
*
kernel_size
)
self
.
input_proj
=
nn
.
Linear
(
channels
,
channels
)
self
.
output_proj
=
nn
.
Linear
(
channels
,
channels
)
self
.
_reset_parameters
()
if
center_feature_scale
:
self
.
center_feature_scale_proj_weight
=
nn
.
Parameter
(
torch
.
zeros
((
group
,
channels
),
dtype
=
torch
.
float
))
self
.
center_feature_scale_proj_bias
=
nn
.
Parameter
(
torch
.
tensor
(
0.0
,
dtype
=
torch
.
float
).
view
((
1
,)).
repeat
(
group
,
))
self
.
center_feature_scale_module
=
CenterFeatureScaleModule
()
def
_reset_parameters
(
self
):
constant_
(
self
.
offset
.
weight
.
data
,
0.
)
constant_
(
self
.
offset
.
bias
.
data
,
0.
)
constant_
(
self
.
mask
.
weight
.
data
,
0.
)
constant_
(
self
.
mask
.
bias
.
data
,
0.
)
xavier_uniform_
(
self
.
input_proj
.
weight
.
data
)
constant_
(
self
.
input_proj
.
bias
.
data
,
0.
)
xavier_uniform_
(
self
.
output_proj
.
weight
.
data
)
constant_
(
self
.
output_proj
.
bias
.
data
,
0.
)
def
forward
(
self
,
input
):
"""
:param query (N, H, W, C)
:return output (N, H, W, C)
"""
N
,
H
,
W
,
_
=
input
.
shape
x
=
self
.
input_proj
(
input
)
x_proj
=
x
x1
=
input
.
permute
(
0
,
3
,
1
,
2
)
x1
=
self
.
dw_conv
(
x1
)
offset
=
self
.
offset
(
x1
)
mask
=
self
.
mask
(
x1
).
reshape
(
N
,
H
,
W
,
self
.
group
,
-
1
)
mask
=
F
.
softmax
(
mask
,
-
1
).
reshape
(
N
,
H
,
W
,
-
1
)
x
=
dcnv3_core_pytorch
(
x
,
offset
,
mask
,
self
.
kernel_size
,
self
.
kernel_size
,
self
.
stride
,
self
.
stride
,
self
.
pad
,
self
.
pad
,
self
.
dilation
,
self
.
dilation
,
self
.
group
,
self
.
group_channels
,
self
.
offset_scale
)
if
self
.
center_feature_scale
:
center_feature_scale
=
self
.
center_feature_scale_module
(
x1
,
self
.
center_feature_scale_proj_weight
,
self
.
center_feature_scale_proj_bias
)
# N, H, W, groups -> N, H, W, groups, 1 -> N, H, W, groups, _d_per_group -> N, H, W, channels
center_feature_scale
=
center_feature_scale
[...,
None
].
repeat
(
1
,
1
,
1
,
1
,
self
.
channels
//
self
.
group
).
flatten
(
-
2
)
x
=
x
*
(
1
-
center_feature_scale
)
+
x_proj
*
center_feature_scale
x
=
self
.
output_proj
(
x
)
return
x
class
DCNv3
(
nn
.
Module
):
def
__init__
(
self
,
channels
=
64
,
kernel_size
=
3
,
dw_kernel_size
=
None
,
stride
=
1
,
pad
=
1
,
dilation
=
1
,
group
=
4
,
offset_scale
=
1.0
,
act_layer
=
'GELU'
,
norm_layer
=
'LN'
,
center_feature_scale
=
False
):
"""
DCNv3 Module
:param channels
:param kernel_size
:param stride
:param pad
:param dilation
:param group
:param offset_scale
:param act_layer
:param norm_layer
"""
super
().
__init__
()
if
channels
%
group
!=
0
:
raise
ValueError
(
f
'channels must be divisible by group, but got
{
channels
}
and
{
group
}
'
)
_d_per_group
=
channels
//
group
dw_kernel_size
=
dw_kernel_size
if
dw_kernel_size
is
not
None
else
kernel_size
# you'd better set _d_per_group to a power of 2 which is more efficient in our CUDA implementation
if
not
_is_power_of_2
(
_d_per_group
):
warnings
.
warn
(
"You'd better set channels in DCNv3 to make the dimension of each attention head a power of 2 "
"which is more efficient in our CUDA implementation."
)
self
.
offset_scale
=
offset_scale
self
.
channels
=
channels
self
.
kernel_size
=
kernel_size
self
.
dw_kernel_size
=
dw_kernel_size
self
.
stride
=
stride
self
.
dilation
=
dilation
self
.
pad
=
pad
self
.
group
=
group
self
.
group_channels
=
channels
//
group
self
.
offset_scale
=
offset_scale
self
.
center_feature_scale
=
center_feature_scale
self
.
dw_conv
=
nn
.
Sequential
(
nn
.
Conv2d
(
channels
,
channels
,
kernel_size
=
dw_kernel_size
,
stride
=
1
,
padding
=
(
dw_kernel_size
-
1
)
//
2
,
groups
=
channels
),
build_norm_layer
(
channels
,
norm_layer
,
'channels_first'
,
'channels_last'
),
build_act_layer
(
act_layer
))
self
.
offset
=
nn
.
Linear
(
channels
,
group
*
kernel_size
*
kernel_size
*
2
)
self
.
mask
=
nn
.
Linear
(
channels
,
group
*
kernel_size
*
kernel_size
)
self
.
input_proj
=
nn
.
Linear
(
channels
,
channels
)
self
.
output_proj
=
nn
.
Linear
(
channels
,
channels
)
self
.
_reset_parameters
()
if
center_feature_scale
:
self
.
center_feature_scale_proj_weight
=
nn
.
Parameter
(
torch
.
zeros
((
group
,
channels
),
dtype
=
torch
.
float
))
self
.
center_feature_scale_proj_bias
=
nn
.
Parameter
(
torch
.
tensor
(
0.0
,
dtype
=
torch
.
float
).
view
((
1
,)).
repeat
(
group
,
))
self
.
center_feature_scale_module
=
CenterFeatureScaleModule
()
def
_reset_parameters
(
self
):
constant_
(
self
.
offset
.
weight
.
data
,
0.
)
constant_
(
self
.
offset
.
bias
.
data
,
0.
)
constant_
(
self
.
mask
.
weight
.
data
,
0.
)
constant_
(
self
.
mask
.
bias
.
data
,
0.
)
xavier_uniform_
(
self
.
input_proj
.
weight
.
data
)
constant_
(
self
.
input_proj
.
bias
.
data
,
0.
)
xavier_uniform_
(
self
.
output_proj
.
weight
.
data
)
constant_
(
self
.
output_proj
.
bias
.
data
,
0.
)
def
forward
(
self
,
input
):
"""
:param query (N, H, W, C)
:return output (N, H, W, C)
"""
N
,
H
,
W
,
_
=
input
.
shape
x
=
self
.
input_proj
(
input
)
x_proj
=
x
dtype
=
x
.
dtype
x1
=
input
.
permute
(
0
,
3
,
1
,
2
)
x1
=
self
.
dw_conv
(
x1
)
offset
=
self
.
offset
(
x1
)
mask
=
self
.
mask
(
x1
).
reshape
(
N
,
H
,
W
,
self
.
group
,
-
1
)
mask
=
F
.
softmax
(
mask
,
-
1
).
reshape
(
N
,
H
,
W
,
-
1
).
type
(
dtype
)
x
=
DCNv3Function
.
apply
(
x
,
offset
,
mask
,
self
.
kernel_size
,
self
.
kernel_size
,
self
.
stride
,
self
.
stride
,
self
.
pad
,
self
.
pad
,
self
.
dilation
,
self
.
dilation
,
self
.
group
,
self
.
group_channels
,
self
.
offset_scale
,
256
)
if
self
.
center_feature_scale
:
center_feature_scale
=
self
.
center_feature_scale_module
(
x1
,
self
.
center_feature_scale_proj_weight
,
self
.
center_feature_scale_proj_bias
)
# N, H, W, groups -> N, H, W, groups, 1 -> N, H, W, groups, _d_per_group -> N, H, W, channels
center_feature_scale
=
center_feature_scale
[...,
None
].
repeat
(
1
,
1
,
1
,
1
,
self
.
channels
//
self
.
group
).
flatten
(
-
2
)
x
=
x
*
(
1
-
center_feature_scale
)
+
x_proj
*
center_feature_scale
x
=
self
.
output_proj
(
x
)
return
x
autonomous_driving/occupancy_prediction/projects/mmdet3d_plugin/bevformer/backbones/ops_dcnv3/setup.py
0 → 100644
View file @
b64d9ca3
# --------------------------------------------------------
# InternImage
# Copyright (c) 2022 OpenGVLab
# Licensed under The MIT License [see LICENSE for details]
# --------------------------------------------------------
import
os
import
glob
import
torch
from
torch.utils.cpp_extension
import
CUDA_HOME
from
torch.utils.cpp_extension
import
CppExtension
from
torch.utils.cpp_extension
import
CUDAExtension
from
setuptools
import
find_packages
from
setuptools
import
setup
requirements
=
[
"torch"
,
"torchvision"
]
def
get_extensions
():
this_dir
=
os
.
path
.
dirname
(
os
.
path
.
abspath
(
__file__
))
extensions_dir
=
os
.
path
.
join
(
this_dir
,
"src"
)
main_file
=
glob
.
glob
(
os
.
path
.
join
(
extensions_dir
,
"*.cpp"
))
source_cpu
=
glob
.
glob
(
os
.
path
.
join
(
extensions_dir
,
"cpu"
,
"*.cpp"
))
source_cuda
=
glob
.
glob
(
os
.
path
.
join
(
extensions_dir
,
"cuda"
,
"*.cu"
))
sources
=
main_file
+
source_cpu
extension
=
CppExtension
extra_compile_args
=
{
"cxx"
:
[]}
define_macros
=
[]
if
torch
.
cuda
.
is_available
()
and
CUDA_HOME
is
not
None
:
extension
=
CUDAExtension
sources
+=
source_cuda
define_macros
+=
[(
"WITH_CUDA"
,
None
)]
extra_compile_args
[
"nvcc"
]
=
[
# "-DCUDA_HAS_FP16=1",
# "-D__CUDA_NO_HALF_OPERATORS__",
# "-D__CUDA_NO_HALF_CONVERSIONS__",
# "-D__CUDA_NO_HALF2_OPERATORS__",
]
else
:
raise
NotImplementedError
(
'Cuda is not availabel'
)
sources
=
[
os
.
path
.
join
(
extensions_dir
,
s
)
for
s
in
sources
]
include_dirs
=
[
extensions_dir
]
ext_modules
=
[
extension
(
"DCNv3"
,
sources
,
include_dirs
=
include_dirs
,
define_macros
=
define_macros
,
extra_compile_args
=
extra_compile_args
,
)
]
return
ext_modules
setup
(
name
=
"DCNv3"
,
version
=
"1.0"
,
author
=
"InternImage"
,
url
=
"https://github.com/OpenGVLab/InternImage"
,
description
=
"PyTorch Wrapper for CUDA Functions of DCNv3"
,
packages
=
find_packages
(
exclude
=
(
"configs"
,
"tests"
,
)),
ext_modules
=
get_extensions
(),
cmdclass
=
{
"build_ext"
:
torch
.
utils
.
cpp_extension
.
BuildExtension
},
)
autonomous_driving/occupancy_prediction/projects/mmdet3d_plugin/bevformer/backbones/ops_dcnv3/src/cpu/dcnv3_cpu.cpp
0 → 100644
View file @
b64d9ca3
/*!
**************************************************************************************************
* InternImage
* Copyright (c) 2022 OpenGVLab
* Licensed under The MIT License [see LICENSE for details]
**************************************************************************************************
* Modified from
*https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
**************************************************************************************************
*/
#include <vector>
#include <ATen/ATen.h>
#include <ATen/cuda/CUDAContext.h>
at
::
Tensor
dcnv3_cpu_forward
(
const
at
::
Tensor
&
input
,
const
at
::
Tensor
&
offset
,
const
at
::
Tensor
&
mask
,
const
int
kernel_h
,
const
int
kernel_w
,
const
int
stride_h
,
const
int
stride_w
,
const
int
pad_h
,
const
int
pad_w
,
const
int
dilation_h
,
const
int
dilation_w
,
const
int
group
,
const
int
group_channels
,
const
float
offset_scale
,
const
int
im2col_step
)
{
AT_ERROR
(
"Not implement on cpu"
);
}
std
::
vector
<
at
::
Tensor
>
dcnv3_cpu_backward
(
const
at
::
Tensor
&
input
,
const
at
::
Tensor
&
offset
,
const
at
::
Tensor
&
mask
,
const
int
kernel_h
,
const
int
kernel_w
,
const
int
stride_h
,
const
int
stride_w
,
const
int
pad_h
,
const
int
pad_w
,
const
int
dilation_h
,
const
int
dilation_w
,
const
int
group
,
const
int
group_channels
,
const
float
offset_scale
,
const
at
::
Tensor
&
grad_output
,
const
int
im2col_step
)
{
AT_ERROR
(
"Not implement on cpu"
);
}
autonomous_driving/occupancy_prediction/projects/mmdet3d_plugin/bevformer/backbones/ops_dcnv3/src/cpu/dcnv3_cpu.h
0 → 100644
View file @
b64d9ca3
/*!
**************************************************************************************************
* InternImage
* Copyright (c) 2022 OpenGVLab
* Licensed under The MIT License [see LICENSE for details]
**************************************************************************************************
* Modified from
*https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
**************************************************************************************************
*/
#pragma once
#include <torch/extension.h>
at
::
Tensor
dcnv3_cpu_forward
(
const
at
::
Tensor
&
input
,
const
at
::
Tensor
&
offset
,
const
at
::
Tensor
&
mask
,
const
int
kernel_h
,
const
int
kernel_w
,
const
int
stride_h
,
const
int
stride_w
,
const
int
pad_h
,
const
int
pad_w
,
const
int
dilation_h
,
const
int
dilation_w
,
const
int
group
,
const
int
group_channels
,
const
float
offset_scale
,
const
int
im2col_step
);
std
::
vector
<
at
::
Tensor
>
dcnv3_cpu_backward
(
const
at
::
Tensor
&
input
,
const
at
::
Tensor
&
offset
,
const
at
::
Tensor
&
mask
,
const
int
kernel_h
,
const
int
kernel_w
,
const
int
stride_h
,
const
int
stride_w
,
const
int
pad_h
,
const
int
pad_w
,
const
int
dilation_h
,
const
int
dilation_w
,
const
int
group
,
const
int
group_channels
,
const
float
offset_scale
,
const
at
::
Tensor
&
grad_output
,
const
int
im2col_step
);
autonomous_driving/occupancy_prediction/projects/mmdet3d_plugin/bevformer/backbones/ops_dcnv3/src/cuda/dcnv3_cuda.cu
0 → 100644
View file @
b64d9ca3
/*!
**************************************************************************************************
* InternImage
* Copyright (c) 2022 OpenGVLab
* Licensed under The MIT License [see LICENSE for details]
**************************************************************************************************
* Modified from
*https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
**************************************************************************************************
*/
#include "cuda/dcnv3_im2col_cuda.cuh"
#include <vector>
#include <ATen/ATen.h>
#include <ATen/cuda/CUDAContext.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <torch/torch.h>
at
::
Tensor
dcnv3_cuda_forward
(
const
at
::
Tensor
&
input
,
const
at
::
Tensor
&
offset
,
const
at
::
Tensor
&
mask
,
const
int
kernel_h
,
const
int
kernel_w
,
const
int
stride_h
,
const
int
stride_w
,
const
int
pad_h
,
const
int
pad_w
,
const
int
dilation_h
,
const
int
dilation_w
,
const
int
group
,
const
int
group_channels
,
const
float
offset_scale
,
const
int
im2col_step
)
{
AT_ASSERTM
(
input
.
is_contiguous
(),
"input tensor has to be contiguous"
);
AT_ASSERTM
(
offset
.
is_contiguous
(),
"offset tensor has to be contiguous"
);
AT_ASSERTM
(
mask
.
is_contiguous
(),
"mask tensor has to be contiguous"
);
AT_ASSERTM
(
input
.
type
().
is_cuda
(),
"input must be a CUDA tensor"
);
AT_ASSERTM
(
offset
.
type
().
is_cuda
(),
"offset must be a CUDA tensor"
);
AT_ASSERTM
(
mask
.
type
().
is_cuda
(),
"mask must be a CUDA tensor"
);
const
int
batch
=
input
.
size
(
0
);
const
int
height_in
=
input
.
size
(
1
);
const
int
width_in
=
input
.
size
(
2
);
const
int
channels
=
input
.
size
(
3
);
const
int
height_out
=
(
height_in
+
2
*
pad_h
-
(
dilation_h
*
(
kernel_h
-
1
)
+
1
))
/
stride_h
+
1
;
const
int
width_out
=
(
width_in
+
2
*
pad_w
-
(
dilation_w
*
(
kernel_w
-
1
)
+
1
))
/
stride_w
+
1
;
const
int
im2col_step_
=
std
::
min
(
batch
,
im2col_step
);
AT_ASSERTM
(
batch
%
im2col_step_
==
0
,
"batch(%d) must divide im2col_step(%d)"
,
batch
,
im2col_step_
);
AT_ASSERTM
(
channels
==
(
group
*
group_channels
),
"Input channels and group times group channels wont match: (%d vs %d)."
,
channels
,
group
*
group_channels
);
auto
output
=
at
::
zeros
({
batch
,
height_out
,
width_out
,
group
*
group_channels
},
input
.
options
());
const
int
batch_n
=
im2col_step_
;
auto
output_n
=
output
.
view
({
batch
/
batch_n
,
batch_n
,
height_out
,
width_out
,
group
*
group_channels
});
auto
per_input_size
=
height_in
*
width_in
*
group
*
group_channels
;
auto
per_offset_size
=
height_out
*
width_out
*
group
*
kernel_h
*
kernel_w
*
2
;
auto
per_mask_size
=
height_out
*
width_out
*
group
*
kernel_h
*
kernel_w
;
for
(
int
n
=
0
;
n
<
batch
/
im2col_step_
;
++
n
)
{
auto
columns
=
output_n
.
select
(
0
,
n
);
// AT_DISPATCH_FLOATING_TYPES(
AT_DISPATCH_FLOATING_TYPES_AND_HALF
(
input
.
type
(),
"ms_deform_attn_forward_cuda"
,
([
&
]
{
dcnv3_im2col_cuda
(
at
::
cuda
::
getCurrentCUDAStream
(),
input
.
data
<
scalar_t
>
()
+
n
*
im2col_step_
*
per_input_size
,
offset
.
data
<
scalar_t
>
()
+
n
*
im2col_step_
*
per_offset_size
,
mask
.
data
<
scalar_t
>
()
+
n
*
im2col_step_
*
per_mask_size
,
columns
.
data
<
scalar_t
>
(),
kernel_h
,
kernel_w
,
stride_h
,
stride_w
,
pad_h
,
pad_w
,
dilation_h
,
dilation_w
,
group
,
group_channels
,
batch_n
,
height_in
,
width_in
,
height_out
,
width_out
,
offset_scale
);
}));
}
return
output
;
}
std
::
vector
<
at
::
Tensor
>
dcnv3_cuda_backward
(
const
at
::
Tensor
&
input
,
const
at
::
Tensor
&
offset
,
const
at
::
Tensor
&
mask
,
const
int
kernel_h
,
const
int
kernel_w
,
const
int
stride_h
,
const
int
stride_w
,
const
int
pad_h
,
const
int
pad_w
,
const
int
dilation_h
,
const
int
dilation_w
,
const
int
group
,
const
int
group_channels
,
const
float
offset_scale
,
const
at
::
Tensor
&
grad_output
,
const
int
im2col_step
)
{
AT_ASSERTM
(
input
.
is_contiguous
(),
"input tensor has to be contiguous"
);
AT_ASSERTM
(
offset
.
is_contiguous
(),
"offset tensor has to be contiguous"
);
AT_ASSERTM
(
mask
.
is_contiguous
(),
"mask tensor has to be contiguous"
);
AT_ASSERTM
(
grad_output
.
is_contiguous
(),
"grad_output tensor has to be contiguous"
);
AT_ASSERTM
(
input
.
type
().
is_cuda
(),
"input must be a CUDA tensor"
);
AT_ASSERTM
(
offset
.
type
().
is_cuda
(),
"offset must be a CUDA tensor"
);
AT_ASSERTM
(
mask
.
type
().
is_cuda
(),
"mask must be a CUDA tensor"
);
AT_ASSERTM
(
grad_output
.
type
().
is_cuda
(),
"grad_output must be a CUDA tensor"
);
const
int
batch
=
input
.
size
(
0
);
const
int
height_in
=
input
.
size
(
1
);
const
int
width_in
=
input
.
size
(
2
);
const
int
channels
=
input
.
size
(
3
);
const
int
height_out
=
(
height_in
+
2
*
pad_h
-
(
dilation_h
*
(
kernel_h
-
1
)
+
1
))
/
stride_h
+
1
;
const
int
width_out
=
(
width_in
+
2
*
pad_w
-
(
dilation_w
*
(
kernel_w
-
1
)
+
1
))
/
stride_w
+
1
;
const
int
im2col_step_
=
std
::
min
(
batch
,
im2col_step
);
AT_ASSERTM
(
batch
%
im2col_step_
==
0
,
"batch(%d) must divide im2col_step(%d)"
,
batch
,
im2col_step_
);
AT_ASSERTM
(
channels
==
(
group
*
group_channels
),
"Input channels and group times group channels wont match: (%d vs %d)."
,
channels
,
group
*
group_channels
);
auto
dtype
=
input
.
dtype
();
if
(
dtype
==
at
::
kHalf
)
{
dtype
=
at
::
kFloat
;
}
auto
grad_input
=
at
::
zeros_like
(
input
,
dtype
);
auto
grad_offset
=
at
::
zeros_like
(
offset
,
dtype
);
auto
grad_mask
=
at
::
zeros_like
(
mask
,
dtype
);
const
int
batch_n
=
im2col_step_
;
auto
per_input_size
=
height_in
*
width_in
*
group
*
group_channels
;
auto
per_offset_size
=
height_out
*
width_out
*
group
*
kernel_h
*
kernel_w
*
2
;
auto
per_mask_size
=
height_out
*
width_out
*
group
*
kernel_h
*
kernel_w
;
auto
grad_output_n
=
grad_output
.
view
({
batch
/
im2col_step_
,
batch_n
,
height_out
*
width_out
,
group
,
group_channels
});
for
(
int
n
=
0
;
n
<
batch
/
im2col_step_
;
++
n
)
{
auto
grad_output_g
=
grad_output_n
.
select
(
0
,
n
);
// AT_DISPATCH_FLOATING_TYPES(
AT_DISPATCH_FLOATING_TYPES_AND_HALF
(
input
.
type
(),
"ms_deform_attn_backward_cuda"
,
([
&
]
{
dcnv3_col2im_cuda
(
at
::
cuda
::
getCurrentCUDAStream
(),
grad_output_g
.
data
<
scalar_t
>
(),
input
.
data
<
scalar_t
>
()
+
n
*
im2col_step_
*
per_input_size
,
offset
.
data
<
scalar_t
>
()
+
n
*
im2col_step_
*
per_offset_size
,
mask
.
data
<
scalar_t
>
()
+
n
*
im2col_step_
*
per_mask_size
,
kernel_h
,
kernel_w
,
stride_h
,
stride_w
,
pad_h
,
pad_w
,
dilation_h
,
dilation_w
,
group
,
group_channels
,
batch_n
,
height_in
,
width_in
,
height_out
,
width_out
,
offset_scale
,
grad_input
.
data
<
opmath_t
>
()
+
n
*
im2col_step_
*
per_input_size
,
grad_offset
.
data
<
opmath_t
>
()
+
n
*
im2col_step_
*
per_offset_size
,
grad_mask
.
data
<
opmath_t
>
()
+
n
*
im2col_step_
*
per_mask_size
);
}));
}
if
(
input
.
dtype
()
==
torch
::
kHalf
)
{
return
{
grad_input
.
to
(
torch
::
kHalf
),
grad_offset
.
to
(
torch
::
kHalf
),
grad_mask
.
to
(
torch
::
kHalf
)};
}
else
{
return
{
grad_input
,
grad_offset
,
grad_mask
};
}
}
\ No newline at end of file
autonomous_driving/occupancy_prediction/projects/mmdet3d_plugin/bevformer/backbones/ops_dcnv3/src/cuda/dcnv3_cuda.h
0 → 100644
View file @
b64d9ca3
/*!
**************************************************************************************************
* InternImage
* Copyright (c) 2022 OpenGVLab
* Licensed under The MIT License [see LICENSE for details]
**************************************************************************************************
* Modified from
*https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
**************************************************************************************************
*/
#pragma once
#include <torch/extension.h>
at
::
Tensor
dcnv3_cuda_forward
(
const
at
::
Tensor
&
input
,
const
at
::
Tensor
&
offset
,
const
at
::
Tensor
&
mask
,
const
int
kernel_h
,
const
int
kernel_w
,
const
int
stride_h
,
const
int
stride_w
,
const
int
pad_h
,
const
int
pad_w
,
const
int
dilation_h
,
const
int
dilation_w
,
const
int
group
,
const
int
group_channels
,
const
float
offset_scale
,
const
int
im2col_step
);
std
::
vector
<
at
::
Tensor
>
dcnv3_cuda_backward
(
const
at
::
Tensor
&
input
,
const
at
::
Tensor
&
offset
,
const
at
::
Tensor
&
mask
,
const
int
kernel_h
,
const
int
kernel_w
,
const
int
stride_h
,
const
int
stride_w
,
const
int
pad_h
,
const
int
pad_w
,
const
int
dilation_h
,
const
int
dilation_w
,
const
int
group
,
const
int
group_channels
,
const
float
offset_scale
,
const
at
::
Tensor
&
grad_output
,
const
int
im2col_step
);
autonomous_driving/occupancy_prediction/projects/mmdet3d_plugin/bevformer/backbones/ops_dcnv3/src/cuda/dcnv3_im2col_cuda.cuh
0 → 100644
View file @
b64d9ca3
/*!
**************************************************************************************************
* InternImage
* Copyright (c) 2022 OpenGVLab
* Licensed under The MIT License [see LICENSE for details]
**************************************************************************************************
* Modified from
*https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
**************************************************************************************************
*/
#include <algorithm>
#include <cstdio>
#include <cstring>
#include <ATen/ATen.h>
#include <ATen/OpMathType.h>
#include <ATen/cuda/CUDAContext.h>
#include <THC/THCAtomics.cuh>
#define CUDA_KERNEL_LOOP(i, n) \
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
i += blockDim.x * gridDim.x)
const
int
CUDA_NUM_THREADS
=
256
;
inline
int
GET_BLOCKS
(
const
int
N
,
const
int
num_threads
)
{
return
(
N
+
num_threads
-
1
)
/
num_threads
;
}
#define opmath_t at::opmath_type<scalar_t>
template
<
typename
scalar_t
>
__device__
opmath_t
dcnv3_im2col_bilinear
(
const
scalar_t
*&
bottom_data
,
const
int
&
height
,
const
int
&
width
,
const
int
&
group
,
const
int
&
group_channels
,
const
opmath_t
&
h
,
const
opmath_t
&
w
,
const
int
&
g
,
const
int
&
c
)
{
const
int
h_low
=
floor
(
h
);
const
int
w_low
=
floor
(
w
);
const
int
h_high
=
h_low
+
1
;
const
int
w_high
=
w_low
+
1
;
const
opmath_t
lh
=
h
-
h_low
;
const
opmath_t
lw
=
w
-
w_low
;
const
opmath_t
hh
=
1
-
lh
,
hw
=
1
-
lw
;
const
int
w_stride
=
group
*
group_channels
;
const
int
h_stride
=
width
*
w_stride
;
const
int
h_low_ptr_offset
=
h_low
*
h_stride
;
const
int
h_high_ptr_offset
=
h_low_ptr_offset
+
h_stride
;
const
int
w_low_ptr_offset
=
w_low
*
w_stride
;
const
int
w_high_ptr_offset
=
w_low_ptr_offset
+
w_stride
;
const
int
base_ptr
=
g
*
group_channels
+
c
;
opmath_t
v1
=
0
;
if
(
h_low
>=
0
&&
w_low
>=
0
)
{
const
int
ptr1
=
h_low_ptr_offset
+
w_low_ptr_offset
+
base_ptr
;
v1
=
bottom_data
[
ptr1
];
}
opmath_t
v2
=
0
;
if
(
h_low
>=
0
&&
w_high
<=
width
-
1
)
{
const
int
ptr2
=
h_low_ptr_offset
+
w_high_ptr_offset
+
base_ptr
;
v2
=
bottom_data
[
ptr2
];
}
opmath_t
v3
=
0
;
if
(
h_high
<=
height
-
1
&&
w_low
>=
0
)
{
const
int
ptr3
=
h_high_ptr_offset
+
w_low_ptr_offset
+
base_ptr
;
v3
=
bottom_data
[
ptr3
];
}
opmath_t
v4
=
0
;
if
(
h_high
<=
height
-
1
&&
w_high
<=
width
-
1
)
{
const
int
ptr4
=
h_high_ptr_offset
+
w_high_ptr_offset
+
base_ptr
;
v4
=
bottom_data
[
ptr4
];
}
const
opmath_t
w1
=
hh
*
hw
,
w2
=
hh
*
lw
,
w3
=
lh
*
hw
,
w4
=
lh
*
lw
;
const
opmath_t
val
=
(
w1
*
v1
+
w2
*
v2
+
w3
*
v3
+
w4
*
v4
);
return
val
;
}
template
<
typename
scalar_t
>
__device__
void
dcnv3_col2im_bilinear
(
const
scalar_t
*&
bottom_data
,
const
int
&
height
,
const
int
&
width
,
const
int
&
nheads
,
const
int
&
group_channels
,
const
opmath_t
&
h
,
const
opmath_t
&
w
,
const
int
&
m
,
const
int
&
c
,
const
opmath_t
offset_scale
,
const
opmath_t
&
top_grad
,
const
opmath_t
&
mask
,
opmath_t
*&
grad_im
,
opmath_t
*
grad_offset
,
opmath_t
*
grad_mask
)
{
const
int
h_low
=
floor
(
h
);
const
int
w_low
=
floor
(
w
);
const
int
h_high
=
h_low
+
1
;
const
int
w_high
=
w_low
+
1
;
const
opmath_t
lh
=
h
-
h_low
;
const
opmath_t
lw
=
w
-
w_low
;
const
opmath_t
hh
=
1
-
lh
,
hw
=
1
-
lw
;
const
int
w_stride
=
nheads
*
group_channels
;
const
int
h_stride
=
width
*
w_stride
;
const
int
h_low_ptr_offset
=
h_low
*
h_stride
;
const
int
h_high_ptr_offset
=
h_low_ptr_offset
+
h_stride
;
const
int
w_low_ptr_offset
=
w_low
*
w_stride
;
const
int
w_high_ptr_offset
=
w_low_ptr_offset
+
w_stride
;
const
int
base_ptr
=
m
*
group_channels
+
c
;
const
opmath_t
w1
=
hh
*
hw
,
w2
=
hh
*
lw
,
w3
=
lh
*
hw
,
w4
=
lh
*
lw
;
const
opmath_t
top_grad_im
=
top_grad
*
mask
;
opmath_t
grad_h_weight
=
0
,
grad_w_weight
=
0
;
opmath_t
v1
=
0
;
if
(
h_low
>=
0
&&
w_low
>=
0
)
{
const
int
ptr1
=
h_low_ptr_offset
+
w_low_ptr_offset
+
base_ptr
;
v1
=
bottom_data
[
ptr1
];
grad_h_weight
-=
hw
*
v1
;
grad_w_weight
-=
hh
*
v1
;
atomicAdd
(
grad_im
+
ptr1
,
w1
*
top_grad_im
);
}
opmath_t
v2
=
0
;
if
(
h_low
>=
0
&&
w_high
<=
width
-
1
)
{
const
int
ptr2
=
h_low_ptr_offset
+
w_high_ptr_offset
+
base_ptr
;
v2
=
bottom_data
[
ptr2
];
grad_h_weight
-=
lw
*
v2
;
grad_w_weight
+=
hh
*
v2
;
atomicAdd
(
grad_im
+
ptr2
,
w2
*
top_grad_im
);
}
opmath_t
v3
=
0
;
if
(
h_high
<=
height
-
1
&&
w_low
>=
0
)
{
const
int
ptr3
=
h_high_ptr_offset
+
w_low_ptr_offset
+
base_ptr
;
v3
=
bottom_data
[
ptr3
];
grad_h_weight
+=
hw
*
v3
;
grad_w_weight
-=
lh
*
v3
;
atomicAdd
(
grad_im
+
ptr3
,
w3
*
top_grad_im
);
}
opmath_t
v4
=
0
;
if
(
h_high
<=
height
-
1
&&
w_high
<=
width
-
1
)
{
const
int
ptr4
=
h_high_ptr_offset
+
w_high_ptr_offset
+
base_ptr
;
v4
=
bottom_data
[
ptr4
];
grad_h_weight
+=
lw
*
v4
;
grad_w_weight
+=
lh
*
v4
;
atomicAdd
(
grad_im
+
ptr4
,
w4
*
top_grad_im
);
}
const
opmath_t
val
=
(
w1
*
v1
+
w2
*
v2
+
w3
*
v3
+
w4
*
v4
);
*
grad_mask
=
top_grad
*
val
;
*
grad_offset
=
offset_scale
*
grad_w_weight
*
top_grad_im
;
*
(
grad_offset
+
1
)
=
offset_scale
*
grad_h_weight
*
top_grad_im
;
}
template
<
typename
scalar_t
>
__device__
void
dcnv3_col2im_bilinear_gm
(
const
scalar_t
*&
bottom_data
,
const
int
&
height
,
const
int
&
width
,
const
int
&
nheads
,
const
int
&
group_channels
,
const
opmath_t
&
h
,
const
opmath_t
&
w
,
const
int
&
m
,
const
int
&
c
,
const
opmath_t
offset_scale
,
const
opmath_t
&
top_grad
,
const
opmath_t
&
mask
,
opmath_t
*&
grad_im
,
opmath_t
*
grad_offset
,
opmath_t
*
grad_mask
)
{
const
int
h_low
=
floor
(
h
);
const
int
w_low
=
floor
(
w
);
const
int
h_high
=
h_low
+
1
;
const
int
w_high
=
w_low
+
1
;
const
opmath_t
lh
=
h
-
h_low
;
const
opmath_t
lw
=
w
-
w_low
;
const
opmath_t
hh
=
1
-
lh
,
hw
=
1
-
lw
;
const
int
w_stride
=
nheads
*
group_channels
;
const
int
h_stride
=
width
*
w_stride
;
const
int
h_low_ptr_offset
=
h_low
*
h_stride
;
const
int
h_high_ptr_offset
=
h_low_ptr_offset
+
h_stride
;
const
int
w_low_ptr_offset
=
w_low
*
w_stride
;
const
int
w_high_ptr_offset
=
w_low_ptr_offset
+
w_stride
;
const
int
base_ptr
=
m
*
group_channels
+
c
;
const
opmath_t
w1
=
hh
*
hw
,
w2
=
hh
*
lw
,
w3
=
lh
*
hw
,
w4
=
lh
*
lw
;
const
opmath_t
top_grad_im
=
top_grad
*
mask
;
opmath_t
grad_h_weight
=
0
,
grad_w_weight
=
0
;
opmath_t
v1
=
0
;
if
(
h_low
>=
0
&&
w_low
>=
0
)
{
const
int
ptr1
=
h_low_ptr_offset
+
w_low_ptr_offset
+
base_ptr
;
v1
=
bottom_data
[
ptr1
];
grad_h_weight
-=
hw
*
v1
;
grad_w_weight
-=
hh
*
v1
;
atomicAdd
(
grad_im
+
ptr1
,
w1
*
top_grad_im
);
}
opmath_t
v2
=
0
;
if
(
h_low
>=
0
&&
w_high
<=
width
-
1
)
{
const
int
ptr2
=
h_low_ptr_offset
+
w_high_ptr_offset
+
base_ptr
;
v2
=
bottom_data
[
ptr2
];
grad_h_weight
-=
lw
*
v2
;
grad_w_weight
+=
hh
*
v2
;
atomicAdd
(
grad_im
+
ptr2
,
w2
*
top_grad_im
);
}
opmath_t
v3
=
0
;
if
(
h_high
<=
height
-
1
&&
w_low
>=
0
)
{
const
int
ptr3
=
h_high_ptr_offset
+
w_low_ptr_offset
+
base_ptr
;
v3
=
bottom_data
[
ptr3
];
grad_h_weight
+=
hw
*
v3
;
grad_w_weight
-=
lh
*
v3
;
atomicAdd
(
grad_im
+
ptr3
,
w3
*
top_grad_im
);
}
opmath_t
v4
=
0
;
if
(
h_high
<=
height
-
1
&&
w_high
<=
width
-
1
)
{
const
int
ptr4
=
h_high_ptr_offset
+
w_high_ptr_offset
+
base_ptr
;
v4
=
bottom_data
[
ptr4
];
grad_h_weight
+=
lw
*
v4
;
grad_w_weight
+=
lh
*
v4
;
atomicAdd
(
grad_im
+
ptr4
,
w4
*
top_grad_im
);
}
const
opmath_t
val
=
(
w1
*
v1
+
w2
*
v2
+
w3
*
v3
+
w4
*
v4
);
atomicAdd
(
grad_mask
,
top_grad
*
val
);
atomicAdd
(
grad_offset
,
offset_scale
*
grad_w_weight
*
top_grad_im
);
atomicAdd
(
grad_offset
+
1
,
offset_scale
*
grad_h_weight
*
top_grad_im
);
}
template
<
typename
scalar_t
>
__global__
void
dcnv3_im2col_gpu_kernel
(
const
int
num_kernels
,
const
scalar_t
*
data_im
,
const
scalar_t
*
data_offset
,
const
scalar_t
*
data_mask
,
scalar_t
*
data_col
,
const
int
kernel_h
,
const
int
kernel_w
,
const
int
stride_h
,
const
int
stride_w
,
const
int
pad_h
,
const
int
pad_w
,
const
int
dilation_h
,
const
int
dilation_w
,
const
int
group
,
const
int
group_channels
,
const
int
height_in
,
const
int
width_in
,
const
int
height_out
,
const
int
width_out
,
const
opmath_t
offset_scale
)
{
CUDA_KERNEL_LOOP
(
index
,
num_kernels
)
{
int
_temp
=
index
;
const
int
c_col
=
_temp
%
group_channels
;
_temp
/=
group_channels
;
const
int
sampling_index
=
_temp
;
const
int
g_col
=
_temp
%
group
;
_temp
/=
group
;
const
int
p0_w
=
((
dilation_w
*
(
kernel_w
-
1
))
>>
1
)
-
pad_w
+
(
_temp
%
width_out
)
*
stride_w
;
_temp
/=
width_out
;
const
int
p0_h
=
((
dilation_h
*
(
kernel_h
-
1
))
>>
1
)
-
pad_h
+
(
_temp
%
height_out
)
*
stride_h
;
_temp
/=
height_out
;
const
int
b_col
=
_temp
;
const
int
input_size
=
height_in
*
width_in
;
scalar_t
*
data_col_ptr
=
data_col
+
index
;
const
int
kernel_size
=
kernel_h
*
kernel_w
;
int
data_weight_ptr
=
sampling_index
*
kernel_size
;
int
data_loc_w_ptr
=
data_weight_ptr
<<
1
;
const
int
qid_stride
=
group
*
group_channels
;
opmath_t
col
=
0
;
const
scalar_t
*
data_im_ptr
=
data_im
+
b_col
*
input_size
*
qid_stride
;
// top-left
const
opmath_t
p0_w_
=
p0_w
-
((
dilation_w
*
(
kernel_w
-
1
))
>>
1
)
*
offset_scale
;
const
opmath_t
p0_h_
=
p0_h
-
((
dilation_h
*
(
kernel_h
-
1
))
>>
1
)
*
offset_scale
;
for
(
int
i
=
0
;
i
<
kernel_w
;
++
i
)
{
for
(
int
j
=
0
;
j
<
kernel_h
;
++
j
)
{
const
opmath_t
offset_w
=
data_offset
[
data_loc_w_ptr
];
const
opmath_t
offset_h
=
data_offset
[
data_loc_w_ptr
+
1
];
const
opmath_t
loc_w
=
p0_w_
+
(
i
*
dilation_w
+
offset_w
)
*
offset_scale
;
const
opmath_t
loc_h
=
p0_h_
+
(
j
*
dilation_h
+
offset_h
)
*
offset_scale
;
const
opmath_t
weight
=
data_mask
[
data_weight_ptr
];
if
(
loc_h
>
-
1
&&
loc_w
>
-
1
&&
loc_h
<
height_in
&&
loc_w
<
width_in
)
{
col
+=
dcnv3_im2col_bilinear
(
data_im_ptr
,
height_in
,
width_in
,
group
,
group_channels
,
loc_h
,
loc_w
,
g_col
,
c_col
)
*
weight
;
}
data_weight_ptr
+=
1
;
data_loc_w_ptr
+=
2
;
}
}
*
data_col_ptr
=
col
;
}
}
// debug
template
<
typename
scalar_t
,
unsigned
int
blockSize
>
__global__
void
dcnv3_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1
(
const
int
num_kernels
,
const
scalar_t
*
grad_col
,
const
scalar_t
*
data_im
,
const
scalar_t
*
data_offset
,
const
scalar_t
*
data_mask
,
const
int
kernel_h
,
const
int
kernel_w
,
const
int
stride_h
,
const
int
stride_w
,
const
int
pad_h
,
const
int
pad_w
,
const
int
dilation_h
,
const
int
dilation_w
,
const
int
group
,
const
int
group_channels
,
const
int
height_in
,
const
int
width_in
,
const
int
height_out
,
const
int
width_out
,
const
opmath_t
offset_scale
,
opmath_t
*
grad_im
,
opmath_t
*
grad_offset
,
opmath_t
*
grad_mask
)
{
CUDA_KERNEL_LOOP
(
index
,
num_kernels
)
{
__shared__
opmath_t
cache_grad_offset
[
blockSize
*
2
];
__shared__
opmath_t
cache_grad_mask
[
blockSize
];
unsigned
int
tid
=
threadIdx
.
x
;
int
_temp
=
index
;
const
int
c_col
=
_temp
%
group_channels
;
_temp
/=
group_channels
;
const
int
sampling_index
=
_temp
;
const
int
g_col
=
_temp
%
group
;
_temp
/=
group
;
const
int
p0_w
=
((
dilation_w
*
(
kernel_w
-
1
))
>>
1
)
-
pad_w
+
(
_temp
%
width_out
)
*
stride_w
;
_temp
/=
width_out
;
const
int
p0_h
=
((
dilation_h
*
(
kernel_h
-
1
))
>>
1
)
-
pad_h
+
(
_temp
%
height_out
)
*
stride_h
;
_temp
/=
height_out
;
const
int
b_col
=
_temp
;
const
opmath_t
top_grad
=
grad_col
[
index
];
const
int
input_size
=
height_in
*
width_in
;
const
int
kernel_size
=
kernel_h
*
kernel_w
;
int
data_weight_ptr
=
sampling_index
*
kernel_size
;
int
data_loc_w_ptr
=
data_weight_ptr
<<
1
;
const
int
grad_sampling_ptr
=
data_weight_ptr
;
grad_offset
+=
grad_sampling_ptr
<<
1
;
grad_mask
+=
grad_sampling_ptr
;
const
int
qid_stride
=
group
*
group_channels
;
const
int
im_ptr_offset
=
b_col
*
input_size
*
qid_stride
;
const
scalar_t
*
data_im_ptr
=
data_im
+
im_ptr_offset
;
opmath_t
*
grad_im_ptr
=
grad_im
+
im_ptr_offset
;
const
opmath_t
p0_w_
=
p0_w
-
((
dilation_w
*
(
kernel_w
-
1
))
>>
1
)
*
offset_scale
;
const
opmath_t
p0_h_
=
p0_h
-
((
dilation_h
*
(
kernel_h
-
1
))
>>
1
)
*
offset_scale
;
for
(
int
i
=
0
;
i
<
kernel_w
;
++
i
)
{
for
(
int
j
=
0
;
j
<
kernel_h
;
++
j
)
{
const
opmath_t
offset_w
=
data_offset
[
data_loc_w_ptr
];
const
opmath_t
offset_h
=
data_offset
[
data_loc_w_ptr
+
1
];
const
opmath_t
loc_w
=
p0_w_
+
(
i
*
dilation_w
+
offset_w
)
*
offset_scale
;
const
opmath_t
loc_h
=
p0_h_
+
(
j
*
dilation_h
+
offset_h
)
*
offset_scale
;
const
opmath_t
weight
=
data_mask
[
data_weight_ptr
];
*
(
cache_grad_offset
+
(
threadIdx
.
x
<<
1
))
=
0
;
*
(
cache_grad_offset
+
((
threadIdx
.
x
<<
1
)
+
1
))
=
0
;
*
(
cache_grad_mask
+
threadIdx
.
x
)
=
0
;
if
(
loc_h
>
-
1
&&
loc_w
>
-
1
&&
loc_h
<
height_in
&&
loc_w
<
width_in
)
{
dcnv3_col2im_bilinear
(
data_im_ptr
,
height_in
,
width_in
,
group
,
group_channels
,
loc_h
,
loc_w
,
g_col
,
c_col
,
offset_scale
,
top_grad
,
weight
,
grad_im_ptr
,
cache_grad_offset
+
(
threadIdx
.
x
<<
1
),
cache_grad_mask
+
threadIdx
.
x
);
}
__syncthreads
();
if
(
tid
==
0
)
{
opmath_t
_grad_w
=
cache_grad_offset
[
0
],
_grad_h
=
cache_grad_offset
[
1
],
_grad_a
=
cache_grad_mask
[
0
];
int
sid
=
2
;
for
(
unsigned
int
tid
=
1
;
tid
<
blockSize
;
++
tid
)
{
_grad_w
+=
cache_grad_offset
[
sid
];
_grad_h
+=
cache_grad_offset
[
sid
+
1
];
_grad_a
+=
cache_grad_mask
[
tid
];
sid
+=
2
;
}
*
grad_offset
=
_grad_w
;
*
(
grad_offset
+
1
)
=
_grad_h
;
*
grad_mask
=
_grad_a
;
}
__syncthreads
();
data_weight_ptr
+=
1
;
data_loc_w_ptr
+=
2
;
grad_mask
+=
1
;
grad_offset
+=
2
;
}
}
}
}
template
<
typename
scalar_t
,
unsigned
int
blockSize
>
__global__
void
dcnv3_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2
(
const
int
num_kernels
,
const
scalar_t
*
grad_col
,
const
scalar_t
*
data_im
,
const
scalar_t
*
data_offset
,
const
scalar_t
*
data_mask
,
const
int
kernel_h
,
const
int
kernel_w
,
const
int
stride_h
,
const
int
stride_w
,
const
int
pad_h
,
const
int
pad_w
,
const
int
dilation_h
,
const
int
dilation_w
,
const
int
group
,
const
int
group_channels
,
const
int
height_in
,
const
int
width_in
,
const
int
height_out
,
const
int
width_out
,
const
opmath_t
offset_scale
,
opmath_t
*
grad_im
,
opmath_t
*
grad_offset
,
opmath_t
*
grad_mask
)
{
CUDA_KERNEL_LOOP
(
index
,
num_kernels
)
{
__shared__
opmath_t
cache_grad_offset
[
blockSize
*
2
];
__shared__
opmath_t
cache_grad_mask
[
blockSize
];
unsigned
int
tid
=
threadIdx
.
x
;
int
_temp
=
index
;
const
int
c_col
=
_temp
%
group_channels
;
_temp
/=
group_channels
;
const
int
sampling_index
=
_temp
;
const
int
g_col
=
_temp
%
group
;
_temp
/=
group
;
const
int
p0_w
=
((
dilation_w
*
(
kernel_w
-
1
))
>>
1
)
-
pad_w
+
(
_temp
%
width_out
)
*
stride_w
;
_temp
/=
width_out
;
const
int
p0_h
=
((
dilation_h
*
(
kernel_h
-
1
))
>>
1
)
-
pad_h
+
(
_temp
%
height_out
)
*
stride_h
;
_temp
/=
height_out
;
const
int
b_col
=
_temp
;
const
opmath_t
top_grad
=
grad_col
[
index
];
const
int
input_size
=
height_in
*
width_in
;
const
int
kernel_size
=
kernel_h
*
kernel_w
;
int
data_weight_ptr
=
sampling_index
*
kernel_size
;
int
data_loc_w_ptr
=
data_weight_ptr
<<
1
;
const
int
grad_sampling_ptr
=
data_weight_ptr
;
grad_offset
+=
grad_sampling_ptr
<<
1
;
grad_mask
+=
grad_sampling_ptr
;
const
int
qid_stride
=
group
*
group_channels
;
const
int
im_ptr_offset
=
b_col
*
input_size
*
qid_stride
;
const
scalar_t
*
data_im_ptr
=
data_im
+
im_ptr_offset
;
opmath_t
*
grad_im_ptr
=
grad_im
+
im_ptr_offset
;
const
opmath_t
p0_w_
=
p0_w
-
((
dilation_w
*
(
kernel_w
-
1
))
>>
1
)
*
offset_scale
;
const
opmath_t
p0_h_
=
p0_h
-
((
dilation_h
*
(
kernel_h
-
1
))
>>
1
)
*
offset_scale
;
for
(
int
i
=
0
;
i
<
kernel_w
;
++
i
)
{
for
(
int
j
=
0
;
j
<
kernel_h
;
++
j
)
{
const
opmath_t
offset_w
=
data_offset
[
data_loc_w_ptr
];
const
opmath_t
offset_h
=
data_offset
[
data_loc_w_ptr
+
1
];
const
opmath_t
loc_w
=
p0_w_
+
(
i
*
dilation_w
+
offset_w
)
*
offset_scale
;
const
opmath_t
loc_h
=
p0_h_
+
(
j
*
dilation_h
+
offset_h
)
*
offset_scale
;
const
opmath_t
weight
=
data_mask
[
data_weight_ptr
];
*
(
cache_grad_offset
+
(
threadIdx
.
x
<<
1
))
=
0
;
*
(
cache_grad_offset
+
((
threadIdx
.
x
<<
1
)
+
1
))
=
0
;
*
(
cache_grad_mask
+
threadIdx
.
x
)
=
0
;
if
(
loc_h
>
-
1
&&
loc_w
>
-
1
&&
loc_h
<
height_in
&&
loc_w
<
width_in
)
{
dcnv3_col2im_bilinear
(
data_im_ptr
,
height_in
,
width_in
,
group
,
group_channels
,
loc_h
,
loc_w
,
g_col
,
c_col
,
offset_scale
,
top_grad
,
weight
,
grad_im_ptr
,
cache_grad_offset
+
(
threadIdx
.
x
<<
1
),
cache_grad_mask
+
threadIdx
.
x
);
}
__syncthreads
();
for
(
unsigned
int
s
=
blockSize
/
2
;
s
>
0
;
s
>>=
1
)
{
if
(
tid
<
s
)
{
const
unsigned
int
xid1
=
tid
<<
1
;
const
unsigned
int
xid2
=
(
tid
+
s
)
<<
1
;
cache_grad_mask
[
tid
]
+=
cache_grad_mask
[
tid
+
s
];
cache_grad_offset
[
xid1
]
+=
cache_grad_offset
[
xid2
];
cache_grad_offset
[
xid1
+
1
]
+=
cache_grad_offset
[
xid2
+
1
];
}
__syncthreads
();
}
if
(
tid
==
0
)
{
*
grad_offset
=
cache_grad_offset
[
0
];
*
(
grad_offset
+
1
)
=
cache_grad_offset
[
1
];
*
grad_mask
=
cache_grad_mask
[
0
];
}
__syncthreads
();
data_weight_ptr
+=
1
;
data_loc_w_ptr
+=
2
;
grad_mask
+=
1
;
grad_offset
+=
2
;
}
}
}
}
template
<
typename
scalar_t
>
__global__
void
dcnv3_col2im_gpu_kernel_shm_reduce_v1
(
const
int
num_kernels
,
const
scalar_t
*
grad_col
,
const
scalar_t
*
data_im
,
const
scalar_t
*
data_offset
,
const
scalar_t
*
data_mask
,
const
int
kernel_h
,
const
int
kernel_w
,
const
int
stride_h
,
const
int
stride_w
,
const
int
pad_h
,
const
int
pad_w
,
const
int
dilation_h
,
const
int
dilation_w
,
const
int
group
,
const
int
group_channels
,
const
int
height_in
,
const
int
width_in
,
const
int
height_out
,
const
int
width_out
,
const
opmath_t
offset_scale
,
opmath_t
*
grad_im
,
opmath_t
*
grad_offset
,
opmath_t
*
grad_mask
)
{
CUDA_KERNEL_LOOP
(
index
,
num_kernels
)
{
extern
__shared__
int
_s
[];
opmath_t
*
cache_grad_offset
=
(
opmath_t
*
)
_s
;
opmath_t
*
cache_grad_mask
=
cache_grad_offset
+
2
*
blockDim
.
x
;
unsigned
int
tid
=
threadIdx
.
x
;
int
_temp
=
index
;
const
int
c_col
=
_temp
%
group_channels
;
_temp
/=
group_channels
;
const
int
sampling_index
=
_temp
;
const
int
g_col
=
_temp
%
group
;
_temp
/=
group
;
const
int
p0_w
=
((
dilation_w
*
(
kernel_w
-
1
))
>>
1
)
-
pad_w
+
(
_temp
%
width_out
)
*
stride_w
;
_temp
/=
width_out
;
const
int
p0_h
=
((
dilation_h
*
(
kernel_h
-
1
))
>>
1
)
-
pad_h
+
(
_temp
%
height_out
)
*
stride_h
;
_temp
/=
height_out
;
const
int
b_col
=
_temp
;
const
opmath_t
top_grad
=
grad_col
[
index
];
const
int
input_size
=
height_in
*
width_in
;
const
int
kernel_size
=
kernel_h
*
kernel_w
;
int
data_weight_ptr
=
sampling_index
*
kernel_size
;
int
data_loc_w_ptr
=
data_weight_ptr
<<
1
;
const
int
grad_sampling_ptr
=
data_weight_ptr
;
grad_offset
+=
grad_sampling_ptr
<<
1
;
grad_mask
+=
grad_sampling_ptr
;
const
int
qid_stride
=
group
*
group_channels
;
const
int
im_ptr_offset
=
b_col
*
input_size
*
qid_stride
;
const
scalar_t
*
data_im_ptr
=
data_im
+
im_ptr_offset
;
opmath_t
*
grad_im_ptr
=
grad_im
+
im_ptr_offset
;
const
opmath_t
p0_w_
=
p0_w
-
((
dilation_w
*
(
kernel_w
-
1
))
>>
1
)
*
offset_scale
;
const
opmath_t
p0_h_
=
p0_h
-
((
dilation_h
*
(
kernel_h
-
1
))
>>
1
)
*
offset_scale
;
for
(
int
i
=
0
;
i
<
kernel_w
;
++
i
)
{
for
(
int
j
=
0
;
j
<
kernel_h
;
++
j
)
{
const
opmath_t
offset_w
=
data_offset
[
data_loc_w_ptr
];
const
opmath_t
offset_h
=
data_offset
[
data_loc_w_ptr
+
1
];
const
opmath_t
loc_w
=
p0_w_
+
(
i
*
dilation_w
+
offset_w
)
*
offset_scale
;
const
opmath_t
loc_h
=
p0_h_
+
(
j
*
dilation_h
+
offset_h
)
*
offset_scale
;
const
opmath_t
weight
=
data_mask
[
data_weight_ptr
];
*
(
cache_grad_offset
+
(
threadIdx
.
x
<<
1
))
=
0
;
*
(
cache_grad_offset
+
((
threadIdx
.
x
<<
1
)
+
1
))
=
0
;
*
(
cache_grad_mask
+
threadIdx
.
x
)
=
0
;
if
(
loc_h
>
-
1
&&
loc_w
>
-
1
&&
loc_h
<
height_in
&&
loc_w
<
width_in
)
{
dcnv3_col2im_bilinear
(
data_im_ptr
,
height_in
,
width_in
,
group
,
group_channels
,
loc_h
,
loc_w
,
g_col
,
c_col
,
offset_scale
,
top_grad
,
weight
,
grad_im_ptr
,
cache_grad_offset
+
(
threadIdx
.
x
<<
1
),
cache_grad_mask
+
threadIdx
.
x
);
}
__syncthreads
();
if
(
tid
==
0
)
{
opmath_t
_grad_w
=
cache_grad_offset
[
0
],
_grad_h
=
cache_grad_offset
[
1
],
_grad_a
=
cache_grad_mask
[
0
];
int
sid
=
2
;
for
(
unsigned
int
tid
=
1
;
tid
<
blockDim
.
x
;
++
tid
)
{
_grad_w
+=
cache_grad_offset
[
sid
];
_grad_h
+=
cache_grad_offset
[
sid
+
1
];
_grad_a
+=
cache_grad_mask
[
tid
];
sid
+=
2
;
}
*
grad_offset
=
_grad_w
;
*
(
grad_offset
+
1
)
=
_grad_h
;
*
grad_mask
=
_grad_a
;
}
__syncthreads
();
data_weight_ptr
+=
1
;
data_loc_w_ptr
+=
2
;
grad_mask
+=
1
;
grad_offset
+=
2
;
}
}
}
}
template
<
typename
scalar_t
>
__global__
void
dcnv3_col2im_gpu_kernel_shm_reduce_v2
(
const
int
num_kernels
,
const
scalar_t
*
grad_col
,
const
scalar_t
*
data_im
,
const
scalar_t
*
data_offset
,
const
scalar_t
*
data_mask
,
const
int
kernel_h
,
const
int
kernel_w
,
const
int
stride_h
,
const
int
stride_w
,
const
int
pad_h
,
const
int
pad_w
,
const
int
dilation_h
,
const
int
dilation_w
,
const
int
group
,
const
int
group_channels
,
const
int
height_in
,
const
int
width_in
,
const
int
height_out
,
const
int
width_out
,
const
opmath_t
offset_scale
,
opmath_t
*
grad_im
,
opmath_t
*
grad_offset
,
opmath_t
*
grad_mask
)
{
CUDA_KERNEL_LOOP
(
index
,
num_kernels
)
{
extern
__shared__
int
_s
[];
opmath_t
*
cache_grad_offset
=
(
opmath_t
*
)
_s
;
opmath_t
*
cache_grad_mask
=
cache_grad_offset
+
2
*
blockDim
.
x
;
unsigned
int
tid
=
threadIdx
.
x
;
int
_temp
=
index
;
const
int
c_col
=
_temp
%
group_channels
;
_temp
/=
group_channels
;
const
int
sampling_index
=
_temp
;
const
int
g_col
=
_temp
%
group
;
_temp
/=
group
;
const
int
p0_w
=
((
dilation_w
*
(
kernel_w
-
1
))
>>
1
)
-
pad_w
+
(
_temp
%
width_out
)
*
stride_w
;
_temp
/=
width_out
;
const
int
p0_h
=
((
dilation_h
*
(
kernel_h
-
1
))
>>
1
)
-
pad_h
+
(
_temp
%
height_out
)
*
stride_h
;
_temp
/=
height_out
;
const
int
b_col
=
_temp
;
const
opmath_t
top_grad
=
grad_col
[
index
];
const
int
input_size
=
height_in
*
width_in
;
const
int
kernel_size
=
kernel_h
*
kernel_w
;
int
data_weight_ptr
=
sampling_index
*
kernel_size
;
int
data_loc_w_ptr
=
data_weight_ptr
<<
1
;
const
int
grad_sampling_ptr
=
data_weight_ptr
;
grad_offset
+=
grad_sampling_ptr
<<
1
;
grad_mask
+=
grad_sampling_ptr
;
const
int
qid_stride
=
group
*
group_channels
;
const
int
im_ptr_offset
=
b_col
*
input_size
*
qid_stride
;
const
scalar_t
*
data_im_ptr
=
data_im
+
im_ptr_offset
;
opmath_t
*
grad_im_ptr
=
grad_im
+
im_ptr_offset
;
const
opmath_t
p0_w_
=
p0_w
-
((
dilation_w
*
(
kernel_w
-
1
))
>>
1
)
*
offset_scale
;
const
opmath_t
p0_h_
=
p0_h
-
((
dilation_h
*
(
kernel_h
-
1
))
>>
1
)
*
offset_scale
;
for
(
int
i
=
0
;
i
<
kernel_w
;
++
i
)
{
for
(
int
j
=
0
;
j
<
kernel_h
;
++
j
)
{
const
opmath_t
offset_w
=
data_offset
[
data_loc_w_ptr
];
const
opmath_t
offset_h
=
data_offset
[
data_loc_w_ptr
+
1
];
const
opmath_t
loc_w
=
p0_w_
+
(
i
*
dilation_w
+
offset_w
)
*
offset_scale
;
const
opmath_t
loc_h
=
p0_h_
+
(
j
*
dilation_h
+
offset_h
)
*
offset_scale
;
const
opmath_t
weight
=
data_mask
[
data_weight_ptr
];
*
(
cache_grad_offset
+
(
threadIdx
.
x
<<
1
))
=
0
;
*
(
cache_grad_offset
+
((
threadIdx
.
x
<<
1
)
+
1
))
=
0
;
*
(
cache_grad_mask
+
threadIdx
.
x
)
=
0
;
if
(
loc_h
>
-
1
&&
loc_w
>
-
1
&&
loc_h
<
height_in
&&
loc_w
<
width_in
)
{
dcnv3_col2im_bilinear
(
data_im_ptr
,
height_in
,
width_in
,
group
,
group_channels
,
loc_h
,
loc_w
,
g_col
,
c_col
,
offset_scale
,
top_grad
,
weight
,
grad_im_ptr
,
cache_grad_offset
+
(
threadIdx
.
x
<<
1
),
cache_grad_mask
+
threadIdx
.
x
);
}
__syncthreads
();
for
(
unsigned
int
s
=
blockDim
.
x
/
2
,
spre
=
blockDim
.
x
;
s
>
0
;
s
>>=
1
,
spre
>>=
1
)
{
if
(
tid
<
s
)
{
const
unsigned
int
xid1
=
tid
<<
1
;
const
unsigned
int
xid2
=
(
tid
+
s
)
<<
1
;
cache_grad_mask
[
tid
]
+=
cache_grad_mask
[
tid
+
s
];
cache_grad_offset
[
xid1
]
+=
cache_grad_offset
[
xid2
];
cache_grad_offset
[
xid1
+
1
]
+=
cache_grad_offset
[
xid2
+
1
];
if
(
tid
+
(
s
<<
1
)
<
spre
)
{
cache_grad_mask
[
tid
]
+=
cache_grad_mask
[
tid
+
(
s
<<
1
)];
cache_grad_offset
[
xid1
]
+=
cache_grad_offset
[
xid2
+
(
s
<<
1
)];
cache_grad_offset
[
xid1
+
1
]
+=
cache_grad_offset
[
xid2
+
1
+
(
s
<<
1
)];
}
}
__syncthreads
();
}
if
(
tid
==
0
)
{
*
grad_offset
=
cache_grad_offset
[
0
];
*
(
grad_offset
+
1
)
=
cache_grad_offset
[
1
];
*
grad_mask
=
cache_grad_mask
[
0
];
}
__syncthreads
();
data_weight_ptr
+=
1
;
data_loc_w_ptr
+=
2
;
grad_mask
+=
1
;
grad_offset
+=
2
;
}
}
}
}
template
<
typename
scalar_t
>
__global__
void
dcnv3_col2im_gpu_kernel_shm_reduce_v2_multi_blocks
(
const
int
num_kernels
,
const
scalar_t
*
grad_col
,
const
scalar_t
*
data_im
,
const
scalar_t
*
data_offset
,
const
scalar_t
*
data_mask
,
const
int
kernel_h
,
const
int
kernel_w
,
const
int
stride_h
,
const
int
stride_w
,
const
int
pad_h
,
const
int
pad_w
,
const
int
dilation_h
,
const
int
dilation_w
,
const
int
group
,
const
int
group_channels
,
const
int
height_in
,
const
int
width_in
,
const
int
height_out
,
const
int
width_out
,
const
opmath_t
offset_scale
,
opmath_t
*
grad_im
,
opmath_t
*
grad_offset
,
opmath_t
*
grad_mask
)
{
CUDA_KERNEL_LOOP
(
index
,
num_kernels
)
{
extern
__shared__
int
_s
[];
opmath_t
*
cache_grad_offset
=
(
opmath_t
*
)
_s
;
opmath_t
*
cache_grad_mask
=
cache_grad_offset
+
2
*
blockDim
.
x
;
unsigned
int
tid
=
threadIdx
.
x
;
int
_temp
=
index
;
const
int
c_col
=
_temp
%
group_channels
;
_temp
/=
group_channels
;
const
int
sampling_index
=
_temp
;
const
int
g_col
=
_temp
%
group
;
_temp
/=
group
;
const
int
p0_w
=
((
dilation_w
*
(
kernel_w
-
1
))
>>
1
)
-
pad_w
+
(
_temp
%
width_out
)
*
stride_w
;
_temp
/=
width_out
;
const
int
p0_h
=
((
dilation_h
*
(
kernel_h
-
1
))
>>
1
)
-
pad_h
+
(
_temp
%
height_out
)
*
stride_h
;
_temp
/=
height_out
;
const
int
b_col
=
_temp
;
const
opmath_t
top_grad
=
grad_col
[
index
];
const
int
input_size
=
height_in
*
width_in
;
const
int
kernel_size
=
kernel_h
*
kernel_w
;
int
data_weight_ptr
=
sampling_index
*
kernel_size
;
int
data_loc_w_ptr
=
data_weight_ptr
<<
1
;
const
int
grad_sampling_ptr
=
data_weight_ptr
;
grad_offset
+=
grad_sampling_ptr
<<
1
;
grad_mask
+=
grad_sampling_ptr
;
const
int
qid_stride
=
group
*
group_channels
;
const
int
im_ptr_offset
=
b_col
*
input_size
*
qid_stride
;
const
scalar_t
*
data_im_ptr
=
data_im
+
im_ptr_offset
;
opmath_t
*
grad_im_ptr
=
grad_im
+
im_ptr_offset
;
const
opmath_t
p0_w_
=
p0_w
-
((
dilation_w
*
(
kernel_w
-
1
))
>>
1
)
*
offset_scale
;
const
opmath_t
p0_h_
=
p0_h
-
((
dilation_h
*
(
kernel_h
-
1
))
>>
1
)
*
offset_scale
;
for
(
int
i
=
0
;
i
<
kernel_w
;
++
i
)
{
for
(
int
j
=
0
;
j
<
kernel_h
;
++
j
)
{
const
opmath_t
offset_w
=
data_offset
[
data_loc_w_ptr
];
const
opmath_t
offset_h
=
data_offset
[
data_loc_w_ptr
+
1
];
const
opmath_t
loc_w
=
p0_w_
+
(
i
*
dilation_w
+
offset_w
)
*
offset_scale
;
const
opmath_t
loc_h
=
p0_h_
+
(
j
*
dilation_h
+
offset_h
)
*
offset_scale
;
const
opmath_t
weight
=
data_mask
[
data_weight_ptr
];
*
(
cache_grad_offset
+
(
threadIdx
.
x
<<
1
))
=
0
;
*
(
cache_grad_offset
+
((
threadIdx
.
x
<<
1
)
+
1
))
=
0
;
*
(
cache_grad_mask
+
threadIdx
.
x
)
=
0
;
if
(
loc_h
>
-
1
&&
loc_w
>
-
1
&&
loc_h
<
height_in
&&
loc_w
<
width_in
)
{
dcnv3_col2im_bilinear
(
data_im_ptr
,
height_in
,
width_in
,
group
,
group_channels
,
loc_h
,
loc_w
,
g_col
,
c_col
,
offset_scale
,
top_grad
,
weight
,
grad_im_ptr
,
cache_grad_offset
+
(
threadIdx
.
x
<<
1
),
cache_grad_mask
+
threadIdx
.
x
);
}
__syncthreads
();
for
(
unsigned
int
s
=
blockDim
.
x
/
2
,
spre
=
blockDim
.
x
;
s
>
0
;
s
>>=
1
,
spre
>>=
1
)
{
if
(
tid
<
s
)
{
const
unsigned
int
xid1
=
tid
<<
1
;
const
unsigned
int
xid2
=
(
tid
+
s
)
<<
1
;
cache_grad_mask
[
tid
]
+=
cache_grad_mask
[
tid
+
s
];
cache_grad_offset
[
xid1
]
+=
cache_grad_offset
[
xid2
];
cache_grad_offset
[
xid1
+
1
]
+=
cache_grad_offset
[
xid2
+
1
];
if
(
tid
+
(
s
<<
1
)
<
spre
)
{
cache_grad_mask
[
tid
]
+=
cache_grad_mask
[
tid
+
(
s
<<
1
)];
cache_grad_offset
[
xid1
]
+=
cache_grad_offset
[
xid2
+
(
s
<<
1
)];
cache_grad_offset
[
xid1
+
1
]
+=
cache_grad_offset
[
xid2
+
1
+
(
s
<<
1
)];
}
}
__syncthreads
();
}
if
(
tid
==
0
)
{
atomicAdd
(
grad_offset
,
cache_grad_offset
[
0
]);
atomicAdd
(
grad_offset
+
1
,
cache_grad_offset
[
1
]);
atomicAdd
(
grad_mask
,
cache_grad_mask
[
0
]);
}
__syncthreads
();
data_weight_ptr
+=
1
;
data_loc_w_ptr
+=
2
;
grad_mask
+=
1
;
grad_offset
+=
2
;
}
}
}
}
template
<
typename
scalar_t
>
__global__
void
dcnv3_col2im_gpu_kernel_gm
(
const
int
num_kernels
,
const
scalar_t
*
grad_col
,
const
scalar_t
*
data_im
,
const
scalar_t
*
data_offset
,
const
scalar_t
*
data_mask
,
const
int
kernel_h
,
const
int
kernel_w
,
const
int
stride_h
,
const
int
stride_w
,
const
int
pad_h
,
const
int
pad_w
,
const
int
dilation_h
,
const
int
dilation_w
,
const
int
group
,
const
int
group_channels
,
const
int
height_in
,
const
int
width_in
,
const
int
height_out
,
const
int
width_out
,
const
opmath_t
offset_scale
,
opmath_t
*
grad_im
,
opmath_t
*
grad_offset
,
opmath_t
*
grad_mask
)
{
CUDA_KERNEL_LOOP
(
index
,
num_kernels
)
{
int
_temp
=
index
;
const
int
c_col
=
_temp
%
group_channels
;
_temp
/=
group_channels
;
const
int
sampling_index
=
_temp
;
const
int
g_col
=
_temp
%
group
;
_temp
/=
group
;
const
int
p0_w
=
((
dilation_w
*
(
kernel_w
-
1
))
>>
1
)
-
pad_w
+
(
_temp
%
width_out
)
*
stride_w
;
_temp
/=
width_out
;
const
int
p0_h
=
((
dilation_h
*
(
kernel_h
-
1
))
>>
1
)
-
pad_h
+
(
_temp
%
height_out
)
*
stride_h
;
_temp
/=
height_out
;
const
int
b_col
=
_temp
;
const
opmath_t
top_grad
=
grad_col
[
index
];
const
int
input_size
=
height_in
*
width_in
;
const
int
kernel_size
=
kernel_h
*
kernel_w
;
int
data_weight_ptr
=
sampling_index
*
kernel_size
;
int
data_loc_w_ptr
=
data_weight_ptr
<<
1
;
const
int
grad_sampling_ptr
=
data_weight_ptr
;
grad_offset
+=
grad_sampling_ptr
<<
1
;
grad_mask
+=
grad_sampling_ptr
;
const
int
qid_stride
=
group
*
group_channels
;
const
int
im_ptr_offset
=
b_col
*
input_size
*
qid_stride
;
const
scalar_t
*
data_im_ptr
=
data_im
+
im_ptr_offset
;
opmath_t
*
grad_im_ptr
=
grad_im
+
im_ptr_offset
;
const
opmath_t
p0_w_
=
p0_w
-
((
dilation_w
*
(
kernel_w
-
1
))
>>
1
)
*
offset_scale
;
const
opmath_t
p0_h_
=
p0_h
-
((
dilation_h
*
(
kernel_h
-
1
))
>>
1
)
*
offset_scale
;
for
(
int
i
=
0
;
i
<
kernel_w
;
++
i
)
{
for
(
int
j
=
0
;
j
<
kernel_h
;
++
j
)
{
const
opmath_t
offset_w
=
data_offset
[
data_loc_w_ptr
];
const
opmath_t
offset_h
=
data_offset
[
data_loc_w_ptr
+
1
];
const
opmath_t
loc_w
=
p0_w_
+
(
i
*
dilation_w
+
offset_w
)
*
offset_scale
;
const
opmath_t
loc_h
=
p0_h_
+
(
j
*
dilation_h
+
offset_h
)
*
offset_scale
;
const
opmath_t
weight
=
data_mask
[
data_weight_ptr
];
if
(
loc_h
>
-
1
&&
loc_w
>
-
1
&&
loc_h
<
height_in
&&
loc_w
<
width_in
)
{
dcnv3_col2im_bilinear_gm
(
data_im_ptr
,
height_in
,
width_in
,
group
,
group_channels
,
loc_h
,
loc_w
,
g_col
,
c_col
,
offset_scale
,
top_grad
,
weight
,
grad_im_ptr
,
grad_offset
,
grad_mask
);
}
data_weight_ptr
+=
1
;
data_loc_w_ptr
+=
2
;
grad_mask
+=
1
;
grad_offset
+=
2
;
}
}
}
}
template
<
typename
scalar_t
>
void
dcnv3_im2col_cuda
(
cudaStream_t
stream
,
const
scalar_t
*
data_im
,
const
scalar_t
*
data_offset
,
const
scalar_t
*
data_mask
,
scalar_t
*
data_col
,
const
int
kernel_h
,
const
int
kernel_w
,
const
int
stride_h
,
const
int
stride_w
,
const
int
pad_h
,
const
int
pad_w
,
const
int
dilation_h
,
const
int
dilation_w
,
const
int
group
,
const
int
group_channels
,
const
int
batch_n
,
const
int
height_in
,
const
int
width_in
,
const
int
height_out
,
const
int
width_out
,
const
opmath_t
offset_scale
)
{
const
int
num_kernels
=
batch_n
*
height_out
*
width_out
*
group
*
group_channels
;
const
int
num_actual_kernels
=
batch_n
*
height_out
*
width_out
*
group
*
group_channels
;
const
int
num_threads
=
CUDA_NUM_THREADS
;
dcnv3_im2col_gpu_kernel
<
scalar_t
>
<<<
GET_BLOCKS
(
num_actual_kernels
,
num_threads
),
num_threads
,
0
,
stream
>>>
(
num_kernels
,
data_im
,
data_offset
,
data_mask
,
data_col
,
kernel_h
,
kernel_w
,
stride_h
,
stride_w
,
pad_h
,
pad_w
,
dilation_h
,
dilation_w
,
group
,
group_channels
,
height_in
,
width_in
,
height_out
,
width_out
,
offset_scale
);
cudaError_t
err
=
cudaGetLastError
();
if
(
err
!=
cudaSuccess
)
{
printf
(
"error in dcnv3_im2col_cuda: %s
\n
"
,
cudaGetErrorString
(
err
));
}
}
template
<
typename
scalar_t
>
void
dcnv3_col2im_cuda
(
cudaStream_t
stream
,
const
scalar_t
*
grad_col
,
const
scalar_t
*
data_im
,
const
scalar_t
*
data_offset
,
const
scalar_t
*
data_mask
,
const
int
kernel_h
,
const
int
kernel_w
,
const
int
stride_h
,
const
int
stride_w
,
const
int
pad_h
,
const
int
pad_w
,
const
int
dilation_h
,
const
int
dilation_w
,
const
int
group
,
const
int
group_channels
,
const
int
batch_n
,
const
int
height_in
,
const
int
width_in
,
const
int
height_out
,
const
int
width_out
,
const
opmath_t
offset_scale
,
opmath_t
*
grad_im
,
opmath_t
*
grad_offset
,
opmath_t
*
grad_mask
)
{
const
int
num_threads
=
(
group_channels
>
CUDA_NUM_THREADS
)
?
CUDA_NUM_THREADS
:
group_channels
;
const
int
num_kernels
=
batch_n
*
height_out
*
width_out
*
group
*
group_channels
;
const
int
num_actual_kernels
=
batch_n
*
height_out
*
width_out
*
group
*
group_channels
;
if
(
group_channels
>
1024
)
{
if
((
group_channels
&
1023
)
==
0
)
{
dcnv3_col2im_gpu_kernel_shm_reduce_v2_multi_blocks
<
scalar_t
>
<<<
GET_BLOCKS
(
num_actual_kernels
,
num_threads
),
num_threads
,
num_threads
*
3
*
sizeof
(
opmath_t
),
stream
>>>
(
num_kernels
,
grad_col
,
data_im
,
data_offset
,
data_mask
,
kernel_h
,
kernel_w
,
stride_h
,
stride_w
,
pad_h
,
pad_w
,
dilation_h
,
dilation_w
,
group
,
group_channels
,
height_in
,
width_in
,
height_out
,
width_out
,
offset_scale
,
grad_im
,
grad_offset
,
grad_mask
);
}
else
{
dcnv3_col2im_gpu_kernel_gm
<
scalar_t
>
<<<
GET_BLOCKS
(
num_actual_kernels
,
num_threads
),
num_threads
,
0
,
stream
>>>
(
num_kernels
,
grad_col
,
data_im
,
data_offset
,
data_mask
,
kernel_h
,
kernel_w
,
stride_h
,
stride_w
,
pad_h
,
pad_w
,
dilation_h
,
dilation_w
,
group
,
group_channels
,
height_in
,
width_in
,
height_out
,
width_out
,
offset_scale
,
grad_im
,
grad_offset
,
grad_mask
);
}
}
else
{
switch
(
group_channels
)
{
case
1
:
dcnv3_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1
<
scalar_t
,
1
>
<<<
GET_BLOCKS
(
num_actual_kernels
,
num_threads
),
num_threads
,
0
,
stream
>>>
(
num_kernels
,
grad_col
,
data_im
,
data_offset
,
data_mask
,
kernel_h
,
kernel_w
,
stride_h
,
stride_w
,
pad_h
,
pad_w
,
dilation_h
,
dilation_w
,
group
,
group_channels
,
height_in
,
width_in
,
height_out
,
width_out
,
offset_scale
,
grad_im
,
grad_offset
,
grad_mask
);
break
;
case
2
:
dcnv3_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1
<
scalar_t
,
2
>
<<<
GET_BLOCKS
(
num_actual_kernels
,
num_threads
),
num_threads
,
0
,
stream
>>>
(
num_kernels
,
grad_col
,
data_im
,
data_offset
,
data_mask
,
kernel_h
,
kernel_w
,
stride_h
,
stride_w
,
pad_h
,
pad_w
,
dilation_h
,
dilation_w
,
group
,
group_channels
,
height_in
,
width_in
,
height_out
,
width_out
,
offset_scale
,
grad_im
,
grad_offset
,
grad_mask
);
break
;
case
4
:
dcnv3_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1
<
scalar_t
,
4
>
<<<
GET_BLOCKS
(
num_actual_kernels
,
num_threads
),
num_threads
,
0
,
stream
>>>
(
num_kernels
,
grad_col
,
data_im
,
data_offset
,
data_mask
,
kernel_h
,
kernel_w
,
stride_h
,
stride_w
,
pad_h
,
pad_w
,
dilation_h
,
dilation_w
,
group
,
group_channels
,
height_in
,
width_in
,
height_out
,
width_out
,
offset_scale
,
grad_im
,
grad_offset
,
grad_mask
);
break
;
case
8
:
dcnv3_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1
<
scalar_t
,
8
>
<<<
GET_BLOCKS
(
num_actual_kernels
,
num_threads
),
num_threads
,
0
,
stream
>>>
(
num_kernels
,
grad_col
,
data_im
,
data_offset
,
data_mask
,
kernel_h
,
kernel_w
,
stride_h
,
stride_w
,
pad_h
,
pad_w
,
dilation_h
,
dilation_w
,
group
,
group_channels
,
height_in
,
width_in
,
height_out
,
width_out
,
offset_scale
,
grad_im
,
grad_offset
,
grad_mask
);
break
;
case
16
:
dcnv3_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1
<
scalar_t
,
16
>
<<<
GET_BLOCKS
(
num_actual_kernels
,
num_threads
),
num_threads
,
0
,
stream
>>>
(
num_kernels
,
grad_col
,
data_im
,
data_offset
,
data_mask
,
kernel_h
,
kernel_w
,
stride_h
,
stride_w
,
pad_h
,
pad_w
,
dilation_h
,
dilation_w
,
group
,
group_channels
,
height_in
,
width_in
,
height_out
,
width_out
,
offset_scale
,
grad_im
,
grad_offset
,
grad_mask
);
break
;
case
32
:
dcnv3_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1
<
scalar_t
,
32
>
<<<
GET_BLOCKS
(
num_actual_kernels
,
num_threads
),
num_threads
,
0
,
stream
>>>
(
num_kernels
,
grad_col
,
data_im
,
data_offset
,
data_mask
,
kernel_h
,
kernel_w
,
stride_h
,
stride_w
,
pad_h
,
pad_w
,
dilation_h
,
dilation_w
,
group
,
group_channels
,
height_in
,
width_in
,
height_out
,
width_out
,
offset_scale
,
grad_im
,
grad_offset
,
grad_mask
);
break
;
case
64
:
dcnv3_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2
<
scalar_t
,
64
>
<<<
GET_BLOCKS
(
num_actual_kernels
,
num_threads
),
num_threads
,
0
,
stream
>>>
(
num_kernels
,
grad_col
,
data_im
,
data_offset
,
data_mask
,
kernel_h
,
kernel_w
,
stride_h
,
stride_w
,
pad_h
,
pad_w
,
dilation_h
,
dilation_w
,
group
,
group_channels
,
height_in
,
width_in
,
height_out
,
width_out
,
offset_scale
,
grad_im
,
grad_offset
,
grad_mask
);
break
;
case
128
:
dcnv3_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2
<
scalar_t
,
128
>
<<<
GET_BLOCKS
(
num_actual_kernels
,
num_threads
),
num_threads
,
0
,
stream
>>>
(
num_kernels
,
grad_col
,
data_im
,
data_offset
,
data_mask
,
kernel_h
,
kernel_w
,
stride_h
,
stride_w
,
pad_h
,
pad_w
,
dilation_h
,
dilation_w
,
group
,
group_channels
,
height_in
,
width_in
,
height_out
,
width_out
,
offset_scale
,
grad_im
,
grad_offset
,
grad_mask
);
break
;
case
256
:
dcnv3_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2
<
scalar_t
,
256
>
<<<
GET_BLOCKS
(
num_actual_kernels
,
num_threads
),
num_threads
,
0
,
stream
>>>
(
num_kernels
,
grad_col
,
data_im
,
data_offset
,
data_mask
,
kernel_h
,
kernel_w
,
stride_h
,
stride_w
,
pad_h
,
pad_w
,
dilation_h
,
dilation_w
,
group
,
group_channels
,
height_in
,
width_in
,
height_out
,
width_out
,
offset_scale
,
grad_im
,
grad_offset
,
grad_mask
);
break
;
case
512
:
dcnv3_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2
<
scalar_t
,
512
>
<<<
GET_BLOCKS
(
num_actual_kernels
,
num_threads
),
num_threads
,
0
,
stream
>>>
(
num_kernels
,
grad_col
,
data_im
,
data_offset
,
data_mask
,
kernel_h
,
kernel_w
,
stride_h
,
stride_w
,
pad_h
,
pad_w
,
dilation_h
,
dilation_w
,
group
,
group_channels
,
height_in
,
width_in
,
height_out
,
width_out
,
offset_scale
,
grad_im
,
grad_offset
,
grad_mask
);
break
;
case
1024
:
dcnv3_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2
<
scalar_t
,
1024
>
<<<
GET_BLOCKS
(
num_actual_kernels
,
num_threads
),
num_threads
,
0
,
stream
>>>
(
num_kernels
,
grad_col
,
data_im
,
data_offset
,
data_mask
,
kernel_h
,
kernel_w
,
stride_h
,
stride_w
,
pad_h
,
pad_w
,
dilation_h
,
dilation_w
,
group
,
group_channels
,
height_in
,
width_in
,
height_out
,
width_out
,
offset_scale
,
grad_im
,
grad_offset
,
grad_mask
);
break
;
default:
if
(
group_channels
<
64
)
{
dcnv3_col2im_gpu_kernel_shm_reduce_v1
<
scalar_t
>
<<<
GET_BLOCKS
(
num_actual_kernels
,
num_threads
),
num_threads
,
num_threads
*
3
*
sizeof
(
opmath_t
),
stream
>>>
(
num_kernels
,
grad_col
,
data_im
,
data_offset
,
data_mask
,
kernel_h
,
kernel_w
,
stride_h
,
stride_w
,
pad_h
,
pad_w
,
dilation_h
,
dilation_w
,
group
,
group_channels
,
height_in
,
width_in
,
height_out
,
width_out
,
offset_scale
,
grad_im
,
grad_offset
,
grad_mask
);
}
else
{
dcnv3_col2im_gpu_kernel_shm_reduce_v2
<
scalar_t
>
<<<
GET_BLOCKS
(
num_actual_kernels
,
num_threads
),
num_threads
,
num_threads
*
3
*
sizeof
(
opmath_t
),
stream
>>>
(
num_kernels
,
grad_col
,
data_im
,
data_offset
,
data_mask
,
kernel_h
,
kernel_w
,
stride_h
,
stride_w
,
pad_h
,
pad_w
,
dilation_h
,
dilation_w
,
group
,
group_channels
,
height_in
,
width_in
,
height_out
,
width_out
,
offset_scale
,
grad_im
,
grad_offset
,
grad_mask
);
}
}
}
cudaError_t
err
=
cudaGetLastError
();
if
(
err
!=
cudaSuccess
)
{
printf
(
"error in dcnv3_col2im_cuda: %s
\n
"
,
cudaGetErrorString
(
err
));
}
}
\ No newline at end of file
autonomous_driving/occupancy_prediction/projects/mmdet3d_plugin/bevformer/backbones/ops_dcnv3/src/dcnv3.h
0 → 100644
View file @
b64d9ca3
/*!
**************************************************************************************************
* InternImage
* Copyright (c) 2022 OpenGVLab
* Licensed under The MIT License [see LICENSE for details]
**************************************************************************************************
* Modified from
*https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
**************************************************************************************************
*/
#pragma once
#include "cpu/dcnv3_cpu.h"
#ifdef WITH_CUDA
#include "cuda/dcnv3_cuda.h"
#endif
at
::
Tensor
dcnv3_forward
(
const
at
::
Tensor
&
input
,
const
at
::
Tensor
&
offset
,
const
at
::
Tensor
&
mask
,
const
int
kernel_h
,
const
int
kernel_w
,
const
int
stride_h
,
const
int
stride_w
,
const
int
pad_h
,
const
int
pad_w
,
const
int
dilation_h
,
const
int
dilation_w
,
const
int
group
,
const
int
group_channels
,
const
float
offset_scale
,
const
int
im2col_step
)
{
if
(
input
.
type
().
is_cuda
())
{
#ifdef WITH_CUDA
return
dcnv3_cuda_forward
(
input
,
offset
,
mask
,
kernel_h
,
kernel_w
,
stride_h
,
stride_w
,
pad_h
,
pad_w
,
dilation_h
,
dilation_w
,
group
,
group_channels
,
offset_scale
,
im2col_step
);
#else
AT_ERROR
(
"Not compiled with GPU support"
);
#endif
}
AT_ERROR
(
"Not implemented on the CPU"
);
}
std
::
vector
<
at
::
Tensor
>
dcnv3_backward
(
const
at
::
Tensor
&
input
,
const
at
::
Tensor
&
offset
,
const
at
::
Tensor
&
mask
,
const
int
kernel_h
,
const
int
kernel_w
,
const
int
stride_h
,
const
int
stride_w
,
const
int
pad_h
,
const
int
pad_w
,
const
int
dilation_h
,
const
int
dilation_w
,
const
int
group
,
const
int
group_channels
,
const
float
offset_scale
,
const
at
::
Tensor
&
grad_output
,
const
int
im2col_step
)
{
if
(
input
.
type
().
is_cuda
())
{
#ifdef WITH_CUDA
return
dcnv3_cuda_backward
(
input
,
offset
,
mask
,
kernel_h
,
kernel_w
,
stride_h
,
stride_w
,
pad_h
,
pad_w
,
dilation_h
,
dilation_w
,
group
,
group_channels
,
offset_scale
,
grad_output
,
im2col_step
);
#else
AT_ERROR
(
"Not compiled with GPU support"
);
#endif
}
AT_ERROR
(
"Not implemented on the CPU"
);
}
Prev
1
2
3
4
5
6
7
8
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment