Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
mmdetection3d
Commits
ee5667c1
Commit
ee5667c1
authored
Apr 27, 2020
by
zhangwenwei
Browse files
Reformat cpp code to supress warning
parent
2bb43004
Changes
15
Hide whitespace changes
Inline
Side-by-side
Showing
15 changed files
with
884 additions
and
815 deletions
+884
-815
mmdet3d/datasets/kitti_dataset.py
mmdet3d/datasets/kitti_dataset.py
+2
-2
mmdet3d/models/anchor_heads/second_head.py
mmdet3d/models/anchor_heads/second_head.py
+22
-2
mmdet3d/ops/iou3d/src/iou3d.cpp
mmdet3d/ops/iou3d/src/iou3d.cpp
+145
-127
mmdet3d/ops/roiaware_pool3d/src/points_in_boxes_cpu.cpp
mmdet3d/ops/roiaware_pool3d/src/points_in_boxes_cpu.cpp
+56
-54
mmdet3d/ops/roiaware_pool3d/src/points_in_boxes_cuda.cu
mmdet3d/ops/roiaware_pool3d/src/points_in_boxes_cuda.cu
+102
-90
mmdet3d/ops/roiaware_pool3d/src/roiaware_pool3d.cpp
mmdet3d/ops/roiaware_pool3d/src/roiaware_pool3d.cpp
+123
-100
mmdet3d/ops/spconv/include/spconv/fused_spconv_ops.h
mmdet3d/ops/spconv/include/spconv/fused_spconv_ops.h
+34
-27
mmdet3d/ops/spconv/include/spconv/pool_ops.h
mmdet3d/ops/spconv/include/spconv/pool_ops.h
+9
-8
mmdet3d/ops/spconv/include/spconv/spconv_ops.h
mmdet3d/ops/spconv/include/spconv/spconv_ops.h
+162
-131
mmdet3d/ops/spconv/include/torch_utils.h
mmdet3d/ops/spconv/include/torch_utils.h
+34
-33
mmdet3d/ops/voxel/src/scatter_points_cpu.cpp
mmdet3d/ops/voxel/src/scatter_points_cpu.cpp
+101
-110
mmdet3d/ops/voxel/src/scatter_points_cuda.cu
mmdet3d/ops/voxel/src/scatter_points_cuda.cu
+6
-6
mmdet3d/ops/voxel/src/voxelization.h
mmdet3d/ops/voxel/src/voxelization.h
+4
-4
mmdet3d/ops/voxel/src/voxelization_cpu.cpp
mmdet3d/ops/voxel/src/voxelization_cpu.cpp
+78
-115
mmdet3d/ops/voxel/src/voxelization_cuda.cu
mmdet3d/ops/voxel/src/voxelization_cuda.cu
+6
-6
No files found.
mmdet3d/datasets/kitti_dataset.py
View file @
ee5667c1
...
...
@@ -274,7 +274,7 @@ class KittiDataset(torch_data.Dataset):
out
)
return
result_files
def
evaluate
(
self
,
result_files
,
eval_types
=
None
):
def
evaluate
(
self
,
result_files
,
logger
=
None
,
eval_types
=
None
):
from
mmdet3d.core.evaluation
import
kitti_eval
gt_annos
=
[
info
[
'annos'
]
for
info
in
self
.
kitti_infos
]
if
eval_types
==
'img_bbox'
:
...
...
@@ -283,7 +283,7 @@ class KittiDataset(torch_data.Dataset):
else
:
ap_result_str
,
ap_dict
=
kitti_eval
(
gt_annos
,
result_files
,
self
.
class_names
)
return
ap_result_str
,
ap_dict
return
ap_dict
def
bbox2result_kitti
(
self
,
net_outputs
,
class_names
,
out
=
None
):
if
out
:
...
...
mmdet3d/models/anchor_heads/second_head.py
View file @
ee5667c1
...
...
@@ -15,12 +15,32 @@ from .train_mixins import AnchorTrainMixin
@
HEADS
.
register_module
class
SECONDHead
(
nn
.
Module
,
AnchorTrainMixin
):
"""Anchor-based head (RPN, RetinaNet, SSD, etc.).
"""Anchor-based head for VoxelNet detectors.
Args:
class_name (list[str]): name of classes (TODO: to be removed)
in_channels (int): Number of channels in the input feature map.
train_cfg (dict): train configs
test_cfg (dict): test configs
feat_channels (int): Number of channels of the feature map.
use_direction_classifier (bool): Whether to add a direction classifier.
encode_bg_as_zeros (bool): Whether to use sigmoid of softmax
(TODO: to be removed)
box_code_size (int): The size of box code.
anchor_generator(dict): Config dict of anchor generator.
assigner_per_size (bool): Whether to do assignment for each separate
anchor size.
assign_per_class (bool): Whether to do assignment for each class.
diff_rad_by_sin (bool): Whether to change the difference into sin
difference for box regression loss.
dir_offset (float | int): The offset of BEV rotation angles
(TODO: may be moved into box coder)
dirlimit_offset (float | int): The limited range of BEV rotation angles
(TODO: may be moved into box coder)
box_coder (dict): Config dict of box coders.
loss_cls (dict): Config of classification loss.
loss_bbox (dict): Config of localization loss.
loss_dir (dict): Config of direction classifier loss.
"""
# noqa: W605
def
__init__
(
self
,
...
...
@@ -253,7 +273,7 @@ class SECONDHead(nn.Module, AnchorTrainMixin):
num_levels
=
len
(
cls_scores
)
featmap_sizes
=
[
cls_scores
[
i
].
shape
[
-
2
:]
for
i
in
range
(
num_levels
)]
device
=
cls_scores
[
0
].
device
mlvl_anchors
=
self
.
anchor_generator
s
.
grid_anchors
(
mlvl_anchors
=
self
.
anchor_generator
.
grid_anchors
(
featmap_sizes
,
device
=
device
)
mlvl_anchors
=
[
anchor
.
reshape
(
-
1
,
self
.
box_code_size
)
for
anchor
in
mlvl_anchors
...
...
mmdet3d/ops/iou3d/src/iou3d.cpp
View file @
ee5667c1
#include <torch/serialize/tensor.h>
#include <torch/extension.h>
#include <vector>
#include <cuda.h>
#include <cuda_runtime_api.h>
#include <torch/extension.h>
#include <torch/serialize/tensor.h>
#define CHECK_CUDA(x) AT_CHECK(x.type().is_cuda(), #x, " must be a CUDAtensor ")
#define CHECK_CONTIGUOUS(x) AT_CHECK(x.is_contiguous(), #x, " must be contiguous ")
#define CHECK_INPUT(x) CHECK_CUDA(x);CHECK_CONTIGUOUS(x)
#define DIVUP(m,n) ((m) / (n) + ((m) % (n) > 0))
#include <vector>
#define CHECK_ERROR(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline
void
gpuAssert
(
cudaError_t
code
,
const
char
*
file
,
int
line
,
bool
abort
=
true
)
{
if
(
code
!=
cudaSuccess
)
{
fprintf
(
stderr
,
"GPUassert: %s %s %d
\n
"
,
cudaGetErrorString
(
code
),
file
,
line
);
if
(
abort
)
exit
(
code
);
}
#define CHECK_CUDA(x) \
TORCH_CHECK(x.device().is_cuda(), #x, " must be a CUDAtensor ")
#define CHECK_CONTIGUOUS(x) \
TORCH_CHECK(x.is_contiguous(), #x, " must be contiguous ")
#define CHECK_INPUT(x) \
CHECK_CUDA(x); \
CHECK_CONTIGUOUS(x)
#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
#define CHECK_ERROR(ans) \
{ gpuAssert((ans), __FILE__, __LINE__); }
inline
void
gpuAssert
(
cudaError_t
code
,
const
char
*
file
,
int
line
,
bool
abort
=
true
)
{
if
(
code
!=
cudaSuccess
)
{
fprintf
(
stderr
,
"GPUassert: %s %s %d
\n
"
,
cudaGetErrorString
(
code
),
file
,
line
);
if
(
abort
)
exit
(
code
);
}
}
const
int
THREADS_PER_BLOCK_NMS
=
sizeof
(
unsigned
long
long
)
*
8
;
void
boxesoverlapLauncher
(
const
int
num_a
,
const
float
*
boxes_a
,
const
int
num_b
,
const
float
*
boxes_b
,
float
*
ans_overlap
);
void
boxesioubevLauncher
(
const
int
num_a
,
const
float
*
boxes_a
,
const
int
num_b
,
const
float
*
boxes_b
,
float
*
ans_iou
);
void
nmsLauncher
(
const
float
*
boxes
,
unsigned
long
long
*
mask
,
int
boxes_num
,
float
nms_overlap_thresh
);
void
nmsNormalLauncher
(
const
float
*
boxes
,
unsigned
long
long
*
mask
,
int
boxes_num
,
float
nms_overlap_thresh
);
int
boxes_overlap_bev_gpu
(
at
::
Tensor
boxes_a
,
at
::
Tensor
boxes_b
,
at
::
Tensor
ans_overlap
){
// params boxes_a: (N, 5) [x1, y1, x2, y2, ry]
// params boxes_b: (M, 5)
// params ans_overlap: (N, M)
CHECK_INPUT
(
boxes_a
);
CHECK_INPUT
(
boxes_b
);
CHECK_INPUT
(
ans_overlap
);
int
num_a
=
boxes_a
.
size
(
0
);
int
num_b
=
boxes_b
.
size
(
0
);
const
float
*
boxes_a_data
=
boxes_a
.
data
<
float
>
();
const
float
*
boxes_b_data
=
boxes_b
.
data
<
float
>
();
float
*
ans_overlap_data
=
ans_overlap
.
data
<
float
>
();
boxesoverlapLauncher
(
num_a
,
boxes_a_data
,
num_b
,
boxes_b_data
,
ans_overlap_data
);
return
1
;
void
boxesoverlapLauncher
(
const
int
num_a
,
const
float
*
boxes_a
,
const
int
num_b
,
const
float
*
boxes_b
,
float
*
ans_overlap
);
void
boxesioubevLauncher
(
const
int
num_a
,
const
float
*
boxes_a
,
const
int
num_b
,
const
float
*
boxes_b
,
float
*
ans_iou
);
void
nmsLauncher
(
const
float
*
boxes
,
unsigned
long
long
*
mask
,
int
boxes_num
,
float
nms_overlap_thresh
);
void
nmsNormalLauncher
(
const
float
*
boxes
,
unsigned
long
long
*
mask
,
int
boxes_num
,
float
nms_overlap_thresh
);
int
boxes_overlap_bev_gpu
(
at
::
Tensor
boxes_a
,
at
::
Tensor
boxes_b
,
at
::
Tensor
ans_overlap
)
{
// params boxes_a: (N, 5) [x1, y1, x2, y2, ry]
// params boxes_b: (M, 5)
// params ans_overlap: (N, M)
CHECK_INPUT
(
boxes_a
);
CHECK_INPUT
(
boxes_b
);
CHECK_INPUT
(
ans_overlap
);
int
num_a
=
boxes_a
.
size
(
0
);
int
num_b
=
boxes_b
.
size
(
0
);
const
float
*
boxes_a_data
=
boxes_a
.
data_ptr
<
float
>
();
const
float
*
boxes_b_data
=
boxes_b
.
data_ptr
<
float
>
();
float
*
ans_overlap_data
=
ans_overlap
.
data_ptr
<
float
>
();
boxesoverlapLauncher
(
num_a
,
boxes_a_data
,
num_b
,
boxes_b_data
,
ans_overlap_data
);
return
1
;
}
int
boxes_iou_bev_gpu
(
at
::
Tensor
boxes_a
,
at
::
Tensor
boxes_b
,
at
::
Tensor
ans_iou
){
// params boxes_a: (N, 5) [x1, y1, x2, y2, ry]
// params boxes_b: (M, 5)
// params ans_overlap: (N, M)
int
boxes_iou_bev_gpu
(
at
::
Tensor
boxes_a
,
at
::
Tensor
boxes_b
,
at
::
Tensor
ans_iou
)
{
// params boxes_a: (N, 5) [x1, y1, x2, y2, ry]
// params boxes_b: (M, 5)
// params ans_overlap: (N, M)
CHECK_INPUT
(
boxes_a
);
CHECK_INPUT
(
boxes_b
);
CHECK_INPUT
(
ans_iou
);
CHECK_INPUT
(
boxes_a
);
CHECK_INPUT
(
boxes_b
);
CHECK_INPUT
(
ans_iou
);
int
num_a
=
boxes_a
.
size
(
0
);
int
num_b
=
boxes_b
.
size
(
0
);
int
num_a
=
boxes_a
.
size
(
0
);
int
num_b
=
boxes_b
.
size
(
0
);
const
float
*
boxes_a_data
=
boxes_a
.
data
<
float
>
();
const
float
*
boxes_b_data
=
boxes_b
.
data
<
float
>
();
float
*
ans_iou_data
=
ans_iou
.
data
<
float
>
();
const
float
*
boxes_a_data
=
boxes_a
.
data
_ptr
<
float
>
();
const
float
*
boxes_b_data
=
boxes_b
.
data
_ptr
<
float
>
();
float
*
ans_iou_data
=
ans_iou
.
data
_ptr
<
float
>
();
boxesioubevLauncher
(
num_a
,
boxes_a_data
,
num_b
,
boxes_b_data
,
ans_iou_data
);
boxesioubevLauncher
(
num_a
,
boxes_a_data
,
num_b
,
boxes_b_data
,
ans_iou_data
);
return
1
;
return
1
;
}
int
nms_gpu
(
at
::
Tensor
boxes
,
at
::
Tensor
keep
,
float
nms_overlap_thresh
){
// params boxes: (N, 5) [x1, y1, x2, y2, ry]
// params keep: (N)
int
nms_gpu
(
at
::
Tensor
boxes
,
at
::
Tensor
keep
,
float
nms_overlap_thresh
)
{
// params boxes: (N, 5) [x1, y1, x2, y2, ry]
// params keep: (N)
CHECK_INPUT
(
boxes
);
CHECK_CONTIGUOUS
(
keep
);
CHECK_INPUT
(
boxes
);
CHECK_CONTIGUOUS
(
keep
);
int
boxes_num
=
boxes
.
size
(
0
);
const
float
*
boxes_data
=
boxes
.
data
<
float
>
();
long
*
keep_data
=
keep
.
data
<
long
>
();
int
boxes_num
=
boxes
.
size
(
0
);
const
float
*
boxes_data
=
boxes
.
data
_ptr
<
float
>
();
long
*
keep_data
=
keep
.
data
_ptr
<
long
>
();
const
int
col_blocks
=
DIVUP
(
boxes_num
,
THREADS_PER_BLOCK_NMS
);
const
int
col_blocks
=
DIVUP
(
boxes_num
,
THREADS_PER_BLOCK_NMS
);
unsigned
long
long
*
mask_data
=
NULL
;
CHECK_ERROR
(
cudaMalloc
((
void
**
)
&
mask_data
,
boxes_num
*
col_blocks
*
sizeof
(
unsigned
long
long
)));
nmsLauncher
(
boxes_data
,
mask_data
,
boxes_num
,
nms_overlap_thresh
);
unsigned
long
long
*
mask_data
=
NULL
;
CHECK_ERROR
(
cudaMalloc
((
void
**
)
&
mask_data
,
boxes_num
*
col_blocks
*
sizeof
(
unsigned
long
long
)));
nmsLauncher
(
boxes_data
,
mask_data
,
boxes_num
,
nms_overlap_thresh
);
// unsigned long long mask_cpu[boxes_num * col_blocks];
// unsigned long long *mask_cpu = new unsigned long long [boxes_num * col_blocks];
std
::
vector
<
unsigned
long
long
>
mask_cpu
(
boxes_num
*
col_blocks
);
// unsigned long long mask_cpu[boxes_num * col_blocks];
// unsigned long long *mask_cpu = new unsigned long long [boxes_num *
// col_blocks];
std
::
vector
<
unsigned
long
long
>
mask_cpu
(
boxes_num
*
col_blocks
);
// printf("boxes_num=%d, col_blocks=%d\n", boxes_num, col_blocks);
CHECK_ERROR
(
cudaMemcpy
(
&
mask_cpu
[
0
],
mask_data
,
boxes_num
*
col_blocks
*
sizeof
(
unsigned
long
long
),
cudaMemcpyDeviceToHost
));
// printf("boxes_num=%d, col_blocks=%d\n", boxes_num, col_blocks);
CHECK_ERROR
(
cudaMemcpy
(
&
mask_cpu
[
0
],
mask_data
,
boxes_num
*
col_blocks
*
sizeof
(
unsigned
long
long
),
cudaMemcpyDeviceToHost
));
cudaFree
(
mask_data
);
cudaFree
(
mask_data
);
unsigned
long
long
remv_cpu
[
col_blocks
];
memset
(
remv_cpu
,
0
,
col_blocks
*
sizeof
(
unsigned
long
long
));
unsigned
long
long
remv_cpu
[
col_blocks
];
memset
(
remv_cpu
,
0
,
col_blocks
*
sizeof
(
unsigned
long
long
));
int
num_to_keep
=
0
;
int
num_to_keep
=
0
;
for
(
int
i
=
0
;
i
<
boxes_num
;
i
++
){
int
nblock
=
i
/
THREADS_PER_BLOCK_NMS
;
int
inblock
=
i
%
THREADS_PER_BLOCK_NMS
;
for
(
int
i
=
0
;
i
<
boxes_num
;
i
++
)
{
int
nblock
=
i
/
THREADS_PER_BLOCK_NMS
;
int
inblock
=
i
%
THREADS_PER_BLOCK_NMS
;
if
(
!
(
remv_cpu
[
nblock
]
&
(
1ULL
<<
inblock
))){
keep_data
[
num_to_keep
++
]
=
i
;
unsigned
long
long
*
p
=
&
mask_cpu
[
0
]
+
i
*
col_blocks
;
for
(
int
j
=
nblock
;
j
<
col_blocks
;
j
++
){
remv_cpu
[
j
]
|=
p
[
j
];
}
}
if
(
!
(
remv_cpu
[
nblock
]
&
(
1ULL
<<
inblock
)))
{
keep_data
[
num_to_keep
++
]
=
i
;
unsigned
long
long
*
p
=
&
mask_cpu
[
0
]
+
i
*
col_blocks
;
for
(
int
j
=
nblock
;
j
<
col_blocks
;
j
++
)
{
remv_cpu
[
j
]
|=
p
[
j
];
}
}
if
(
cudaSuccess
!=
cudaGetLastError
()
)
printf
(
"Error!
\n
"
);
}
if
(
cudaSuccess
!=
cudaGetLastError
())
printf
(
"Error!
\n
"
);
return
num_to_keep
;
return
num_to_keep
;
}
int
nms_normal_gpu
(
at
::
Tensor
boxes
,
at
::
Tensor
keep
,
float
nms_overlap_thresh
)
{
// params boxes: (N, 5) [x1, y1, x2, y2, ry]
// params keep: (N)
int
nms_normal_gpu
(
at
::
Tensor
boxes
,
at
::
Tensor
keep
,
float
nms_overlap_thresh
){
// params boxes: (N, 5) [x1, y1, x2, y2, ry]
// params keep: (N)
CHECK_INPUT
(
boxes
);
CHECK_CONTIGUOUS
(
keep
);
CHECK_INPUT
(
boxes
);
CHECK_CONTIGUOUS
(
keep
);
int
boxes_num
=
boxes
.
size
(
0
);
const
float
*
boxes_data
=
boxes
.
data
<
float
>
();
long
*
keep_data
=
keep
.
data
<
long
>
();
int
boxes_num
=
boxes
.
size
(
0
);
const
float
*
boxes_data
=
boxes
.
data
_ptr
<
float
>
();
long
*
keep_data
=
keep
.
data
_ptr
<
long
>
();
const
int
col_blocks
=
DIVUP
(
boxes_num
,
THREADS_PER_BLOCK_NMS
);
const
int
col_blocks
=
DIVUP
(
boxes_num
,
THREADS_PER_BLOCK_NMS
);
unsigned
long
long
*
mask_data
=
NULL
;
CHECK_ERROR
(
cudaMalloc
((
void
**
)
&
mask_data
,
boxes_num
*
col_blocks
*
sizeof
(
unsigned
long
long
)));
nmsNormalLauncher
(
boxes_data
,
mask_data
,
boxes_num
,
nms_overlap_thresh
);
unsigned
long
long
*
mask_data
=
NULL
;
CHECK_ERROR
(
cudaMalloc
((
void
**
)
&
mask_data
,
boxes_num
*
col_blocks
*
sizeof
(
unsigned
long
long
)));
nmsNormalLauncher
(
boxes_data
,
mask_data
,
boxes_num
,
nms_overlap_thresh
);
// unsigned long long mask_cpu[boxes_num * col_blocks];
// unsigned long long *mask_cpu = new unsigned long long [boxes_num * col_blocks];
std
::
vector
<
unsigned
long
long
>
mask_cpu
(
boxes_num
*
col_blocks
);
// unsigned long long mask_cpu[boxes_num * col_blocks];
// unsigned long long *mask_cpu = new unsigned long long [boxes_num *
// col_blocks];
std
::
vector
<
unsigned
long
long
>
mask_cpu
(
boxes_num
*
col_blocks
);
// printf("boxes_num=%d, col_blocks=%d\n", boxes_num, col_blocks);
CHECK_ERROR
(
cudaMemcpy
(
&
mask_cpu
[
0
],
mask_data
,
boxes_num
*
col_blocks
*
sizeof
(
unsigned
long
long
),
cudaMemcpyDeviceToHost
));
// printf("boxes_num=%d, col_blocks=%d\n", boxes_num, col_blocks);
CHECK_ERROR
(
cudaMemcpy
(
&
mask_cpu
[
0
],
mask_data
,
boxes_num
*
col_blocks
*
sizeof
(
unsigned
long
long
),
cudaMemcpyDeviceToHost
));
cudaFree
(
mask_data
);
cudaFree
(
mask_data
);
unsigned
long
long
remv_cpu
[
col_blocks
];
memset
(
remv_cpu
,
0
,
col_blocks
*
sizeof
(
unsigned
long
long
));
unsigned
long
long
remv_cpu
[
col_blocks
];
memset
(
remv_cpu
,
0
,
col_blocks
*
sizeof
(
unsigned
long
long
));
int
num_to_keep
=
0
;
int
num_to_keep
=
0
;
for
(
int
i
=
0
;
i
<
boxes_num
;
i
++
){
int
nblock
=
i
/
THREADS_PER_BLOCK_NMS
;
int
inblock
=
i
%
THREADS_PER_BLOCK_NMS
;
for
(
int
i
=
0
;
i
<
boxes_num
;
i
++
)
{
int
nblock
=
i
/
THREADS_PER_BLOCK_NMS
;
int
inblock
=
i
%
THREADS_PER_BLOCK_NMS
;
if
(
!
(
remv_cpu
[
nblock
]
&
(
1ULL
<<
inblock
))){
keep_data
[
num_to_keep
++
]
=
i
;
unsigned
long
long
*
p
=
&
mask_cpu
[
0
]
+
i
*
col_blocks
;
for
(
int
j
=
nblock
;
j
<
col_blocks
;
j
++
){
remv_cpu
[
j
]
|=
p
[
j
];
}
}
if
(
!
(
remv_cpu
[
nblock
]
&
(
1ULL
<<
inblock
)))
{
keep_data
[
num_to_keep
++
]
=
i
;
unsigned
long
long
*
p
=
&
mask_cpu
[
0
]
+
i
*
col_blocks
;
for
(
int
j
=
nblock
;
j
<
col_blocks
;
j
++
)
{
remv_cpu
[
j
]
|=
p
[
j
];
}
}
if
(
cudaSuccess
!=
cudaGetLastError
()
)
printf
(
"Error!
\n
"
);
}
if
(
cudaSuccess
!=
cudaGetLastError
())
printf
(
"Error!
\n
"
);
return
num_to_keep
;
return
num_to_keep
;
}
PYBIND11_MODULE
(
TORCH_EXTENSION_NAME
,
m
)
{
m
.
def
(
"boxes_overlap_bev_gpu"
,
&
boxes_overlap_bev_gpu
,
"oriented boxes overlap"
);
m
.
def
(
"boxes_overlap_bev_gpu"
,
&
boxes_overlap_bev_gpu
,
"oriented boxes overlap"
);
m
.
def
(
"boxes_iou_bev_gpu"
,
&
boxes_iou_bev_gpu
,
"oriented boxes iou"
);
m
.
def
(
"nms_gpu"
,
&
nms_gpu
,
"oriented nms gpu"
);
m
.
def
(
"nms_normal_gpu"
,
&
nms_normal_gpu
,
"nms gpu"
);
...
...
mmdet3d/ops/roiaware_pool3d/src/points_in_boxes_cpu.cpp
View file @
ee5667c1
//Modified from
//https://github.com/sshaoshuai/PCDet/blob/master/pcdet/ops/roiaware_pool3d/src/roiaware_pool3d_kernel.cu
//Points in boxes cpu
//Written by Shaoshuai Shi
//All Rights Reserved 2019.
//
Modified from
//
https://github.com/sshaoshuai/PCDet/blob/master/pcdet/ops/roiaware_pool3d/src/roiaware_pool3d_kernel.cu
//
Points in boxes cpu
//
Written by Shaoshuai Shi
//
All Rights Reserved 2019.
#include <torch/serialize/tensor.h>
#include <torch/extension.h>
#include <assert.h>
#include <math.h>
#include <stdio.h>
#include <torch/extension.h>
#include <torch/serialize/tensor.h>
#define CHECK_CONTIGUOUS(x) AT_CHECK(x.is_contiguous(), #x, " must be contiguous ")
#define CHECK_CONTIGUOUS(x) \
TORCH_CHECK(x.is_contiguous(), #x, " must be contiguous ")
// #define DEBUG
inline
void
lidar_to_local_coords_cpu
(
float
shift_x
,
float
shift_y
,
float
rz
,
float
&
local_x
,
float
&
local_y
){
// should rotate pi/2 + alpha to translate LiDAR to local
float
rot_angle
=
rz
+
M_PI
/
2
;
float
cosa
=
cos
(
rot_angle
),
sina
=
sin
(
rot_angle
);
local_x
=
shift_x
*
cosa
+
shift_y
*
(
-
sina
);
local_y
=
shift_x
*
sina
+
shift_y
*
cosa
;
inline
void
lidar_to_local_coords_cpu
(
float
shift_x
,
float
shift_y
,
float
rz
,
float
&
local_x
,
float
&
local_y
)
{
// should rotate pi/2 + alpha to translate LiDAR to local
float
rot_angle
=
rz
+
M_PI
/
2
;
float
cosa
=
cos
(
rot_angle
),
sina
=
sin
(
rot_angle
);
local_x
=
shift_x
*
cosa
+
shift_y
*
(
-
sina
);
local_y
=
shift_x
*
sina
+
shift_y
*
cosa
;
}
inline
int
check_pt_in_box3d_cpu
(
const
float
*
pt
,
const
float
*
box3d
,
float
&
local_x
,
float
&
local_y
){
// param pt: (x, y, z)
// param box3d: (cx, cy, cz, w, l, h, rz) in LiDAR coordinate, cz in the bottom center
float
x
=
pt
[
0
],
y
=
pt
[
1
],
z
=
pt
[
2
];
float
cx
=
box3d
[
0
],
cy
=
box3d
[
1
],
cz
=
box3d
[
2
];
float
w
=
box3d
[
3
],
l
=
box3d
[
4
],
h
=
box3d
[
5
],
rz
=
box3d
[
6
];
cz
+=
h
/
2.0
;
// shift to the center since cz in box3d is the bottom center
if
(
fabsf
(
z
-
cz
)
>
h
/
2.0
)
return
0
;
lidar_to_local_coords_cpu
(
x
-
cx
,
y
-
cy
,
rz
,
local_x
,
local_y
);
float
in_flag
=
(
local_x
>
-
l
/
2.0
)
&
(
local_x
<
l
/
2.0
)
&
(
local_y
>
-
w
/
2.0
)
&
(
local_y
<
w
/
2.0
);
return
in_flag
;
inline
int
check_pt_in_box3d_cpu
(
const
float
*
pt
,
const
float
*
box3d
,
float
&
local_x
,
float
&
local_y
)
{
// param pt: (x, y, z)
// param box3d: (cx, cy, cz, w, l, h, rz) in LiDAR coordinate, cz in the
// bottom center
float
x
=
pt
[
0
],
y
=
pt
[
1
],
z
=
pt
[
2
];
float
cx
=
box3d
[
0
],
cy
=
box3d
[
1
],
cz
=
box3d
[
2
];
float
w
=
box3d
[
3
],
l
=
box3d
[
4
],
h
=
box3d
[
5
],
rz
=
box3d
[
6
];
cz
+=
h
/
2.0
;
// shift to the center since cz in box3d is the bottom center
if
(
fabsf
(
z
-
cz
)
>
h
/
2.0
)
return
0
;
lidar_to_local_coords_cpu
(
x
-
cx
,
y
-
cy
,
rz
,
local_x
,
local_y
);
float
in_flag
=
(
local_x
>
-
l
/
2.0
)
&
(
local_x
<
l
/
2.0
)
&
(
local_y
>
-
w
/
2.0
)
&
(
local_y
<
w
/
2.0
);
return
in_flag
;
}
int
points_in_boxes_cpu
(
at
::
Tensor
boxes_tensor
,
at
::
Tensor
pts_tensor
,
at
::
Tensor
pts_indices_tensor
){
// params boxes: (N, 7) [x, y, z, w, l, h, rz] in LiDAR coordinate, z is the
bottom center, each box DO NOT overlaps
// params pts: (npoints, 3) [x, y, z]
in LiDAR coordinate
// params pts_indices: (N, npoints)
CHECK_CONTIGUOUS
(
boxes_tensor
);
CHECK_CONTIGUOUS
(
pts_tensor
);
CHECK_CONTIGUOUS
(
pts_indices_tensor
);
int
boxes_num
=
boxes_tensor
.
size
(
0
);
int
pts_num
=
pts_tensor
.
size
(
0
);
const
float
*
boxes
=
boxes_tensor
.
data
<
float
>
();
const
float
*
pts
=
pts_tensor
.
data
<
float
>
();
int
*
pts_indices
=
pts_indices_tensor
.
data
<
int
>
();
float
local_x
=
0
,
local_y
=
0
;
for
(
int
i
=
0
;
i
<
boxes_num
;
i
++
){
for
(
int
j
=
0
;
j
<
pts_num
;
j
++
){
int
cur_in_flag
=
check_pt_in_box3d_cpu
(
pts
+
j
*
3
,
boxes
+
i
*
7
,
local_x
,
local_y
);
pt
s
_in
dices
[
i
*
pts_num
+
j
]
=
cur_in_flag
;
}
int
points_in_boxes_cpu
(
at
::
Tensor
boxes_tensor
,
at
::
Tensor
pts_tensor
,
at
::
Tensor
pts_indices_tensor
)
{
// params boxes: (N, 7) [x, y, z, w, l, h, rz] in LiDAR coordinate, z is the
//
bottom center, each box DO NOT overlaps
params pts: (npoints, 3) [x, y, z]
//
in LiDAR coordinate
params pts_indices: (N, npoints)
CHECK_CONTIGUOUS
(
boxes_tensor
);
CHECK_CONTIGUOUS
(
pts_tensor
);
CHECK_CONTIGUOUS
(
pts_indices_tensor
);
int
boxes_num
=
boxes_tensor
.
size
(
0
);
int
pts_num
=
pts_tensor
.
size
(
0
);
const
float
*
boxes
=
boxes_tensor
.
data
_ptr
<
float
>
();
const
float
*
pts
=
pts_tensor
.
data
_ptr
<
float
>
();
int
*
pts_indices
=
pts_indices_tensor
.
data
_ptr
<
int
>
();
float
local_x
=
0
,
local_y
=
0
;
for
(
int
i
=
0
;
i
<
boxes_num
;
i
++
)
{
for
(
int
j
=
0
;
j
<
pts_num
;
j
++
)
{
int
cur_in_flag
=
check_
pt_in
_box3d_cpu
(
pts
+
j
*
3
,
boxes
+
i
*
7
,
local_x
,
local_y
)
;
pts_indices
[
i
*
pts_num
+
j
]
=
cur_in_flag
;
}
}
return
1
;
return
1
;
}
mmdet3d/ops/roiaware_pool3d/src/points_in_boxes_cuda.cu
View file @
ee5667c1
//Modified from
//https://github.com/sshaoshuai/PCDet/blob/master/pcdet/ops/roiaware_pool3d/src/roiaware_pool3d_kernel.cu
//Points in boxes gpu
//Written by Shaoshuai Shi
//All Rights Reserved 2019.
//
Modified from
//
https://github.com/sshaoshuai/PCDet/blob/master/pcdet/ops/roiaware_pool3d/src/roiaware_pool3d_kernel.cu
//
Points in boxes gpu
//
Written by Shaoshuai Shi
//
All Rights Reserved 2019.
#include <torch/serialize/tensor.h>
#include <torch/extension.h>
#include <assert.h>
#include <math.h>
#include <stdio.h>
#include <torch/extension.h>
#include <torch/serialize/tensor.h>
#define THREADS_PER_BLOCK 256
#define DIVUP(m,n) ((m) / (n) + ((m) % (n) > 0))
#define CHECK_CUDA(x) AT_CHECK(x.type().is_cuda(), #x, " must be a CUDAtensor ")
#define CHECK_CONTIGUOUS(x) AT_CHECK(x.is_contiguous(), #x, " must be contiguous ")
#define CHECK_INPUT(x) CHECK_CUDA(x);CHECK_CONTIGUOUS(x)
#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
#define CHECK_CUDA(x) \
TORCH_CHECK(x.device().is_cuda(), #x, " must be a CUDAtensor ")
#define CHECK_CONTIGUOUS(x) \
TORCH_CHECK(x.is_contiguous(), #x, " must be contiguous ")
#define CHECK_INPUT(x) \
CHECK_CUDA(x); \
CHECK_CONTIGUOUS(x)
// #define DEBUG
__device__
inline
void
lidar_to_local_coords
(
float
shift_x
,
float
shift_y
,
float
rz
,
float
&
local_x
,
float
&
local_y
){
// should rotate pi/2 + alpha to translate LiDAR to local
float
rot_angle
=
rz
+
M_PI
/
2
;
float
cosa
=
cos
(
rot_angle
),
sina
=
sin
(
rot_angle
);
local_x
=
shift_x
*
cosa
+
shift_y
*
(
-
sina
);
local_y
=
shift_x
*
sina
+
shift_y
*
cosa
;
__device__
inline
void
lidar_to_local_coords
(
float
shift_x
,
float
shift_y
,
float
rz
,
float
&
local_x
,
float
&
local_y
)
{
// should rotate pi/2 + alpha to translate LiDAR to local
float
rot_angle
=
rz
+
M_PI
/
2
;
float
cosa
=
cos
(
rot_angle
),
sina
=
sin
(
rot_angle
);
local_x
=
shift_x
*
cosa
+
shift_y
*
(
-
sina
);
local_y
=
shift_x
*
sina
+
shift_y
*
cosa
;
}
__device__
inline
int
check_pt_in_box3d
(
const
float
*
pt
,
const
float
*
box3d
,
float
&
local_x
,
float
&
local_y
){
// param pt: (x, y, z)
// param box3d: (cx, cy, cz, w, l, h, rz) in LiDAR coordinate, cz in the bottom center
float
x
=
pt
[
0
],
y
=
pt
[
1
],
z
=
pt
[
2
];
float
cx
=
box3d
[
0
],
cy
=
box3d
[
1
],
cz
=
box3d
[
2
];
float
w
=
box3d
[
3
],
l
=
box3d
[
4
],
h
=
box3d
[
5
],
rz
=
box3d
[
6
];
cz
+=
h
/
2.0
;
// shift to the center since cz in box3d is the bottom center
if
(
fabsf
(
z
-
cz
)
>
h
/
2.0
)
return
0
;
lidar_to_local_coords
(
x
-
cx
,
y
-
cy
,
rz
,
local_x
,
local_y
);
float
in_flag
=
(
local_x
>
-
l
/
2.0
)
&
(
local_x
<
l
/
2.0
)
&
(
local_y
>
-
w
/
2.0
)
&
(
local_y
<
w
/
2.0
);
return
in_flag
;
__device__
inline
int
check_pt_in_box3d
(
const
float
*
pt
,
const
float
*
box3d
,
float
&
local_x
,
float
&
local_y
)
{
// param pt: (x, y, z)
// param box3d: (cx, cy, cz, w, l, h, rz) in LiDAR coordinate, cz in the
// bottom center
float
x
=
pt
[
0
],
y
=
pt
[
1
],
z
=
pt
[
2
];
float
cx
=
box3d
[
0
],
cy
=
box3d
[
1
],
cz
=
box3d
[
2
];
float
w
=
box3d
[
3
],
l
=
box3d
[
4
],
h
=
box3d
[
5
],
rz
=
box3d
[
6
];
cz
+=
h
/
2.0
;
// shift to the center since cz in box3d is the bottom center
if
(
fabsf
(
z
-
cz
)
>
h
/
2.0
)
return
0
;
lidar_to_local_coords
(
x
-
cx
,
y
-
cy
,
rz
,
local_x
,
local_y
);
float
in_flag
=
(
local_x
>
-
l
/
2.0
)
&
(
local_x
<
l
/
2.0
)
&
(
local_y
>
-
w
/
2.0
)
&
(
local_y
<
w
/
2.0
);
return
in_flag
;
}
__global__
void
points_in_boxes_kernel
(
int
batch_size
,
int
boxes_num
,
int
pts_num
,
const
float
*
boxes
,
const
float
*
pts
,
int
*
box_idx_of_points
){
// params boxes: (B, N, 7) [x, y, z, w, l, h, rz] in LiDAR coordinate, z is the bottom center, each box DO NOT overlaps
// params pts: (B, npoints, 3) [x, y, z] in LiDAR coordinate
// params boxes_idx_of_points: (B, npoints), default -1
int
bs_idx
=
blockIdx
.
y
;
int
pt_idx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
bs_idx
>=
batch_size
||
pt_idx
>=
pts_num
)
return
;
boxes
+=
bs_idx
*
boxes_num
*
7
;
pts
+=
bs_idx
*
pts_num
*
3
+
pt_idx
*
3
;
box_idx_of_points
+=
bs_idx
*
pts_num
+
pt_idx
;
float
local_x
=
0
,
local_y
=
0
;
int
cur_in_flag
=
0
;
for
(
int
k
=
0
;
k
<
boxes_num
;
k
++
){
cur_in_flag
=
check_pt_in_box3d
(
pts
,
boxes
+
k
*
7
,
local_x
,
local_y
);
if
(
cur_in_flag
){
box_idx_of_points
[
0
]
=
k
;
break
;
}
__global__
void
points_in_boxes_kernel
(
int
batch_size
,
int
boxes_num
,
int
pts_num
,
const
float
*
boxes
,
const
float
*
pts
,
int
*
box_idx_of_points
)
{
// params boxes: (B, N, 7) [x, y, z, w, l, h, rz] in LiDAR coordinate, z is
// the bottom center, each box DO NOT overlaps params pts: (B, npoints, 3) [x,
// y, z] in LiDAR coordinate params boxes_idx_of_points: (B, npoints), default
// -1
int
bs_idx
=
blockIdx
.
y
;
int
pt_idx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
bs_idx
>=
batch_size
||
pt_idx
>=
pts_num
)
return
;
boxes
+=
bs_idx
*
boxes_num
*
7
;
pts
+=
bs_idx
*
pts_num
*
3
+
pt_idx
*
3
;
box_idx_of_points
+=
bs_idx
*
pts_num
+
pt_idx
;
float
local_x
=
0
,
local_y
=
0
;
int
cur_in_flag
=
0
;
for
(
int
k
=
0
;
k
<
boxes_num
;
k
++
)
{
cur_in_flag
=
check_pt_in_box3d
(
pts
,
boxes
+
k
*
7
,
local_x
,
local_y
);
if
(
cur_in_flag
)
{
box_idx_of_points
[
0
]
=
k
;
break
;
}
}
}
void
points_in_boxes_launcher
(
int
batch_size
,
int
boxes_num
,
int
pts_num
,
const
float
*
boxes
,
const
float
*
pts
,
int
*
box_idx_of_points
){
// params boxes: (B, N, 7) [x, y, z, w, l, h, rz] in LiDAR coordinate, z is the bottom center, each box DO NOT overlaps
// params pts: (B, npoints, 3) [x, y, z] in LiDAR coordinate
// params boxes_idx_of_points: (B, npoints), default -1
cudaError_t
err
;
dim3
blocks
(
DIVUP
(
pts_num
,
THREADS_PER_BLOCK
),
batch_size
);
dim3
threads
(
THREADS_PER_BLOCK
);
points_in_boxes_kernel
<<<
blocks
,
threads
>>>
(
batch_size
,
boxes_num
,
pts_num
,
boxes
,
pts
,
box_idx_of_points
);
err
=
cudaGetLastError
();
if
(
cudaSuccess
!=
err
)
{
fprintf
(
stderr
,
"CUDA kernel failed : %s
\n
"
,
cudaGetErrorString
(
err
));
exit
(
-
1
);
}
void
points_in_boxes_launcher
(
int
batch_size
,
int
boxes_num
,
int
pts_num
,
const
float
*
boxes
,
const
float
*
pts
,
int
*
box_idx_of_points
)
{
// params boxes: (B, N, 7) [x, y, z, w, l, h, rz] in LiDAR coordinate, z is
// the bottom center, each box DO NOT overlaps params pts: (B, npoints, 3) [x,
// y, z] in LiDAR coordinate params boxes_idx_of_points: (B, npoints), default
// -1
cudaError_t
err
;
dim3
blocks
(
DIVUP
(
pts_num
,
THREADS_PER_BLOCK
),
batch_size
);
dim3
threads
(
THREADS_PER_BLOCK
);
points_in_boxes_kernel
<<<
blocks
,
threads
>>>
(
batch_size
,
boxes_num
,
pts_num
,
boxes
,
pts
,
box_idx_of_points
);
err
=
cudaGetLastError
();
if
(
cudaSuccess
!=
err
)
{
fprintf
(
stderr
,
"CUDA kernel failed : %s
\n
"
,
cudaGetErrorString
(
err
));
exit
(
-
1
);
}
#ifdef DEBUG
cudaDeviceSynchronize
();
// for using printf in kernel function
cudaDeviceSynchronize
();
// for using printf in kernel function
#endif
}
int
points_in_boxes_gpu
(
at
::
Tensor
boxes_tensor
,
at
::
Tensor
pts_tensor
,
at
::
Tensor
box_idx_of_points_tensor
){
// params boxes: (B, N, 7) [x, y, z, w, l, h, rz] in LiDAR coordinate, z is the bottom center, each box DO NOT overlaps
// params pts: (B, npoints, 3) [x, y, z] in LiDAR coordinate
// params boxes_idx_of_points: (B, npoints), default -1
int
points_in_boxes_gpu
(
at
::
Tensor
boxes_tensor
,
at
::
Tensor
pts_tensor
,
at
::
Tensor
box_idx_of_points_tensor
)
{
// params boxes: (B, N, 7) [x, y, z, w, l, h, rz] in LiDAR coordinate, z is
// the bottom center, each box DO NOT overlaps params pts: (B, npoints, 3) [x,
// y, z] in LiDAR coordinate params boxes_idx_of_points: (B, npoints), default
// -1
CHECK_INPUT
(
boxes_tensor
);
CHECK_INPUT
(
pts_tensor
);
CHECK_INPUT
(
box_idx_of_points_tensor
);
CHECK_INPUT
(
boxes_tensor
);
CHECK_INPUT
(
pts_tensor
);
CHECK_INPUT
(
box_idx_of_points_tensor
);
int
batch_size
=
boxes_tensor
.
size
(
0
);
int
boxes_num
=
boxes_tensor
.
size
(
1
);
int
pts_num
=
pts_tensor
.
size
(
1
);
int
batch_size
=
boxes_tensor
.
size
(
0
);
int
boxes_num
=
boxes_tensor
.
size
(
1
);
int
pts_num
=
pts_tensor
.
size
(
1
);
const
float
*
boxes
=
boxes_tensor
.
data
<
float
>
();
const
float
*
pts
=
pts_tensor
.
data
<
float
>
();
int
*
box_idx_of_points
=
box_idx_of_points_tensor
.
data
<
int
>
();
const
float
*
boxes
=
boxes_tensor
.
data
_ptr
<
float
>
();
const
float
*
pts
=
pts_tensor
.
data
_ptr
<
float
>
();
int
*
box_idx_of_points
=
box_idx_of_points_tensor
.
data
_ptr
<
int
>
();
points_in_boxes_launcher
(
batch_size
,
boxes_num
,
pts_num
,
boxes
,
pts
,
box_idx_of_points
);
points_in_boxes_launcher
(
batch_size
,
boxes_num
,
pts_num
,
boxes
,
pts
,
box_idx_of_points
);
return
1
;
return
1
;
}
mmdet3d/ops/roiaware_pool3d/src/roiaware_pool3d.cpp
View file @
ee5667c1
//Modified from
//https://github.com/sshaoshuai/PCDet/blob/master/pcdet/ops/roiaware_pool3d/src/roiaware_pool3d_kernel.cu
//RoI-aware point cloud feature pooling
//Written by Shaoshuai Shi
//All Rights Reserved 2019.
//
Modified from
//
https://github.com/sshaoshuai/PCDet/blob/master/pcdet/ops/roiaware_pool3d/src/roiaware_pool3d_kernel.cu
//
RoI-aware point cloud feature pooling
//
Written by Shaoshuai Shi
//
All Rights Reserved 2019.
#include <torch/serialize/tensor.h>
#include <torch/extension.h>
#include <assert.h>
#include <torch/extension.h>
#include <torch/serialize/tensor.h>
#define CHECK_CUDA(x) AT_CHECK(x.type().is_cuda(), #x, " must be a CUDAtensor ")
#define CHECK_CONTIGUOUS(x) AT_CHECK(x.is_contiguous(), #x, " must be contiguous ")
#define CHECK_INPUT(x) CHECK_CUDA(x);CHECK_CONTIGUOUS(x)
void
roiaware_pool3d_launcher
(
int
boxes_num
,
int
pts_num
,
int
channels
,
int
max_pts_each_voxel
,
int
out_x
,
int
out_y
,
int
out_z
,
const
float
*
rois
,
const
float
*
pts
,
const
float
*
pts_feature
,
int
*
argmax
,
int
*
pts_idx_of_voxels
,
float
*
pooled_features
,
int
pool_method
);
void
roiaware_pool3d_backward_launcher
(
int
boxes_num
,
int
out_x
,
int
out_y
,
int
out_z
,
int
channels
,
int
max_pts_each_voxel
,
const
int
*
pts_idx_of_voxels
,
const
int
*
argmax
,
const
float
*
grad_out
,
float
*
grad_in
,
int
pool_method
);
int
roiaware_pool3d_gpu
(
at
::
Tensor
rois
,
at
::
Tensor
pts
,
at
::
Tensor
pts_feature
,
at
::
Tensor
argmax
,
at
::
Tensor
pts_idx_of_voxels
,
at
::
Tensor
pooled_features
,
int
pool_method
);
int
roiaware_pool3d_gpu_backward
(
at
::
Tensor
pts_idx_of_voxels
,
at
::
Tensor
argmax
,
at
::
Tensor
grad_out
,
at
::
Tensor
grad_in
,
int
pool_method
);
int
points_in_boxes_cpu
(
at
::
Tensor
boxes_tensor
,
at
::
Tensor
pts_tensor
,
at
::
Tensor
pts_indices_tensor
);
int
points_in_boxes_gpu
(
at
::
Tensor
boxes_tensor
,
at
::
Tensor
pts_tensor
,
at
::
Tensor
box_idx_of_points_tensor
);
int
roiaware_pool3d_gpu
(
at
::
Tensor
rois
,
at
::
Tensor
pts
,
at
::
Tensor
pts_feature
,
at
::
Tensor
argmax
,
at
::
Tensor
pts_idx_of_voxels
,
at
::
Tensor
pooled_features
,
int
pool_method
){
// params rois: (N, 7) [x, y, z, w, l, h, ry] in LiDAR coordinate
// params pts: (npoints, 3) [x, y, z] in LiDAR coordinate
// params pts_feature: (npoints, C)
// params argmax: (N, out_x, out_y, out_z, C)
// params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel)
// params pooled_features: (N, out_x, out_y, out_z, C)
// params pool_method: 0: max_pool 1: avg_pool
CHECK_INPUT
(
rois
);
CHECK_INPUT
(
pts
);
CHECK_INPUT
(
pts_feature
);
CHECK_INPUT
(
argmax
);
CHECK_INPUT
(
pts_idx_of_voxels
);
CHECK_INPUT
(
pooled_features
);
int
boxes_num
=
rois
.
size
(
0
);
int
pts_num
=
pts
.
size
(
0
);
int
channels
=
pts_feature
.
size
(
1
);
int
max_pts_each_voxel
=
pts_idx_of_voxels
.
size
(
4
);
// index 0 is the counter
int
out_x
=
pts_idx_of_voxels
.
size
(
1
);
int
out_y
=
pts_idx_of_voxels
.
size
(
2
);
int
out_z
=
pts_idx_of_voxels
.
size
(
3
);
assert
((
out_x
<
256
)
&&
(
out_y
<
256
)
&&
(
out_z
<
256
));
// we encode index with 8bit
const
float
*
rois_data
=
rois
.
data
<
float
>
();
const
float
*
pts_data
=
pts
.
data
<
float
>
();
const
float
*
pts_feature_data
=
pts_feature
.
data
<
float
>
();
int
*
argmax_data
=
argmax
.
data
<
int
>
();
int
*
pts_idx_of_voxels_data
=
pts_idx_of_voxels
.
data
<
int
>
();
float
*
pooled_features_data
=
pooled_features
.
data
<
float
>
();
roiaware_pool3d_launcher
(
boxes_num
,
pts_num
,
channels
,
max_pts_each_voxel
,
out_x
,
out_y
,
out_z
,
rois_data
,
pts_data
,
pts_feature_data
,
argmax_data
,
pts_idx_of_voxels_data
,
pooled_features_data
,
pool_method
);
return
1
;
#define CHECK_CUDA(x) \
TORCH_CHECK(x.device().is_cuda(), #x, " must be a CUDAtensor ")
#define CHECK_CONTIGUOUS(x) \
TORCH_CHECK(x.is_contiguous(), #x, " must be contiguous ")
#define CHECK_INPUT(x) \
CHECK_CUDA(x); \
CHECK_CONTIGUOUS(x)
void
roiaware_pool3d_launcher
(
int
boxes_num
,
int
pts_num
,
int
channels
,
int
max_pts_each_voxel
,
int
out_x
,
int
out_y
,
int
out_z
,
const
float
*
rois
,
const
float
*
pts
,
const
float
*
pts_feature
,
int
*
argmax
,
int
*
pts_idx_of_voxels
,
float
*
pooled_features
,
int
pool_method
);
void
roiaware_pool3d_backward_launcher
(
int
boxes_num
,
int
out_x
,
int
out_y
,
int
out_z
,
int
channels
,
int
max_pts_each_voxel
,
const
int
*
pts_idx_of_voxels
,
const
int
*
argmax
,
const
float
*
grad_out
,
float
*
grad_in
,
int
pool_method
);
int
roiaware_pool3d_gpu
(
at
::
Tensor
rois
,
at
::
Tensor
pts
,
at
::
Tensor
pts_feature
,
at
::
Tensor
argmax
,
at
::
Tensor
pts_idx_of_voxels
,
at
::
Tensor
pooled_features
,
int
pool_method
);
int
roiaware_pool3d_gpu_backward
(
at
::
Tensor
pts_idx_of_voxels
,
at
::
Tensor
argmax
,
at
::
Tensor
grad_out
,
at
::
Tensor
grad_in
,
int
pool_method
);
int
points_in_boxes_cpu
(
at
::
Tensor
boxes_tensor
,
at
::
Tensor
pts_tensor
,
at
::
Tensor
pts_indices_tensor
);
int
points_in_boxes_gpu
(
at
::
Tensor
boxes_tensor
,
at
::
Tensor
pts_tensor
,
at
::
Tensor
box_idx_of_points_tensor
);
int
roiaware_pool3d_gpu
(
at
::
Tensor
rois
,
at
::
Tensor
pts
,
at
::
Tensor
pts_feature
,
at
::
Tensor
argmax
,
at
::
Tensor
pts_idx_of_voxels
,
at
::
Tensor
pooled_features
,
int
pool_method
)
{
// params rois: (N, 7) [x, y, z, w, l, h, ry] in LiDAR coordinate
// params pts: (npoints, 3) [x, y, z] in LiDAR coordinate
// params pts_feature: (npoints, C)
// params argmax: (N, out_x, out_y, out_z, C)
// params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel)
// params pooled_features: (N, out_x, out_y, out_z, C)
// params pool_method: 0: max_pool 1: avg_pool
CHECK_INPUT
(
rois
);
CHECK_INPUT
(
pts
);
CHECK_INPUT
(
pts_feature
);
CHECK_INPUT
(
argmax
);
CHECK_INPUT
(
pts_idx_of_voxels
);
CHECK_INPUT
(
pooled_features
);
int
boxes_num
=
rois
.
size
(
0
);
int
pts_num
=
pts
.
size
(
0
);
int
channels
=
pts_feature
.
size
(
1
);
int
max_pts_each_voxel
=
pts_idx_of_voxels
.
size
(
4
);
// index 0 is the counter
int
out_x
=
pts_idx_of_voxels
.
size
(
1
);
int
out_y
=
pts_idx_of_voxels
.
size
(
2
);
int
out_z
=
pts_idx_of_voxels
.
size
(
3
);
assert
((
out_x
<
256
)
&&
(
out_y
<
256
)
&&
(
out_z
<
256
));
// we encode index with 8bit
const
float
*
rois_data
=
rois
.
data_ptr
<
float
>
();
const
float
*
pts_data
=
pts
.
data_ptr
<
float
>
();
const
float
*
pts_feature_data
=
pts_feature
.
data_ptr
<
float
>
();
int
*
argmax_data
=
argmax
.
data_ptr
<
int
>
();
int
*
pts_idx_of_voxels_data
=
pts_idx_of_voxels
.
data_ptr
<
int
>
();
float
*
pooled_features_data
=
pooled_features
.
data_ptr
<
float
>
();
roiaware_pool3d_launcher
(
boxes_num
,
pts_num
,
channels
,
max_pts_each_voxel
,
out_x
,
out_y
,
out_z
,
rois_data
,
pts_data
,
pts_feature_data
,
argmax_data
,
pts_idx_of_voxels_data
,
pooled_features_data
,
pool_method
);
return
1
;
}
int
roiaware_pool3d_gpu_backward
(
at
::
Tensor
pts_idx_of_voxels
,
at
::
Tensor
argmax
,
at
::
Tensor
grad_out
,
at
::
Tensor
grad_in
,
int
pool_method
){
// params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel)
// params argmax: (N, out_x, out_y, out_z, C)
// params grad_out: (N, out_x, out_y, out_z, C)
// params grad_in: (npoints, C), return value
// params pool_method: 0: max_pool 1: avg_pool
CHECK_INPUT
(
pts_idx_of_voxels
);
CHECK_INPUT
(
argmax
);
CHECK_INPUT
(
grad_out
);
CHECK_INPUT
(
grad_in
);
int
boxes_num
=
pts_idx_of_voxels
.
size
(
0
);
int
out_x
=
pts_idx_of_voxels
.
size
(
1
);
int
out_y
=
pts_idx_of_voxels
.
size
(
2
);
int
out_z
=
pts_idx_of_voxels
.
size
(
3
);
int
max_pts_each_voxel
=
pts_idx_of_voxels
.
size
(
4
);
// index 0 is the counter
int
channels
=
grad_out
.
size
(
4
);
const
int
*
pts_idx_of_voxels_data
=
pts_idx_of_voxels
.
data
<
int
>
();
const
int
*
argmax_data
=
argmax
.
data
<
int
>
();
const
float
*
grad_out_data
=
grad_out
.
data
<
float
>
();
float
*
grad_in_data
=
grad_in
.
data
<
float
>
();
roiaware_pool3d_backward_launcher
(
boxes_num
,
out_x
,
out_y
,
out_z
,
channels
,
max_pts_each_voxel
,
pts_idx_of_voxels_data
,
argmax_data
,
grad_out_data
,
grad_in_data
,
pool_method
);
return
1
;
int
roiaware_pool3d_gpu_backward
(
at
::
Tensor
pts_idx_of_voxels
,
at
::
Tensor
argmax
,
at
::
Tensor
grad_out
,
at
::
Tensor
grad_in
,
int
pool_method
)
{
// params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel)
// params argmax: (N, out_x, out_y, out_z, C)
// params grad_out: (N, out_x, out_y, out_z, C)
// params grad_in: (npoints, C), return value
// params pool_method: 0: max_pool 1: avg_pool
CHECK_INPUT
(
pts_idx_of_voxels
);
CHECK_INPUT
(
argmax
);
CHECK_INPUT
(
grad_out
);
CHECK_INPUT
(
grad_in
);
int
boxes_num
=
pts_idx_of_voxels
.
size
(
0
);
int
out_x
=
pts_idx_of_voxels
.
size
(
1
);
int
out_y
=
pts_idx_of_voxels
.
size
(
2
);
int
out_z
=
pts_idx_of_voxels
.
size
(
3
);
int
max_pts_each_voxel
=
pts_idx_of_voxels
.
size
(
4
);
// index 0 is the counter
int
channels
=
grad_out
.
size
(
4
);
const
int
*
pts_idx_of_voxels_data
=
pts_idx_of_voxels
.
data_ptr
<
int
>
();
const
int
*
argmax_data
=
argmax
.
data_ptr
<
int
>
();
const
float
*
grad_out_data
=
grad_out
.
data_ptr
<
float
>
();
float
*
grad_in_data
=
grad_in
.
data_ptr
<
float
>
();
roiaware_pool3d_backward_launcher
(
boxes_num
,
out_x
,
out_y
,
out_z
,
channels
,
max_pts_each_voxel
,
pts_idx_of_voxels_data
,
argmax_data
,
grad_out_data
,
grad_in_data
,
pool_method
);
return
1
;
}
PYBIND11_MODULE
(
TORCH_EXTENSION_NAME
,
m
)
{
m
.
def
(
"forward"
,
&
roiaware_pool3d_gpu
,
"roiaware pool3d forward (CUDA)"
);
m
.
def
(
"backward"
,
&
roiaware_pool3d_gpu_backward
,
"roiaware pool3d backward (CUDA)"
);
m
.
def
(
"points_in_boxes_gpu"
,
&
points_in_boxes_gpu
,
"points_in_boxes_gpu forward (CUDA)"
);
m
.
def
(
"points_in_boxes_cpu"
,
&
points_in_boxes_cpu
,
"points_in_boxes_cpu forward (CPU)"
);
m
.
def
(
"forward"
,
&
roiaware_pool3d_gpu
,
"roiaware pool3d forward (CUDA)"
);
m
.
def
(
"backward"
,
&
roiaware_pool3d_gpu_backward
,
"roiaware pool3d backward (CUDA)"
);
m
.
def
(
"points_in_boxes_gpu"
,
&
points_in_boxes_gpu
,
"points_in_boxes_gpu forward (CUDA)"
);
m
.
def
(
"points_in_boxes_cpu"
,
&
points_in_boxes_cpu
,
"points_in_boxes_cpu forward (CPU)"
);
}
mmdet3d/ops/spconv/include/spconv/fused_spconv_ops.h
View file @
ee5667c1
...
...
@@ -26,9 +26,10 @@ namespace spconv {
// torch.jit's doc says only support int64, so we need to convert to int32.
template
<
typename
T
>
torch
::
Tensor
fusedIndiceConvBatchNorm
(
torch
::
Tensor
features
,
torch
::
Tensor
filters
,
torch
::
Tensor
bias
,
torch
::
Tensor
indicePairs
,
torch
::
Tensor
indiceNum
,
int64_t
numActOut
,
int64_t
_inverse
,
int64_t
_subM
)
{
torch
::
Tensor
fusedIndiceConvBatchNorm
(
torch
::
Tensor
features
,
torch
::
Tensor
filters
,
torch
::
Tensor
bias
,
torch
::
Tensor
indicePairs
,
torch
::
Tensor
indiceNum
,
int64_t
numActOut
,
int64_t
_inverse
,
int64_t
_subM
)
{
bool
subM
=
_subM
!=
0
;
bool
inverse
=
_inverse
!=
0
;
auto
device
=
features
.
device
().
type
();
...
...
@@ -37,13 +38,16 @@ torch::Tensor fusedIndiceConvBatchNorm(torch::Tensor features, torch::Tensor fil
auto
numInPlanes
=
features
.
size
(
1
);
auto
numOutPlanes
=
filters
.
size
(
ndim
+
1
);
auto
indicePairNumCpu
=
indiceNum
.
to
({
torch
::
kCPU
});
auto
indicePairMaxSizeIter
=
std
::
max_element
(
indicePairNumCpu
.
data
<
int
>
(),
indicePairNumCpu
.
data
<
int
>
()
+
kernelVolume
);
int
indicePairMaxOffset
=
indicePairMaxSizeIter
-
indicePairNumCpu
.
data
<
int
>
();
auto
indicePairMaxSizeIter
=
std
::
max_element
(
indicePairNumCpu
.
data_ptr
<
int
>
(),
indicePairNumCpu
.
data_ptr
<
int
>
()
+
kernelVolume
);
int
indicePairMaxOffset
=
indicePairMaxSizeIter
-
indicePairNumCpu
.
data_ptr
<
int
>
();
int
indicePairMaxSize
=
*
indicePairMaxSizeIter
;
/*if (_subM){
std::vector<int> indicePairNumVec(indicePairNumCpu.data<int>(), indicePairNumCpu.data<int>() + kernelVolume);
std::vector<int> indicePairNumVec(indicePairNumCpu.data_ptr<int>(),
indicePairNumCpu.data_ptr<int>() + kernelVolume);
indicePairNumVec.erase(indicePairNumVec.begin() + indicePairMaxOffset);
auto indicePairVecMaxSizeIter = std::max_element(
...
...
@@ -56,46 +60,49 @@ torch::Tensor fusedIndiceConvBatchNorm(torch::Tensor features, torch::Tensor fil
// auto indicePairOptions =
// torch::TensorOptions().dtype(torch::kInt64).device(indicePairs.device());
torch
::
Tensor
output
=
torch
::
zeros
({
numActOut
,
numOutPlanes
},
options
).
copy_
(
bias
);
torch
::
Tensor
inputBuffer
=
torch
::
zeros
({
indicePairMaxSize
,
numInPlanes
},
options
);
torch
::
Tensor
output
=
torch
::
zeros
({
numActOut
,
numOutPlanes
},
options
).
copy_
(
bias
);
torch
::
Tensor
inputBuffer
=
torch
::
zeros
({
indicePairMaxSize
,
numInPlanes
},
options
);
torch
::
Tensor
outputBuffer
=
torch
::
zeros
({
indicePairMaxSize
,
numOutPlanes
},
options
);
filters
=
filters
.
view
({
-
1
,
numInPlanes
,
numOutPlanes
});
if
(
subM
)
{
// the center index of subm conv don't need gather and scatter
// add.
if
(
subM
)
{
// the center index of subm conv don't need gather and scatter
// add.
torch
::
mm_out
(
output
,
features
,
filters
[
indicePairMaxOffset
]);
}
double
totalGatherTime
=
0
;
double
totalGEMMTime
=
0
;
double
totalSAddTime
=
0
;
for
(
int
i
=
0
;
i
<
kernelVolume
;
++
i
)
{
auto
nHot
=
indicePairNumCpu
.
data
<
int
>
()[
i
];
auto
nHot
=
indicePairNumCpu
.
data
_ptr
<
int
>
()[
i
];
if
(
nHot
<=
0
||
(
subM
&&
i
==
indicePairMaxOffset
))
{
continue
;
}
// auto timer = spconv::CudaContextTimer<>();
auto
outputBufferBlob
=
torch
::
from_blob
(
outputBuffer
.
data
<
T
>
(),
{
nHot
,
numOutPlanes
},
options
);
auto
inputBufferBlob
=
torch
::
from_blob
(
inputBuffer
.
data
<
T
>
(),
{
nHot
,
numInPlanes
},
options
);
auto
outputBufferBlob
=
torch
::
from_blob
(
outputBuffer
.
data_ptr
<
T
>
(),
{
nHot
,
numOutPlanes
},
options
);
auto
inputBufferBlob
=
torch
::
from_blob
(
inputBuffer
.
data_ptr
<
T
>
(),
{
nHot
,
numInPlanes
},
options
);
if
(
device
==
torch
::
kCPU
)
{
functor
::
SparseGatherFunctor
<
tv
::
CPU
,
T
,
int
>
gatherFtor
;
gatherFtor
(
tv
::
CPU
(),
tv
::
torch2tv
<
T
>
(
inputBuffer
),
tv
::
torch2tv
<
const
T
>
(
features
),
tv
::
torch2tv
<
const
int
>
(
indicePairs
).
subview
(
i
,
inverse
),
nHot
);
tv
::
torch2tv
<
const
int
>
(
indicePairs
).
subview
(
i
,
inverse
),
nHot
);
}
else
{
functor
::
SparseGatherFunctor
<
tv
::
GPU
,
T
,
int
>
gatherFtor
;
gatherFtor
(
tv
::
TorchGPU
(),
tv
::
torch2tv
<
T
>
(
inputBuffer
),
tv
::
torch2tv
<
const
T
>
(
features
),
tv
::
torch2tv
<
const
int
>
(
indicePairs
).
subview
(
i
,
inverse
),
nHot
);
tv
::
torch2tv
<
const
int
>
(
indicePairs
).
subview
(
i
,
inverse
),
nHot
);
TV_CHECK_CUDA_ERR
();
/* slower than SparseGatherFunctor, may due to int->long conversion
auto indicePairLong = indicePairs[i][inverse].to(torch::kInt64);
auto indicePairBlob = torch::from_blob(indicePairLong.data<long>(), {nHot},
indicePairOptions);
torch::index_select_out(inputBufferBlob, features, 0,
indicePairBlob);*/
auto indicePairBlob = torch::from_blob(indicePairLong.data_ptr<long>(),
{nHot}, indicePairOptions); torch::index_select_out(inputBufferBlob,
features, 0, indicePairBlob);*/
}
// totalGatherTime += timer.report() / 1000.0;
torch
::
mm_out
(
outputBufferBlob
,
inputBufferBlob
,
filters
[
i
]);
...
...
@@ -105,14 +112,14 @@ torch::Tensor fusedIndiceConvBatchNorm(torch::Tensor features, torch::Tensor fil
functor
::
SparseScatterAddFunctor
<
tv
::
CPU
,
T
,
int
>
scatterFtor
;
scatterFtor
(
tv
::
CPU
(),
tv
::
torch2tv
<
T
>
(
output
),
tv
::
torch2tv
<
const
T
>
(
outputBuffer
),
tv
::
torch2tv
<
const
int
>
(
indicePairs
).
subview
(
i
,
!
inverse
),
nHot
,
true
);
tv
::
torch2tv
<
const
int
>
(
indicePairs
).
subview
(
i
,
!
inverse
),
nHot
,
true
);
}
else
{
functor
::
SparseScatterAddFunctor
<
tv
::
GPU
,
T
,
int
>
scatterFtor
;
scatterFtor
(
tv
::
TorchGPU
(),
tv
::
torch2tv
<
T
>
(
output
),
tv
::
torch2tv
<
const
T
>
(
outputBuffer
),
tv
::
torch2tv
<
const
int
>
(
indicePairs
).
subview
(
i
,
!
inverse
),
nHot
,
true
);
tv
::
torch2tv
<
const
int
>
(
indicePairs
).
subview
(
i
,
!
inverse
),
nHot
,
true
);
TV_CHECK_CUDA_ERR
();
}
// totalSAddTime += timer.report() / 1000.0;
...
...
@@ -122,6 +129,6 @@ torch::Tensor fusedIndiceConvBatchNorm(torch::Tensor features, torch::Tensor fil
// std::cout << "scatteradd time " << totalSAddTime << std::endl;
return
output
;
}
}
// namespace spconv
}
// namespace spconv
#endif
mmdet3d/ops/spconv/include/spconv/pool_ops.h
View file @
ee5667c1
...
...
@@ -24,7 +24,7 @@
namespace
spconv
{
template
<
typename
T
>
torch
::
Tensor
indiceMaxPool
(
torch
::
Tensor
features
,
torch
::
Tensor
indicePairs
,
torch
::
Tensor
indiceNum
,
int64_t
numAct
)
{
torch
::
Tensor
indiceNum
,
int64_t
numAct
)
{
auto
device
=
features
.
device
().
type
();
auto
kernelVolume
=
indicePairs
.
size
(
0
);
auto
numInPlanes
=
features
.
size
(
1
);
...
...
@@ -34,7 +34,7 @@ torch::Tensor indiceMaxPool(torch::Tensor features, torch::Tensor indicePairs,
torch
::
Tensor
output
=
torch
::
zeros
({
numAct
,
numInPlanes
},
options
);
double
totalTime
=
0
;
for
(
int
i
=
0
;
i
<
kernelVolume
;
++
i
)
{
auto
nHot
=
indicePairNumCpu
.
data
<
int
>
()[
i
];
auto
nHot
=
indicePairNumCpu
.
data
_ptr
<
int
>
()[
i
];
if
(
nHot
<=
0
)
{
continue
;
}
...
...
@@ -59,18 +59,19 @@ torch::Tensor indiceMaxPool(torch::Tensor features, torch::Tensor indicePairs,
template
<
typename
T
>
torch
::
Tensor
indiceMaxPoolBackward
(
torch
::
Tensor
features
,
torch
::
Tensor
outFeatures
,
torch
::
Tensor
outGrad
,
torch
::
Tensor
indicePairs
,
torch
::
Tensor
indiceNum
)
{
torch
::
Tensor
outFeatures
,
torch
::
Tensor
outGrad
,
torch
::
Tensor
indicePairs
,
torch
::
Tensor
indiceNum
)
{
auto
device
=
features
.
device
().
type
();
auto
numInPlanes
=
features
.
size
(
1
);
auto
indicePairNumCpu
=
indiceNum
.
to
({
torch
::
kCPU
});
auto
options
=
torch
::
TensorOptions
().
dtype
(
features
.
dtype
()).
device
(
features
.
device
());
torch
::
Tensor
inputGrad
=
torch
::
zeros
(
features
.
sizes
(),
options
);
auto
kernelVolume
=
indicePairs
.
size
(
0
);
auto
kernelVolume
=
indicePairs
.
size
(
0
);
for
(
int
i
=
0
;
i
<
kernelVolume
;
++
i
)
{
auto
nHot
=
indicePairNumCpu
.
data
<
int
>
()[
i
];
auto
nHot
=
indicePairNumCpu
.
data
_ptr
<
int
>
()[
i
];
if
(
nHot
<=
0
)
{
continue
;
}
...
...
@@ -92,6 +93,6 @@ torch::Tensor indiceMaxPoolBackward(torch::Tensor features,
return
inputGrad
;
}
}
// namespace spconv
}
// namespace spconv
#endif
mmdet3d/ops/spconv/include/spconv/spconv_ops.h
View file @
ee5667c1
...
...
@@ -25,17 +25,17 @@
namespace
spconv
{
// torch.jit's doc says only support int64, so we need to convert to int32.
template
<
unsigned
NDim
>
std
::
vector
<
torch
::
Tensor
>
getIndicePair
(
torch
::
Tensor
indices
,
int64_t
batchSize
,
std
::
vector
<
int64_t
>
outSpatialShape
,
std
::
vector
<
int64_t
>
spatialShape
,
std
::
vector
<
int64_t
>
kernelSize
,
std
::
vector
<
int64_t
>
stride
,
std
::
vector
<
int64_t
>
padding
,
std
::
vector
<
int64_t
>
dilation
,
std
::
vector
<
int64_t
>
outPadding
,
int64_t
_subM
,
int64_t
_transpose
)
{
std
::
vector
<
torch
::
Tensor
>
getIndicePair
(
torch
::
Tensor
indices
,
int64_t
batchSize
,
std
::
vector
<
int64_t
>
outSpatialShape
,
std
::
vector
<
int64_t
>
spatialShape
,
std
::
vector
<
int64_t
>
kernelSize
,
std
::
vector
<
int64_t
>
stride
,
std
::
vector
<
int64_t
>
padding
,
std
::
vector
<
int64_t
>
dilation
,
std
::
vector
<
int64_t
>
outPadding
,
int64_t
_subM
,
int64_t
_transpose
)
{
// auto timer = spconv::CudaContextTimer<>();
bool
subM
=
_subM
!=
0
;
bool
transpose
=
_transpose
!=
0
;
auto
numAct
=
indices
.
size
(
0
);
auto
coorDim
=
indices
.
size
(
1
)
-
1
;
// batchIdx + xyz
auto
coorDim
=
indices
.
size
(
1
)
-
1
;
// batchIdx + xyz
TV_ASSERT_RT_ERR
(
NDim
==
coorDim
,
"error"
);
TV_ASSERT_RT_ERR
(
kernelSize
.
size
()
==
coorDim
,
"error"
);
TV_ASSERT_RT_ERR
(
outSpatialShape
.
size
()
==
coorDim
,
"error"
);
...
...
@@ -54,7 +54,7 @@ getIndicePair(torch::Tensor indices, int64_t batchSize,
}
torch
::
Tensor
indicePairs
=
torch
::
full
({
kernelVolume
,
2
,
numAct
},
-
1
,
torch
::
dtype
(
torch
::
kInt32
).
device
(
indices
.
device
()));
torch
::
dtype
(
torch
::
kInt32
).
device
(
indices
.
device
()));
torch
::
Tensor
indiceNum
=
torch
::
zeros
(
{
kernelVolume
},
torch
::
dtype
(
torch
::
kInt32
).
device
(
indices
.
device
()));
torch
::
Tensor
gridOut
=
...
...
@@ -67,9 +67,9 @@ getIndicePair(torch::Tensor indices, int64_t batchSize,
tv
::
SimpleVector
<
int
,
NDim
>
stride32
;
tv
::
SimpleVector
<
int
,
NDim
>
padding32
;
tv
::
SimpleVector
<
int
,
NDim
>
dilation32
;
auto
indicePairUnique
=
torch
::
full
(
{
indicePairs
.
numel
()
/
2
+
1
},
std
::
numeric_limits
<
int
>::
max
(),
torch
::
dtype
(
torch
::
kInt32
).
device
(
indices
.
device
()));
auto
indicePairUnique
=
torch
::
full
(
{
indicePairs
.
numel
()
/
2
+
1
},
std
::
numeric_limits
<
int
>::
max
(),
torch
::
dtype
(
torch
::
kInt32
).
device
(
indices
.
device
()));
for
(
int
i
=
0
;
i
<
NDim
;
++
i
)
{
outSpatialShape32
.
push_back
(
outSpatialShape
[
i
]);
kernelSize32
.
push_back
(
kernelSize
[
i
]);
...
...
@@ -88,40 +88,44 @@ getIndicePair(torch::Tensor indices, int64_t batchSize,
auto
getIndicePairFtor
=
functor
::
CreateSubMIndicePairFunctor
<
tv
::
CPU
,
int
,
int
,
NDim
>
();
numActOut
=
getIndicePairFtor
(
tv
::
CPU
(),
tv
::
torch2tv
<
const
int
>
(
indices
),
tv
::
torch2tv
<
int
>
(
gridOut
),
tv
::
torch2tv
<
int
>
(
indicePairs
),
tv
::
torch2tv
<
int
>
(
indiceNum
),
kernelSize32
,
stride32
,
padding32
,
dilation32
,
outSpatialShape32
,
transpose
);
tv
::
CPU
(),
tv
::
torch2tv
<
const
int
>
(
indices
),
tv
::
torch2tv
<
int
>
(
gridOut
),
tv
::
torch2tv
<
int
>
(
indicePairs
),
tv
::
torch2tv
<
int
>
(
indiceNum
),
kernelSize32
,
stride32
,
padding32
,
dilation32
,
outSpatialShape32
,
transpose
);
}
else
{
auto
getIndicePairFtor
=
functor
::
CreateSubMIndicePairFunctor
<
tv
::
GPU
,
int
,
int
,
NDim
>
();
numActOut
=
getIndicePairFtor
(
tv
::
TorchGPU
(),
tv
::
torch2tv
<
const
int
>
(
indices
),
tv
::
torch2tv
<
int
>
(
gridOut
),
tv
::
torch2tv
<
int
>
(
indicePairs
),
tv
::
torch2tv
<
int
>
(
indiceNum
),
kernelSize32
,
stride32
,
padding32
,
dilation32
,
outSpatialShape32
,
transpose
);
tv
::
TorchGPU
(),
tv
::
torch2tv
<
const
int
>
(
indices
),
tv
::
torch2tv
<
int
>
(
gridOut
),
tv
::
torch2tv
<
int
>
(
indicePairs
),
tv
::
torch2tv
<
int
>
(
indiceNum
),
kernelSize32
,
stride32
,
padding32
,
dilation32
,
outSpatialShape32
,
transpose
);
}
return
{
indices
,
indicePairs
,
indiceNum
};
}
else
{
torch
::
Tensor
outInds
=
torch
::
zeros
({
numAct
*
kernelVolume
,
coorDim
+
1
},
torch
::
dtype
(
torch
::
kInt32
).
device
(
indices
.
device
()));
torch
::
dtype
(
torch
::
kInt32
).
device
(
indices
.
device
()));
if
(
indices
.
device
().
type
()
==
torch
::
kCPU
)
{
auto
getIndicePairFtor
=
functor
::
CreateConvIndicePairFunctor
<
tv
::
CPU
,
int
,
int
,
NDim
>
();
auto
getIndicePairFtor
=
functor
::
CreateConvIndicePairFunctor
<
tv
::
CPU
,
int
,
int
,
NDim
>
();
numActOut
=
getIndicePairFtor
(
tv
::
CPU
(),
tv
::
torch2tv
<
const
int
>
(
indices
),
tv
::
torch2tv
<
int
>
(
outInds
),
tv
::
torch2tv
<
int
>
(
gridOut
),
tv
::
torch2tv
<
int
>
(
indicePairs
),
tv
::
torch2tv
<
int
>
(
indiceNum
),
kernelSize32
,
stride32
,
padding32
,
dilation32
,
outSpatialShape32
,
transpose
);
tv
::
torch2tv
<
int
>
(
indicePairs
),
tv
::
torch2tv
<
int
>
(
indiceNum
),
kernelSize32
,
stride32
,
padding32
,
dilation32
,
outSpatialShape32
,
transpose
);
}
else
{
auto
getIndicePairFtorP1
=
functor
::
CreateConvIndicePairFunctorP1
<
tv
::
GPU
,
int
,
int
,
NDim
>
();
auto
getIndicePairFtorP2
=
functor
::
CreateConvIndicePairFunctorP2
<
tv
::
GPU
,
int
,
int
,
NDim
>
();
numActOut
=
getIndicePairFtorP1
(
tv
::
TorchGPU
(),
tv
::
torch2tv
<
const
int
>
(
indices
),
tv
::
torch2tv
<
int
>
(
outInds
),
tv
::
torch2tv
<
int
>
(
gridOut
),
tv
::
torch2tv
<
int
>
(
indicePairs
),
tv
::
torch2tv
<
int
>
(
indiceNum
),
tv
::
torch2tv
<
int
>
(
indicePairUnique
),
kernelSize32
,
stride32
,
padding32
,
dilation32
,
outSpatialShape32
,
transpose
);
numActOut
=
getIndicePairFtorP1
(
tv
::
TorchGPU
(),
tv
::
torch2tv
<
const
int
>
(
indices
),
tv
::
torch2tv
<
int
>
(
outInds
),
tv
::
torch2tv
<
int
>
(
gridOut
),
tv
::
torch2tv
<
int
>
(
indicePairs
),
tv
::
torch2tv
<
int
>
(
indiceNum
),
tv
::
torch2tv
<
int
>
(
indicePairUnique
),
kernelSize32
,
stride32
,
padding32
,
dilation32
,
outSpatialShape32
,
transpose
);
if
(
numActOut
>
0
)
{
auto
res
=
torch
::
_unique
(
indicePairUnique
);
indicePairUnique
=
std
::
get
<
0
>
(
res
);
...
...
@@ -137,17 +141,17 @@ getIndicePair(torch::Tensor indices, int64_t batchSize,
}
template
<
unsigned
NDim
>
std
::
vector
<
torch
::
Tensor
>
getIndicePairPreGrid
(
torch
::
Tensor
indices
,
torch
::
Tensor
gridOut
,
int64_t
batchSize
,
std
::
vector
<
int64_t
>
outSpatialShape
,
std
::
vector
<
int64_t
>
spatialShape
,
std
::
vector
<
int64_t
>
kernelSize
,
std
::
vector
<
int64_t
>
stride
,
std
::
vector
<
int64_t
>
padding
,
std
::
vector
<
int64_t
>
dilation
,
std
::
vector
<
int64_t
>
outPadding
,
int64_t
_subM
,
int64_t
_transpose
)
{
std
::
vector
<
torch
::
Tensor
>
getIndicePairPreGrid
(
torch
::
Tensor
indices
,
torch
::
Tensor
gridOut
,
int64_t
batchSize
,
std
::
vector
<
int64_t
>
outSpatialShape
,
std
::
vector
<
int64_t
>
spatialShape
,
std
::
vector
<
int64_t
>
kernelSize
,
std
::
vector
<
int64_t
>
stride
,
std
::
vector
<
int64_t
>
padding
,
std
::
vector
<
int64_t
>
dilation
,
std
::
vector
<
int64_t
>
outPadding
,
int64_t
_subM
,
int64_t
_transpose
)
{
// auto timer = spconv::CudaContextTimer<>();
bool
subM
=
_subM
!=
0
;
bool
transpose
=
_transpose
!=
0
;
auto
numAct
=
indices
.
size
(
0
);
auto
coorDim
=
indices
.
size
(
1
)
-
1
;
// batchIdx + xyz
auto
coorDim
=
indices
.
size
(
1
)
-
1
;
// batchIdx + xyz
TV_ASSERT_RT_ERR
(
NDim
==
coorDim
,
"error"
);
TV_ASSERT_RT_ERR
(
kernelSize
.
size
()
==
coorDim
,
"error"
);
TV_ASSERT_RT_ERR
(
outSpatialShape
.
size
()
==
coorDim
,
"error"
);
...
...
@@ -167,7 +171,7 @@ getIndicePairPreGrid(torch::Tensor indices, torch::Tensor gridOut, int64_t batch
TV_ASSERT_INVALID_ARG
(
gridOut
.
numel
()
>=
outputVolume
*
batchSize
,
"error"
);
torch
::
Tensor
indicePairs
=
torch
::
full
({
kernelVolume
,
2
,
numAct
},
-
1
,
torch
::
dtype
(
torch
::
kInt32
).
device
(
indices
.
device
()));
torch
::
dtype
(
torch
::
kInt32
).
device
(
indices
.
device
()));
torch
::
Tensor
indiceNum
=
torch
::
zeros
(
{
kernelVolume
},
torch
::
dtype
(
torch
::
kInt32
).
device
(
indices
.
device
()));
// std::cout << "full time " << timer.report() / 1000.0 << std::endl;
...
...
@@ -177,9 +181,9 @@ getIndicePairPreGrid(torch::Tensor indices, torch::Tensor gridOut, int64_t batch
tv
::
SimpleVector
<
int
,
NDim
>
stride32
;
tv
::
SimpleVector
<
int
,
NDim
>
padding32
;
tv
::
SimpleVector
<
int
,
NDim
>
dilation32
;
auto
indicePairUnique
=
torch
::
full
(
{
indicePairs
.
numel
()
/
2
+
1
},
std
::
numeric_limits
<
int
>::
max
(),
torch
::
dtype
(
torch
::
kInt32
).
device
(
indices
.
device
()));
auto
indicePairUnique
=
torch
::
full
(
{
indicePairs
.
numel
()
/
2
+
1
},
std
::
numeric_limits
<
int
>::
max
(),
torch
::
dtype
(
torch
::
kInt32
).
device
(
indices
.
device
()));
for
(
int
i
=
0
;
i
<
NDim
;
++
i
)
{
outSpatialShape32
.
push_back
(
outSpatialShape
[
i
]);
kernelSize32
.
push_back
(
kernelSize
[
i
]);
...
...
@@ -198,42 +202,46 @@ getIndicePairPreGrid(torch::Tensor indices, torch::Tensor gridOut, int64_t batch
auto
getIndicePairFtor
=
functor
::
CreateSubMIndicePairFunctor
<
tv
::
CPU
,
int
,
int
,
NDim
>
();
numActOut
=
getIndicePairFtor
(
tv
::
CPU
(),
tv
::
torch2tv
<
const
int
>
(
indices
),
tv
::
torch2tv
<
int
>
(
gridOut
),
tv
::
torch2tv
<
int
>
(
indicePairs
),
tv
::
torch2tv
<
int
>
(
indiceNum
),
kernelSize32
,
stride32
,
padding32
,
dilation32
,
outSpatialShape32
,
transpose
);
tv
::
CPU
(),
tv
::
torch2tv
<
const
int
>
(
indices
),
tv
::
torch2tv
<
int
>
(
gridOut
),
tv
::
torch2tv
<
int
>
(
indicePairs
),
tv
::
torch2tv
<
int
>
(
indiceNum
),
kernelSize32
,
stride32
,
padding32
,
dilation32
,
outSpatialShape32
,
transpose
);
gridOut
.
fill_
(
-
1
);
}
else
{
auto
getIndicePairFtor
=
functor
::
CreateSubMIndicePairFunctor
<
tv
::
GPU
,
int
,
int
,
NDim
>
();
numActOut
=
getIndicePairFtor
(
tv
::
TorchGPU
(),
tv
::
torch2tv
<
const
int
>
(
indices
),
tv
::
torch2tv
<
int
>
(
gridOut
),
tv
::
torch2tv
<
int
>
(
indicePairs
),
tv
::
torch2tv
<
int
>
(
indiceNum
),
kernelSize32
,
stride32
,
padding32
,
dilation32
,
outSpatialShape32
,
transpose
,
true
);
tv
::
TorchGPU
(),
tv
::
torch2tv
<
const
int
>
(
indices
),
tv
::
torch2tv
<
int
>
(
gridOut
),
tv
::
torch2tv
<
int
>
(
indicePairs
),
tv
::
torch2tv
<
int
>
(
indiceNum
),
kernelSize32
,
stride32
,
padding32
,
dilation32
,
outSpatialShape32
,
transpose
,
true
);
}
return
{
indices
,
indicePairs
,
indiceNum
};
}
else
{
torch
::
Tensor
outInds
=
torch
::
zeros
({
numAct
*
kernelVolume
,
coorDim
+
1
},
torch
::
dtype
(
torch
::
kInt32
).
device
(
indices
.
device
()));
torch
::
dtype
(
torch
::
kInt32
).
device
(
indices
.
device
()));
if
(
indices
.
device
().
type
()
==
torch
::
kCPU
)
{
auto
getIndicePairFtor
=
functor
::
CreateConvIndicePairFunctor
<
tv
::
CPU
,
int
,
int
,
NDim
>
();
auto
getIndicePairFtor
=
functor
::
CreateConvIndicePairFunctor
<
tv
::
CPU
,
int
,
int
,
NDim
>
();
numActOut
=
getIndicePairFtor
(
tv
::
CPU
(),
tv
::
torch2tv
<
const
int
>
(
indices
),
tv
::
torch2tv
<
int
>
(
outInds
),
tv
::
torch2tv
<
int
>
(
gridOut
),
tv
::
torch2tv
<
int
>
(
indicePairs
),
tv
::
torch2tv
<
int
>
(
indiceNum
),
kernelSize32
,
stride32
,
padding32
,
dilation32
,
outSpatialShape32
,
transpose
,
true
);
tv
::
torch2tv
<
int
>
(
indicePairs
),
tv
::
torch2tv
<
int
>
(
indiceNum
),
kernelSize32
,
stride32
,
padding32
,
dilation32
,
outSpatialShape32
,
transpose
,
true
);
gridOut
.
fill_
(
-
1
);
}
else
{
auto
getIndicePairFtorP1
=
functor
::
CreateConvIndicePairFunctorP1
<
tv
::
GPU
,
int
,
int
,
NDim
>
();
auto
getIndicePairFtorP2
=
functor
::
CreateConvIndicePairFunctorP2
<
tv
::
GPU
,
int
,
int
,
NDim
>
();
numActOut
=
getIndicePairFtorP1
(
tv
::
TorchGPU
(),
tv
::
torch2tv
<
const
int
>
(
indices
),
tv
::
torch2tv
<
int
>
(
outInds
),
tv
::
torch2tv
<
int
>
(
gridOut
),
tv
::
torch2tv
<
int
>
(
indicePairs
),
tv
::
torch2tv
<
int
>
(
indiceNum
),
tv
::
torch2tv
<
int
>
(
indicePairUnique
),
kernelSize32
,
stride32
,
padding32
,
dilation32
,
outSpatialShape32
,
transpose
);
numActOut
=
getIndicePairFtorP1
(
tv
::
TorchGPU
(),
tv
::
torch2tv
<
const
int
>
(
indices
),
tv
::
torch2tv
<
int
>
(
outInds
),
tv
::
torch2tv
<
int
>
(
gridOut
),
tv
::
torch2tv
<
int
>
(
indicePairs
),
tv
::
torch2tv
<
int
>
(
indiceNum
),
tv
::
torch2tv
<
int
>
(
indicePairUnique
),
kernelSize32
,
stride32
,
padding32
,
dilation32
,
outSpatialShape32
,
transpose
);
if
(
numActOut
>
0
)
{
auto
res
=
torch
::
_unique
(
indicePairUnique
);
indicePairUnique
=
std
::
get
<
0
>
(
res
);
...
...
@@ -241,19 +249,18 @@ getIndicePairPreGrid(torch::Tensor indices, torch::Tensor gridOut, int64_t batch
tv
::
TorchGPU
(),
tv
::
torch2tv
<
const
int
>
(
indices
),
tv
::
torch2tv
<
int
>
(
outInds
),
tv
::
torch2tv
<
int
>
(
gridOut
),
tv
::
torch2tv
<
int
>
(
indicePairs
),
tv
::
torch2tv
<
int
>
(
indiceNum
),
tv
::
torch2tv
<
int
>
(
indicePairUnique
),
outSpatialShape32
,
transpose
,
true
);
tv
::
torch2tv
<
int
>
(
indicePairUnique
),
outSpatialShape32
,
transpose
,
true
);
}
}
return
{
outInds
.
slice
(
0
,
0
,
numActOut
),
indicePairs
,
indiceNum
};
}
}
template
<
typename
T
>
torch
::
Tensor
indiceConv
(
torch
::
Tensor
features
,
torch
::
Tensor
filters
,
torch
::
Tensor
indicePairs
,
torch
::
Tensor
indiceNum
,
int64_t
numActOut
,
int64_t
_inverse
,
int64_t
_subM
)
{
torch
::
Tensor
indicePairs
,
torch
::
Tensor
indiceNum
,
int64_t
numActOut
,
int64_t
_inverse
,
int64_t
_subM
)
{
bool
subM
=
_subM
!=
0
;
bool
inverse
=
_inverse
!=
0
;
auto
device
=
features
.
device
().
type
();
...
...
@@ -262,13 +269,16 @@ torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters,
auto
numInPlanes
=
features
.
size
(
1
);
auto
numOutPlanes
=
filters
.
size
(
ndim
+
1
);
auto
indicePairNumCpu
=
indiceNum
.
to
({
torch
::
kCPU
});
auto
indicePairMaxSizeIter
=
std
::
max_element
(
indicePairNumCpu
.
data
<
int
>
(),
indicePairNumCpu
.
data
<
int
>
()
+
kernelVolume
);
int
indicePairMaxOffset
=
indicePairMaxSizeIter
-
indicePairNumCpu
.
data
<
int
>
();
auto
indicePairMaxSizeIter
=
std
::
max_element
(
indicePairNumCpu
.
data_ptr
<
int
>
(),
indicePairNumCpu
.
data_ptr
<
int
>
()
+
kernelVolume
);
int
indicePairMaxOffset
=
indicePairMaxSizeIter
-
indicePairNumCpu
.
data_ptr
<
int
>
();
int
indicePairMaxSize
=
*
indicePairMaxSizeIter
;
/*if (_subM){
std::vector<int> indicePairNumVec(indicePairNumCpu.data<int>(), indicePairNumCpu.data<int>() + kernelVolume);
std::vector<int> indicePairNumVec(indicePairNumCpu.data_ptr<int>(),
indicePairNumCpu.data_ptr<int>() + kernelVolume);
indicePairNumVec.erase(indicePairNumVec.begin() + indicePairMaxOffset);
auto indicePairVecMaxSizeIter = std::max_element(
...
...
@@ -282,45 +292,47 @@ torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters,
// torch::TensorOptions().dtype(torch::kInt64).device(indicePairs.device());
torch
::
Tensor
output
=
torch
::
zeros
({
numActOut
,
numOutPlanes
},
options
);
torch
::
Tensor
inputBuffer
=
torch
::
zeros
({
indicePairMaxSize
,
numInPlanes
},
options
);
torch
::
Tensor
inputBuffer
=
torch
::
zeros
({
indicePairMaxSize
,
numInPlanes
},
options
);
torch
::
Tensor
outputBuffer
=
torch
::
zeros
({
indicePairMaxSize
,
numOutPlanes
},
options
);
filters
=
filters
.
view
({
-
1
,
numInPlanes
,
numOutPlanes
});
if
(
subM
)
{
// the center index of subm conv don't need gather and scatter
// add.
if
(
subM
)
{
// the center index of subm conv don't need gather and scatter
// add.
torch
::
mm_out
(
output
,
features
,
filters
[
indicePairMaxOffset
]);
}
double
totalGatherTime
=
0
;
double
totalGEMMTime
=
0
;
double
totalSAddTime
=
0
;
for
(
int
i
=
0
;
i
<
kernelVolume
;
++
i
)
{
auto
nHot
=
indicePairNumCpu
.
data
<
int
>
()[
i
];
auto
nHot
=
indicePairNumCpu
.
data
_ptr
<
int
>
()[
i
];
if
(
nHot
<=
0
||
(
subM
&&
i
==
indicePairMaxOffset
))
{
continue
;
}
// auto timer = spconv::CudaContextTimer<>();
auto
outputBufferBlob
=
torch
::
from_blob
(
outputBuffer
.
data
<
T
>
(),
{
nHot
,
numOutPlanes
},
options
);
auto
inputBufferBlob
=
torch
::
from_blob
(
inputBuffer
.
data
<
T
>
(),
{
nHot
,
numInPlanes
},
options
);
auto
outputBufferBlob
=
torch
::
from_blob
(
outputBuffer
.
data_ptr
<
T
>
(),
{
nHot
,
numOutPlanes
},
options
);
auto
inputBufferBlob
=
torch
::
from_blob
(
inputBuffer
.
data_ptr
<
T
>
(),
{
nHot
,
numInPlanes
},
options
);
if
(
device
==
torch
::
kCPU
)
{
functor
::
SparseGatherFunctor
<
tv
::
CPU
,
T
,
int
>
gatherFtor
;
gatherFtor
(
tv
::
CPU
(),
tv
::
torch2tv
<
T
>
(
inputBuffer
),
tv
::
torch2tv
<
const
T
>
(
features
),
tv
::
torch2tv
<
const
int
>
(
indicePairs
).
subview
(
i
,
inverse
),
nHot
);
tv
::
torch2tv
<
const
int
>
(
indicePairs
).
subview
(
i
,
inverse
),
nHot
);
}
else
{
functor
::
SparseGatherFunctor
<
tv
::
GPU
,
T
,
int
>
gatherFtor
;
gatherFtor
(
tv
::
TorchGPU
(),
tv
::
torch2tv
<
T
>
(
inputBuffer
),
tv
::
torch2tv
<
const
T
>
(
features
),
tv
::
torch2tv
<
const
int
>
(
indicePairs
).
subview
(
i
,
inverse
),
nHot
);
tv
::
torch2tv
<
const
int
>
(
indicePairs
).
subview
(
i
,
inverse
),
nHot
);
TV_CHECK_CUDA_ERR
();
/* slower than SparseGatherFunctor, may due to int->long conversion
auto indicePairLong = indicePairs[i][inverse].to(torch::kInt64);
auto indicePairBlob = torch::from_blob(indicePairLong.data<long>(), {nHot},
indicePairOptions);
torch::index_select_out(inputBufferBlob, features, 0,
indicePairBlob);*/
auto indicePairBlob = torch::from_blob(indicePairLong.data_ptr<long>(),
{nHot}, indicePairOptions); torch::index_select_out(inputBufferBlob,
features, 0, indicePairBlob);*/
}
// totalGatherTime += timer.report() / 1000.0;
torch
::
mm_out
(
outputBufferBlob
,
inputBufferBlob
,
filters
[
i
]);
...
...
@@ -330,14 +342,14 @@ torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters,
functor
::
SparseScatterAddFunctor
<
tv
::
CPU
,
T
,
int
>
scatterFtor
;
scatterFtor
(
tv
::
CPU
(),
tv
::
torch2tv
<
T
>
(
output
),
tv
::
torch2tv
<
const
T
>
(
outputBuffer
),
tv
::
torch2tv
<
const
int
>
(
indicePairs
).
subview
(
i
,
!
inverse
),
nHot
,
true
);
tv
::
torch2tv
<
const
int
>
(
indicePairs
).
subview
(
i
,
!
inverse
),
nHot
,
true
);
}
else
{
functor
::
SparseScatterAddFunctor
<
tv
::
GPU
,
T
,
int
>
scatterFtor
;
scatterFtor
(
tv
::
TorchGPU
(),
tv
::
torch2tv
<
T
>
(
output
),
tv
::
torch2tv
<
const
T
>
(
outputBuffer
),
tv
::
torch2tv
<
const
int
>
(
indicePairs
).
subview
(
i
,
!
inverse
),
nHot
,
true
);
tv
::
torch2tv
<
const
int
>
(
indicePairs
).
subview
(
i
,
!
inverse
),
nHot
,
true
);
TV_CHECK_CUDA_ERR
();
}
// totalSAddTime += timer.report() / 1000.0;
...
...
@@ -349,10 +361,12 @@ torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters,
}
template
<
typename
T
>
std
::
vector
<
torch
::
Tensor
>
indiceConvBackward
(
torch
::
Tensor
features
,
torch
::
Tensor
filters
,
torch
::
Tensor
outGrad
,
torch
::
Tensor
indicePairs
,
torch
::
Tensor
indiceNum
,
int64_t
_inverse
,
int64_t
_subM
)
{
std
::
vector
<
torch
::
Tensor
>
indiceConvBackward
(
torch
::
Tensor
features
,
torch
::
Tensor
filters
,
torch
::
Tensor
outGrad
,
torch
::
Tensor
indicePairs
,
torch
::
Tensor
indiceNum
,
int64_t
_inverse
,
int64_t
_subM
)
{
bool
subM
=
_subM
!=
0
;
bool
inverse
=
_inverse
!=
0
;
...
...
@@ -362,16 +376,19 @@ indiceConvBackward(torch::Tensor features, torch::Tensor filters,
auto
numInPlanes
=
features
.
size
(
1
);
auto
numOutPlanes
=
filters
.
size
(
ndim
+
1
);
auto
indicePairNumCpu
=
indiceNum
.
to
({
torch
::
kCPU
});
auto
indicePairMaxSizeIter
=
std
::
max_element
(
indicePairNumCpu
.
data
<
int
>
(),
indicePairNumCpu
.
data
<
int
>
()
+
kernelVolume
);
int
indicePairMaxOffset
=
indicePairMaxSizeIter
-
indicePairNumCpu
.
data
<
int
>
();
auto
indicePairMaxSizeIter
=
std
::
max_element
(
indicePairNumCpu
.
data_ptr
<
int
>
(),
indicePairNumCpu
.
data_ptr
<
int
>
()
+
kernelVolume
);
int
indicePairMaxOffset
=
indicePairMaxSizeIter
-
indicePairNumCpu
.
data_ptr
<
int
>
();
int
indicePairMaxSize
=
*
indicePairMaxSizeIter
;
auto
options
=
torch
::
TensorOptions
().
dtype
(
features
.
dtype
()).
device
(
features
.
device
());
auto
filterShape
=
filters
.
sizes
();
torch
::
Tensor
inputGrad
=
torch
::
zeros
(
features
.
sizes
(),
options
);
torch
::
Tensor
filtersGrad
=
torch
::
zeros
(
filterShape
,
options
);
torch
::
Tensor
inputBuffer
=
torch
::
zeros
({
indicePairMaxSize
,
numInPlanes
},
options
);
torch
::
Tensor
inputBuffer
=
torch
::
zeros
({
indicePairMaxSize
,
numInPlanes
},
options
);
torch
::
Tensor
outputBuffer
=
torch
::
zeros
({
indicePairMaxSize
,
numOutPlanes
},
options
);
...
...
@@ -383,7 +400,7 @@ indiceConvBackward(torch::Tensor features, torch::Tensor filters,
torch
::
mm_out
(
inputGrad
,
outGrad
,
filters
[
indicePairMaxOffset
].
t
());
}
for
(
int
i
=
0
;
i
<
kernelVolume
;
++
i
)
{
auto
nHot
=
indicePairNumCpu
.
data
<
int
>
()[
i
];
auto
nHot
=
indicePairNumCpu
.
data
_ptr
<
int
>
()[
i
];
if
(
nHot
<=
0
||
(
subM
&&
i
==
indicePairMaxOffset
))
{
continue
;
}
...
...
@@ -392,27 +409,31 @@ indiceConvBackward(torch::Tensor features, torch::Tensor filters,
functor
::
SparseGatherFunctor
<
tv
::
CPU
,
T
,
int
>
gatherFtorOut
;
gatherFtor
(
tv
::
CPU
(),
tv
::
torch2tv
<
T
>
(
inputBuffer
),
tv
::
torch2tv
<
const
T
>
(
features
),
tv
::
torch2tv
<
const
int
>
(
indicePairs
).
subview
(
i
,
inverse
),
nHot
);
tv
::
torch2tv
<
const
int
>
(
indicePairs
).
subview
(
i
,
inverse
),
nHot
);
gatherFtorOut
(
tv
::
CPU
(),
tv
::
torch2tv
<
T
>
(
outputBuffer
),
tv
::
torch2tv
<
const
T
>
(
outGrad
),
tv
::
torch2tv
<
const
int
>
(
indicePairs
).
subview
(
i
,
!
inverse
),
nHot
);
tv
::
torch2tv
<
const
int
>
(
indicePairs
).
subview
(
i
,
!
inverse
),
nHot
);
}
else
{
functor
::
SparseGatherFunctor
<
tv
::
GPU
,
T
,
int
>
gatherFtor
;
functor
::
SparseGatherFunctor
<
tv
::
GPU
,
T
,
int
>
gatherFtorOut
;
gatherFtor
(
tv
::
TorchGPU
(),
tv
::
torch2tv
<
T
>
(
inputBuffer
),
tv
::
torch2tv
<
const
T
>
(
features
),
tv
::
torch2tv
<
const
int
>
(
indicePairs
).
subview
(
i
,
inverse
),
nHot
);
tv
::
torch2tv
<
const
int
>
(
indicePairs
).
subview
(
i
,
inverse
),
nHot
);
TV_CHECK_CUDA_ERR
();
gatherFtorOut
(
tv
::
TorchGPU
(),
tv
::
torch2tv
<
T
>
(
outputBuffer
),
tv
::
torch2tv
<
const
T
>
(
outGrad
),
tv
::
torch2tv
<
const
int
>
(
indicePairs
).
subview
(
i
,
!
inverse
),
nHot
);
tv
::
torch2tv
<
const
int
>
(
indicePairs
).
subview
(
i
,
!
inverse
),
nHot
);
TV_CHECK_CUDA_ERR
();
}
auto
filterGradSub
=
filtersGrad
[
i
];
auto
outputBufferBlob
=
torch
::
from_blob
(
outputBuffer
.
data
<
T
>
(),
{
nHot
,
numOutPlanes
},
options
);
auto
inputBufferBlob
=
torch
::
from_blob
(
inputBuffer
.
data
<
T
>
(),
{
nHot
,
numInPlanes
},
options
);
auto
outputBufferBlob
=
torch
::
from_blob
(
outputBuffer
.
data_ptr
<
T
>
(),
{
nHot
,
numOutPlanes
},
options
);
auto
inputBufferBlob
=
torch
::
from_blob
(
inputBuffer
.
data_ptr
<
T
>
(),
{
nHot
,
numInPlanes
},
options
);
torch
::
mm_out
(
filterGradSub
,
inputBufferBlob
.
t
(),
outputBufferBlob
);
torch
::
mm_out
(
inputBufferBlob
,
outputBufferBlob
,
filters
[
i
].
t
());
...
...
@@ -420,12 +441,14 @@ indiceConvBackward(torch::Tensor features, torch::Tensor filters,
functor
::
SparseScatterAddFunctor
<
tv
::
CPU
,
T
,
int
>
scatterFtor
;
scatterFtor
(
tv
::
CPU
(),
tv
::
torch2tv
<
T
>
(
inputGrad
),
tv
::
torch2tv
<
const
T
>
(
inputBuffer
),
tv
::
torch2tv
<
const
int
>
(
indicePairs
).
subview
(
i
,
inverse
),
nHot
);
tv
::
torch2tv
<
const
int
>
(
indicePairs
).
subview
(
i
,
inverse
),
nHot
);
}
else
{
functor
::
SparseScatterAddFunctor
<
tv
::
GPU
,
T
,
int
>
scatterFtor
;
scatterFtor
(
tv
::
TorchGPU
(),
tv
::
torch2tv
<
T
>
(
inputGrad
),
tv
::
torch2tv
<
const
T
>
(
inputBuffer
),
tv
::
torch2tv
<
const
int
>
(
indicePairs
).
subview
(
i
,
inverse
),
nHot
);
tv
::
torch2tv
<
const
int
>
(
indicePairs
).
subview
(
i
,
inverse
),
nHot
);
TV_CHECK_CUDA_ERR
();
}
}
...
...
@@ -433,9 +456,12 @@ indiceConvBackward(torch::Tensor features, torch::Tensor filters,
}
template
<
typename
T
>
torch
::
Tensor
indiceConvDevelopDontUse
(
torch
::
Tensor
features
,
torch
::
Tensor
filters
,
torch
::
Tensor
indicePairs
,
torch
::
Tensor
indiceNum
,
int64_t
numActOut
,
int64_t
_inverse
,
int64_t
_subM
)
{
torch
::
Tensor
indiceConvDevelopDontUse
(
torch
::
Tensor
features
,
torch
::
Tensor
filters
,
torch
::
Tensor
indicePairs
,
torch
::
Tensor
indiceNum
,
int64_t
numActOut
,
int64_t
_inverse
,
int64_t
_subM
)
{
bool
subM
=
_subM
!=
0
;
bool
inverse
=
_inverse
!=
0
;
...
...
@@ -446,15 +472,19 @@ torch::Tensor indiceConvDevelopDontUse(torch::Tensor features, torch::Tensor fil
auto
numOutPlanes
=
filters
.
size
(
ndim
+
1
);
auto
indicePairNumCpu
=
indiceNum
.
to
({
torch
::
kCPU
});
auto
totalActsTen
=
indicePairNumCpu
.
sum
();
auto
totalActs
=
indicePairNumCpu
.
data
<
int
>
()[
0
];
auto
indicePairMaxSizeIter
=
std
::
max_element
(
indicePairNumCpu
.
data
<
int
>
(),
indicePairNumCpu
.
data
<
int
>
()
+
kernelVolume
);
int
indicePairMaxOffset
=
indicePairMaxSizeIter
-
indicePairNumCpu
.
data
<
int
>
();
auto
totalActs
=
indicePairNumCpu
.
data_ptr
<
int
>
()[
0
];
auto
indicePairMaxSizeIter
=
std
::
max_element
(
indicePairNumCpu
.
data_ptr
<
int
>
(),
indicePairNumCpu
.
data_ptr
<
int
>
()
+
kernelVolume
);
int
indicePairMaxOffset
=
indicePairMaxSizeIter
-
indicePairNumCpu
.
data_ptr
<
int
>
();
int
indicePairMaxSize
=
*
indicePairMaxSizeIter
;
std
::
vector
<
int
>
indicePairNumVec
(
indicePairNumCpu
.
data
<
int
>
(),
indicePairNumCpu
.
data
<
int
>
()
+
kernelVolume
);
std
::
vector
<
int
>
indicePairNumVec
(
indicePairNumCpu
.
data_ptr
<
int
>
(),
indicePairNumCpu
.
data_ptr
<
int
>
()
+
kernelVolume
);
indicePairNumVec
.
erase
(
indicePairNumVec
.
begin
()
+
indicePairMaxOffset
);
int
subRuleMaxSize
=
*
std
::
max_element
(
indicePairNumVec
.
begin
(),
indicePairNumVec
.
end
());
int
subRuleMaxSize
=
*
std
::
max_element
(
indicePairNumVec
.
begin
(),
indicePairNumVec
.
end
());
if
(
subM
)
{
indicePairMaxSize
=
subRuleMaxSize
;
}
...
...
@@ -470,9 +500,9 @@ torch::Tensor indiceConvDevelopDontUse(torch::Tensor features, torch::Tensor fil
torch
::
Tensor
outputBuffer
=
torch
::
zeros
({
kernelVolume
,
indicePairMaxSize
,
numOutPlanes
},
options
);
filters
=
filters
.
view
({
-
1
,
numInPlanes
,
numOutPlanes
});
std
::
cout
<<
"create time "
<<
timer
.
report
()
/
1000.0
<<
std
::
endl
;
if
(
subM
)
{
// the center index of subm conv don't need gather and scatter
// add.
std
::
cout
<<
"create time "
<<
timer
.
report
()
/
1000.0
<<
std
::
endl
;
if
(
subM
)
{
// the center index of subm conv don't need gather and scatter
// add.
torch
::
mm_out
(
output
,
features
,
filters
[
indicePairMaxOffset
]);
}
double
totalGatherTime
=
0
;
...
...
@@ -480,43 +510,44 @@ torch::Tensor indiceConvDevelopDontUse(torch::Tensor features, torch::Tensor fil
double
totalSAddTime
=
0
;
// auto timer = spconv::CudaContextTimer<>();
for
(
int
i
=
0
;
i
<
kernelVolume
;
++
i
)
{
auto
nHot
=
indicePairNumCpu
.
data
<
int
>
()[
i
];
auto
nHot
=
indicePairNumCpu
.
data
_ptr
<
int
>
()[
i
];
if
(
nHot
<=
0
||
(
subM
&&
i
==
indicePairMaxOffset
))
{
continue
;
}
//
auto
outputBufferBlob
=
torch
::
from_blob
(
outputBuffer
[
i
].
data
<
T
>
(),
auto
outputBufferBlob
=
torch
::
from_blob
(
outputBuffer
[
i
].
data
_ptr
<
T
>
(),
{
nHot
,
numOutPlanes
},
options
);
auto
inputBufferBlob
=
torch
::
from_blob
(
inputBuffer
[
i
].
data
<
T
>
(),
auto
inputBufferBlob
=
torch
::
from_blob
(
inputBuffer
[
i
].
data
_ptr
<
T
>
(),
{
nHot
,
numInPlanes
},
options
);
if
(
device
==
torch
::
kCPU
)
{
functor
::
SparseGatherFunctor
<
tv
::
CPU
,
T
,
int
>
gatherFtor
;
gatherFtor
(
tv
::
CPU
(),
tv
::
torch2tv
<
T
>
(
inputBufferBlob
),
tv
::
torch2tv
<
const
T
>
(
features
),
tv
::
torch2tv
<
const
int
>
(
indicePairs
).
subview
(
i
,
inverse
),
nHot
);
tv
::
torch2tv
<
const
int
>
(
indicePairs
).
subview
(
i
,
inverse
),
nHot
);
}
else
{
functor
::
SparseGatherFunctor
<
tv
::
GPU
,
T
,
int
>
gatherFtor
;
gatherFtor
(
tv
::
TorchGPU
(),
tv
::
torch2tv
<
T
>
(
inputBufferBlob
),
tv
::
torch2tv
<
const
T
>
(
features
),
tv
::
torch2tv
<
const
int
>
(
indicePairs
).
subview
(
i
,
inverse
),
nHot
);
tv
::
torch2tv
<
const
int
>
(
indicePairs
).
subview
(
i
,
inverse
),
nHot
);
TV_CHECK_CUDA_ERR
();
}
// }
// for (int i = 0; i < kernelVolume; ++i) {
// totalGatherTime += timer.report() / 1000.0;
// auto outputBufferBlob = torch::from_blob(outputBuffer[i].data<T>(),
// auto outputBufferBlob = torch::from_blob(outputBuffer[i].data
_ptr
<T>(),
// {nHot, numOutPlanes}, options);
}
// totalGatherTime += timer.report() / 1000.0;
for
(
int
i
=
0
;
i
<
kernelVolume
;
++
i
)
{
auto
nHot
=
indicePairNumCpu
.
data
<
int
>
()[
i
];
auto
nHot
=
indicePairNumCpu
.
data
_ptr
<
int
>
()[
i
];
if
(
nHot
<=
0
||
(
subM
&&
i
==
indicePairMaxOffset
))
{
continue
;
}
auto
outputBufferBlob
=
torch
::
from_blob
(
outputBuffer
[
i
].
data
<
T
>
(),
auto
outputBufferBlob
=
torch
::
from_blob
(
outputBuffer
[
i
].
data
_ptr
<
T
>
(),
{
nHot
,
numOutPlanes
},
options
);
auto
inputBufferBlob
=
torch
::
from_blob
(
inputBuffer
[
i
].
data
<
T
>
(),
auto
inputBufferBlob
=
torch
::
from_blob
(
inputBuffer
[
i
].
data
_ptr
<
T
>
(),
{
nHot
,
numInPlanes
},
options
);
torch
::
mm_out
(
outputBufferBlob
,
inputBufferBlob
,
filters
[
i
]);
...
...
@@ -524,27 +555,27 @@ torch::Tensor indiceConvDevelopDontUse(torch::Tensor features, torch::Tensor fil
// totalGEMMTime += timer.report() / 1000.0;
// totalGEMMTime += timer.report() / 1000.0;
for
(
int
i
=
0
;
i
<
kernelVolume
;
++
i
)
{
auto
nHot
=
indicePairNumCpu
.
data
<
int
>
()[
i
];
auto
nHot
=
indicePairNumCpu
.
data
_ptr
<
int
>
()[
i
];
if
(
nHot
<=
0
||
(
subM
&&
i
==
indicePairMaxOffset
))
{
continue
;
}
auto
outputBufferBlob
=
torch
::
from_blob
(
outputBuffer
[
i
].
data
<
T
>
(),
auto
outputBufferBlob
=
torch
::
from_blob
(
outputBuffer
[
i
].
data
_ptr
<
T
>
(),
{
nHot
,
numOutPlanes
},
options
);
auto
inputBufferBlob
=
torch
::
from_blob
(
inputBuffer
[
i
].
data
<
T
>
(),
auto
inputBufferBlob
=
torch
::
from_blob
(
inputBuffer
[
i
].
data
_ptr
<
T
>
(),
{
nHot
,
numInPlanes
},
options
);
if
(
device
==
torch
::
kCPU
)
{
functor
::
SparseScatterAddFunctor
<
tv
::
CPU
,
T
,
int
>
scatterFtor
;
scatterFtor
(
tv
::
CPU
(),
tv
::
torch2tv
<
T
>
(
output
),
tv
::
torch2tv
<
const
T
>
(
outputBufferBlob
),
tv
::
torch2tv
<
const
int
>
(
indicePairs
).
subview
(
i
,
!
inverse
),
nHot
,
true
);
tv
::
torch2tv
<
const
int
>
(
indicePairs
).
subview
(
i
,
!
inverse
),
nHot
,
true
);
}
else
{
functor
::
SparseScatterAddFunctor
<
tv
::
GPU
,
T
,
int
>
scatterFtor
;
scatterFtor
(
tv
::
TorchGPU
(),
tv
::
torch2tv
<
T
>
(
output
),
tv
::
torch2tv
<
const
T
>
(
outputBufferBlob
),
tv
::
torch2tv
<
const
int
>
(
indicePairs
).
subview
(
i
,
!
inverse
),
nHot
,
true
);
tv
::
torch2tv
<
const
int
>
(
indicePairs
).
subview
(
i
,
!
inverse
),
nHot
,
true
);
TV_CHECK_CUDA_ERR
();
}
// totalSAddTime += timer.report() / 1000.0;
...
...
@@ -556,6 +587,6 @@ torch::Tensor indiceConvDevelopDontUse(torch::Tensor features, torch::Tensor fil
return
output
;
}
}
// namespace spconv
}
// namespace spconv
#endif
mmdet3d/ops/spconv/include/torch_utils.h
View file @
ee5667c1
...
...
@@ -13,48 +13,49 @@
// limitations under the License.
#pragma once
#include <tensorview/tensorview.h>
#include <torch/script.h>
#include <ATen/ATen.h>
#include <ATen/cuda/CUDAContext.h>
#include <tensorview/tensorview.h>
#include <torch/script.h>
namespace
tv
{
struct
TorchGPU
:
public
tv
::
GPU
{
struct
TorchGPU
:
public
tv
::
GPU
{
virtual
cudaStream_t
getStream
()
const
override
{
return
at
::
cuda
::
getCurrentCUDAStream
();
}
};
template
<
typename
T
>
void
check_torch_dtype
(
const
torch
::
Tensor
&
tensor
)
{
template
<
typename
T
>
void
check_torch_dtype
(
const
torch
::
Tensor
&
tensor
)
{
switch
(
tensor
.
type
().
scalarType
())
{
case
at
::
ScalarType
::
Double
:
{
auto
val
=
std
::
is_same
<
std
::
remove_const_t
<
T
>
,
double
>::
value
;
TV_ASSERT_RT_ERR
(
val
,
"error"
);
break
;
}
case
at
::
ScalarType
::
Float
:
{
auto
val
=
std
::
is_same
<
std
::
remove_const_t
<
T
>
,
float
>::
value
;
TV_ASSERT_RT_ERR
(
val
,
"error"
);
break
;
}
case
at
::
ScalarType
::
Int
:
{
auto
val
=
std
::
is_same
<
std
::
remove_const_t
<
T
>
,
int
>::
value
;
TV_ASSERT_RT_ERR
(
val
,
"error"
);
break
;
}
case
at
::
ScalarType
::
Half
:
{
auto
val
=
std
::
is_same
<
std
::
remove_const_t
<
T
>
,
at
::
Half
>::
value
;
TV_ASSERT_RT_ERR
(
val
,
"error"
);
break
;
}
case
at
::
ScalarType
::
Long
:
{
auto
val
=
std
::
is_same
<
std
::
remove_const_t
<
T
>
,
long
>::
value
;
TV_ASSERT_RT_ERR
(
val
,
"error"
);
break
;
}
default:
TV_ASSERT_RT_ERR
(
false
,
"error"
);
case
at
::
ScalarType
::
Double
:
{
auto
val
=
std
::
is_same
<
std
::
remove_const_t
<
T
>
,
double
>::
value
;
TV_ASSERT_RT_ERR
(
val
,
"error"
);
break
;
}
case
at
::
ScalarType
::
Float
:
{
auto
val
=
std
::
is_same
<
std
::
remove_const_t
<
T
>
,
float
>::
value
;
TV_ASSERT_RT_ERR
(
val
,
"error"
);
break
;
}
case
at
::
ScalarType
::
Int
:
{
auto
val
=
std
::
is_same
<
std
::
remove_const_t
<
T
>
,
int
>::
value
;
TV_ASSERT_RT_ERR
(
val
,
"error"
);
break
;
}
case
at
::
ScalarType
::
Half
:
{
auto
val
=
std
::
is_same
<
std
::
remove_const_t
<
T
>
,
at
::
Half
>::
value
;
TV_ASSERT_RT_ERR
(
val
,
"error"
);
break
;
}
case
at
::
ScalarType
::
Long
:
{
auto
val
=
std
::
is_same
<
std
::
remove_const_t
<
T
>
,
long
>::
value
;
TV_ASSERT_RT_ERR
(
val
,
"error"
);
break
;
}
default:
TV_ASSERT_RT_ERR
(
false
,
"error"
);
}
}
...
...
@@ -65,6 +66,6 @@ tv::TensorView<T> torch2tv(const torch::Tensor &tensor) {
for
(
auto
i
:
tensor
.
sizes
())
{
shape
.
push_back
(
i
);
}
return
tv
::
TensorView
<
T
>
(
tensor
.
data
<
std
::
remove_const_t
<
T
>>
(),
shape
);
return
tv
::
TensorView
<
T
>
(
tensor
.
data
_ptr
<
std
::
remove_const_t
<
T
>>
(),
shape
);
}
}
// namespace tv
}
// namespace tv
mmdet3d/ops/voxel/src/scatter_points_cpu.cpp
View file @
ee5667c1
#include <torch/extension.h>
#include <ATen/TensorUtils.h>
#include <torch/extension.h>
// #include "voxelization.h"
namespace
{
template
<
typename
T_int
>
void
determin_max_points_kernel
(
torch
::
TensorAccessor
<
T_int
,
2
>
coor
,
torch
::
TensorAccessor
<
T_int
,
1
>
point_to_voxelidx
,
torch
::
TensorAccessor
<
T_int
,
1
>
num_points_per_voxel
,
torch
::
TensorAccessor
<
T_int
,
3
>
coor_to_voxelidx
,
int
&
voxel_num
,
int
&
max_points
,
const
int
num_points
)
{
int
voxelidx
,
num
;
for
(
int
i
=
0
;
i
<
num_points
;
++
i
)
{
if
(
coor
[
i
][
0
]
==
-
1
)
continue
;
voxelidx
=
coor_to_voxelidx
[
coor
[
i
][
0
]][
coor
[
i
][
1
]][
coor
[
i
][
2
]];
// record voxel
if
(
voxelidx
==
-
1
)
{
voxelidx
=
voxel_num
;
voxel_num
+=
1
;
coor_to_voxelidx
[
coor
[
i
][
0
]][
coor
[
i
][
1
]][
coor
[
i
][
2
]]
=
voxelidx
;
}
// put points into voxel
num
=
num_points_per_voxel
[
voxelidx
];
point_to_voxelidx
[
i
]
=
num
;
num_points_per_voxel
[
voxelidx
]
+=
1
;
// update max points per voxel
max_points
=
std
::
max
(
max_points
,
num
+
1
);
void
determin_max_points_kernel
(
torch
::
TensorAccessor
<
T_int
,
2
>
coor
,
torch
::
TensorAccessor
<
T_int
,
1
>
point_to_voxelidx
,
torch
::
TensorAccessor
<
T_int
,
1
>
num_points_per_voxel
,
torch
::
TensorAccessor
<
T_int
,
3
>
coor_to_voxelidx
,
int
&
voxel_num
,
int
&
max_points
,
const
int
num_points
)
{
int
voxelidx
,
num
;
for
(
int
i
=
0
;
i
<
num_points
;
++
i
)
{
if
(
coor
[
i
][
0
]
==
-
1
)
continue
;
voxelidx
=
coor_to_voxelidx
[
coor
[
i
][
0
]][
coor
[
i
][
1
]][
coor
[
i
][
2
]];
// record voxel
if
(
voxelidx
==
-
1
)
{
voxelidx
=
voxel_num
;
voxel_num
+=
1
;
coor_to_voxelidx
[
coor
[
i
][
0
]][
coor
[
i
][
1
]][
coor
[
i
][
2
]]
=
voxelidx
;
}
return
;
}
// put points into voxel
num
=
num_points_per_voxel
[
voxelidx
];
point_to_voxelidx
[
i
]
=
num
;
num_points_per_voxel
[
voxelidx
]
+=
1
;
// update max points per voxel
max_points
=
std
::
max
(
max_points
,
num
+
1
);
}
return
;
}
template
<
typename
T
,
typename
T_int
>
void
scatter_point_to_voxel_kernel
(
const
torch
::
TensorAccessor
<
T
,
2
>
points
,
torch
::
TensorAccessor
<
T_int
,
2
>
coor
,
torch
::
TensorAccessor
<
T_int
,
1
>
point_to_voxelidx
,
torch
::
TensorAccessor
<
T_int
,
3
>
coor_to_voxelidx
,
torch
::
TensorAccessor
<
T
,
3
>
voxels
,
torch
::
TensorAccessor
<
T_int
,
2
>
voxel_coors
,
const
int
num_features
,
const
int
num_points
,
const
int
NDim
){
for
(
int
i
=
0
;
i
<
num_points
;
++
i
)
{
int
num
=
point_to_voxelidx
[
i
];
int
voxelidx
=
coor_to_voxelidx
[
coor
[
i
][
0
]][
coor
[
i
][
1
]][
coor
[
i
][
2
]];
for
(
int
k
=
0
;
k
<
num_features
;
++
k
)
{
voxels
[
voxelidx
][
num
][
k
]
=
points
[
i
][
k
];
}
for
(
int
k
=
0
;
k
<
NDim
;
++
k
)
{
voxel_coors
[
voxelidx
][
k
]
=
coor
[
i
][
k
];
}
const
torch
::
TensorAccessor
<
T
,
2
>
points
,
torch
::
TensorAccessor
<
T_int
,
2
>
coor
,
torch
::
TensorAccessor
<
T_int
,
1
>
point_to_voxelidx
,
torch
::
TensorAccessor
<
T_int
,
3
>
coor_to_voxelidx
,
torch
::
TensorAccessor
<
T
,
3
>
voxels
,
torch
::
TensorAccessor
<
T_int
,
2
>
voxel_coors
,
const
int
num_features
,
const
int
num_points
,
const
int
NDim
)
{
for
(
int
i
=
0
;
i
<
num_points
;
++
i
)
{
int
num
=
point_to_voxelidx
[
i
];
int
voxelidx
=
coor_to_voxelidx
[
coor
[
i
][
0
]][
coor
[
i
][
1
]][
coor
[
i
][
2
]];
for
(
int
k
=
0
;
k
<
num_features
;
++
k
)
{
voxels
[
voxelidx
][
num
][
k
]
=
points
[
i
][
k
];
}
for
(
int
k
=
0
;
k
<
NDim
;
++
k
)
{
voxel_coors
[
voxelidx
][
k
]
=
coor
[
i
][
k
];
}
}
}
}
// namespace
}
// namespace
namespace
voxelization
{
std
::
vector
<
at
::
Tensor
>
dynamic_point_to_voxel_cpu
(
const
at
::
Tensor
&
points
,
const
at
::
Tensor
&
voxel_mapping
,
const
std
::
vector
<
float
>
voxel_size
,
const
std
::
vector
<
float
>
coors_range
)
{
// current version tooks about 0.02s_0.03s for one frame on cpu
// check device
AT_ASSERTM
(
points
.
device
().
is_cpu
(),
"points must be a CPU tensor"
);
const
int
NDim
=
voxel_mapping
.
size
(
1
);
const
int
num_points
=
points
.
size
(
0
);
const
int
num_features
=
points
.
size
(
1
);
std
::
vector
<
int
>
grid_size
(
NDim
);
for
(
int
i
=
0
;
i
<
NDim
;
++
i
)
{
grid_size
[
i
]
=
round
((
coors_range
[
NDim
+
i
]
-
coors_range
[
i
])
/
voxel_size
[
i
]);
}
at
::
Tensor
num_points_per_voxel
=
at
::
zeros
({
num_points
,},
voxel_mapping
.
options
());
at
::
Tensor
coor_to_voxelidx
=
-
at
::
ones
({
grid_size
[
2
],
grid_size
[
1
],
grid_size
[
0
]},
voxel_mapping
.
options
());
at
::
Tensor
point_to_voxelidx
=
-
at
::
ones
({
num_points
,},
voxel_mapping
.
options
());
int
voxel_num
=
0
;
int
max_points
=
0
;
AT_DISPATCH_ALL_TYPES
(
voxel_mapping
.
type
(),
"determin_max_point"
,
[
&
]
{
determin_max_points_kernel
<
scalar_t
>
(
voxel_mapping
.
accessor
<
scalar_t
,
2
>
(),
point_to_voxelidx
.
accessor
<
scalar_t
,
1
>
(),
num_points_per_voxel
.
accessor
<
scalar_t
,
1
>
(),
coor_to_voxelidx
.
accessor
<
scalar_t
,
3
>
(),
voxel_num
,
max_points
,
num_points
);
});
at
::
Tensor
voxels
=
at
::
zeros
({
voxel_num
,
max_points
,
num_features
},
points
.
options
());
at
::
Tensor
voxel_coors
=
at
::
zeros
({
voxel_num
,
NDim
},
points
.
options
().
dtype
(
at
::
kInt
));
AT_DISPATCH_ALL_TYPES
(
points
.
type
(),
"scatter_point_to_voxel"
,
[
&
]
{
scatter_point_to_voxel_kernel
<
scalar_t
,
int
>
(
points
.
accessor
<
scalar_t
,
2
>
(),
voxel_mapping
.
accessor
<
int
,
2
>
(),
point_to_voxelidx
.
accessor
<
int
,
1
>
(),
coor_to_voxelidx
.
accessor
<
int
,
3
>
(),
voxels
.
accessor
<
scalar_t
,
3
>
(),
voxel_coors
.
accessor
<
int
,
2
>
(),
num_features
,
num_points
,
NDim
);
});
at
::
Tensor
num_points_per_voxel_out
=
num_points_per_voxel
.
slice
(
/*dim=*/
0
,
/*start=*/
0
,
/*end=*/
voxel_num
);
return
{
voxels
,
voxel_coors
,
num_points_per_voxel_out
};
const
at
::
Tensor
&
points
,
const
at
::
Tensor
&
voxel_mapping
,
const
std
::
vector
<
float
>
voxel_size
,
const
std
::
vector
<
float
>
coors_range
)
{
// current version tooks about 0.02s_0.03s for one frame on cpu
// check device
AT_ASSERTM
(
points
.
device
().
is_cpu
(),
"points must be a CPU tensor"
);
const
int
NDim
=
voxel_mapping
.
size
(
1
);
const
int
num_points
=
points
.
size
(
0
);
const
int
num_features
=
points
.
size
(
1
);
std
::
vector
<
int
>
grid_size
(
NDim
);
for
(
int
i
=
0
;
i
<
NDim
;
++
i
)
{
grid_size
[
i
]
=
round
((
coors_range
[
NDim
+
i
]
-
coors_range
[
i
])
/
voxel_size
[
i
]);
}
at
::
Tensor
num_points_per_voxel
=
at
::
zeros
(
{
num_points
,
},
voxel_mapping
.
options
());
at
::
Tensor
coor_to_voxelidx
=
-
at
::
ones
(
{
grid_size
[
2
],
grid_size
[
1
],
grid_size
[
0
]},
voxel_mapping
.
options
());
at
::
Tensor
point_to_voxelidx
=
-
at
::
ones
(
{
num_points
,
},
voxel_mapping
.
options
());
int
voxel_num
=
0
;
int
max_points
=
0
;
AT_DISPATCH_ALL_TYPES
(
voxel_mapping
.
scalar_type
(),
"determin_max_point"
,
[
&
]
{
determin_max_points_kernel
<
scalar_t
>
(
voxel_mapping
.
accessor
<
scalar_t
,
2
>
(),
point_to_voxelidx
.
accessor
<
scalar_t
,
1
>
(),
num_points_per_voxel
.
accessor
<
scalar_t
,
1
>
(),
coor_to_voxelidx
.
accessor
<
scalar_t
,
3
>
(),
voxel_num
,
max_points
,
num_points
);
});
at
::
Tensor
voxels
=
at
::
zeros
({
voxel_num
,
max_points
,
num_features
},
points
.
options
());
at
::
Tensor
voxel_coors
=
at
::
zeros
({
voxel_num
,
NDim
},
points
.
options
().
dtype
(
at
::
kInt
));
AT_DISPATCH_ALL_TYPES
(
points
.
scalar_type
(),
"scatter_point_to_voxel"
,
[
&
]
{
scatter_point_to_voxel_kernel
<
scalar_t
,
int
>
(
points
.
accessor
<
scalar_t
,
2
>
(),
voxel_mapping
.
accessor
<
int
,
2
>
(),
point_to_voxelidx
.
accessor
<
int
,
1
>
(),
coor_to_voxelidx
.
accessor
<
int
,
3
>
(),
voxels
.
accessor
<
scalar_t
,
3
>
(),
voxel_coors
.
accessor
<
int
,
2
>
(),
num_features
,
num_points
,
NDim
);
});
at
::
Tensor
num_points_per_voxel_out
=
num_points_per_voxel
.
slice
(
/*dim=*/
0
,
/*start=*/
0
,
/*end=*/
voxel_num
);
return
{
voxels
,
voxel_coors
,
num_points_per_voxel_out
};
}
}
}
// namespace voxelization
mmdet3d/ops/voxel/src/scatter_points_cuda.cu
View file @
ee5667c1
...
...
@@ -6,7 +6,7 @@
#include <ATen/cuda/CUDAApplyUtils.cuh>
#define CHECK_CUDA(x) \
TORCH_CHECK(x.
typ
e().is_cuda(), #x " must be a CUDA tensor")
TORCH_CHECK(x.
devic
e().is_cuda(), #x " must be a CUDA tensor")
#define CHECK_CONTIGUOUS(x) \
TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
#define CHECK_INPUT(x) \
...
...
@@ -177,7 +177,7 @@ std::vector<at::Tensor> dynamic_point_to_voxel_forward_gpu(
dim3
threads
(
threadsPerBlock
);
cudaStream_t
map_stream
=
at
::
cuda
::
getCurrentCUDAStream
();
AT_DISPATCH_ALL_TYPES
(
voxel_mapping
.
type
(),
"determin_duplicate"
,
([
&
]
{
voxel_mapping
.
scalar_
type
(),
"determin_duplicate"
,
([
&
]
{
point_to_voxelidx_kernel
<
int
><<<
blocks
,
threads
,
0
,
map_stream
>>>
(
voxel_mapping
.
data_ptr
<
int
>
(),
point_to_voxelidx
.
data_ptr
<
int
>
(),
point_to_pointidx
.
data_ptr
<
int
>
(),
num_points
,
NDim
);
...
...
@@ -203,7 +203,7 @@ std::vector<at::Tensor> dynamic_point_to_voxel_forward_gpu(
voxel_mapping
.
options
());
// must be zero from the begining
cudaStream_t
logic_stream
=
at
::
cuda
::
getCurrentCUDAStream
();
AT_DISPATCH_ALL_TYPES
(
voxel_mapping
.
type
(),
"determin_duplicate"
,
([
&
]
{
voxel_mapping
.
scalar_
type
(),
"determin_duplicate"
,
([
&
]
{
determin_voxel_num
<
int
><<<
1
,
1
,
0
,
logic_stream
>>>
(
voxel_mapping
.
data_ptr
<
int
>
(),
num_points_per_voxel
.
data_ptr
<
int
>
(),
point_to_voxelidx
.
data_ptr
<
int
>
(),
...
...
@@ -228,7 +228,7 @@ std::vector<at::Tensor> dynamic_point_to_voxel_forward_gpu(
dim3
cp_threads
(
threadsPerBlock
,
4
);
cudaStream_t
cp_stream
=
at
::
cuda
::
getCurrentCUDAStream
();
AT_DISPATCH_ALL_TYPES
(
points
.
type
(),
"scatter_point_to_voxel"
,
([
&
]
{
points
.
scalar_
type
(),
"scatter_point_to_voxel"
,
([
&
]
{
scatter_point_to_voxel_kernel
<
float
,
int
>
<<<
blocks
,
cp_threads
,
0
,
cp_stream
>>>
(
points
.
data_ptr
<
float
>
(),
voxel_mapping
.
data_ptr
<
int
>
(),
...
...
@@ -265,8 +265,8 @@ void dynamic_point_to_voxel_backward_gpu(at::Tensor& grad_input_points,
dim3
blocks
(
col_blocks
);
dim3
cp_threads
(
threadsPerBlock
,
4
);
cudaStream_t
cp_stream
=
at
::
cuda
::
getCurrentCUDAStream
();
AT_DISPATCH_ALL_TYPES
(
grad_input_points
.
type
(),
"scatter_point_to_voxel"
,
([
&
]
{
AT_DISPATCH_ALL_TYPES
(
grad_input_points
.
scalar_type
()
,
"scatter_point_to_voxel"
,
([
&
]
{
map_voxel_to_point_kernel
<
float
,
int
>
<<<
blocks
,
cp_threads
,
0
,
cp_stream
>>>
(
grad_input_points
.
data_ptr
<
float
>
(),
...
...
mmdet3d/ops/voxel/src/voxelization.h
View file @
ee5667c1
...
...
@@ -49,7 +49,7 @@ inline int hard_voxelize(const at::Tensor& points, at::Tensor& voxels,
const
std
::
vector
<
float
>
coors_range
,
const
int
max_points
,
const
int
max_voxels
,
const
int
NDim
=
3
)
{
if
(
points
.
typ
e
().
is_cuda
())
{
if
(
points
.
devic
e
().
is_cuda
())
{
#ifdef WITH_CUDA
return
hard_voxelize_gpu
(
points
,
voxels
,
coors
,
num_points_per_voxel
,
voxel_size
,
coors_range
,
max_points
,
max_voxels
,
...
...
@@ -67,7 +67,7 @@ inline void dynamic_voxelize(const at::Tensor& points, at::Tensor& coors,
const
std
::
vector
<
float
>
voxel_size
,
const
std
::
vector
<
float
>
coors_range
,
const
int
NDim
=
3
)
{
if
(
points
.
typ
e
().
is_cuda
())
{
if
(
points
.
devic
e
().
is_cuda
())
{
#ifdef WITH_CUDA
return
dynamic_voxelize_gpu
(
points
,
coors
,
voxel_size
,
coors_range
,
NDim
);
#else
...
...
@@ -80,7 +80,7 @@ inline void dynamic_voxelize(const at::Tensor& points, at::Tensor& coors,
inline
std
::
vector
<
torch
::
Tensor
>
dynamic_point_to_voxel_forward
(
const
at
::
Tensor
&
points
,
const
at
::
Tensor
&
voxel_mapping
,
const
std
::
vector
<
float
>
voxel_size
,
const
std
::
vector
<
float
>
coors_range
)
{
if
(
points
.
typ
e
().
is_cuda
())
{
if
(
points
.
devic
e
().
is_cuda
())
{
#ifdef WITH_CUDA
return
dynamic_point_to_voxel_forward_gpu
(
points
,
voxel_mapping
,
voxel_size
,
coors_range
);
...
...
@@ -95,7 +95,7 @@ inline std::vector<torch::Tensor> dynamic_point_to_voxel_forward(
inline
void
dynamic_point_to_voxel_backward
(
at
::
Tensor
&
grad_input_points
,
const
at
::
Tensor
&
grad_output_voxels
,
const
at
::
Tensor
&
point_to_voxelidx
,
const
at
::
Tensor
&
coor_to_voxelidx
)
{
if
(
grad_input_points
.
typ
e
().
is_cuda
())
{
if
(
grad_input_points
.
devic
e
().
is_cuda
())
{
#ifdef WITH_CUDA
return
dynamic_point_to_voxel_backward_gpu
(
grad_input_points
,
grad_output_voxels
,
point_to_voxelidx
,
...
...
mmdet3d/ops/voxel/src/voxelization_cpu.cpp
View file @
ee5667c1
#include <torch/extension.h>
#include <ATen/TensorUtils.h>
#include <torch/extension.h>
// #include "voxelization.h"
namespace
{
template
<
typename
T
,
typename
T_int
>
void
dynamic_voxelize_kernel
(
const
torch
::
TensorAccessor
<
T
,
2
>
points
,
void
dynamic_voxelize_kernel
(
const
torch
::
TensorAccessor
<
T
,
2
>
points
,
torch
::
TensorAccessor
<
T_int
,
2
>
coors
,
const
std
::
vector
<
float
>
voxel_size
,
const
std
::
vector
<
float
>
coors_range
,
const
std
::
vector
<
int
>
grid_size
,
const
int
num_points
,
const
int
num_features
,
const
int
NDim
)
{
const
int
num_points
,
const
int
num_features
,
const
int
NDim
)
{
const
int
ndim_minus_1
=
NDim
-
1
;
bool
failed
=
false
;
int
coor
[
NDim
];
...
...
@@ -44,56 +40,42 @@ void dynamic_voxelize_kernel(const torch::TensorAccessor<T,2> points,
return
;
}
template
<
typename
T
,
typename
T_int
>
void
hard_voxelize_kernel
(
const
torch
::
TensorAccessor
<
T
,
2
>
points
,
torch
::
TensorAccessor
<
T
,
3
>
voxels
,
torch
::
TensorAccessor
<
T_int
,
2
>
coors
,
torch
::
TensorAccessor
<
T_int
,
1
>
num_points_per_voxel
,
torch
::
TensorAccessor
<
T_int
,
3
>
coor_to_voxelidx
,
int
&
voxel_num
,
const
std
::
vector
<
float
>
voxel_size
,
void
hard_voxelize_kernel
(
const
torch
::
TensorAccessor
<
T
,
2
>
points
,
torch
::
TensorAccessor
<
T
,
3
>
voxels
,
torch
::
TensorAccessor
<
T_int
,
2
>
coors
,
torch
::
TensorAccessor
<
T_int
,
1
>
num_points_per_voxel
,
torch
::
TensorAccessor
<
T_int
,
3
>
coor_to_voxelidx
,
int
&
voxel_num
,
const
std
::
vector
<
float
>
voxel_size
,
const
std
::
vector
<
float
>
coors_range
,
const
std
::
vector
<
int
>
grid_size
,
const
int
max_points
,
const
int
max_voxels
,
const
int
num_points
,
const
int
num_features
,
const
int
NDim
)
{
const
int
max_points
,
const
int
max_voxels
,
const
int
num_points
,
const
int
num_features
,
const
int
NDim
)
{
// declare a temp coors
at
::
Tensor
temp_coors
=
at
::
zeros
({
num_points
,
NDim
},
at
::
TensorOptions
().
dtype
(
at
::
kInt
).
device
(
at
::
kCPU
));
at
::
Tensor
temp_coors
=
at
::
zeros
(
{
num_points
,
NDim
},
at
::
TensorOptions
().
dtype
(
at
::
kInt
).
device
(
at
::
kCPU
));
// First use dynamic voxelization to get coors,
// then check max points/voxels constraints
dynamic_voxelize_kernel
<
T
,
int
>
(
points
,
temp_coors
.
accessor
<
int
,
2
>
(),
voxel_size
,
coors_range
,
grid_size
,
num_points
,
num_features
,
NDim
);
dynamic_voxelize_kernel
<
T
,
int
>
(
points
,
temp_coors
.
accessor
<
int
,
2
>
(),
voxel_size
,
coors_range
,
grid_size
,
num_points
,
num_features
,
NDim
);
int
voxelidx
,
num
;
auto
coor
=
temp_coors
.
accessor
<
int
,
2
>
();
auto
coor
=
temp_coors
.
accessor
<
int
,
2
>
();
for
(
int
i
=
0
;
i
<
num_points
;
++
i
)
{
// T_int* coor = temp_coors.data_ptr<int>() + i * NDim;
if
(
coor
[
i
][
0
]
==
-
1
)
continue
;
if
(
coor
[
i
][
0
]
==
-
1
)
continue
;
voxelidx
=
coor_to_voxelidx
[
coor
[
i
][
0
]][
coor
[
i
][
1
]][
coor
[
i
][
2
]];
// record voxel
if
(
voxelidx
==
-
1
)
{
voxelidx
=
voxel_num
;
if
(
max_voxels
!=
-
1
&&
voxel_num
>=
max_voxels
)
break
;
if
(
max_voxels
!=
-
1
&&
voxel_num
>=
max_voxels
)
break
;
voxel_num
+=
1
;
coor_to_voxelidx
[
coor
[
i
][
0
]][
coor
[
i
][
1
]][
coor
[
i
][
2
]]
=
voxelidx
;
...
...
@@ -116,93 +98,74 @@ void hard_voxelize_kernel(const torch::TensorAccessor<T,2> points,
return
;
}
}
// namespace
}
// namespace
namespace
voxelization
{
int
hard_voxelize_cpu
(
const
at
::
Tensor
&
points
,
at
::
Tensor
&
voxels
,
at
::
Tensor
&
coors
,
at
::
Tensor
&
num_points_per_voxel
,
const
std
::
vector
<
float
>
voxel_size
,
const
std
::
vector
<
float
>
coors_range
,
const
int
max_points
,
const
int
max_voxels
,
const
int
NDim
=
3
)
{
// current version tooks about 0.02s_0.03s for one frame on cpu
// check device
AT_ASSERTM
(
points
.
device
().
is_cpu
(),
"points must be a CPU tensor"
);
std
::
vector
<
int
>
grid_size
(
NDim
);
const
int
num_points
=
points
.
size
(
0
);
const
int
num_features
=
points
.
size
(
1
);
for
(
int
i
=
0
;
i
<
NDim
;
++
i
)
{
grid_size
[
i
]
=
round
((
coors_range
[
NDim
+
i
]
-
coors_range
[
i
])
/
voxel_size
[
i
]);
}
int
hard_voxelize_cpu
(
const
at
::
Tensor
&
points
,
at
::
Tensor
&
voxels
,
at
::
Tensor
&
coors
,
at
::
Tensor
&
num_points_per_voxel
,
const
std
::
vector
<
float
>
voxel_size
,
const
std
::
vector
<
float
>
coors_range
,
const
int
max_points
,
const
int
max_voxels
,
const
int
NDim
=
3
)
{
// current version tooks about 0.02s_0.03s for one frame on cpu
// check device
AT_ASSERTM
(
points
.
device
().
is_cpu
(),
"points must be a CPU tensor"
);
std
::
vector
<
int
>
grid_size
(
NDim
);
const
int
num_points
=
points
.
size
(
0
);
const
int
num_features
=
points
.
size
(
1
);
for
(
int
i
=
0
;
i
<
NDim
;
++
i
)
{
grid_size
[
i
]
=
round
((
coors_range
[
NDim
+
i
]
-
coors_range
[
i
])
/
voxel_size
[
i
]);
}
// coors, num_points_per_voxel, coor_to_voxelidx are int Tensor
//printf("cpu coor_to_voxelidx size: [%d, %d, %d]\n", grid_size[2], grid_size[1], grid_size[0]);
at
::
Tensor
coor_to_voxelidx
=
-
at
::
ones
({
grid_size
[
2
],
grid_size
[
1
],
grid_size
[
0
]},
coors
.
options
());
// coors, num_points_per_voxel, coor_to_voxelidx are int Tensor
// printf("cpu coor_to_voxelidx size: [%d, %d, %d]\n", grid_size[2],
// grid_size[1], grid_size[0]);
at
::
Tensor
coor_to_voxelidx
=
-
at
::
ones
({
grid_size
[
2
],
grid_size
[
1
],
grid_size
[
0
]},
coors
.
options
());
int
voxel_num
=
0
;
AT_DISPATCH_FLOATING_TYPES_AND_HALF
(
points
.
type
(),
"hard_voxelize_forward"
,
[
&
]
{
int
voxel_num
=
0
;
AT_DISPATCH_FLOATING_TYPES_AND_HALF
(
points
.
scalar_type
(),
"hard_voxelize_forward"
,
[
&
]
{
hard_voxelize_kernel
<
scalar_t
,
int
>
(
points
.
accessor
<
scalar_t
,
2
>
(),
voxels
.
accessor
<
scalar_t
,
3
>
(),
coors
.
accessor
<
int
,
2
>
(),
num_points_per_voxel
.
accessor
<
int
,
1
>
(),
coor_to_voxelidx
.
accessor
<
int
,
3
>
(),
voxel_num
,
voxel_size
,
coors_range
,
grid_size
,
max_points
,
max_voxels
,
num_points
,
num_features
,
NDim
);
});
return
voxel_num
;
points
.
accessor
<
scalar_t
,
2
>
(),
voxels
.
accessor
<
scalar_t
,
3
>
(),
coors
.
accessor
<
int
,
2
>
(),
num_points_per_voxel
.
accessor
<
int
,
1
>
(),
coor_to_voxelidx
.
accessor
<
int
,
3
>
(),
voxel_num
,
voxel_size
,
coors_range
,
grid_size
,
max_points
,
max_voxels
,
num_points
,
num_features
,
NDim
);
});
return
voxel_num
;
}
void
dynamic_voxelize_cpu
(
const
at
::
Tensor
&
points
,
at
::
Tensor
&
coors
,
const
std
::
vector
<
float
>
voxel_size
,
const
std
::
vector
<
float
>
coors_range
,
const
int
NDim
=
3
)
{
// check device
AT_ASSERTM
(
points
.
device
().
is_cpu
(),
"points must be a CPU tensor"
);
void
dynamic_voxelize_cpu
(
const
at
::
Tensor
&
points
,
at
::
Tensor
&
coors
,
const
std
::
vector
<
float
>
voxel_size
,
const
std
::
vector
<
float
>
coors_range
,
const
int
NDim
=
3
)
{
// check device
AT_ASSERTM
(
points
.
device
().
is_cpu
(),
"points must be a CPU tensor"
);
std
::
vector
<
int
>
grid_size
(
NDim
);
const
int
num_points
=
points
.
size
(
0
);
const
int
num_features
=
points
.
size
(
1
);
std
::
vector
<
int
>
grid_size
(
NDim
);
const
int
num_points
=
points
.
size
(
0
);
const
int
num_features
=
points
.
size
(
1
);
for
(
int
i
=
0
;
i
<
NDim
;
++
i
)
{
grid_size
[
i
]
=
round
((
coors_range
[
NDim
+
i
]
-
coors_range
[
i
])
/
voxel_size
[
i
]);
}
for
(
int
i
=
0
;
i
<
NDim
;
++
i
)
{
grid_size
[
i
]
=
round
((
coors_range
[
NDim
+
i
]
-
coors_range
[
i
])
/
voxel_size
[
i
]);
}
// coors, num_points_per_voxel, coor_to_voxelidx are int Tensor
AT_DISPATCH_FLOATING_TYPES_AND_HALF
(
points
.
type
(),
"hard_voxelize_forward"
,
[
&
]
{
// coors, num_points_per_voxel, coor_to_voxelidx are int Tensor
AT_DISPATCH_FLOATING_TYPES_AND_HALF
(
points
.
scalar_type
(),
"hard_voxelize_forward"
,
[
&
]
{
dynamic_voxelize_kernel
<
scalar_t
,
int
>
(
points
.
accessor
<
scalar_t
,
2
>
(),
coors
.
accessor
<
int
,
2
>
(),
voxel_size
,
coors_range
,
grid_size
,
num_points
,
num_features
,
NDim
);
});
return
;
}
points
.
accessor
<
scalar_t
,
2
>
(),
coors
.
accessor
<
int
,
2
>
(),
voxel_size
,
coors_range
,
grid_size
,
num_points
,
num_features
,
NDim
);
});
return
;
}
}
// namespace voxelization
mmdet3d/ops/voxel/src/voxelization_cuda.cu
View file @
ee5667c1
...
...
@@ -6,7 +6,7 @@
#include <ATen/cuda/CUDAApplyUtils.cuh>
#define CHECK_CUDA(x) \
TORCH_CHECK(x.
typ
e().is_cuda(), #x " must be a CUDA tensor")
TORCH_CHECK(x.
devic
e().is_cuda(), #x " must be a CUDA tensor")
#define CHECK_CONTIGUOUS(x) \
TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
#define CHECK_INPUT(x) \
...
...
@@ -219,7 +219,7 @@ int hard_voxelize_gpu(const at::Tensor& points, at::Tensor& voxels,
// 1. link point to corresponding voxel coors
AT_DISPATCH_ALL_TYPES
(
points
.
type
(),
"hard_voxelize_kernel"
,
([
&
]
{
points
.
scalar_
type
(),
"hard_voxelize_kernel"
,
([
&
]
{
dynamic_voxelize_kernel
<
scalar_t
,
int
>
<<<
grid
,
block
,
0
,
at
::
cuda
::
getCurrentCUDAStream
()
>>>
(
points
.
contiguous
().
data_ptr
<
scalar_t
>
(),
...
...
@@ -247,7 +247,7 @@ int hard_voxelize_gpu(const at::Tensor& points, at::Tensor& voxels,
dim3
map_grid
(
std
::
min
(
at
::
cuda
::
ATenCeilDiv
(
num_points
,
512
),
4096
));
dim3
map_block
(
512
);
AT_DISPATCH_ALL_TYPES
(
temp_coors
.
type
(),
"determin_duplicate"
,
([
&
]
{
temp_coors
.
scalar_
type
(),
"determin_duplicate"
,
([
&
]
{
point_to_voxelidx_kernel
<
int
>
<<<
map_grid
,
map_block
,
0
,
at
::
cuda
::
getCurrentCUDAStream
()
>>>
(
temp_coors
.
contiguous
().
data_ptr
<
int
>
(),
...
...
@@ -272,7 +272,7 @@ int hard_voxelize_gpu(const at::Tensor& points, at::Tensor& voxels,
points
.
options
().
dtype
(
at
::
kInt
));
// must be zero from the begining
AT_DISPATCH_ALL_TYPES
(
temp_coors
.
type
(),
"determin_duplicate"
,
([
&
]
{
temp_coors
.
scalar_
type
(),
"determin_duplicate"
,
([
&
]
{
determin_voxel_num
<
int
><<<
1
,
1
,
0
,
at
::
cuda
::
getCurrentCUDAStream
()
>>>
(
num_points_per_voxel
.
contiguous
().
data_ptr
<
int
>
(),
point_to_voxelidx
.
contiguous
().
data_ptr
<
int
>
(),
...
...
@@ -290,7 +290,7 @@ int hard_voxelize_gpu(const at::Tensor& points, at::Tensor& voxels,
dim3
cp_grid
(
std
::
min
(
at
::
cuda
::
ATenCeilDiv
(
pts_output_size
,
512
),
4096
));
dim3
cp_block
(
512
);
AT_DISPATCH_ALL_TYPES
(
points
.
type
(),
"assign_point_to_voxel"
,
([
&
]
{
points
.
scalar_
type
(),
"assign_point_to_voxel"
,
([
&
]
{
assign_point_to_voxel
<
float
,
int
>
<<<
cp_grid
,
cp_block
,
0
,
at
::
cuda
::
getCurrentCUDAStream
()
>>>
(
pts_output_size
,
points
.
contiguous
().
data_ptr
<
float
>
(),
...
...
@@ -308,7 +308,7 @@ int hard_voxelize_gpu(const at::Tensor& points, at::Tensor& voxels,
std
::
min
(
at
::
cuda
::
ATenCeilDiv
(
coors_output_size
,
512
),
4096
));
dim3
coors_cp_block
(
512
);
AT_DISPATCH_ALL_TYPES
(
points
.
type
(),
"assign_point_to_voxel"
,
([
&
]
{
points
.
scalar_
type
(),
"assign_point_to_voxel"
,
([
&
]
{
assign_voxel_coors
<
float
,
int
><<<
coors_cp_grid
,
coors_cp_block
,
0
,
at
::
cuda
::
getCurrentCUDAStream
()
>>>
(
coors_output_size
,
temp_coors
.
contiguous
().
data_ptr
<
int
>
(),
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment