Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
mmdetection3d
Commits
ee5667c1
Commit
ee5667c1
authored
Apr 27, 2020
by
zhangwenwei
Browse files
Reformat cpp code to supress warning
parent
2bb43004
Changes
15
Show whitespace changes
Inline
Side-by-side
Showing
15 changed files
with
884 additions
and
815 deletions
+884
-815
mmdet3d/datasets/kitti_dataset.py
mmdet3d/datasets/kitti_dataset.py
+2
-2
mmdet3d/models/anchor_heads/second_head.py
mmdet3d/models/anchor_heads/second_head.py
+22
-2
mmdet3d/ops/iou3d/src/iou3d.cpp
mmdet3d/ops/iou3d/src/iou3d.cpp
+145
-127
mmdet3d/ops/roiaware_pool3d/src/points_in_boxes_cpu.cpp
mmdet3d/ops/roiaware_pool3d/src/points_in_boxes_cpu.cpp
+56
-54
mmdet3d/ops/roiaware_pool3d/src/points_in_boxes_cuda.cu
mmdet3d/ops/roiaware_pool3d/src/points_in_boxes_cuda.cu
+102
-90
mmdet3d/ops/roiaware_pool3d/src/roiaware_pool3d.cpp
mmdet3d/ops/roiaware_pool3d/src/roiaware_pool3d.cpp
+123
-100
mmdet3d/ops/spconv/include/spconv/fused_spconv_ops.h
mmdet3d/ops/spconv/include/spconv/fused_spconv_ops.h
+34
-27
mmdet3d/ops/spconv/include/spconv/pool_ops.h
mmdet3d/ops/spconv/include/spconv/pool_ops.h
+9
-8
mmdet3d/ops/spconv/include/spconv/spconv_ops.h
mmdet3d/ops/spconv/include/spconv/spconv_ops.h
+162
-131
mmdet3d/ops/spconv/include/torch_utils.h
mmdet3d/ops/spconv/include/torch_utils.h
+34
-33
mmdet3d/ops/voxel/src/scatter_points_cpu.cpp
mmdet3d/ops/voxel/src/scatter_points_cpu.cpp
+101
-110
mmdet3d/ops/voxel/src/scatter_points_cuda.cu
mmdet3d/ops/voxel/src/scatter_points_cuda.cu
+6
-6
mmdet3d/ops/voxel/src/voxelization.h
mmdet3d/ops/voxel/src/voxelization.h
+4
-4
mmdet3d/ops/voxel/src/voxelization_cpu.cpp
mmdet3d/ops/voxel/src/voxelization_cpu.cpp
+78
-115
mmdet3d/ops/voxel/src/voxelization_cuda.cu
mmdet3d/ops/voxel/src/voxelization_cuda.cu
+6
-6
No files found.
mmdet3d/datasets/kitti_dataset.py
View file @
ee5667c1
...
@@ -274,7 +274,7 @@ class KittiDataset(torch_data.Dataset):
...
@@ -274,7 +274,7 @@ class KittiDataset(torch_data.Dataset):
out
)
out
)
return
result_files
return
result_files
def
evaluate
(
self
,
result_files
,
eval_types
=
None
):
def
evaluate
(
self
,
result_files
,
logger
=
None
,
eval_types
=
None
):
from
mmdet3d.core.evaluation
import
kitti_eval
from
mmdet3d.core.evaluation
import
kitti_eval
gt_annos
=
[
info
[
'annos'
]
for
info
in
self
.
kitti_infos
]
gt_annos
=
[
info
[
'annos'
]
for
info
in
self
.
kitti_infos
]
if
eval_types
==
'img_bbox'
:
if
eval_types
==
'img_bbox'
:
...
@@ -283,7 +283,7 @@ class KittiDataset(torch_data.Dataset):
...
@@ -283,7 +283,7 @@ class KittiDataset(torch_data.Dataset):
else
:
else
:
ap_result_str
,
ap_dict
=
kitti_eval
(
gt_annos
,
result_files
,
ap_result_str
,
ap_dict
=
kitti_eval
(
gt_annos
,
result_files
,
self
.
class_names
)
self
.
class_names
)
return
ap_result_str
,
ap_dict
return
ap_dict
def
bbox2result_kitti
(
self
,
net_outputs
,
class_names
,
out
=
None
):
def
bbox2result_kitti
(
self
,
net_outputs
,
class_names
,
out
=
None
):
if
out
:
if
out
:
...
...
mmdet3d/models/anchor_heads/second_head.py
View file @
ee5667c1
...
@@ -15,12 +15,32 @@ from .train_mixins import AnchorTrainMixin
...
@@ -15,12 +15,32 @@ from .train_mixins import AnchorTrainMixin
@
HEADS
.
register_module
@
HEADS
.
register_module
class
SECONDHead
(
nn
.
Module
,
AnchorTrainMixin
):
class
SECONDHead
(
nn
.
Module
,
AnchorTrainMixin
):
"""Anchor-based head (RPN, RetinaNet, SSD, etc.).
"""Anchor-based head for VoxelNet detectors.
Args:
Args:
class_name (list[str]): name of classes (TODO: to be removed)
in_channels (int): Number of channels in the input feature map.
in_channels (int): Number of channels in the input feature map.
train_cfg (dict): train configs
test_cfg (dict): test configs
feat_channels (int): Number of channels of the feature map.
feat_channels (int): Number of channels of the feature map.
use_direction_classifier (bool): Whether to add a direction classifier.
encode_bg_as_zeros (bool): Whether to use sigmoid of softmax
(TODO: to be removed)
box_code_size (int): The size of box code.
anchor_generator(dict): Config dict of anchor generator.
assigner_per_size (bool): Whether to do assignment for each separate
anchor size.
assign_per_class (bool): Whether to do assignment for each class.
diff_rad_by_sin (bool): Whether to change the difference into sin
difference for box regression loss.
dir_offset (float | int): The offset of BEV rotation angles
(TODO: may be moved into box coder)
dirlimit_offset (float | int): The limited range of BEV rotation angles
(TODO: may be moved into box coder)
box_coder (dict): Config dict of box coders.
loss_cls (dict): Config of classification loss.
loss_cls (dict): Config of classification loss.
loss_bbox (dict): Config of localization loss.
loss_bbox (dict): Config of localization loss.
loss_dir (dict): Config of direction classifier loss.
"""
# noqa: W605
"""
# noqa: W605
def
__init__
(
self
,
def
__init__
(
self
,
...
@@ -253,7 +273,7 @@ class SECONDHead(nn.Module, AnchorTrainMixin):
...
@@ -253,7 +273,7 @@ class SECONDHead(nn.Module, AnchorTrainMixin):
num_levels
=
len
(
cls_scores
)
num_levels
=
len
(
cls_scores
)
featmap_sizes
=
[
cls_scores
[
i
].
shape
[
-
2
:]
for
i
in
range
(
num_levels
)]
featmap_sizes
=
[
cls_scores
[
i
].
shape
[
-
2
:]
for
i
in
range
(
num_levels
)]
device
=
cls_scores
[
0
].
device
device
=
cls_scores
[
0
].
device
mlvl_anchors
=
self
.
anchor_generator
s
.
grid_anchors
(
mlvl_anchors
=
self
.
anchor_generator
.
grid_anchors
(
featmap_sizes
,
device
=
device
)
featmap_sizes
,
device
=
device
)
mlvl_anchors
=
[
mlvl_anchors
=
[
anchor
.
reshape
(
-
1
,
self
.
box_code_size
)
for
anchor
in
mlvl_anchors
anchor
.
reshape
(
-
1
,
self
.
box_code_size
)
for
anchor
in
mlvl_anchors
...
...
mmdet3d/ops/iou3d/src/iou3d.cpp
View file @
ee5667c1
#include <torch/serialize/tensor.h>
#include <torch/extension.h>
#include <vector>
#include <cuda.h>
#include <cuda.h>
#include <cuda_runtime_api.h>
#include <cuda_runtime_api.h>
#include <torch/extension.h>
#include <torch/serialize/tensor.h>
#define CHECK_CUDA(x) AT_CHECK(x.type().is_cuda(), #x, " must be a CUDAtensor ")
#include <vector>
#define CHECK_CONTIGUOUS(x) AT_CHECK(x.is_contiguous(), #x, " must be contiguous ")
#define CHECK_INPUT(x) CHECK_CUDA(x);CHECK_CONTIGUOUS(x)
#define DIVUP(m,n) ((m) / (n) + ((m) % (n) > 0))
#define CHECK_ERROR(ans) { gpuAssert((ans), __FILE__, __LINE__); }
#define CHECK_CUDA(x) \
inline
void
gpuAssert
(
cudaError_t
code
,
const
char
*
file
,
int
line
,
bool
abort
=
true
)
TORCH_CHECK(x.device().is_cuda(), #x, " must be a CUDAtensor ")
{
#define CHECK_CONTIGUOUS(x) \
if
(
code
!=
cudaSuccess
)
TORCH_CHECK(x.is_contiguous(), #x, " must be contiguous ")
{
#define CHECK_INPUT(x) \
fprintf
(
stderr
,
"GPUassert: %s %s %d
\n
"
,
cudaGetErrorString
(
code
),
file
,
line
);
CHECK_CUDA(x); \
CHECK_CONTIGUOUS(x)
#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
#define CHECK_ERROR(ans) \
{ gpuAssert((ans), __FILE__, __LINE__); }
inline
void
gpuAssert
(
cudaError_t
code
,
const
char
*
file
,
int
line
,
bool
abort
=
true
)
{
if
(
code
!=
cudaSuccess
)
{
fprintf
(
stderr
,
"GPUassert: %s %s %d
\n
"
,
cudaGetErrorString
(
code
),
file
,
line
);
if
(
abort
)
exit
(
code
);
if
(
abort
)
exit
(
code
);
}
}
}
}
const
int
THREADS_PER_BLOCK_NMS
=
sizeof
(
unsigned
long
long
)
*
8
;
const
int
THREADS_PER_BLOCK_NMS
=
sizeof
(
unsigned
long
long
)
*
8
;
void
boxesoverlapLauncher
(
const
int
num_a
,
const
float
*
boxes_a
,
void
boxesoverlapLauncher
(
const
int
num_a
,
const
float
*
boxes_a
,
const
int
num_b
,
const
float
*
boxes_b
,
float
*
ans_overlap
);
const
int
num_b
,
const
float
*
boxes_b
,
void
boxesioubevLauncher
(
const
int
num_a
,
const
float
*
boxes_a
,
const
int
num_b
,
const
float
*
boxes_b
,
float
*
ans_iou
);
float
*
ans_overlap
);
void
nmsLauncher
(
const
float
*
boxes
,
unsigned
long
long
*
mask
,
int
boxes_num
,
float
nms_overlap_thresh
);
void
boxesioubevLauncher
(
const
int
num_a
,
const
float
*
boxes_a
,
const
int
num_b
,
void
nmsNormalLauncher
(
const
float
*
boxes
,
unsigned
long
long
*
mask
,
int
boxes_num
,
float
nms_overlap_thresh
);
const
float
*
boxes_b
,
float
*
ans_iou
);
void
nmsLauncher
(
const
float
*
boxes
,
unsigned
long
long
*
mask
,
int
boxes_num
,
int
boxes_overlap_bev_gpu
(
at
::
Tensor
boxes_a
,
at
::
Tensor
boxes_b
,
at
::
Tensor
ans_overlap
){
float
nms_overlap_thresh
);
void
nmsNormalLauncher
(
const
float
*
boxes
,
unsigned
long
long
*
mask
,
int
boxes_num
,
float
nms_overlap_thresh
);
int
boxes_overlap_bev_gpu
(
at
::
Tensor
boxes_a
,
at
::
Tensor
boxes_b
,
at
::
Tensor
ans_overlap
)
{
// params boxes_a: (N, 5) [x1, y1, x2, y2, ry]
// params boxes_a: (N, 5) [x1, y1, x2, y2, ry]
// params boxes_b: (M, 5)
// params boxes_b: (M, 5)
// params ans_overlap: (N, M)
// params ans_overlap: (N, M)
...
@@ -40,16 +51,18 @@ int boxes_overlap_bev_gpu(at::Tensor boxes_a, at::Tensor boxes_b, at::Tensor ans
...
@@ -40,16 +51,18 @@ int boxes_overlap_bev_gpu(at::Tensor boxes_a, at::Tensor boxes_b, at::Tensor ans
int
num_a
=
boxes_a
.
size
(
0
);
int
num_a
=
boxes_a
.
size
(
0
);
int
num_b
=
boxes_b
.
size
(
0
);
int
num_b
=
boxes_b
.
size
(
0
);
const
float
*
boxes_a_data
=
boxes_a
.
data
<
float
>
();
const
float
*
boxes_a_data
=
boxes_a
.
data
_ptr
<
float
>
();
const
float
*
boxes_b_data
=
boxes_b
.
data
<
float
>
();
const
float
*
boxes_b_data
=
boxes_b
.
data
_ptr
<
float
>
();
float
*
ans_overlap_data
=
ans_overlap
.
data
<
float
>
();
float
*
ans_overlap_data
=
ans_overlap
.
data
_ptr
<
float
>
();
boxesoverlapLauncher
(
num_a
,
boxes_a_data
,
num_b
,
boxes_b_data
,
ans_overlap_data
);
boxesoverlapLauncher
(
num_a
,
boxes_a_data
,
num_b
,
boxes_b_data
,
ans_overlap_data
);
return
1
;
return
1
;
}
}
int
boxes_iou_bev_gpu
(
at
::
Tensor
boxes_a
,
at
::
Tensor
boxes_b
,
at
::
Tensor
ans_iou
){
int
boxes_iou_bev_gpu
(
at
::
Tensor
boxes_a
,
at
::
Tensor
boxes_b
,
at
::
Tensor
ans_iou
)
{
// params boxes_a: (N, 5) [x1, y1, x2, y2, ry]
// params boxes_a: (N, 5) [x1, y1, x2, y2, ry]
// params boxes_b: (M, 5)
// params boxes_b: (M, 5)
// params ans_overlap: (N, M)
// params ans_overlap: (N, M)
...
@@ -61,16 +74,16 @@ int boxes_iou_bev_gpu(at::Tensor boxes_a, at::Tensor boxes_b, at::Tensor ans_iou
...
@@ -61,16 +74,16 @@ int boxes_iou_bev_gpu(at::Tensor boxes_a, at::Tensor boxes_b, at::Tensor ans_iou
int
num_a
=
boxes_a
.
size
(
0
);
int
num_a
=
boxes_a
.
size
(
0
);
int
num_b
=
boxes_b
.
size
(
0
);
int
num_b
=
boxes_b
.
size
(
0
);
const
float
*
boxes_a_data
=
boxes_a
.
data
<
float
>
();
const
float
*
boxes_a_data
=
boxes_a
.
data
_ptr
<
float
>
();
const
float
*
boxes_b_data
=
boxes_b
.
data
<
float
>
();
const
float
*
boxes_b_data
=
boxes_b
.
data
_ptr
<
float
>
();
float
*
ans_iou_data
=
ans_iou
.
data
<
float
>
();
float
*
ans_iou_data
=
ans_iou
.
data
_ptr
<
float
>
();
boxesioubevLauncher
(
num_a
,
boxes_a_data
,
num_b
,
boxes_b_data
,
ans_iou_data
);
boxesioubevLauncher
(
num_a
,
boxes_a_data
,
num_b
,
boxes_b_data
,
ans_iou_data
);
return
1
;
return
1
;
}
}
int
nms_gpu
(
at
::
Tensor
boxes
,
at
::
Tensor
keep
,
float
nms_overlap_thresh
){
int
nms_gpu
(
at
::
Tensor
boxes
,
at
::
Tensor
keep
,
float
nms_overlap_thresh
)
{
// params boxes: (N, 5) [x1, y1, x2, y2, ry]
// params boxes: (N, 5) [x1, y1, x2, y2, ry]
// params keep: (N)
// params keep: (N)
...
@@ -78,21 +91,24 @@ int nms_gpu(at::Tensor boxes, at::Tensor keep, float nms_overlap_thresh){
...
@@ -78,21 +91,24 @@ int nms_gpu(at::Tensor boxes, at::Tensor keep, float nms_overlap_thresh){
CHECK_CONTIGUOUS
(
keep
);
CHECK_CONTIGUOUS
(
keep
);
int
boxes_num
=
boxes
.
size
(
0
);
int
boxes_num
=
boxes
.
size
(
0
);
const
float
*
boxes_data
=
boxes
.
data
<
float
>
();
const
float
*
boxes_data
=
boxes
.
data
_ptr
<
float
>
();
long
*
keep_data
=
keep
.
data
<
long
>
();
long
*
keep_data
=
keep
.
data
_ptr
<
long
>
();
const
int
col_blocks
=
DIVUP
(
boxes_num
,
THREADS_PER_BLOCK_NMS
);
const
int
col_blocks
=
DIVUP
(
boxes_num
,
THREADS_PER_BLOCK_NMS
);
unsigned
long
long
*
mask_data
=
NULL
;
unsigned
long
long
*
mask_data
=
NULL
;
CHECK_ERROR
(
cudaMalloc
((
void
**
)
&
mask_data
,
boxes_num
*
col_blocks
*
sizeof
(
unsigned
long
long
)));
CHECK_ERROR
(
cudaMalloc
((
void
**
)
&
mask_data
,
boxes_num
*
col_blocks
*
sizeof
(
unsigned
long
long
)));
nmsLauncher
(
boxes_data
,
mask_data
,
boxes_num
,
nms_overlap_thresh
);
nmsLauncher
(
boxes_data
,
mask_data
,
boxes_num
,
nms_overlap_thresh
);
// unsigned long long mask_cpu[boxes_num * col_blocks];
// unsigned long long mask_cpu[boxes_num * col_blocks];
// unsigned long long *mask_cpu = new unsigned long long [boxes_num * col_blocks];
// unsigned long long *mask_cpu = new unsigned long long [boxes_num *
// col_blocks];
std
::
vector
<
unsigned
long
long
>
mask_cpu
(
boxes_num
*
col_blocks
);
std
::
vector
<
unsigned
long
long
>
mask_cpu
(
boxes_num
*
col_blocks
);
// printf("boxes_num=%d, col_blocks=%d\n", boxes_num, col_blocks);
// printf("boxes_num=%d, col_blocks=%d\n", boxes_num, col_blocks);
CHECK_ERROR
(
cudaMemcpy
(
&
mask_cpu
[
0
],
mask_data
,
boxes_num
*
col_blocks
*
sizeof
(
unsigned
long
long
),
CHECK_ERROR
(
cudaMemcpy
(
&
mask_cpu
[
0
],
mask_data
,
boxes_num
*
col_blocks
*
sizeof
(
unsigned
long
long
),
cudaMemcpyDeviceToHost
));
cudaMemcpyDeviceToHost
));
cudaFree
(
mask_data
);
cudaFree
(
mask_data
);
...
@@ -102,25 +118,25 @@ int nms_gpu(at::Tensor boxes, at::Tensor keep, float nms_overlap_thresh){
...
@@ -102,25 +118,25 @@ int nms_gpu(at::Tensor boxes, at::Tensor keep, float nms_overlap_thresh){
int
num_to_keep
=
0
;
int
num_to_keep
=
0
;
for
(
int
i
=
0
;
i
<
boxes_num
;
i
++
){
for
(
int
i
=
0
;
i
<
boxes_num
;
i
++
)
{
int
nblock
=
i
/
THREADS_PER_BLOCK_NMS
;
int
nblock
=
i
/
THREADS_PER_BLOCK_NMS
;
int
inblock
=
i
%
THREADS_PER_BLOCK_NMS
;
int
inblock
=
i
%
THREADS_PER_BLOCK_NMS
;
if
(
!
(
remv_cpu
[
nblock
]
&
(
1ULL
<<
inblock
))){
if
(
!
(
remv_cpu
[
nblock
]
&
(
1ULL
<<
inblock
)))
{
keep_data
[
num_to_keep
++
]
=
i
;
keep_data
[
num_to_keep
++
]
=
i
;
unsigned
long
long
*
p
=
&
mask_cpu
[
0
]
+
i
*
col_blocks
;
unsigned
long
long
*
p
=
&
mask_cpu
[
0
]
+
i
*
col_blocks
;
for
(
int
j
=
nblock
;
j
<
col_blocks
;
j
++
){
for
(
int
j
=
nblock
;
j
<
col_blocks
;
j
++
)
{
remv_cpu
[
j
]
|=
p
[
j
];
remv_cpu
[
j
]
|=
p
[
j
];
}
}
}
}
}
}
if
(
cudaSuccess
!=
cudaGetLastError
()
)
printf
(
"Error!
\n
"
);
if
(
cudaSuccess
!=
cudaGetLastError
())
printf
(
"Error!
\n
"
);
return
num_to_keep
;
return
num_to_keep
;
}
}
int
nms_normal_gpu
(
at
::
Tensor
boxes
,
at
::
Tensor
keep
,
int
nms_normal_gpu
(
at
::
Tensor
boxes
,
at
::
Tensor
keep
,
float
nms_overlap_thresh
){
float
nms_overlap_thresh
)
{
// params boxes: (N, 5) [x1, y1, x2, y2, ry]
// params boxes: (N, 5) [x1, y1, x2, y2, ry]
// params keep: (N)
// params keep: (N)
...
@@ -128,21 +144,24 @@ int nms_normal_gpu(at::Tensor boxes, at::Tensor keep, float nms_overlap_thresh){
...
@@ -128,21 +144,24 @@ int nms_normal_gpu(at::Tensor boxes, at::Tensor keep, float nms_overlap_thresh){
CHECK_CONTIGUOUS
(
keep
);
CHECK_CONTIGUOUS
(
keep
);
int
boxes_num
=
boxes
.
size
(
0
);
int
boxes_num
=
boxes
.
size
(
0
);
const
float
*
boxes_data
=
boxes
.
data
<
float
>
();
const
float
*
boxes_data
=
boxes
.
data
_ptr
<
float
>
();
long
*
keep_data
=
keep
.
data
<
long
>
();
long
*
keep_data
=
keep
.
data
_ptr
<
long
>
();
const
int
col_blocks
=
DIVUP
(
boxes_num
,
THREADS_PER_BLOCK_NMS
);
const
int
col_blocks
=
DIVUP
(
boxes_num
,
THREADS_PER_BLOCK_NMS
);
unsigned
long
long
*
mask_data
=
NULL
;
unsigned
long
long
*
mask_data
=
NULL
;
CHECK_ERROR
(
cudaMalloc
((
void
**
)
&
mask_data
,
boxes_num
*
col_blocks
*
sizeof
(
unsigned
long
long
)));
CHECK_ERROR
(
cudaMalloc
((
void
**
)
&
mask_data
,
boxes_num
*
col_blocks
*
sizeof
(
unsigned
long
long
)));
nmsNormalLauncher
(
boxes_data
,
mask_data
,
boxes_num
,
nms_overlap_thresh
);
nmsNormalLauncher
(
boxes_data
,
mask_data
,
boxes_num
,
nms_overlap_thresh
);
// unsigned long long mask_cpu[boxes_num * col_blocks];
// unsigned long long mask_cpu[boxes_num * col_blocks];
// unsigned long long *mask_cpu = new unsigned long long [boxes_num * col_blocks];
// unsigned long long *mask_cpu = new unsigned long long [boxes_num *
// col_blocks];
std
::
vector
<
unsigned
long
long
>
mask_cpu
(
boxes_num
*
col_blocks
);
std
::
vector
<
unsigned
long
long
>
mask_cpu
(
boxes_num
*
col_blocks
);
// printf("boxes_num=%d, col_blocks=%d\n", boxes_num, col_blocks);
// printf("boxes_num=%d, col_blocks=%d\n", boxes_num, col_blocks);
CHECK_ERROR
(
cudaMemcpy
(
&
mask_cpu
[
0
],
mask_data
,
boxes_num
*
col_blocks
*
sizeof
(
unsigned
long
long
),
CHECK_ERROR
(
cudaMemcpy
(
&
mask_cpu
[
0
],
mask_data
,
boxes_num
*
col_blocks
*
sizeof
(
unsigned
long
long
),
cudaMemcpyDeviceToHost
));
cudaMemcpyDeviceToHost
));
cudaFree
(
mask_data
);
cudaFree
(
mask_data
);
...
@@ -152,27 +171,26 @@ int nms_normal_gpu(at::Tensor boxes, at::Tensor keep, float nms_overlap_thresh){
...
@@ -152,27 +171,26 @@ int nms_normal_gpu(at::Tensor boxes, at::Tensor keep, float nms_overlap_thresh){
int
num_to_keep
=
0
;
int
num_to_keep
=
0
;
for
(
int
i
=
0
;
i
<
boxes_num
;
i
++
){
for
(
int
i
=
0
;
i
<
boxes_num
;
i
++
)
{
int
nblock
=
i
/
THREADS_PER_BLOCK_NMS
;
int
nblock
=
i
/
THREADS_PER_BLOCK_NMS
;
int
inblock
=
i
%
THREADS_PER_BLOCK_NMS
;
int
inblock
=
i
%
THREADS_PER_BLOCK_NMS
;
if
(
!
(
remv_cpu
[
nblock
]
&
(
1ULL
<<
inblock
))){
if
(
!
(
remv_cpu
[
nblock
]
&
(
1ULL
<<
inblock
)))
{
keep_data
[
num_to_keep
++
]
=
i
;
keep_data
[
num_to_keep
++
]
=
i
;
unsigned
long
long
*
p
=
&
mask_cpu
[
0
]
+
i
*
col_blocks
;
unsigned
long
long
*
p
=
&
mask_cpu
[
0
]
+
i
*
col_blocks
;
for
(
int
j
=
nblock
;
j
<
col_blocks
;
j
++
){
for
(
int
j
=
nblock
;
j
<
col_blocks
;
j
++
)
{
remv_cpu
[
j
]
|=
p
[
j
];
remv_cpu
[
j
]
|=
p
[
j
];
}
}
}
}
}
}
if
(
cudaSuccess
!=
cudaGetLastError
()
)
printf
(
"Error!
\n
"
);
if
(
cudaSuccess
!=
cudaGetLastError
())
printf
(
"Error!
\n
"
);
return
num_to_keep
;
return
num_to_keep
;
}
}
PYBIND11_MODULE
(
TORCH_EXTENSION_NAME
,
m
)
{
PYBIND11_MODULE
(
TORCH_EXTENSION_NAME
,
m
)
{
m
.
def
(
"boxes_overlap_bev_gpu"
,
&
boxes_overlap_bev_gpu
,
"oriented boxes overlap"
);
m
.
def
(
"boxes_overlap_bev_gpu"
,
&
boxes_overlap_bev_gpu
,
"oriented boxes overlap"
);
m
.
def
(
"boxes_iou_bev_gpu"
,
&
boxes_iou_bev_gpu
,
"oriented boxes iou"
);
m
.
def
(
"boxes_iou_bev_gpu"
,
&
boxes_iou_bev_gpu
,
"oriented boxes iou"
);
m
.
def
(
"nms_gpu"
,
&
nms_gpu
,
"oriented nms gpu"
);
m
.
def
(
"nms_gpu"
,
&
nms_gpu
,
"oriented nms gpu"
);
m
.
def
(
"nms_normal_gpu"
,
&
nms_normal_gpu
,
"nms gpu"
);
m
.
def
(
"nms_normal_gpu"
,
&
nms_normal_gpu
,
"nms gpu"
);
...
...
mmdet3d/ops/roiaware_pool3d/src/points_in_boxes_cpu.cpp
View file @
ee5667c1
//Modified from
//
Modified from
//https://github.com/sshaoshuai/PCDet/blob/master/pcdet/ops/roiaware_pool3d/src/roiaware_pool3d_kernel.cu
//
https://github.com/sshaoshuai/PCDet/blob/master/pcdet/ops/roiaware_pool3d/src/roiaware_pool3d_kernel.cu
//Points in boxes cpu
//
Points in boxes cpu
//Written by Shaoshuai Shi
//
Written by Shaoshuai Shi
//All Rights Reserved 2019.
//
All Rights Reserved 2019.
#include <torch/serialize/tensor.h>
#include <torch/extension.h>
#include <assert.h>
#include <assert.h>
#include <math.h>
#include <math.h>
#include <stdio.h>
#include <stdio.h>
#include <torch/extension.h>
#include <torch/serialize/tensor.h>
#define CHECK_CONTIGUOUS(x) AT_CHECK(x.is_contiguous(), #x, " must be contiguous ")
#define CHECK_CONTIGUOUS(x) \
TORCH_CHECK(x.is_contiguous(), #x, " must be contiguous ")
// #define DEBUG
// #define DEBUG
inline
void
lidar_to_local_coords_cpu
(
float
shift_x
,
float
shift_y
,
float
rz
,
inline
void
lidar_to_local_coords_cpu
(
float
shift_x
,
float
shift_y
,
float
rz
,
float
&
local_x
,
float
&
local_y
){
float
&
local_x
,
float
&
local_y
)
{
// should rotate pi/2 + alpha to translate LiDAR to local
// should rotate pi/2 + alpha to translate LiDAR to local
float
rot_angle
=
rz
+
M_PI
/
2
;
float
rot_angle
=
rz
+
M_PI
/
2
;
float
cosa
=
cos
(
rot_angle
),
sina
=
sin
(
rot_angle
);
float
cosa
=
cos
(
rot_angle
),
sina
=
sin
(
rot_angle
);
...
@@ -24,10 +23,11 @@ inline void lidar_to_local_coords_cpu(float shift_x, float shift_y, float rz, fl
...
@@ -24,10 +23,11 @@ inline void lidar_to_local_coords_cpu(float shift_x, float shift_y, float rz, fl
local_y
=
shift_x
*
sina
+
shift_y
*
cosa
;
local_y
=
shift_x
*
sina
+
shift_y
*
cosa
;
}
}
inline
int
check_pt_in_box3d_cpu
(
const
float
*
pt
,
const
float
*
box3d
,
inline
int
check_pt_in_box3d_cpu
(
const
float
*
pt
,
const
float
*
box3d
,
float
&
local_x
,
float
&
local_y
){
float
&
local_x
,
float
&
local_y
)
{
// param pt: (x, y, z)
// param pt: (x, y, z)
// param box3d: (cx, cy, cz, w, l, h, rz) in LiDAR coordinate, cz in the bottom center
// param box3d: (cx, cy, cz, w, l, h, rz) in LiDAR coordinate, cz in the
// bottom center
float
x
=
pt
[
0
],
y
=
pt
[
1
],
z
=
pt
[
2
];
float
x
=
pt
[
0
],
y
=
pt
[
1
],
z
=
pt
[
2
];
float
cx
=
box3d
[
0
],
cy
=
box3d
[
1
],
cz
=
box3d
[
2
];
float
cx
=
box3d
[
0
],
cy
=
box3d
[
1
],
cz
=
box3d
[
2
];
float
w
=
box3d
[
3
],
l
=
box3d
[
4
],
h
=
box3d
[
5
],
rz
=
box3d
[
6
];
float
w
=
box3d
[
3
],
l
=
box3d
[
4
],
h
=
box3d
[
5
],
rz
=
box3d
[
6
];
...
@@ -35,15 +35,16 @@ inline int check_pt_in_box3d_cpu(const float *pt, const float *box3d, float &loc
...
@@ -35,15 +35,16 @@ inline int check_pt_in_box3d_cpu(const float *pt, const float *box3d, float &loc
if
(
fabsf
(
z
-
cz
)
>
h
/
2.0
)
return
0
;
if
(
fabsf
(
z
-
cz
)
>
h
/
2.0
)
return
0
;
lidar_to_local_coords_cpu
(
x
-
cx
,
y
-
cy
,
rz
,
local_x
,
local_y
);
lidar_to_local_coords_cpu
(
x
-
cx
,
y
-
cy
,
rz
,
local_x
,
local_y
);
float
in_flag
=
(
local_x
>
-
l
/
2.0
)
&
(
local_x
<
l
/
2.0
)
&
(
local_y
>
-
w
/
2.0
)
&
(
local_y
<
w
/
2.0
);
float
in_flag
=
(
local_x
>
-
l
/
2.0
)
&
(
local_x
<
l
/
2.0
)
&
(
local_y
>
-
w
/
2.0
)
&
(
local_y
<
w
/
2.0
);
return
in_flag
;
return
in_flag
;
}
}
int
points_in_boxes_cpu
(
at
::
Tensor
boxes_tensor
,
at
::
Tensor
pts_tensor
,
int
points_in_boxes_cpu
(
at
::
Tensor
boxes_tensor
,
at
::
Tensor
pts_tensor
,
at
::
Tensor
pts_indices_tensor
){
at
::
Tensor
pts_indices_tensor
)
{
// params boxes: (N, 7) [x, y, z, w, l, h, rz] in LiDAR coordinate, z is the
bottom center, each box DO NOT overlaps
// params boxes: (N, 7) [x, y, z, w, l, h, rz] in LiDAR coordinate, z is the
// params pts: (npoints, 3) [x, y, z]
in LiDAR coordinate
//
bottom center, each box DO NOT overlaps
params pts: (npoints, 3) [x, y, z]
// params pts_indices: (N, npoints)
//
in LiDAR coordinate
params pts_indices: (N, npoints)
CHECK_CONTIGUOUS
(
boxes_tensor
);
CHECK_CONTIGUOUS
(
boxes_tensor
);
CHECK_CONTIGUOUS
(
pts_tensor
);
CHECK_CONTIGUOUS
(
pts_tensor
);
...
@@ -52,14 +53,15 @@ int points_in_boxes_cpu(at::Tensor boxes_tensor, at::Tensor pts_tensor, at::Tens
...
@@ -52,14 +53,15 @@ int points_in_boxes_cpu(at::Tensor boxes_tensor, at::Tensor pts_tensor, at::Tens
int
boxes_num
=
boxes_tensor
.
size
(
0
);
int
boxes_num
=
boxes_tensor
.
size
(
0
);
int
pts_num
=
pts_tensor
.
size
(
0
);
int
pts_num
=
pts_tensor
.
size
(
0
);
const
float
*
boxes
=
boxes_tensor
.
data
<
float
>
();
const
float
*
boxes
=
boxes_tensor
.
data
_ptr
<
float
>
();
const
float
*
pts
=
pts_tensor
.
data
<
float
>
();
const
float
*
pts
=
pts_tensor
.
data
_ptr
<
float
>
();
int
*
pts_indices
=
pts_indices_tensor
.
data
<
int
>
();
int
*
pts_indices
=
pts_indices_tensor
.
data
_ptr
<
int
>
();
float
local_x
=
0
,
local_y
=
0
;
float
local_x
=
0
,
local_y
=
0
;
for
(
int
i
=
0
;
i
<
boxes_num
;
i
++
){
for
(
int
i
=
0
;
i
<
boxes_num
;
i
++
)
{
for
(
int
j
=
0
;
j
<
pts_num
;
j
++
){
for
(
int
j
=
0
;
j
<
pts_num
;
j
++
)
{
int
cur_in_flag
=
check_pt_in_box3d_cpu
(
pts
+
j
*
3
,
boxes
+
i
*
7
,
local_x
,
local_y
);
int
cur_in_flag
=
check_pt_in_box3d_cpu
(
pts
+
j
*
3
,
boxes
+
i
*
7
,
local_x
,
local_y
);
pts_indices
[
i
*
pts_num
+
j
]
=
cur_in_flag
;
pts_indices
[
i
*
pts_num
+
j
]
=
cur_in_flag
;
}
}
}
}
...
...
mmdet3d/ops/roiaware_pool3d/src/points_in_boxes_cuda.cu
View file @
ee5667c1
//Modified from
//
Modified from
//https://github.com/sshaoshuai/PCDet/blob/master/pcdet/ops/roiaware_pool3d/src/roiaware_pool3d_kernel.cu
//
https://github.com/sshaoshuai/PCDet/blob/master/pcdet/ops/roiaware_pool3d/src/roiaware_pool3d_kernel.cu
//Points in boxes gpu
//
Points in boxes gpu
//Written by Shaoshuai Shi
//
Written by Shaoshuai Shi
//All Rights Reserved 2019.
//
All Rights Reserved 2019.
#include <torch/serialize/tensor.h>
#include <torch/extension.h>
#include <assert.h>
#include <assert.h>
#include <math.h>
#include <math.h>
#include <stdio.h>
#include <stdio.h>
#include <torch/extension.h>
#include <torch/serialize/tensor.h>
#define THREADS_PER_BLOCK 256
#define THREADS_PER_BLOCK 256
#define DIVUP(m,n) ((m) / (n) + ((m) % (n) > 0))
#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
#define CHECK_CUDA(x) AT_CHECK(x.type().is_cuda(), #x, " must be a CUDAtensor ")
#define CHECK_CUDA(x) \
#define CHECK_CONTIGUOUS(x) AT_CHECK(x.is_contiguous(), #x, " must be contiguous ")
TORCH_CHECK(x.device().is_cuda(), #x, " must be a CUDAtensor ")
#define CHECK_INPUT(x) CHECK_CUDA(x);CHECK_CONTIGUOUS(x)
#define CHECK_CONTIGUOUS(x) \
TORCH_CHECK(x.is_contiguous(), #x, " must be contiguous ")
#define CHECK_INPUT(x) \
CHECK_CUDA(x); \
CHECK_CONTIGUOUS(x)
// #define DEBUG
// #define DEBUG
__device__
inline
void
lidar_to_local_coords
(
float
shift_x
,
float
shift_y
,
__device__
inline
void
lidar_to_local_coords
(
float
shift_x
,
float
shift_y
,
float
rz
,
float
&
local_x
,
float
&
local_y
){
float
rz
,
float
&
local_x
,
float
&
local_y
)
{
// should rotate pi/2 + alpha to translate LiDAR to local
// should rotate pi/2 + alpha to translate LiDAR to local
float
rot_angle
=
rz
+
M_PI
/
2
;
float
rot_angle
=
rz
+
M_PI
/
2
;
float
cosa
=
cos
(
rot_angle
),
sina
=
sin
(
rot_angle
);
float
cosa
=
cos
(
rot_angle
),
sina
=
sin
(
rot_angle
);
...
@@ -29,10 +32,11 @@ __device__ inline void lidar_to_local_coords(float shift_x, float shift_y, float
...
@@ -29,10 +32,11 @@ __device__ inline void lidar_to_local_coords(float shift_x, float shift_y, float
local_y
=
shift_x
*
sina
+
shift_y
*
cosa
;
local_y
=
shift_x
*
sina
+
shift_y
*
cosa
;
}
}
__device__
inline
int
check_pt_in_box3d
(
const
float
*
pt
,
const
float
*
box3d
,
__device__
inline
int
check_pt_in_box3d
(
const
float
*
pt
,
const
float
*
box3d
,
float
&
local_x
,
float
&
local_y
){
float
&
local_x
,
float
&
local_y
)
{
// param pt: (x, y, z)
// param pt: (x, y, z)
// param box3d: (cx, cy, cz, w, l, h, rz) in LiDAR coordinate, cz in the bottom center
// param box3d: (cx, cy, cz, w, l, h, rz) in LiDAR coordinate, cz in the
// bottom center
float
x
=
pt
[
0
],
y
=
pt
[
1
],
z
=
pt
[
2
];
float
x
=
pt
[
0
],
y
=
pt
[
1
],
z
=
pt
[
2
];
float
cx
=
box3d
[
0
],
cy
=
box3d
[
1
],
cz
=
box3d
[
2
];
float
cx
=
box3d
[
0
],
cy
=
box3d
[
1
],
cz
=
box3d
[
2
];
float
w
=
box3d
[
3
],
l
=
box3d
[
4
],
h
=
box3d
[
5
],
rz
=
box3d
[
6
];
float
w
=
box3d
[
3
],
l
=
box3d
[
4
],
h
=
box3d
[
5
],
rz
=
box3d
[
6
];
...
@@ -40,16 +44,19 @@ __device__ inline int check_pt_in_box3d(const float *pt, const float *box3d, flo
...
@@ -40,16 +44,19 @@ __device__ inline int check_pt_in_box3d(const float *pt, const float *box3d, flo
if
(
fabsf
(
z
-
cz
)
>
h
/
2.0
)
return
0
;
if
(
fabsf
(
z
-
cz
)
>
h
/
2.0
)
return
0
;
lidar_to_local_coords
(
x
-
cx
,
y
-
cy
,
rz
,
local_x
,
local_y
);
lidar_to_local_coords
(
x
-
cx
,
y
-
cy
,
rz
,
local_x
,
local_y
);
float
in_flag
=
(
local_x
>
-
l
/
2.0
)
&
(
local_x
<
l
/
2.0
)
&
(
local_y
>
-
w
/
2.0
)
&
(
local_y
<
w
/
2.0
);
float
in_flag
=
(
local_x
>
-
l
/
2.0
)
&
(
local_x
<
l
/
2.0
)
&
(
local_y
>
-
w
/
2.0
)
&
(
local_y
<
w
/
2.0
);
return
in_flag
;
return
in_flag
;
}
}
__global__
void
points_in_boxes_kernel
(
int
batch_size
,
int
boxes_num
,
__global__
void
points_in_boxes_kernel
(
int
batch_size
,
int
boxes_num
,
int
pts_num
,
const
float
*
boxes
,
int
pts_num
,
const
float
*
boxes
,
const
float
*
pts
,
int
*
box_idx_of_points
){
const
float
*
pts
,
// params boxes: (B, N, 7) [x, y, z, w, l, h, rz] in LiDAR coordinate, z is the bottom center, each box DO NOT overlaps
int
*
box_idx_of_points
)
{
// params pts: (B, npoints, 3) [x, y, z] in LiDAR coordinate
// params boxes: (B, N, 7) [x, y, z, w, l, h, rz] in LiDAR coordinate, z is
// params boxes_idx_of_points: (B, npoints), default -1
// the bottom center, each box DO NOT overlaps params pts: (B, npoints, 3) [x,
// y, z] in LiDAR coordinate params boxes_idx_of_points: (B, npoints), default
// -1
int
bs_idx
=
blockIdx
.
y
;
int
bs_idx
=
blockIdx
.
y
;
int
pt_idx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
int
pt_idx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
...
@@ -61,26 +68,28 @@ __global__ void points_in_boxes_kernel(int batch_size, int boxes_num, int pts_nu
...
@@ -61,26 +68,28 @@ __global__ void points_in_boxes_kernel(int batch_size, int boxes_num, int pts_nu
float
local_x
=
0
,
local_y
=
0
;
float
local_x
=
0
,
local_y
=
0
;
int
cur_in_flag
=
0
;
int
cur_in_flag
=
0
;
for
(
int
k
=
0
;
k
<
boxes_num
;
k
++
){
for
(
int
k
=
0
;
k
<
boxes_num
;
k
++
)
{
cur_in_flag
=
check_pt_in_box3d
(
pts
,
boxes
+
k
*
7
,
local_x
,
local_y
);
cur_in_flag
=
check_pt_in_box3d
(
pts
,
boxes
+
k
*
7
,
local_x
,
local_y
);
if
(
cur_in_flag
){
if
(
cur_in_flag
)
{
box_idx_of_points
[
0
]
=
k
;
box_idx_of_points
[
0
]
=
k
;
break
;
break
;
}
}
}
}
}
}
void
points_in_boxes_launcher
(
int
batch_size
,
int
boxes_num
,
int
pts_num
,
void
points_in_boxes_launcher
(
int
batch_size
,
int
boxes_num
,
int
pts_num
,
const
float
*
boxes
,
const
float
*
boxes
,
const
float
*
pts
,
const
float
*
pts
,
int
*
box_idx_of_points
){
int
*
box_idx_of_points
)
{
// params boxes: (B, N, 7) [x, y, z, w, l, h, rz] in LiDAR coordinate, z is the bottom center, each box DO NOT overlaps
// params boxes: (B, N, 7) [x, y, z, w, l, h, rz] in LiDAR coordinate, z is
// params pts: (B, npoints, 3) [x, y, z] in LiDAR coordinate
// the bottom center, each box DO NOT overlaps params pts: (B, npoints, 3) [x,
// params boxes_idx_of_points: (B, npoints), default -1
// y, z] in LiDAR coordinate params boxes_idx_of_points: (B, npoints), default
// -1
cudaError_t
err
;
cudaError_t
err
;
dim3
blocks
(
DIVUP
(
pts_num
,
THREADS_PER_BLOCK
),
batch_size
);
dim3
blocks
(
DIVUP
(
pts_num
,
THREADS_PER_BLOCK
),
batch_size
);
dim3
threads
(
THREADS_PER_BLOCK
);
dim3
threads
(
THREADS_PER_BLOCK
);
points_in_boxes_kernel
<<<
blocks
,
threads
>>>
(
batch_size
,
boxes_num
,
pts_num
,
boxes
,
pts
,
box_idx_of_points
);
points_in_boxes_kernel
<<<
blocks
,
threads
>>>
(
batch_size
,
boxes_num
,
pts_num
,
boxes
,
pts
,
box_idx_of_points
);
err
=
cudaGetLastError
();
err
=
cudaGetLastError
();
if
(
cudaSuccess
!=
err
)
{
if
(
cudaSuccess
!=
err
)
{
...
@@ -93,10 +102,12 @@ void points_in_boxes_launcher(int batch_size, int boxes_num, int pts_num, const
...
@@ -93,10 +102,12 @@ void points_in_boxes_launcher(int batch_size, int boxes_num, int pts_num, const
#endif
#endif
}
}
int
points_in_boxes_gpu
(
at
::
Tensor
boxes_tensor
,
at
::
Tensor
pts_tensor
,
at
::
Tensor
box_idx_of_points_tensor
){
int
points_in_boxes_gpu
(
at
::
Tensor
boxes_tensor
,
at
::
Tensor
pts_tensor
,
// params boxes: (B, N, 7) [x, y, z, w, l, h, rz] in LiDAR coordinate, z is the bottom center, each box DO NOT overlaps
at
::
Tensor
box_idx_of_points_tensor
)
{
// params pts: (B, npoints, 3) [x, y, z] in LiDAR coordinate
// params boxes: (B, N, 7) [x, y, z, w, l, h, rz] in LiDAR coordinate, z is
// params boxes_idx_of_points: (B, npoints), default -1
// the bottom center, each box DO NOT overlaps params pts: (B, npoints, 3) [x,
// y, z] in LiDAR coordinate params boxes_idx_of_points: (B, npoints), default
// -1
CHECK_INPUT
(
boxes_tensor
);
CHECK_INPUT
(
boxes_tensor
);
CHECK_INPUT
(
pts_tensor
);
CHECK_INPUT
(
pts_tensor
);
...
@@ -106,11 +117,12 @@ int points_in_boxes_gpu(at::Tensor boxes_tensor, at::Tensor pts_tensor, at::Tens
...
@@ -106,11 +117,12 @@ int points_in_boxes_gpu(at::Tensor boxes_tensor, at::Tensor pts_tensor, at::Tens
int
boxes_num
=
boxes_tensor
.
size
(
1
);
int
boxes_num
=
boxes_tensor
.
size
(
1
);
int
pts_num
=
pts_tensor
.
size
(
1
);
int
pts_num
=
pts_tensor
.
size
(
1
);
const
float
*
boxes
=
boxes_tensor
.
data
<
float
>
();
const
float
*
boxes
=
boxes_tensor
.
data
_ptr
<
float
>
();
const
float
*
pts
=
pts_tensor
.
data
<
float
>
();
const
float
*
pts
=
pts_tensor
.
data
_ptr
<
float
>
();
int
*
box_idx_of_points
=
box_idx_of_points_tensor
.
data
<
int
>
();
int
*
box_idx_of_points
=
box_idx_of_points_tensor
.
data
_ptr
<
int
>
();
points_in_boxes_launcher
(
batch_size
,
boxes_num
,
pts_num
,
boxes
,
pts
,
box_idx_of_points
);
points_in_boxes_launcher
(
batch_size
,
boxes_num
,
pts_num
,
boxes
,
pts
,
box_idx_of_points
);
return
1
;
return
1
;
}
}
mmdet3d/ops/roiaware_pool3d/src/roiaware_pool3d.cpp
View file @
ee5667c1
//Modified from
//
Modified from
//https://github.com/sshaoshuai/PCDet/blob/master/pcdet/ops/roiaware_pool3d/src/roiaware_pool3d_kernel.cu
//
https://github.com/sshaoshuai/PCDet/blob/master/pcdet/ops/roiaware_pool3d/src/roiaware_pool3d_kernel.cu
//RoI-aware point cloud feature pooling
//
RoI-aware point cloud feature pooling
//Written by Shaoshuai Shi
//
Written by Shaoshuai Shi
//All Rights Reserved 2019.
//
All Rights Reserved 2019.
#include <torch/serialize/tensor.h>
#include <torch/extension.h>
#include <assert.h>
#include <assert.h>
#include <torch/extension.h>
#include <torch/serialize/tensor.h>
#define CHECK_CUDA(x) \
#define CHECK_CUDA(x) AT_CHECK(x.type().is_cuda(), #x, " must be a CUDAtensor ")
TORCH_CHECK(x.device().is_cuda(), #x, " must be a CUDAtensor ")
#define CHECK_CONTIGUOUS(x) AT_CHECK(x.is_contiguous(), #x, " must be contiguous ")
#define CHECK_CONTIGUOUS(x) \
#define CHECK_INPUT(x) CHECK_CUDA(x);CHECK_CONTIGUOUS(x)
TORCH_CHECK(x.is_contiguous(), #x, " must be contiguous ")
#define CHECK_INPUT(x) \
CHECK_CUDA(x); \
void
roiaware_pool3d_launcher
(
int
boxes_num
,
int
pts_num
,
int
channels
,
int
max_pts_each_voxel
,
CHECK_CONTIGUOUS(x)
int
out_x
,
int
out_y
,
int
out_z
,
const
float
*
rois
,
const
float
*
pts
,
const
float
*
pts_feature
,
int
*
argmax
,
int
*
pts_idx_of_voxels
,
float
*
pooled_features
,
int
pool_method
);
void
roiaware_pool3d_launcher
(
int
boxes_num
,
int
pts_num
,
int
channels
,
int
max_pts_each_voxel
,
int
out_x
,
int
out_y
,
void
roiaware_pool3d_backward_launcher
(
int
boxes_num
,
int
out_x
,
int
out_y
,
int
out_z
,
int
channels
,
int
max_pts_each_voxel
,
int
out_z
,
const
float
*
rois
,
const
float
*
pts
,
const
int
*
pts_idx_of_voxels
,
const
int
*
argmax
,
const
float
*
grad_out
,
float
*
grad_in
,
int
pool_method
);
const
float
*
pts_feature
,
int
*
argmax
,
int
*
pts_idx_of_voxels
,
float
*
pooled_features
,
int
roiaware_pool3d_gpu
(
at
::
Tensor
rois
,
at
::
Tensor
pts
,
at
::
Tensor
pts_feature
,
at
::
Tensor
argmax
,
int
pool_method
);
at
::
Tensor
pts_idx_of_voxels
,
at
::
Tensor
pooled_features
,
int
pool_method
);
void
roiaware_pool3d_backward_launcher
(
int
boxes_num
,
int
out_x
,
int
out_y
,
int
roiaware_pool3d_gpu_backward
(
at
::
Tensor
pts_idx_of_voxels
,
at
::
Tensor
argmax
,
at
::
Tensor
grad_out
,
int
out_z
,
int
channels
,
int
max_pts_each_voxel
,
const
int
*
pts_idx_of_voxels
,
const
int
*
argmax
,
const
float
*
grad_out
,
float
*
grad_in
,
int
pool_method
);
int
roiaware_pool3d_gpu
(
at
::
Tensor
rois
,
at
::
Tensor
pts
,
at
::
Tensor
pts_feature
,
at
::
Tensor
argmax
,
at
::
Tensor
pts_idx_of_voxels
,
at
::
Tensor
pooled_features
,
int
pool_method
);
int
roiaware_pool3d_gpu_backward
(
at
::
Tensor
pts_idx_of_voxels
,
at
::
Tensor
argmax
,
at
::
Tensor
grad_out
,
at
::
Tensor
grad_in
,
int
pool_method
);
at
::
Tensor
grad_in
,
int
pool_method
);
int
points_in_boxes_cpu
(
at
::
Tensor
boxes_tensor
,
at
::
Tensor
pts_tensor
,
at
::
Tensor
pts_indices_tensor
);
int
points_in_boxes_cpu
(
at
::
Tensor
boxes_tensor
,
at
::
Tensor
pts_tensor
,
at
::
Tensor
pts_indices_tensor
);
int
points_in_boxes_gpu
(
at
::
Tensor
boxes_tensor
,
at
::
Tensor
pts_tensor
,
at
::
Tensor
box_idx_of_points_tensor
);
int
points_in_boxes_gpu
(
at
::
Tensor
boxes_tensor
,
at
::
Tensor
pts_tensor
,
at
::
Tensor
box_idx_of_points_tensor
);
int
roiaware_pool3d_gpu
(
at
::
Tensor
rois
,
at
::
Tensor
pts
,
at
::
Tensor
pts_feature
,
at
::
Tensor
argmax
,
at
::
Tensor
pts_idx_of_voxels
,
at
::
Tensor
pooled_features
,
int
pool_method
){
int
roiaware_pool3d_gpu
(
at
::
Tensor
rois
,
at
::
Tensor
pts
,
at
::
Tensor
pts_feature
,
at
::
Tensor
argmax
,
at
::
Tensor
pts_idx_of_voxels
,
at
::
Tensor
pooled_features
,
int
pool_method
)
{
// params rois: (N, 7) [x, y, z, w, l, h, ry] in LiDAR coordinate
// params rois: (N, 7) [x, y, z, w, l, h, ry] in LiDAR coordinate
// params pts: (npoints, 3) [x, y, z] in LiDAR coordinate
// params pts: (npoints, 3) [x, y, z] in LiDAR coordinate
// params pts_feature: (npoints, C)
// params pts_feature: (npoints, C)
...
@@ -56,22 +69,27 @@ int roiaware_pool3d_gpu(at::Tensor rois, at::Tensor pts, at::Tensor pts_feature,
...
@@ -56,22 +69,27 @@ int roiaware_pool3d_gpu(at::Tensor rois, at::Tensor pts, at::Tensor pts_feature,
int
out_x
=
pts_idx_of_voxels
.
size
(
1
);
int
out_x
=
pts_idx_of_voxels
.
size
(
1
);
int
out_y
=
pts_idx_of_voxels
.
size
(
2
);
int
out_y
=
pts_idx_of_voxels
.
size
(
2
);
int
out_z
=
pts_idx_of_voxels
.
size
(
3
);
int
out_z
=
pts_idx_of_voxels
.
size
(
3
);
assert
((
out_x
<
256
)
&&
(
out_y
<
256
)
&&
(
out_z
<
256
));
// we encode index with 8bit
assert
((
out_x
<
256
)
&&
(
out_y
<
256
)
&&
(
out_z
<
256
));
// we encode index with 8bit
const
float
*
rois_data
=
rois
.
data
<
float
>
();
const
float
*
rois_data
=
rois
.
data
_ptr
<
float
>
();
const
float
*
pts_data
=
pts
.
data
<
float
>
();
const
float
*
pts_data
=
pts
.
data
_ptr
<
float
>
();
const
float
*
pts_feature_data
=
pts_feature
.
data
<
float
>
();
const
float
*
pts_feature_data
=
pts_feature
.
data
_ptr
<
float
>
();
int
*
argmax_data
=
argmax
.
data
<
int
>
();
int
*
argmax_data
=
argmax
.
data
_ptr
<
int
>
();
int
*
pts_idx_of_voxels_data
=
pts_idx_of_voxels
.
data
<
int
>
();
int
*
pts_idx_of_voxels_data
=
pts_idx_of_voxels
.
data
_ptr
<
int
>
();
float
*
pooled_features_data
=
pooled_features
.
data
<
float
>
();
float
*
pooled_features_data
=
pooled_features
.
data
_ptr
<
float
>
();
roiaware_pool3d_launcher
(
boxes_num
,
pts_num
,
channels
,
max_pts_each_voxel
,
out_x
,
out_y
,
out_z
,
roiaware_pool3d_launcher
(
rois_data
,
pts_data
,
pts_feature_data
,
argmax_data
,
pts_idx_of_voxels_data
,
pooled_features_data
,
pool_method
);
boxes_num
,
pts_num
,
channels
,
max_pts_each_voxel
,
out_x
,
out_y
,
out_z
,
rois_data
,
pts_data
,
pts_feature_data
,
argmax_data
,
pts_idx_of_voxels_data
,
pooled_features_data
,
pool_method
);
return
1
;
return
1
;
}
}
int
roiaware_pool3d_gpu_backward
(
at
::
Tensor
pts_idx_of_voxels
,
at
::
Tensor
argmax
,
at
::
Tensor
grad_out
,
at
::
Tensor
grad_in
,
int
pool_method
){
int
roiaware_pool3d_gpu_backward
(
at
::
Tensor
pts_idx_of_voxels
,
at
::
Tensor
argmax
,
at
::
Tensor
grad_out
,
at
::
Tensor
grad_in
,
int
pool_method
)
{
// params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel)
// params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel)
// params argmax: (N, out_x, out_y, out_z, C)
// params argmax: (N, out_x, out_y, out_z, C)
// params grad_out: (N, out_x, out_y, out_z, C)
// params grad_out: (N, out_x, out_y, out_z, C)
...
@@ -90,20 +108,25 @@ int roiaware_pool3d_gpu_backward(at::Tensor pts_idx_of_voxels, at::Tensor argmax
...
@@ -90,20 +108,25 @@ int roiaware_pool3d_gpu_backward(at::Tensor pts_idx_of_voxels, at::Tensor argmax
int
max_pts_each_voxel
=
pts_idx_of_voxels
.
size
(
4
);
// index 0 is the counter
int
max_pts_each_voxel
=
pts_idx_of_voxels
.
size
(
4
);
// index 0 is the counter
int
channels
=
grad_out
.
size
(
4
);
int
channels
=
grad_out
.
size
(
4
);
const
int
*
pts_idx_of_voxels_data
=
pts_idx_of_voxels
.
data
<
int
>
();
const
int
*
pts_idx_of_voxels_data
=
pts_idx_of_voxels
.
data
_ptr
<
int
>
();
const
int
*
argmax_data
=
argmax
.
data
<
int
>
();
const
int
*
argmax_data
=
argmax
.
data
_ptr
<
int
>
();
const
float
*
grad_out_data
=
grad_out
.
data
<
float
>
();
const
float
*
grad_out_data
=
grad_out
.
data
_ptr
<
float
>
();
float
*
grad_in_data
=
grad_in
.
data
<
float
>
();
float
*
grad_in_data
=
grad_in
.
data
_ptr
<
float
>
();
roiaware_pool3d_backward_launcher
(
boxes_num
,
out_x
,
out_y
,
out_z
,
channels
,
max_pts_each_voxel
,
roiaware_pool3d_backward_launcher
(
boxes_num
,
out_x
,
out_y
,
out_z
,
channels
,
pts_idx_of_voxels_data
,
argmax_data
,
grad_out_data
,
grad_in_data
,
pool_method
);
max_pts_each_voxel
,
pts_idx_of_voxels_data
,
argmax_data
,
grad_out_data
,
grad_in_data
,
pool_method
);
return
1
;
return
1
;
}
}
PYBIND11_MODULE
(
TORCH_EXTENSION_NAME
,
m
)
{
PYBIND11_MODULE
(
TORCH_EXTENSION_NAME
,
m
)
{
m
.
def
(
"forward"
,
&
roiaware_pool3d_gpu
,
"roiaware pool3d forward (CUDA)"
);
m
.
def
(
"forward"
,
&
roiaware_pool3d_gpu
,
"roiaware pool3d forward (CUDA)"
);
m
.
def
(
"backward"
,
&
roiaware_pool3d_gpu_backward
,
"roiaware pool3d backward (CUDA)"
);
m
.
def
(
"backward"
,
&
roiaware_pool3d_gpu_backward
,
m
.
def
(
"points_in_boxes_gpu"
,
&
points_in_boxes_gpu
,
"points_in_boxes_gpu forward (CUDA)"
);
"roiaware pool3d backward (CUDA)"
);
m
.
def
(
"points_in_boxes_cpu"
,
&
points_in_boxes_cpu
,
"points_in_boxes_cpu forward (CPU)"
);
m
.
def
(
"points_in_boxes_gpu"
,
&
points_in_boxes_gpu
,
"points_in_boxes_gpu forward (CUDA)"
);
m
.
def
(
"points_in_boxes_cpu"
,
&
points_in_boxes_cpu
,
"points_in_boxes_cpu forward (CPU)"
);
}
}
mmdet3d/ops/spconv/include/spconv/fused_spconv_ops.h
View file @
ee5667c1
...
@@ -26,9 +26,10 @@ namespace spconv {
...
@@ -26,9 +26,10 @@ namespace spconv {
// torch.jit's doc says only support int64, so we need to convert to int32.
// torch.jit's doc says only support int64, so we need to convert to int32.
template
<
typename
T
>
template
<
typename
T
>
torch
::
Tensor
fusedIndiceConvBatchNorm
(
torch
::
Tensor
features
,
torch
::
Tensor
filters
,
torch
::
Tensor
bias
,
torch
::
Tensor
fusedIndiceConvBatchNorm
(
torch
::
Tensor
indicePairs
,
torch
::
Tensor
indiceNum
,
torch
::
Tensor
features
,
torch
::
Tensor
filters
,
torch
::
Tensor
bias
,
int64_t
numActOut
,
int64_t
_inverse
,
int64_t
_subM
)
{
torch
::
Tensor
indicePairs
,
torch
::
Tensor
indiceNum
,
int64_t
numActOut
,
int64_t
_inverse
,
int64_t
_subM
)
{
bool
subM
=
_subM
!=
0
;
bool
subM
=
_subM
!=
0
;
bool
inverse
=
_inverse
!=
0
;
bool
inverse
=
_inverse
!=
0
;
auto
device
=
features
.
device
().
type
();
auto
device
=
features
.
device
().
type
();
...
@@ -37,13 +38,16 @@ torch::Tensor fusedIndiceConvBatchNorm(torch::Tensor features, torch::Tensor fil
...
@@ -37,13 +38,16 @@ torch::Tensor fusedIndiceConvBatchNorm(torch::Tensor features, torch::Tensor fil
auto
numInPlanes
=
features
.
size
(
1
);
auto
numInPlanes
=
features
.
size
(
1
);
auto
numOutPlanes
=
filters
.
size
(
ndim
+
1
);
auto
numOutPlanes
=
filters
.
size
(
ndim
+
1
);
auto
indicePairNumCpu
=
indiceNum
.
to
({
torch
::
kCPU
});
auto
indicePairNumCpu
=
indiceNum
.
to
({
torch
::
kCPU
});
auto
indicePairMaxSizeIter
=
std
::
max_element
(
auto
indicePairMaxSizeIter
=
indicePairNumCpu
.
data
<
int
>
(),
indicePairNumCpu
.
data
<
int
>
()
+
kernelVolume
);
std
::
max_element
(
indicePairNumCpu
.
data_ptr
<
int
>
(),
int
indicePairMaxOffset
=
indicePairMaxSizeIter
-
indicePairNumCpu
.
data
<
int
>
();
indicePairNumCpu
.
data_ptr
<
int
>
()
+
kernelVolume
);
int
indicePairMaxOffset
=
indicePairMaxSizeIter
-
indicePairNumCpu
.
data_ptr
<
int
>
();
int
indicePairMaxSize
=
*
indicePairMaxSizeIter
;
int
indicePairMaxSize
=
*
indicePairMaxSizeIter
;
/*if (_subM){
/*if (_subM){
std::vector<int> indicePairNumVec(indicePairNumCpu.data<int>(), indicePairNumCpu.data<int>() + kernelVolume);
std::vector<int> indicePairNumVec(indicePairNumCpu.data_ptr<int>(),
indicePairNumCpu.data_ptr<int>() + kernelVolume);
indicePairNumVec.erase(indicePairNumVec.begin() + indicePairMaxOffset);
indicePairNumVec.erase(indicePairNumVec.begin() + indicePairMaxOffset);
auto indicePairVecMaxSizeIter = std::max_element(
auto indicePairVecMaxSizeIter = std::max_element(
...
@@ -56,8 +60,10 @@ torch::Tensor fusedIndiceConvBatchNorm(torch::Tensor features, torch::Tensor fil
...
@@ -56,8 +60,10 @@ torch::Tensor fusedIndiceConvBatchNorm(torch::Tensor features, torch::Tensor fil
// auto indicePairOptions =
// auto indicePairOptions =
// torch::TensorOptions().dtype(torch::kInt64).device(indicePairs.device());
// torch::TensorOptions().dtype(torch::kInt64).device(indicePairs.device());
torch
::
Tensor
output
=
torch
::
zeros
({
numActOut
,
numOutPlanes
},
options
).
copy_
(
bias
);
torch
::
Tensor
output
=
torch
::
Tensor
inputBuffer
=
torch
::
zeros
({
indicePairMaxSize
,
numInPlanes
},
options
);
torch
::
zeros
({
numActOut
,
numOutPlanes
},
options
).
copy_
(
bias
);
torch
::
Tensor
inputBuffer
=
torch
::
zeros
({
indicePairMaxSize
,
numInPlanes
},
options
);
torch
::
Tensor
outputBuffer
=
torch
::
Tensor
outputBuffer
=
torch
::
zeros
({
indicePairMaxSize
,
numOutPlanes
},
options
);
torch
::
zeros
({
indicePairMaxSize
,
numOutPlanes
},
options
);
filters
=
filters
.
view
({
-
1
,
numInPlanes
,
numOutPlanes
});
filters
=
filters
.
view
({
-
1
,
numInPlanes
,
numOutPlanes
});
...
@@ -69,33 +75,34 @@ torch::Tensor fusedIndiceConvBatchNorm(torch::Tensor features, torch::Tensor fil
...
@@ -69,33 +75,34 @@ torch::Tensor fusedIndiceConvBatchNorm(torch::Tensor features, torch::Tensor fil
double
totalGEMMTime
=
0
;
double
totalGEMMTime
=
0
;
double
totalSAddTime
=
0
;
double
totalSAddTime
=
0
;
for
(
int
i
=
0
;
i
<
kernelVolume
;
++
i
)
{
for
(
int
i
=
0
;
i
<
kernelVolume
;
++
i
)
{
auto
nHot
=
indicePairNumCpu
.
data
<
int
>
()[
i
];
auto
nHot
=
indicePairNumCpu
.
data
_ptr
<
int
>
()[
i
];
if
(
nHot
<=
0
||
(
subM
&&
i
==
indicePairMaxOffset
))
{
if
(
nHot
<=
0
||
(
subM
&&
i
==
indicePairMaxOffset
))
{
continue
;
continue
;
}
}
// auto timer = spconv::CudaContextTimer<>();
// auto timer = spconv::CudaContextTimer<>();
auto
outputBufferBlob
=
auto
outputBufferBlob
=
torch
::
from_blob
(
outputBuffer
.
data_ptr
<
T
>
(),
torch
::
from_blob
(
outputBuffer
.
data
<
T
>
(),
{
nHot
,
numOutPlanes
},
options
);
{
nHot
,
numOutPlanes
},
options
);
auto
inputBufferBlob
=
auto
inputBufferBlob
=
torch
::
from_blob
(
inputBuffer
.
data_ptr
<
T
>
(),
torch
::
from_blob
(
inputBuffer
.
data
<
T
>
(),
{
nHot
,
numInPlanes
},
options
);
{
nHot
,
numInPlanes
},
options
);
if
(
device
==
torch
::
kCPU
)
{
if
(
device
==
torch
::
kCPU
)
{
functor
::
SparseGatherFunctor
<
tv
::
CPU
,
T
,
int
>
gatherFtor
;
functor
::
SparseGatherFunctor
<
tv
::
CPU
,
T
,
int
>
gatherFtor
;
gatherFtor
(
tv
::
CPU
(),
tv
::
torch2tv
<
T
>
(
inputBuffer
),
gatherFtor
(
tv
::
CPU
(),
tv
::
torch2tv
<
T
>
(
inputBuffer
),
tv
::
torch2tv
<
const
T
>
(
features
),
tv
::
torch2tv
<
const
T
>
(
features
),
tv
::
torch2tv
<
const
int
>
(
indicePairs
).
subview
(
i
,
inverse
),
nHot
);
tv
::
torch2tv
<
const
int
>
(
indicePairs
).
subview
(
i
,
inverse
),
nHot
);
}
else
{
}
else
{
functor
::
SparseGatherFunctor
<
tv
::
GPU
,
T
,
int
>
gatherFtor
;
functor
::
SparseGatherFunctor
<
tv
::
GPU
,
T
,
int
>
gatherFtor
;
gatherFtor
(
tv
::
TorchGPU
(),
tv
::
torch2tv
<
T
>
(
inputBuffer
),
gatherFtor
(
tv
::
TorchGPU
(),
tv
::
torch2tv
<
T
>
(
inputBuffer
),
tv
::
torch2tv
<
const
T
>
(
features
),
tv
::
torch2tv
<
const
T
>
(
features
),
tv
::
torch2tv
<
const
int
>
(
indicePairs
).
subview
(
i
,
inverse
),
nHot
);
tv
::
torch2tv
<
const
int
>
(
indicePairs
).
subview
(
i
,
inverse
),
nHot
);
TV_CHECK_CUDA_ERR
();
TV_CHECK_CUDA_ERR
();
/* slower than SparseGatherFunctor, may due to int->long conversion
/* slower than SparseGatherFunctor, may due to int->long conversion
auto indicePairLong = indicePairs[i][inverse].to(torch::kInt64);
auto indicePairLong = indicePairs[i][inverse].to(torch::kInt64);
auto indicePairBlob = torch::from_blob(indicePairLong.data<long>(), {nHot},
auto indicePairBlob = torch::from_blob(indicePairLong.data_ptr<long>(),
indicePairOptions);
{nHot}, indicePairOptions); torch::index_select_out(inputBufferBlob,
torch::index_select_out(inputBufferBlob, features, 0,
features, 0, indicePairBlob);*/
indicePairBlob);*/
}
}
// totalGatherTime += timer.report() / 1000.0;
// totalGatherTime += timer.report() / 1000.0;
torch
::
mm_out
(
outputBufferBlob
,
inputBufferBlob
,
filters
[
i
]);
torch
::
mm_out
(
outputBufferBlob
,
inputBufferBlob
,
filters
[
i
]);
...
@@ -105,14 +112,14 @@ torch::Tensor fusedIndiceConvBatchNorm(torch::Tensor features, torch::Tensor fil
...
@@ -105,14 +112,14 @@ torch::Tensor fusedIndiceConvBatchNorm(torch::Tensor features, torch::Tensor fil
functor
::
SparseScatterAddFunctor
<
tv
::
CPU
,
T
,
int
>
scatterFtor
;
functor
::
SparseScatterAddFunctor
<
tv
::
CPU
,
T
,
int
>
scatterFtor
;
scatterFtor
(
tv
::
CPU
(),
tv
::
torch2tv
<
T
>
(
output
),
scatterFtor
(
tv
::
CPU
(),
tv
::
torch2tv
<
T
>
(
output
),
tv
::
torch2tv
<
const
T
>
(
outputBuffer
),
tv
::
torch2tv
<
const
T
>
(
outputBuffer
),
tv
::
torch2tv
<
const
int
>
(
indicePairs
).
subview
(
i
,
!
inverse
),
nHot
,
tv
::
torch2tv
<
const
int
>
(
indicePairs
).
subview
(
i
,
!
inverse
),
true
);
nHot
,
true
);
}
else
{
}
else
{
functor
::
SparseScatterAddFunctor
<
tv
::
GPU
,
T
,
int
>
scatterFtor
;
functor
::
SparseScatterAddFunctor
<
tv
::
GPU
,
T
,
int
>
scatterFtor
;
scatterFtor
(
tv
::
TorchGPU
(),
tv
::
torch2tv
<
T
>
(
output
),
scatterFtor
(
tv
::
TorchGPU
(),
tv
::
torch2tv
<
T
>
(
output
),
tv
::
torch2tv
<
const
T
>
(
outputBuffer
),
tv
::
torch2tv
<
const
T
>
(
outputBuffer
),
tv
::
torch2tv
<
const
int
>
(
indicePairs
).
subview
(
i
,
!
inverse
),
nHot
,
tv
::
torch2tv
<
const
int
>
(
indicePairs
).
subview
(
i
,
!
inverse
),
true
);
nHot
,
true
);
TV_CHECK_CUDA_ERR
();
TV_CHECK_CUDA_ERR
();
}
}
// totalSAddTime += timer.report() / 1000.0;
// totalSAddTime += timer.report() / 1000.0;
...
...
mmdet3d/ops/spconv/include/spconv/pool_ops.h
View file @
ee5667c1
...
@@ -34,7 +34,7 @@ torch::Tensor indiceMaxPool(torch::Tensor features, torch::Tensor indicePairs,
...
@@ -34,7 +34,7 @@ torch::Tensor indiceMaxPool(torch::Tensor features, torch::Tensor indicePairs,
torch
::
Tensor
output
=
torch
::
zeros
({
numAct
,
numInPlanes
},
options
);
torch
::
Tensor
output
=
torch
::
zeros
({
numAct
,
numInPlanes
},
options
);
double
totalTime
=
0
;
double
totalTime
=
0
;
for
(
int
i
=
0
;
i
<
kernelVolume
;
++
i
)
{
for
(
int
i
=
0
;
i
<
kernelVolume
;
++
i
)
{
auto
nHot
=
indicePairNumCpu
.
data
<
int
>
()[
i
];
auto
nHot
=
indicePairNumCpu
.
data
_ptr
<
int
>
()[
i
];
if
(
nHot
<=
0
)
{
if
(
nHot
<=
0
)
{
continue
;
continue
;
}
}
...
@@ -60,7 +60,8 @@ torch::Tensor indiceMaxPool(torch::Tensor features, torch::Tensor indicePairs,
...
@@ -60,7 +60,8 @@ torch::Tensor indiceMaxPool(torch::Tensor features, torch::Tensor indicePairs,
template
<
typename
T
>
template
<
typename
T
>
torch
::
Tensor
indiceMaxPoolBackward
(
torch
::
Tensor
features
,
torch
::
Tensor
indiceMaxPoolBackward
(
torch
::
Tensor
features
,
torch
::
Tensor
outFeatures
,
torch
::
Tensor
outFeatures
,
torch
::
Tensor
outGrad
,
torch
::
Tensor
indicePairs
,
torch
::
Tensor
outGrad
,
torch
::
Tensor
indicePairs
,
torch
::
Tensor
indiceNum
)
{
torch
::
Tensor
indiceNum
)
{
auto
device
=
features
.
device
().
type
();
auto
device
=
features
.
device
().
type
();
auto
numInPlanes
=
features
.
size
(
1
);
auto
numInPlanes
=
features
.
size
(
1
);
...
@@ -70,7 +71,7 @@ torch::Tensor indiceMaxPoolBackward(torch::Tensor features,
...
@@ -70,7 +71,7 @@ torch::Tensor indiceMaxPoolBackward(torch::Tensor features,
torch
::
Tensor
inputGrad
=
torch
::
zeros
(
features
.
sizes
(),
options
);
torch
::
Tensor
inputGrad
=
torch
::
zeros
(
features
.
sizes
(),
options
);
auto
kernelVolume
=
indicePairs
.
size
(
0
);
auto
kernelVolume
=
indicePairs
.
size
(
0
);
for
(
int
i
=
0
;
i
<
kernelVolume
;
++
i
)
{
for
(
int
i
=
0
;
i
<
kernelVolume
;
++
i
)
{
auto
nHot
=
indicePairNumCpu
.
data
<
int
>
()[
i
];
auto
nHot
=
indicePairNumCpu
.
data
_ptr
<
int
>
()[
i
];
if
(
nHot
<=
0
)
{
if
(
nHot
<=
0
)
{
continue
;
continue
;
}
}
...
...
mmdet3d/ops/spconv/include/spconv/spconv_ops.h
View file @
ee5667c1
...
@@ -25,8 +25,8 @@
...
@@ -25,8 +25,8 @@
namespace
spconv
{
namespace
spconv
{
// torch.jit's doc says only support int64, so we need to convert to int32.
// torch.jit's doc says only support int64, so we need to convert to int32.
template
<
unsigned
NDim
>
template
<
unsigned
NDim
>
std
::
vector
<
torch
::
Tensor
>
std
::
vector
<
torch
::
Tensor
>
getIndicePair
(
getIndicePair
(
torch
::
Tensor
indices
,
int64_t
batchSize
,
torch
::
Tensor
indices
,
int64_t
batchSize
,
std
::
vector
<
int64_t
>
outSpatialShape
,
std
::
vector
<
int64_t
>
spatialShape
,
std
::
vector
<
int64_t
>
outSpatialShape
,
std
::
vector
<
int64_t
>
spatialShape
,
std
::
vector
<
int64_t
>
kernelSize
,
std
::
vector
<
int64_t
>
stride
,
std
::
vector
<
int64_t
>
kernelSize
,
std
::
vector
<
int64_t
>
stride
,
std
::
vector
<
int64_t
>
padding
,
std
::
vector
<
int64_t
>
dilation
,
std
::
vector
<
int64_t
>
padding
,
std
::
vector
<
int64_t
>
dilation
,
...
@@ -67,8 +67,8 @@ getIndicePair(torch::Tensor indices, int64_t batchSize,
...
@@ -67,8 +67,8 @@ getIndicePair(torch::Tensor indices, int64_t batchSize,
tv
::
SimpleVector
<
int
,
NDim
>
stride32
;
tv
::
SimpleVector
<
int
,
NDim
>
stride32
;
tv
::
SimpleVector
<
int
,
NDim
>
padding32
;
tv
::
SimpleVector
<
int
,
NDim
>
padding32
;
tv
::
SimpleVector
<
int
,
NDim
>
dilation32
;
tv
::
SimpleVector
<
int
,
NDim
>
dilation32
;
auto
indicePairUnique
=
auto
indicePairUnique
=
torch
::
full
(
torch
::
full
(
{
indicePairs
.
numel
()
/
2
+
1
},
std
::
numeric_limits
<
int
>::
max
(),
{
indicePairs
.
numel
()
/
2
+
1
},
std
::
numeric_limits
<
int
>::
max
(),
torch
::
dtype
(
torch
::
kInt32
).
device
(
indices
.
device
()));
torch
::
dtype
(
torch
::
kInt32
).
device
(
indices
.
device
()));
for
(
int
i
=
0
;
i
<
NDim
;
++
i
)
{
for
(
int
i
=
0
;
i
<
NDim
;
++
i
)
{
outSpatialShape32
.
push_back
(
outSpatialShape
[
i
]);
outSpatialShape32
.
push_back
(
outSpatialShape
[
i
]);
...
@@ -88,16 +88,18 @@ getIndicePair(torch::Tensor indices, int64_t batchSize,
...
@@ -88,16 +88,18 @@ getIndicePair(torch::Tensor indices, int64_t batchSize,
auto
getIndicePairFtor
=
auto
getIndicePairFtor
=
functor
::
CreateSubMIndicePairFunctor
<
tv
::
CPU
,
int
,
int
,
NDim
>
();
functor
::
CreateSubMIndicePairFunctor
<
tv
::
CPU
,
int
,
int
,
NDim
>
();
numActOut
=
getIndicePairFtor
(
numActOut
=
getIndicePairFtor
(
tv
::
CPU
(),
tv
::
torch2tv
<
const
int
>
(
indices
),
tv
::
torch2tv
<
int
>
(
gridOut
),
tv
::
CPU
(),
tv
::
torch2tv
<
const
int
>
(
indices
),
tv
::
torch2tv
<
int
>
(
indicePairs
),
tv
::
torch2tv
<
int
>
(
indiceNum
),
kernelSize32
,
tv
::
torch2tv
<
int
>
(
gridOut
),
tv
::
torch2tv
<
int
>
(
indicePairs
),
stride32
,
padding32
,
dilation32
,
outSpatialShape32
,
transpose
);
tv
::
torch2tv
<
int
>
(
indiceNum
),
kernelSize32
,
stride32
,
padding32
,
dilation32
,
outSpatialShape32
,
transpose
);
}
else
{
}
else
{
auto
getIndicePairFtor
=
auto
getIndicePairFtor
=
functor
::
CreateSubMIndicePairFunctor
<
tv
::
GPU
,
int
,
int
,
NDim
>
();
functor
::
CreateSubMIndicePairFunctor
<
tv
::
GPU
,
int
,
int
,
NDim
>
();
numActOut
=
getIndicePairFtor
(
numActOut
=
getIndicePairFtor
(
tv
::
TorchGPU
(),
tv
::
torch2tv
<
const
int
>
(
indices
),
tv
::
torch2tv
<
int
>
(
gridOut
),
tv
::
TorchGPU
(),
tv
::
torch2tv
<
const
int
>
(
indices
),
tv
::
torch2tv
<
int
>
(
indicePairs
),
tv
::
torch2tv
<
int
>
(
indiceNum
),
kernelSize32
,
tv
::
torch2tv
<
int
>
(
gridOut
),
tv
::
torch2tv
<
int
>
(
indicePairs
),
stride32
,
padding32
,
dilation32
,
outSpatialShape32
,
transpose
);
tv
::
torch2tv
<
int
>
(
indiceNum
),
kernelSize32
,
stride32
,
padding32
,
dilation32
,
outSpatialShape32
,
transpose
);
}
}
return
{
indices
,
indicePairs
,
indiceNum
};
return
{
indices
,
indicePairs
,
indiceNum
};
}
else
{
}
else
{
...
@@ -105,19 +107,21 @@ getIndicePair(torch::Tensor indices, int64_t batchSize,
...
@@ -105,19 +107,21 @@ getIndicePair(torch::Tensor indices, int64_t batchSize,
torch
::
zeros
({
numAct
*
kernelVolume
,
coorDim
+
1
},
torch
::
zeros
({
numAct
*
kernelVolume
,
coorDim
+
1
},
torch
::
dtype
(
torch
::
kInt32
).
device
(
indices
.
device
()));
torch
::
dtype
(
torch
::
kInt32
).
device
(
indices
.
device
()));
if
(
indices
.
device
().
type
()
==
torch
::
kCPU
)
{
if
(
indices
.
device
().
type
()
==
torch
::
kCPU
)
{
auto
getIndicePairFtor
=
functor
::
CreateConvIndicePairFunctor
<
tv
::
CPU
,
int
,
int
,
NDim
>
();
auto
getIndicePairFtor
=
functor
::
CreateConvIndicePairFunctor
<
tv
::
CPU
,
int
,
int
,
NDim
>
();
numActOut
=
getIndicePairFtor
(
numActOut
=
getIndicePairFtor
(
tv
::
CPU
(),
tv
::
torch2tv
<
const
int
>
(
indices
),
tv
::
CPU
(),
tv
::
torch2tv
<
const
int
>
(
indices
),
tv
::
torch2tv
<
int
>
(
outInds
),
tv
::
torch2tv
<
int
>
(
gridOut
),
tv
::
torch2tv
<
int
>
(
outInds
),
tv
::
torch2tv
<
int
>
(
gridOut
),
tv
::
torch2tv
<
int
>
(
indicePairs
),
tv
::
torch2tv
<
int
>
(
indiceNum
),
kernelSize32
,
tv
::
torch2tv
<
int
>
(
indicePairs
),
tv
::
torch2tv
<
int
>
(
indiceNum
),
stride32
,
padding32
,
dilation32
,
outSpatialShape32
,
transpose
);
kernelSize32
,
stride32
,
padding32
,
dilation32
,
outSpatialShape32
,
transpose
);
}
else
{
}
else
{
auto
getIndicePairFtorP1
=
auto
getIndicePairFtorP1
=
functor
::
CreateConvIndicePairFunctorP1
<
tv
::
GPU
,
int
,
int
,
NDim
>
();
functor
::
CreateConvIndicePairFunctorP1
<
tv
::
GPU
,
int
,
int
,
NDim
>
();
auto
getIndicePairFtorP2
=
auto
getIndicePairFtorP2
=
functor
::
CreateConvIndicePairFunctorP2
<
tv
::
GPU
,
int
,
int
,
NDim
>
();
functor
::
CreateConvIndicePairFunctorP2
<
tv
::
GPU
,
int
,
int
,
NDim
>
();
numActOut
=
numActOut
=
getIndicePairFtorP1
(
getIndicePairFtorP1
(
tv
::
TorchGPU
(),
tv
::
torch2tv
<
const
int
>
(
indices
),
tv
::
TorchGPU
(),
tv
::
torch2tv
<
const
int
>
(
indices
),
tv
::
torch2tv
<
int
>
(
outInds
),
tv
::
torch2tv
<
int
>
(
gridOut
),
tv
::
torch2tv
<
int
>
(
outInds
),
tv
::
torch2tv
<
int
>
(
gridOut
),
tv
::
torch2tv
<
int
>
(
indicePairs
),
tv
::
torch2tv
<
int
>
(
indiceNum
),
tv
::
torch2tv
<
int
>
(
indicePairs
),
tv
::
torch2tv
<
int
>
(
indiceNum
),
tv
::
torch2tv
<
int
>
(
indicePairUnique
),
kernelSize32
,
stride32
,
tv
::
torch2tv
<
int
>
(
indicePairUnique
),
kernelSize32
,
stride32
,
...
@@ -137,8 +141,8 @@ getIndicePair(torch::Tensor indices, int64_t batchSize,
...
@@ -137,8 +141,8 @@ getIndicePair(torch::Tensor indices, int64_t batchSize,
}
}
template
<
unsigned
NDim
>
template
<
unsigned
NDim
>
std
::
vector
<
torch
::
Tensor
>
std
::
vector
<
torch
::
Tensor
>
getIndicePairPreGrid
(
getIndicePairPreGrid
(
torch
::
Tensor
indices
,
torch
::
Tensor
gridOut
,
int64_t
batchSize
,
torch
::
Tensor
indices
,
torch
::
Tensor
gridOut
,
int64_t
batchSize
,
std
::
vector
<
int64_t
>
outSpatialShape
,
std
::
vector
<
int64_t
>
spatialShape
,
std
::
vector
<
int64_t
>
outSpatialShape
,
std
::
vector
<
int64_t
>
spatialShape
,
std
::
vector
<
int64_t
>
kernelSize
,
std
::
vector
<
int64_t
>
stride
,
std
::
vector
<
int64_t
>
kernelSize
,
std
::
vector
<
int64_t
>
stride
,
std
::
vector
<
int64_t
>
padding
,
std
::
vector
<
int64_t
>
dilation
,
std
::
vector
<
int64_t
>
padding
,
std
::
vector
<
int64_t
>
dilation
,
...
@@ -177,8 +181,8 @@ getIndicePairPreGrid(torch::Tensor indices, torch::Tensor gridOut, int64_t batch
...
@@ -177,8 +181,8 @@ getIndicePairPreGrid(torch::Tensor indices, torch::Tensor gridOut, int64_t batch
tv
::
SimpleVector
<
int
,
NDim
>
stride32
;
tv
::
SimpleVector
<
int
,
NDim
>
stride32
;
tv
::
SimpleVector
<
int
,
NDim
>
padding32
;
tv
::
SimpleVector
<
int
,
NDim
>
padding32
;
tv
::
SimpleVector
<
int
,
NDim
>
dilation32
;
tv
::
SimpleVector
<
int
,
NDim
>
dilation32
;
auto
indicePairUnique
=
auto
indicePairUnique
=
torch
::
full
(
torch
::
full
(
{
indicePairs
.
numel
()
/
2
+
1
},
std
::
numeric_limits
<
int
>::
max
(),
{
indicePairs
.
numel
()
/
2
+
1
},
std
::
numeric_limits
<
int
>::
max
(),
torch
::
dtype
(
torch
::
kInt32
).
device
(
indices
.
device
()));
torch
::
dtype
(
torch
::
kInt32
).
device
(
indices
.
device
()));
for
(
int
i
=
0
;
i
<
NDim
;
++
i
)
{
for
(
int
i
=
0
;
i
<
NDim
;
++
i
)
{
outSpatialShape32
.
push_back
(
outSpatialShape
[
i
]);
outSpatialShape32
.
push_back
(
outSpatialShape
[
i
]);
...
@@ -198,17 +202,19 @@ getIndicePairPreGrid(torch::Tensor indices, torch::Tensor gridOut, int64_t batch
...
@@ -198,17 +202,19 @@ getIndicePairPreGrid(torch::Tensor indices, torch::Tensor gridOut, int64_t batch
auto
getIndicePairFtor
=
auto
getIndicePairFtor
=
functor
::
CreateSubMIndicePairFunctor
<
tv
::
CPU
,
int
,
int
,
NDim
>
();
functor
::
CreateSubMIndicePairFunctor
<
tv
::
CPU
,
int
,
int
,
NDim
>
();
numActOut
=
getIndicePairFtor
(
numActOut
=
getIndicePairFtor
(
tv
::
CPU
(),
tv
::
torch2tv
<
const
int
>
(
indices
),
tv
::
torch2tv
<
int
>
(
gridOut
),
tv
::
CPU
(),
tv
::
torch2tv
<
const
int
>
(
indices
),
tv
::
torch2tv
<
int
>
(
indicePairs
),
tv
::
torch2tv
<
int
>
(
indiceNum
),
kernelSize32
,
tv
::
torch2tv
<
int
>
(
gridOut
),
tv
::
torch2tv
<
int
>
(
indicePairs
),
stride32
,
padding32
,
dilation32
,
outSpatialShape32
,
transpose
);
tv
::
torch2tv
<
int
>
(
indiceNum
),
kernelSize32
,
stride32
,
padding32
,
dilation32
,
outSpatialShape32
,
transpose
);
gridOut
.
fill_
(
-
1
);
gridOut
.
fill_
(
-
1
);
}
else
{
}
else
{
auto
getIndicePairFtor
=
auto
getIndicePairFtor
=
functor
::
CreateSubMIndicePairFunctor
<
tv
::
GPU
,
int
,
int
,
NDim
>
();
functor
::
CreateSubMIndicePairFunctor
<
tv
::
GPU
,
int
,
int
,
NDim
>
();
numActOut
=
getIndicePairFtor
(
numActOut
=
getIndicePairFtor
(
tv
::
TorchGPU
(),
tv
::
torch2tv
<
const
int
>
(
indices
),
tv
::
torch2tv
<
int
>
(
gridOut
),
tv
::
TorchGPU
(),
tv
::
torch2tv
<
const
int
>
(
indices
),
tv
::
torch2tv
<
int
>
(
indicePairs
),
tv
::
torch2tv
<
int
>
(
indiceNum
),
kernelSize32
,
tv
::
torch2tv
<
int
>
(
gridOut
),
tv
::
torch2tv
<
int
>
(
indicePairs
),
stride32
,
padding32
,
dilation32
,
outSpatialShape32
,
transpose
,
true
);
tv
::
torch2tv
<
int
>
(
indiceNum
),
kernelSize32
,
stride32
,
padding32
,
dilation32
,
outSpatialShape32
,
transpose
,
true
);
}
}
return
{
indices
,
indicePairs
,
indiceNum
};
return
{
indices
,
indicePairs
,
indiceNum
};
}
else
{
}
else
{
...
@@ -216,20 +222,22 @@ getIndicePairPreGrid(torch::Tensor indices, torch::Tensor gridOut, int64_t batch
...
@@ -216,20 +222,22 @@ getIndicePairPreGrid(torch::Tensor indices, torch::Tensor gridOut, int64_t batch
torch
::
zeros
({
numAct
*
kernelVolume
,
coorDim
+
1
},
torch
::
zeros
({
numAct
*
kernelVolume
,
coorDim
+
1
},
torch
::
dtype
(
torch
::
kInt32
).
device
(
indices
.
device
()));
torch
::
dtype
(
torch
::
kInt32
).
device
(
indices
.
device
()));
if
(
indices
.
device
().
type
()
==
torch
::
kCPU
)
{
if
(
indices
.
device
().
type
()
==
torch
::
kCPU
)
{
auto
getIndicePairFtor
=
functor
::
CreateConvIndicePairFunctor
<
tv
::
CPU
,
int
,
int
,
NDim
>
();
auto
getIndicePairFtor
=
functor
::
CreateConvIndicePairFunctor
<
tv
::
CPU
,
int
,
int
,
NDim
>
();
numActOut
=
getIndicePairFtor
(
numActOut
=
getIndicePairFtor
(
tv
::
CPU
(),
tv
::
torch2tv
<
const
int
>
(
indices
),
tv
::
CPU
(),
tv
::
torch2tv
<
const
int
>
(
indices
),
tv
::
torch2tv
<
int
>
(
outInds
),
tv
::
torch2tv
<
int
>
(
gridOut
),
tv
::
torch2tv
<
int
>
(
outInds
),
tv
::
torch2tv
<
int
>
(
gridOut
),
tv
::
torch2tv
<
int
>
(
indicePairs
),
tv
::
torch2tv
<
int
>
(
indiceNum
),
kernelSize32
,
tv
::
torch2tv
<
int
>
(
indicePairs
),
tv
::
torch2tv
<
int
>
(
indiceNum
),
stride32
,
padding32
,
dilation32
,
outSpatialShape32
,
transpose
,
true
);
kernelSize32
,
stride32
,
padding32
,
dilation32
,
outSpatialShape32
,
transpose
,
true
);
gridOut
.
fill_
(
-
1
);
gridOut
.
fill_
(
-
1
);
}
else
{
}
else
{
auto
getIndicePairFtorP1
=
auto
getIndicePairFtorP1
=
functor
::
CreateConvIndicePairFunctorP1
<
tv
::
GPU
,
int
,
int
,
NDim
>
();
functor
::
CreateConvIndicePairFunctorP1
<
tv
::
GPU
,
int
,
int
,
NDim
>
();
auto
getIndicePairFtorP2
=
auto
getIndicePairFtorP2
=
functor
::
CreateConvIndicePairFunctorP2
<
tv
::
GPU
,
int
,
int
,
NDim
>
();
functor
::
CreateConvIndicePairFunctorP2
<
tv
::
GPU
,
int
,
int
,
NDim
>
();
numActOut
=
numActOut
=
getIndicePairFtorP1
(
getIndicePairFtorP1
(
tv
::
TorchGPU
(),
tv
::
torch2tv
<
const
int
>
(
indices
),
tv
::
TorchGPU
(),
tv
::
torch2tv
<
const
int
>
(
indices
),
tv
::
torch2tv
<
int
>
(
outInds
),
tv
::
torch2tv
<
int
>
(
gridOut
),
tv
::
torch2tv
<
int
>
(
outInds
),
tv
::
torch2tv
<
int
>
(
gridOut
),
tv
::
torch2tv
<
int
>
(
indicePairs
),
tv
::
torch2tv
<
int
>
(
indiceNum
),
tv
::
torch2tv
<
int
>
(
indicePairs
),
tv
::
torch2tv
<
int
>
(
indiceNum
),
tv
::
torch2tv
<
int
>
(
indicePairUnique
),
kernelSize32
,
stride32
,
tv
::
torch2tv
<
int
>
(
indicePairUnique
),
kernelSize32
,
stride32
,
...
@@ -241,15 +249,14 @@ getIndicePairPreGrid(torch::Tensor indices, torch::Tensor gridOut, int64_t batch
...
@@ -241,15 +249,14 @@ getIndicePairPreGrid(torch::Tensor indices, torch::Tensor gridOut, int64_t batch
tv
::
TorchGPU
(),
tv
::
torch2tv
<
const
int
>
(
indices
),
tv
::
TorchGPU
(),
tv
::
torch2tv
<
const
int
>
(
indices
),
tv
::
torch2tv
<
int
>
(
outInds
),
tv
::
torch2tv
<
int
>
(
gridOut
),
tv
::
torch2tv
<
int
>
(
outInds
),
tv
::
torch2tv
<
int
>
(
gridOut
),
tv
::
torch2tv
<
int
>
(
indicePairs
),
tv
::
torch2tv
<
int
>
(
indiceNum
),
tv
::
torch2tv
<
int
>
(
indicePairs
),
tv
::
torch2tv
<
int
>
(
indiceNum
),
tv
::
torch2tv
<
int
>
(
indicePairUnique
),
outSpatialShape32
,
transpose
,
true
);
tv
::
torch2tv
<
int
>
(
indicePairUnique
),
outSpatialShape32
,
transpose
,
true
);
}
}
}
}
return
{
outInds
.
slice
(
0
,
0
,
numActOut
),
indicePairs
,
indiceNum
};
return
{
outInds
.
slice
(
0
,
0
,
numActOut
),
indicePairs
,
indiceNum
};
}
}
}
}
template
<
typename
T
>
template
<
typename
T
>
torch
::
Tensor
indiceConv
(
torch
::
Tensor
features
,
torch
::
Tensor
filters
,
torch
::
Tensor
indiceConv
(
torch
::
Tensor
features
,
torch
::
Tensor
filters
,
torch
::
Tensor
indicePairs
,
torch
::
Tensor
indiceNum
,
torch
::
Tensor
indicePairs
,
torch
::
Tensor
indiceNum
,
...
@@ -262,13 +269,16 @@ torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters,
...
@@ -262,13 +269,16 @@ torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters,
auto
numInPlanes
=
features
.
size
(
1
);
auto
numInPlanes
=
features
.
size
(
1
);
auto
numOutPlanes
=
filters
.
size
(
ndim
+
1
);
auto
numOutPlanes
=
filters
.
size
(
ndim
+
1
);
auto
indicePairNumCpu
=
indiceNum
.
to
({
torch
::
kCPU
});
auto
indicePairNumCpu
=
indiceNum
.
to
({
torch
::
kCPU
});
auto
indicePairMaxSizeIter
=
std
::
max_element
(
auto
indicePairMaxSizeIter
=
indicePairNumCpu
.
data
<
int
>
(),
indicePairNumCpu
.
data
<
int
>
()
+
kernelVolume
);
std
::
max_element
(
indicePairNumCpu
.
data_ptr
<
int
>
(),
int
indicePairMaxOffset
=
indicePairMaxSizeIter
-
indicePairNumCpu
.
data
<
int
>
();
indicePairNumCpu
.
data_ptr
<
int
>
()
+
kernelVolume
);
int
indicePairMaxOffset
=
indicePairMaxSizeIter
-
indicePairNumCpu
.
data_ptr
<
int
>
();
int
indicePairMaxSize
=
*
indicePairMaxSizeIter
;
int
indicePairMaxSize
=
*
indicePairMaxSizeIter
;
/*if (_subM){
/*if (_subM){
std::vector<int> indicePairNumVec(indicePairNumCpu.data<int>(), indicePairNumCpu.data<int>() + kernelVolume);
std::vector<int> indicePairNumVec(indicePairNumCpu.data_ptr<int>(),
indicePairNumCpu.data_ptr<int>() + kernelVolume);
indicePairNumVec.erase(indicePairNumVec.begin() + indicePairMaxOffset);
indicePairNumVec.erase(indicePairNumVec.begin() + indicePairMaxOffset);
auto indicePairVecMaxSizeIter = std::max_element(
auto indicePairVecMaxSizeIter = std::max_element(
...
@@ -282,7 +292,8 @@ torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters,
...
@@ -282,7 +292,8 @@ torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters,
// torch::TensorOptions().dtype(torch::kInt64).device(indicePairs.device());
// torch::TensorOptions().dtype(torch::kInt64).device(indicePairs.device());
torch
::
Tensor
output
=
torch
::
zeros
({
numActOut
,
numOutPlanes
},
options
);
torch
::
Tensor
output
=
torch
::
zeros
({
numActOut
,
numOutPlanes
},
options
);
torch
::
Tensor
inputBuffer
=
torch
::
zeros
({
indicePairMaxSize
,
numInPlanes
},
options
);
torch
::
Tensor
inputBuffer
=
torch
::
zeros
({
indicePairMaxSize
,
numInPlanes
},
options
);
torch
::
Tensor
outputBuffer
=
torch
::
Tensor
outputBuffer
=
torch
::
zeros
({
indicePairMaxSize
,
numOutPlanes
},
options
);
torch
::
zeros
({
indicePairMaxSize
,
numOutPlanes
},
options
);
filters
=
filters
.
view
({
-
1
,
numInPlanes
,
numOutPlanes
});
filters
=
filters
.
view
({
-
1
,
numInPlanes
,
numOutPlanes
});
...
@@ -294,33 +305,34 @@ torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters,
...
@@ -294,33 +305,34 @@ torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters,
double
totalGEMMTime
=
0
;
double
totalGEMMTime
=
0
;
double
totalSAddTime
=
0
;
double
totalSAddTime
=
0
;
for
(
int
i
=
0
;
i
<
kernelVolume
;
++
i
)
{
for
(
int
i
=
0
;
i
<
kernelVolume
;
++
i
)
{
auto
nHot
=
indicePairNumCpu
.
data
<
int
>
()[
i
];
auto
nHot
=
indicePairNumCpu
.
data
_ptr
<
int
>
()[
i
];
if
(
nHot
<=
0
||
(
subM
&&
i
==
indicePairMaxOffset
))
{
if
(
nHot
<=
0
||
(
subM
&&
i
==
indicePairMaxOffset
))
{
continue
;
continue
;
}
}
// auto timer = spconv::CudaContextTimer<>();
// auto timer = spconv::CudaContextTimer<>();
auto
outputBufferBlob
=
auto
outputBufferBlob
=
torch
::
from_blob
(
outputBuffer
.
data_ptr
<
T
>
(),
torch
::
from_blob
(
outputBuffer
.
data
<
T
>
(),
{
nHot
,
numOutPlanes
},
options
);
{
nHot
,
numOutPlanes
},
options
);
auto
inputBufferBlob
=
auto
inputBufferBlob
=
torch
::
from_blob
(
inputBuffer
.
data_ptr
<
T
>
(),
torch
::
from_blob
(
inputBuffer
.
data
<
T
>
(),
{
nHot
,
numInPlanes
},
options
);
{
nHot
,
numInPlanes
},
options
);
if
(
device
==
torch
::
kCPU
)
{
if
(
device
==
torch
::
kCPU
)
{
functor
::
SparseGatherFunctor
<
tv
::
CPU
,
T
,
int
>
gatherFtor
;
functor
::
SparseGatherFunctor
<
tv
::
CPU
,
T
,
int
>
gatherFtor
;
gatherFtor
(
tv
::
CPU
(),
tv
::
torch2tv
<
T
>
(
inputBuffer
),
gatherFtor
(
tv
::
CPU
(),
tv
::
torch2tv
<
T
>
(
inputBuffer
),
tv
::
torch2tv
<
const
T
>
(
features
),
tv
::
torch2tv
<
const
T
>
(
features
),
tv
::
torch2tv
<
const
int
>
(
indicePairs
).
subview
(
i
,
inverse
),
nHot
);
tv
::
torch2tv
<
const
int
>
(
indicePairs
).
subview
(
i
,
inverse
),
nHot
);
}
else
{
}
else
{
functor
::
SparseGatherFunctor
<
tv
::
GPU
,
T
,
int
>
gatherFtor
;
functor
::
SparseGatherFunctor
<
tv
::
GPU
,
T
,
int
>
gatherFtor
;
gatherFtor
(
tv
::
TorchGPU
(),
tv
::
torch2tv
<
T
>
(
inputBuffer
),
gatherFtor
(
tv
::
TorchGPU
(),
tv
::
torch2tv
<
T
>
(
inputBuffer
),
tv
::
torch2tv
<
const
T
>
(
features
),
tv
::
torch2tv
<
const
T
>
(
features
),
tv
::
torch2tv
<
const
int
>
(
indicePairs
).
subview
(
i
,
inverse
),
nHot
);
tv
::
torch2tv
<
const
int
>
(
indicePairs
).
subview
(
i
,
inverse
),
nHot
);
TV_CHECK_CUDA_ERR
();
TV_CHECK_CUDA_ERR
();
/* slower than SparseGatherFunctor, may due to int->long conversion
/* slower than SparseGatherFunctor, may due to int->long conversion
auto indicePairLong = indicePairs[i][inverse].to(torch::kInt64);
auto indicePairLong = indicePairs[i][inverse].to(torch::kInt64);
auto indicePairBlob = torch::from_blob(indicePairLong.data<long>(), {nHot},
auto indicePairBlob = torch::from_blob(indicePairLong.data_ptr<long>(),
indicePairOptions);
{nHot}, indicePairOptions); torch::index_select_out(inputBufferBlob,
torch::index_select_out(inputBufferBlob, features, 0,
features, 0, indicePairBlob);*/
indicePairBlob);*/
}
}
// totalGatherTime += timer.report() / 1000.0;
// totalGatherTime += timer.report() / 1000.0;
torch
::
mm_out
(
outputBufferBlob
,
inputBufferBlob
,
filters
[
i
]);
torch
::
mm_out
(
outputBufferBlob
,
inputBufferBlob
,
filters
[
i
]);
...
@@ -330,14 +342,14 @@ torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters,
...
@@ -330,14 +342,14 @@ torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters,
functor
::
SparseScatterAddFunctor
<
tv
::
CPU
,
T
,
int
>
scatterFtor
;
functor
::
SparseScatterAddFunctor
<
tv
::
CPU
,
T
,
int
>
scatterFtor
;
scatterFtor
(
tv
::
CPU
(),
tv
::
torch2tv
<
T
>
(
output
),
scatterFtor
(
tv
::
CPU
(),
tv
::
torch2tv
<
T
>
(
output
),
tv
::
torch2tv
<
const
T
>
(
outputBuffer
),
tv
::
torch2tv
<
const
T
>
(
outputBuffer
),
tv
::
torch2tv
<
const
int
>
(
indicePairs
).
subview
(
i
,
!
inverse
),
nHot
,
tv
::
torch2tv
<
const
int
>
(
indicePairs
).
subview
(
i
,
!
inverse
),
true
);
nHot
,
true
);
}
else
{
}
else
{
functor
::
SparseScatterAddFunctor
<
tv
::
GPU
,
T
,
int
>
scatterFtor
;
functor
::
SparseScatterAddFunctor
<
tv
::
GPU
,
T
,
int
>
scatterFtor
;
scatterFtor
(
tv
::
TorchGPU
(),
tv
::
torch2tv
<
T
>
(
output
),
scatterFtor
(
tv
::
TorchGPU
(),
tv
::
torch2tv
<
T
>
(
output
),
tv
::
torch2tv
<
const
T
>
(
outputBuffer
),
tv
::
torch2tv
<
const
T
>
(
outputBuffer
),
tv
::
torch2tv
<
const
int
>
(
indicePairs
).
subview
(
i
,
!
inverse
),
nHot
,
tv
::
torch2tv
<
const
int
>
(
indicePairs
).
subview
(
i
,
!
inverse
),
true
);
nHot
,
true
);
TV_CHECK_CUDA_ERR
();
TV_CHECK_CUDA_ERR
();
}
}
// totalSAddTime += timer.report() / 1000.0;
// totalSAddTime += timer.report() / 1000.0;
...
@@ -349,9 +361,11 @@ torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters,
...
@@ -349,9 +361,11 @@ torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters,
}
}
template
<
typename
T
>
template
<
typename
T
>
std
::
vector
<
torch
::
Tensor
>
std
::
vector
<
torch
::
Tensor
>
indiceConvBackward
(
torch
::
Tensor
features
,
indiceConvBackward
(
torch
::
Tensor
features
,
torch
::
Tensor
filters
,
torch
::
Tensor
filters
,
torch
::
Tensor
outGrad
,
torch
::
Tensor
indicePairs
,
torch
::
Tensor
indiceNum
,
torch
::
Tensor
outGrad
,
torch
::
Tensor
indicePairs
,
torch
::
Tensor
indiceNum
,
int64_t
_inverse
,
int64_t
_subM
)
{
int64_t
_inverse
,
int64_t
_subM
)
{
bool
subM
=
_subM
!=
0
;
bool
subM
=
_subM
!=
0
;
bool
inverse
=
_inverse
!=
0
;
bool
inverse
=
_inverse
!=
0
;
...
@@ -362,16 +376,19 @@ indiceConvBackward(torch::Tensor features, torch::Tensor filters,
...
@@ -362,16 +376,19 @@ indiceConvBackward(torch::Tensor features, torch::Tensor filters,
auto
numInPlanes
=
features
.
size
(
1
);
auto
numInPlanes
=
features
.
size
(
1
);
auto
numOutPlanes
=
filters
.
size
(
ndim
+
1
);
auto
numOutPlanes
=
filters
.
size
(
ndim
+
1
);
auto
indicePairNumCpu
=
indiceNum
.
to
({
torch
::
kCPU
});
auto
indicePairNumCpu
=
indiceNum
.
to
({
torch
::
kCPU
});
auto
indicePairMaxSizeIter
=
std
::
max_element
(
auto
indicePairMaxSizeIter
=
indicePairNumCpu
.
data
<
int
>
(),
indicePairNumCpu
.
data
<
int
>
()
+
kernelVolume
);
std
::
max_element
(
indicePairNumCpu
.
data_ptr
<
int
>
(),
int
indicePairMaxOffset
=
indicePairMaxSizeIter
-
indicePairNumCpu
.
data
<
int
>
();
indicePairNumCpu
.
data_ptr
<
int
>
()
+
kernelVolume
);
int
indicePairMaxOffset
=
indicePairMaxSizeIter
-
indicePairNumCpu
.
data_ptr
<
int
>
();
int
indicePairMaxSize
=
*
indicePairMaxSizeIter
;
int
indicePairMaxSize
=
*
indicePairMaxSizeIter
;
auto
options
=
auto
options
=
torch
::
TensorOptions
().
dtype
(
features
.
dtype
()).
device
(
features
.
device
());
torch
::
TensorOptions
().
dtype
(
features
.
dtype
()).
device
(
features
.
device
());
auto
filterShape
=
filters
.
sizes
();
auto
filterShape
=
filters
.
sizes
();
torch
::
Tensor
inputGrad
=
torch
::
zeros
(
features
.
sizes
(),
options
);
torch
::
Tensor
inputGrad
=
torch
::
zeros
(
features
.
sizes
(),
options
);
torch
::
Tensor
filtersGrad
=
torch
::
zeros
(
filterShape
,
options
);
torch
::
Tensor
filtersGrad
=
torch
::
zeros
(
filterShape
,
options
);
torch
::
Tensor
inputBuffer
=
torch
::
zeros
({
indicePairMaxSize
,
numInPlanes
},
options
);
torch
::
Tensor
inputBuffer
=
torch
::
zeros
({
indicePairMaxSize
,
numInPlanes
},
options
);
torch
::
Tensor
outputBuffer
=
torch
::
Tensor
outputBuffer
=
torch
::
zeros
({
indicePairMaxSize
,
numOutPlanes
},
options
);
torch
::
zeros
({
indicePairMaxSize
,
numOutPlanes
},
options
);
...
@@ -383,7 +400,7 @@ indiceConvBackward(torch::Tensor features, torch::Tensor filters,
...
@@ -383,7 +400,7 @@ indiceConvBackward(torch::Tensor features, torch::Tensor filters,
torch
::
mm_out
(
inputGrad
,
outGrad
,
filters
[
indicePairMaxOffset
].
t
());
torch
::
mm_out
(
inputGrad
,
outGrad
,
filters
[
indicePairMaxOffset
].
t
());
}
}
for
(
int
i
=
0
;
i
<
kernelVolume
;
++
i
)
{
for
(
int
i
=
0
;
i
<
kernelVolume
;
++
i
)
{
auto
nHot
=
indicePairNumCpu
.
data
<
int
>
()[
i
];
auto
nHot
=
indicePairNumCpu
.
data
_ptr
<
int
>
()[
i
];
if
(
nHot
<=
0
||
(
subM
&&
i
==
indicePairMaxOffset
))
{
if
(
nHot
<=
0
||
(
subM
&&
i
==
indicePairMaxOffset
))
{
continue
;
continue
;
}
}
...
@@ -392,27 +409,31 @@ indiceConvBackward(torch::Tensor features, torch::Tensor filters,
...
@@ -392,27 +409,31 @@ indiceConvBackward(torch::Tensor features, torch::Tensor filters,
functor
::
SparseGatherFunctor
<
tv
::
CPU
,
T
,
int
>
gatherFtorOut
;
functor
::
SparseGatherFunctor
<
tv
::
CPU
,
T
,
int
>
gatherFtorOut
;
gatherFtor
(
tv
::
CPU
(),
tv
::
torch2tv
<
T
>
(
inputBuffer
),
gatherFtor
(
tv
::
CPU
(),
tv
::
torch2tv
<
T
>
(
inputBuffer
),
tv
::
torch2tv
<
const
T
>
(
features
),
tv
::
torch2tv
<
const
T
>
(
features
),
tv
::
torch2tv
<
const
int
>
(
indicePairs
).
subview
(
i
,
inverse
),
nHot
);
tv
::
torch2tv
<
const
int
>
(
indicePairs
).
subview
(
i
,
inverse
),
nHot
);
gatherFtorOut
(
tv
::
CPU
(),
tv
::
torch2tv
<
T
>
(
outputBuffer
),
gatherFtorOut
(
tv
::
CPU
(),
tv
::
torch2tv
<
T
>
(
outputBuffer
),
tv
::
torch2tv
<
const
T
>
(
outGrad
),
tv
::
torch2tv
<
const
T
>
(
outGrad
),
tv
::
torch2tv
<
const
int
>
(
indicePairs
).
subview
(
i
,
!
inverse
),
nHot
);
tv
::
torch2tv
<
const
int
>
(
indicePairs
).
subview
(
i
,
!
inverse
),
nHot
);
}
else
{
}
else
{
functor
::
SparseGatherFunctor
<
tv
::
GPU
,
T
,
int
>
gatherFtor
;
functor
::
SparseGatherFunctor
<
tv
::
GPU
,
T
,
int
>
gatherFtor
;
functor
::
SparseGatherFunctor
<
tv
::
GPU
,
T
,
int
>
gatherFtorOut
;
functor
::
SparseGatherFunctor
<
tv
::
GPU
,
T
,
int
>
gatherFtorOut
;
gatherFtor
(
tv
::
TorchGPU
(),
tv
::
torch2tv
<
T
>
(
inputBuffer
),
gatherFtor
(
tv
::
TorchGPU
(),
tv
::
torch2tv
<
T
>
(
inputBuffer
),
tv
::
torch2tv
<
const
T
>
(
features
),
tv
::
torch2tv
<
const
T
>
(
features
),
tv
::
torch2tv
<
const
int
>
(
indicePairs
).
subview
(
i
,
inverse
),
nHot
);
tv
::
torch2tv
<
const
int
>
(
indicePairs
).
subview
(
i
,
inverse
),
nHot
);
TV_CHECK_CUDA_ERR
();
TV_CHECK_CUDA_ERR
();
gatherFtorOut
(
tv
::
TorchGPU
(),
tv
::
torch2tv
<
T
>
(
outputBuffer
),
gatherFtorOut
(
tv
::
TorchGPU
(),
tv
::
torch2tv
<
T
>
(
outputBuffer
),
tv
::
torch2tv
<
const
T
>
(
outGrad
),
tv
::
torch2tv
<
const
T
>
(
outGrad
),
tv
::
torch2tv
<
const
int
>
(
indicePairs
).
subview
(
i
,
!
inverse
),
nHot
);
tv
::
torch2tv
<
const
int
>
(
indicePairs
).
subview
(
i
,
!
inverse
),
nHot
);
TV_CHECK_CUDA_ERR
();
TV_CHECK_CUDA_ERR
();
}
}
auto
filterGradSub
=
filtersGrad
[
i
];
auto
filterGradSub
=
filtersGrad
[
i
];
auto
outputBufferBlob
=
auto
outputBufferBlob
=
torch
::
from_blob
(
outputBuffer
.
data_ptr
<
T
>
(),
torch
::
from_blob
(
outputBuffer
.
data
<
T
>
(),
{
nHot
,
numOutPlanes
},
options
);
{
nHot
,
numOutPlanes
},
options
);
auto
inputBufferBlob
=
auto
inputBufferBlob
=
torch
::
from_blob
(
inputBuffer
.
data_ptr
<
T
>
(),
torch
::
from_blob
(
inputBuffer
.
data
<
T
>
(),
{
nHot
,
numInPlanes
},
options
);
{
nHot
,
numInPlanes
},
options
);
torch
::
mm_out
(
filterGradSub
,
inputBufferBlob
.
t
(),
outputBufferBlob
);
torch
::
mm_out
(
filterGradSub
,
inputBufferBlob
.
t
(),
outputBufferBlob
);
torch
::
mm_out
(
inputBufferBlob
,
outputBufferBlob
,
filters
[
i
].
t
());
torch
::
mm_out
(
inputBufferBlob
,
outputBufferBlob
,
filters
[
i
].
t
());
...
@@ -420,12 +441,14 @@ indiceConvBackward(torch::Tensor features, torch::Tensor filters,
...
@@ -420,12 +441,14 @@ indiceConvBackward(torch::Tensor features, torch::Tensor filters,
functor
::
SparseScatterAddFunctor
<
tv
::
CPU
,
T
,
int
>
scatterFtor
;
functor
::
SparseScatterAddFunctor
<
tv
::
CPU
,
T
,
int
>
scatterFtor
;
scatterFtor
(
tv
::
CPU
(),
tv
::
torch2tv
<
T
>
(
inputGrad
),
scatterFtor
(
tv
::
CPU
(),
tv
::
torch2tv
<
T
>
(
inputGrad
),
tv
::
torch2tv
<
const
T
>
(
inputBuffer
),
tv
::
torch2tv
<
const
T
>
(
inputBuffer
),
tv
::
torch2tv
<
const
int
>
(
indicePairs
).
subview
(
i
,
inverse
),
nHot
);
tv
::
torch2tv
<
const
int
>
(
indicePairs
).
subview
(
i
,
inverse
),
nHot
);
}
else
{
}
else
{
functor
::
SparseScatterAddFunctor
<
tv
::
GPU
,
T
,
int
>
scatterFtor
;
functor
::
SparseScatterAddFunctor
<
tv
::
GPU
,
T
,
int
>
scatterFtor
;
scatterFtor
(
tv
::
TorchGPU
(),
tv
::
torch2tv
<
T
>
(
inputGrad
),
scatterFtor
(
tv
::
TorchGPU
(),
tv
::
torch2tv
<
T
>
(
inputGrad
),
tv
::
torch2tv
<
const
T
>
(
inputBuffer
),
tv
::
torch2tv
<
const
T
>
(
inputBuffer
),
tv
::
torch2tv
<
const
int
>
(
indicePairs
).
subview
(
i
,
inverse
),
nHot
);
tv
::
torch2tv
<
const
int
>
(
indicePairs
).
subview
(
i
,
inverse
),
nHot
);
TV_CHECK_CUDA_ERR
();
TV_CHECK_CUDA_ERR
();
}
}
}
}
...
@@ -433,9 +456,12 @@ indiceConvBackward(torch::Tensor features, torch::Tensor filters,
...
@@ -433,9 +456,12 @@ indiceConvBackward(torch::Tensor features, torch::Tensor filters,
}
}
template
<
typename
T
>
template
<
typename
T
>
torch
::
Tensor
indiceConvDevelopDontUse
(
torch
::
Tensor
features
,
torch
::
Tensor
filters
,
torch
::
Tensor
indiceConvDevelopDontUse
(
torch
::
Tensor
features
,
torch
::
Tensor
indicePairs
,
torch
::
Tensor
indiceNum
,
torch
::
Tensor
filters
,
int64_t
numActOut
,
int64_t
_inverse
,
int64_t
_subM
)
{
torch
::
Tensor
indicePairs
,
torch
::
Tensor
indiceNum
,
int64_t
numActOut
,
int64_t
_inverse
,
int64_t
_subM
)
{
bool
subM
=
_subM
!=
0
;
bool
subM
=
_subM
!=
0
;
bool
inverse
=
_inverse
!=
0
;
bool
inverse
=
_inverse
!=
0
;
...
@@ -446,15 +472,19 @@ torch::Tensor indiceConvDevelopDontUse(torch::Tensor features, torch::Tensor fil
...
@@ -446,15 +472,19 @@ torch::Tensor indiceConvDevelopDontUse(torch::Tensor features, torch::Tensor fil
auto
numOutPlanes
=
filters
.
size
(
ndim
+
1
);
auto
numOutPlanes
=
filters
.
size
(
ndim
+
1
);
auto
indicePairNumCpu
=
indiceNum
.
to
({
torch
::
kCPU
});
auto
indicePairNumCpu
=
indiceNum
.
to
({
torch
::
kCPU
});
auto
totalActsTen
=
indicePairNumCpu
.
sum
();
auto
totalActsTen
=
indicePairNumCpu
.
sum
();
auto
totalActs
=
indicePairNumCpu
.
data
<
int
>
()[
0
];
auto
totalActs
=
indicePairNumCpu
.
data_ptr
<
int
>
()[
0
];
auto
indicePairMaxSizeIter
=
std
::
max_element
(
auto
indicePairMaxSizeIter
=
indicePairNumCpu
.
data
<
int
>
(),
indicePairNumCpu
.
data
<
int
>
()
+
kernelVolume
);
std
::
max_element
(
indicePairNumCpu
.
data_ptr
<
int
>
(),
int
indicePairMaxOffset
=
indicePairMaxSizeIter
-
indicePairNumCpu
.
data
<
int
>
();
indicePairNumCpu
.
data_ptr
<
int
>
()
+
kernelVolume
);
int
indicePairMaxOffset
=
indicePairMaxSizeIter
-
indicePairNumCpu
.
data_ptr
<
int
>
();
int
indicePairMaxSize
=
*
indicePairMaxSizeIter
;
int
indicePairMaxSize
=
*
indicePairMaxSizeIter
;
std
::
vector
<
int
>
indicePairNumVec
(
indicePairNumCpu
.
data
<
int
>
(),
std
::
vector
<
int
>
indicePairNumVec
(
indicePairNumCpu
.
data
<
int
>
()
+
kernelVolume
);
indicePairNumCpu
.
data_ptr
<
int
>
(),
indicePairNumCpu
.
data_ptr
<
int
>
()
+
kernelVolume
);
indicePairNumVec
.
erase
(
indicePairNumVec
.
begin
()
+
indicePairMaxOffset
);
indicePairNumVec
.
erase
(
indicePairNumVec
.
begin
()
+
indicePairMaxOffset
);
int
subRuleMaxSize
=
*
std
::
max_element
(
indicePairNumVec
.
begin
(),
indicePairNumVec
.
end
());
int
subRuleMaxSize
=
*
std
::
max_element
(
indicePairNumVec
.
begin
(),
indicePairNumVec
.
end
());
if
(
subM
)
{
if
(
subM
)
{
indicePairMaxSize
=
subRuleMaxSize
;
indicePairMaxSize
=
subRuleMaxSize
;
}
}
...
@@ -470,7 +500,7 @@ torch::Tensor indiceConvDevelopDontUse(torch::Tensor features, torch::Tensor fil
...
@@ -470,7 +500,7 @@ torch::Tensor indiceConvDevelopDontUse(torch::Tensor features, torch::Tensor fil
torch
::
Tensor
outputBuffer
=
torch
::
Tensor
outputBuffer
=
torch
::
zeros
({
kernelVolume
,
indicePairMaxSize
,
numOutPlanes
},
options
);
torch
::
zeros
({
kernelVolume
,
indicePairMaxSize
,
numOutPlanes
},
options
);
filters
=
filters
.
view
({
-
1
,
numInPlanes
,
numOutPlanes
});
filters
=
filters
.
view
({
-
1
,
numInPlanes
,
numOutPlanes
});
std
::
cout
<<
"create time "
<<
timer
.
report
()
/
1000.0
<<
std
::
endl
;
std
::
cout
<<
"create time "
<<
timer
.
report
()
/
1000.0
<<
std
::
endl
;
if
(
subM
)
{
// the center index of subm conv don't need gather and scatter
if
(
subM
)
{
// the center index of subm conv don't need gather and scatter
// add.
// add.
torch
::
mm_out
(
output
,
features
,
filters
[
indicePairMaxOffset
]);
torch
::
mm_out
(
output
,
features
,
filters
[
indicePairMaxOffset
]);
...
@@ -480,43 +510,44 @@ torch::Tensor indiceConvDevelopDontUse(torch::Tensor features, torch::Tensor fil
...
@@ -480,43 +510,44 @@ torch::Tensor indiceConvDevelopDontUse(torch::Tensor features, torch::Tensor fil
double
totalSAddTime
=
0
;
double
totalSAddTime
=
0
;
// auto timer = spconv::CudaContextTimer<>();
// auto timer = spconv::CudaContextTimer<>();
for
(
int
i
=
0
;
i
<
kernelVolume
;
++
i
)
{
for
(
int
i
=
0
;
i
<
kernelVolume
;
++
i
)
{
auto
nHot
=
indicePairNumCpu
.
data
<
int
>
()[
i
];
auto
nHot
=
indicePairNumCpu
.
data
_ptr
<
int
>
()[
i
];
if
(
nHot
<=
0
||
(
subM
&&
i
==
indicePairMaxOffset
))
{
if
(
nHot
<=
0
||
(
subM
&&
i
==
indicePairMaxOffset
))
{
continue
;
continue
;
}
}
//
//
auto
outputBufferBlob
=
torch
::
from_blob
(
outputBuffer
[
i
].
data
<
T
>
(),
auto
outputBufferBlob
=
torch
::
from_blob
(
outputBuffer
[
i
].
data
_ptr
<
T
>
(),
{
nHot
,
numOutPlanes
},
options
);
{
nHot
,
numOutPlanes
},
options
);
auto
inputBufferBlob
=
torch
::
from_blob
(
inputBuffer
[
i
].
data
<
T
>
(),
auto
inputBufferBlob
=
torch
::
from_blob
(
inputBuffer
[
i
].
data
_ptr
<
T
>
(),
{
nHot
,
numInPlanes
},
options
);
{
nHot
,
numInPlanes
},
options
);
if
(
device
==
torch
::
kCPU
)
{
if
(
device
==
torch
::
kCPU
)
{
functor
::
SparseGatherFunctor
<
tv
::
CPU
,
T
,
int
>
gatherFtor
;
functor
::
SparseGatherFunctor
<
tv
::
CPU
,
T
,
int
>
gatherFtor
;
gatherFtor
(
tv
::
CPU
(),
tv
::
torch2tv
<
T
>
(
inputBufferBlob
),
gatherFtor
(
tv
::
CPU
(),
tv
::
torch2tv
<
T
>
(
inputBufferBlob
),
tv
::
torch2tv
<
const
T
>
(
features
),
tv
::
torch2tv
<
const
T
>
(
features
),
tv
::
torch2tv
<
const
int
>
(
indicePairs
).
subview
(
i
,
inverse
),
nHot
);
tv
::
torch2tv
<
const
int
>
(
indicePairs
).
subview
(
i
,
inverse
),
nHot
);
}
else
{
}
else
{
functor
::
SparseGatherFunctor
<
tv
::
GPU
,
T
,
int
>
gatherFtor
;
functor
::
SparseGatherFunctor
<
tv
::
GPU
,
T
,
int
>
gatherFtor
;
gatherFtor
(
tv
::
TorchGPU
(),
tv
::
torch2tv
<
T
>
(
inputBufferBlob
),
gatherFtor
(
tv
::
TorchGPU
(),
tv
::
torch2tv
<
T
>
(
inputBufferBlob
),
tv
::
torch2tv
<
const
T
>
(
features
),
tv
::
torch2tv
<
const
T
>
(
features
),
tv
::
torch2tv
<
const
int
>
(
indicePairs
).
subview
(
i
,
inverse
),
nHot
);
tv
::
torch2tv
<
const
int
>
(
indicePairs
).
subview
(
i
,
inverse
),
nHot
);
TV_CHECK_CUDA_ERR
();
TV_CHECK_CUDA_ERR
();
}
}
// }
// }
// for (int i = 0; i < kernelVolume; ++i) {
// for (int i = 0; i < kernelVolume; ++i) {
// totalGatherTime += timer.report() / 1000.0;
// totalGatherTime += timer.report() / 1000.0;
// auto outputBufferBlob = torch::from_blob(outputBuffer[i].data<T>(),
// auto outputBufferBlob = torch::from_blob(outputBuffer[i].data
_ptr
<T>(),
// {nHot, numOutPlanes}, options);
// {nHot, numOutPlanes}, options);
}
}
// totalGatherTime += timer.report() / 1000.0;
// totalGatherTime += timer.report() / 1000.0;
for
(
int
i
=
0
;
i
<
kernelVolume
;
++
i
)
{
for
(
int
i
=
0
;
i
<
kernelVolume
;
++
i
)
{
auto
nHot
=
indicePairNumCpu
.
data
<
int
>
()[
i
];
auto
nHot
=
indicePairNumCpu
.
data
_ptr
<
int
>
()[
i
];
if
(
nHot
<=
0
||
(
subM
&&
i
==
indicePairMaxOffset
))
{
if
(
nHot
<=
0
||
(
subM
&&
i
==
indicePairMaxOffset
))
{
continue
;
continue
;
}
}
auto
outputBufferBlob
=
torch
::
from_blob
(
outputBuffer
[
i
].
data
<
T
>
(),
auto
outputBufferBlob
=
torch
::
from_blob
(
outputBuffer
[
i
].
data
_ptr
<
T
>
(),
{
nHot
,
numOutPlanes
},
options
);
{
nHot
,
numOutPlanes
},
options
);
auto
inputBufferBlob
=
torch
::
from_blob
(
inputBuffer
[
i
].
data
<
T
>
(),
auto
inputBufferBlob
=
torch
::
from_blob
(
inputBuffer
[
i
].
data
_ptr
<
T
>
(),
{
nHot
,
numInPlanes
},
options
);
{
nHot
,
numInPlanes
},
options
);
torch
::
mm_out
(
outputBufferBlob
,
inputBufferBlob
,
filters
[
i
]);
torch
::
mm_out
(
outputBufferBlob
,
inputBufferBlob
,
filters
[
i
]);
...
@@ -524,27 +555,27 @@ torch::Tensor indiceConvDevelopDontUse(torch::Tensor features, torch::Tensor fil
...
@@ -524,27 +555,27 @@ torch::Tensor indiceConvDevelopDontUse(torch::Tensor features, torch::Tensor fil
// totalGEMMTime += timer.report() / 1000.0;
// totalGEMMTime += timer.report() / 1000.0;
// totalGEMMTime += timer.report() / 1000.0;
// totalGEMMTime += timer.report() / 1000.0;
for
(
int
i
=
0
;
i
<
kernelVolume
;
++
i
)
{
for
(
int
i
=
0
;
i
<
kernelVolume
;
++
i
)
{
auto
nHot
=
indicePairNumCpu
.
data
<
int
>
()[
i
];
auto
nHot
=
indicePairNumCpu
.
data
_ptr
<
int
>
()[
i
];
if
(
nHot
<=
0
||
(
subM
&&
i
==
indicePairMaxOffset
))
{
if
(
nHot
<=
0
||
(
subM
&&
i
==
indicePairMaxOffset
))
{
continue
;
continue
;
}
}
auto
outputBufferBlob
=
torch
::
from_blob
(
outputBuffer
[
i
].
data
<
T
>
(),
auto
outputBufferBlob
=
torch
::
from_blob
(
outputBuffer
[
i
].
data
_ptr
<
T
>
(),
{
nHot
,
numOutPlanes
},
options
);
{
nHot
,
numOutPlanes
},
options
);
auto
inputBufferBlob
=
torch
::
from_blob
(
inputBuffer
[
i
].
data
<
T
>
(),
auto
inputBufferBlob
=
torch
::
from_blob
(
inputBuffer
[
i
].
data
_ptr
<
T
>
(),
{
nHot
,
numInPlanes
},
options
);
{
nHot
,
numInPlanes
},
options
);
if
(
device
==
torch
::
kCPU
)
{
if
(
device
==
torch
::
kCPU
)
{
functor
::
SparseScatterAddFunctor
<
tv
::
CPU
,
T
,
int
>
scatterFtor
;
functor
::
SparseScatterAddFunctor
<
tv
::
CPU
,
T
,
int
>
scatterFtor
;
scatterFtor
(
tv
::
CPU
(),
tv
::
torch2tv
<
T
>
(
output
),
scatterFtor
(
tv
::
CPU
(),
tv
::
torch2tv
<
T
>
(
output
),
tv
::
torch2tv
<
const
T
>
(
outputBufferBlob
),
tv
::
torch2tv
<
const
T
>
(
outputBufferBlob
),
tv
::
torch2tv
<
const
int
>
(
indicePairs
).
subview
(
i
,
!
inverse
),
nHot
,
tv
::
torch2tv
<
const
int
>
(
indicePairs
).
subview
(
i
,
!
inverse
),
true
);
nHot
,
true
);
}
else
{
}
else
{
functor
::
SparseScatterAddFunctor
<
tv
::
GPU
,
T
,
int
>
scatterFtor
;
functor
::
SparseScatterAddFunctor
<
tv
::
GPU
,
T
,
int
>
scatterFtor
;
scatterFtor
(
tv
::
TorchGPU
(),
tv
::
torch2tv
<
T
>
(
output
),
scatterFtor
(
tv
::
TorchGPU
(),
tv
::
torch2tv
<
T
>
(
output
),
tv
::
torch2tv
<
const
T
>
(
outputBufferBlob
),
tv
::
torch2tv
<
const
T
>
(
outputBufferBlob
),
tv
::
torch2tv
<
const
int
>
(
indicePairs
).
subview
(
i
,
!
inverse
),
nHot
,
tv
::
torch2tv
<
const
int
>
(
indicePairs
).
subview
(
i
,
!
inverse
),
true
);
nHot
,
true
);
TV_CHECK_CUDA_ERR
();
TV_CHECK_CUDA_ERR
();
}
}
// totalSAddTime += timer.report() / 1000.0;
// totalSAddTime += timer.report() / 1000.0;
...
...
mmdet3d/ops/spconv/include/torch_utils.h
View file @
ee5667c1
...
@@ -13,20 +13,21 @@
...
@@ -13,20 +13,21 @@
// limitations under the License.
// limitations under the License.
#pragma once
#pragma once
#include <tensorview/tensorview.h>
#include <torch/script.h>
#include <ATen/ATen.h>
#include <ATen/ATen.h>
#include <ATen/cuda/CUDAContext.h>
#include <ATen/cuda/CUDAContext.h>
#include <tensorview/tensorview.h>
#include <torch/script.h>
namespace
tv
{
namespace
tv
{
struct
TorchGPU
:
public
tv
::
GPU
{
struct
TorchGPU
:
public
tv
::
GPU
{
virtual
cudaStream_t
getStream
()
const
override
{
virtual
cudaStream_t
getStream
()
const
override
{
return
at
::
cuda
::
getCurrentCUDAStream
();
return
at
::
cuda
::
getCurrentCUDAStream
();
}
}
};
};
template
<
typename
T
>
void
check_torch_dtype
(
const
torch
::
Tensor
&
tensor
)
{
template
<
typename
T
>
void
check_torch_dtype
(
const
torch
::
Tensor
&
tensor
)
{
switch
(
tensor
.
type
().
scalarType
())
{
switch
(
tensor
.
type
().
scalarType
())
{
case
at
::
ScalarType
::
Double
:
{
case
at
::
ScalarType
::
Double
:
{
auto
val
=
std
::
is_same
<
std
::
remove_const_t
<
T
>
,
double
>::
value
;
auto
val
=
std
::
is_same
<
std
::
remove_const_t
<
T
>
,
double
>::
value
;
...
@@ -65,6 +66,6 @@ tv::TensorView<T> torch2tv(const torch::Tensor &tensor) {
...
@@ -65,6 +66,6 @@ tv::TensorView<T> torch2tv(const torch::Tensor &tensor) {
for
(
auto
i
:
tensor
.
sizes
())
{
for
(
auto
i
:
tensor
.
sizes
())
{
shape
.
push_back
(
i
);
shape
.
push_back
(
i
);
}
}
return
tv
::
TensorView
<
T
>
(
tensor
.
data
<
std
::
remove_const_t
<
T
>>
(),
shape
);
return
tv
::
TensorView
<
T
>
(
tensor
.
data
_ptr
<
std
::
remove_const_t
<
T
>>
(),
shape
);
}
}
}
// namespace tv
}
// namespace tv
mmdet3d/ops/voxel/src/scatter_points_cpu.cpp
View file @
ee5667c1
#include <torch/extension.h>
#include <ATen/TensorUtils.h>
#include <ATen/TensorUtils.h>
#include <torch/extension.h>
// #include "voxelization.h"
// #include "voxelization.h"
namespace
{
namespace
{
template
<
typename
T_int
>
template
<
typename
T_int
>
void
determin_max_points_kernel
(
torch
::
TensorAccessor
<
T_int
,
2
>
coor
,
void
determin_max_points_kernel
(
torch
::
TensorAccessor
<
T_int
,
1
>
point_to_voxelidx
,
torch
::
TensorAccessor
<
T_int
,
2
>
coor
,
torch
::
TensorAccessor
<
T_int
,
1
>
num_points_per_voxel
,
torch
::
TensorAccessor
<
T_int
,
1
>
point_to_voxelidx
,
torch
::
TensorAccessor
<
T_int
,
3
>
coor_to_voxelidx
,
torch
::
TensorAccessor
<
T_int
,
1
>
num_points_per_voxel
,
int
&
voxel_num
,
torch
::
TensorAccessor
<
T_int
,
3
>
coor_to_voxelidx
,
int
&
voxel_num
,
int
&
max_points
,
int
&
max_points
,
const
int
num_points
)
{
const
int
num_points
)
{
int
voxelidx
,
num
;
int
voxelidx
,
num
;
for
(
int
i
=
0
;
i
<
num_points
;
++
i
)
{
for
(
int
i
=
0
;
i
<
num_points
;
++
i
)
{
if
(
coor
[
i
][
0
]
==
-
1
)
if
(
coor
[
i
][
0
]
==
-
1
)
continue
;
continue
;
voxelidx
=
coor_to_voxelidx
[
coor
[
i
][
0
]][
coor
[
i
][
1
]][
coor
[
i
][
2
]];
voxelidx
=
coor_to_voxelidx
[
coor
[
i
][
0
]][
coor
[
i
][
1
]][
coor
[
i
][
2
]];
// record voxel
// record voxel
...
@@ -35,25 +29,21 @@ void determin_max_points_kernel(torch::TensorAccessor<T_int,2> coor,
...
@@ -35,25 +29,21 @@ void determin_max_points_kernel(torch::TensorAccessor<T_int,2> coor,
num_points_per_voxel
[
voxelidx
]
+=
1
;
num_points_per_voxel
[
voxelidx
]
+=
1
;
// update max points per voxel
// update max points per voxel
max_points
=
std
::
max
(
max_points
,
num
+
1
);
max_points
=
std
::
max
(
max_points
,
num
+
1
);
}
}
return
;
return
;
}
}
template
<
typename
T
,
typename
T_int
>
template
<
typename
T
,
typename
T_int
>
void
scatter_point_to_voxel_kernel
(
void
scatter_point_to_voxel_kernel
(
const
torch
::
TensorAccessor
<
T
,
2
>
points
,
const
torch
::
TensorAccessor
<
T
,
2
>
points
,
torch
::
TensorAccessor
<
T_int
,
2
>
coor
,
torch
::
TensorAccessor
<
T_int
,
2
>
coor
,
torch
::
TensorAccessor
<
T_int
,
1
>
point_to_voxelidx
,
torch
::
TensorAccessor
<
T_int
,
1
>
point_to_voxelidx
,
torch
::
TensorAccessor
<
T_int
,
3
>
coor_to_voxelidx
,
torch
::
TensorAccessor
<
T_int
,
3
>
coor_to_voxelidx
,
torch
::
TensorAccessor
<
T
,
3
>
voxels
,
torch
::
TensorAccessor
<
T
,
3
>
voxels
,
torch
::
TensorAccessor
<
T_int
,
2
>
voxel_coors
,
torch
::
TensorAccessor
<
T_int
,
2
>
voxel_coors
,
const
int
num_features
,
const
int
num_features
,
const
int
num_points
,
const
int
NDim
)
{
const
int
num_points
,
const
int
NDim
){
for
(
int
i
=
0
;
i
<
num_points
;
++
i
)
{
for
(
int
i
=
0
;
i
<
num_points
;
++
i
)
{
int
num
=
point_to_voxelidx
[
i
];
int
num
=
point_to_voxelidx
[
i
];
int
voxelidx
=
coor_to_voxelidx
[
coor
[
i
][
0
]][
coor
[
i
][
1
]][
coor
[
i
][
2
]];
int
voxelidx
=
coor_to_voxelidx
[
coor
[
i
][
0
]][
coor
[
i
][
1
]][
coor
[
i
][
2
]];
...
@@ -68,14 +58,11 @@ void scatter_point_to_voxel_kernel(
...
@@ -68,14 +58,11 @@ void scatter_point_to_voxel_kernel(
}
// namespace
}
// namespace
namespace
voxelization
{
namespace
voxelization
{
std
::
vector
<
at
::
Tensor
>
dynamic_point_to_voxel_cpu
(
std
::
vector
<
at
::
Tensor
>
dynamic_point_to_voxel_cpu
(
const
at
::
Tensor
&
points
,
const
at
::
Tensor
&
points
,
const
at
::
Tensor
&
voxel_mapping
,
const
at
::
Tensor
&
voxel_mapping
,
const
std
::
vector
<
float
>
voxel_size
,
const
std
::
vector
<
float
>
coors_range
)
{
const
std
::
vector
<
float
>
voxel_size
,
const
std
::
vector
<
float
>
coors_range
)
{
// current version tooks about 0.02s_0.03s for one frame on cpu
// current version tooks about 0.02s_0.03s for one frame on cpu
// check device
// check device
AT_ASSERTM
(
points
.
device
().
is_cpu
(),
"points must be a CPU tensor"
);
AT_ASSERTM
(
points
.
device
().
is_cpu
(),
"points must be a CPU tensor"
);
...
@@ -86,46 +73,50 @@ std::vector<at::Tensor> dynamic_point_to_voxel_cpu(
...
@@ -86,46 +73,50 @@ std::vector<at::Tensor> dynamic_point_to_voxel_cpu(
std
::
vector
<
int
>
grid_size
(
NDim
);
std
::
vector
<
int
>
grid_size
(
NDim
);
for
(
int
i
=
0
;
i
<
NDim
;
++
i
)
{
for
(
int
i
=
0
;
i
<
NDim
;
++
i
)
{
grid_size
[
i
]
=
round
((
coors_range
[
NDim
+
i
]
-
coors_range
[
i
])
/
voxel_size
[
i
]);
grid_size
[
i
]
=
round
((
coors_range
[
NDim
+
i
]
-
coors_range
[
i
])
/
voxel_size
[
i
]);
}
}
at
::
Tensor
num_points_per_voxel
=
at
::
zeros
({
num_points
,},
voxel_mapping
.
options
());
at
::
Tensor
num_points_per_voxel
=
at
::
zeros
(
at
::
Tensor
coor_to_voxelidx
=
-
at
::
ones
({
grid_size
[
2
],
grid_size
[
1
],
grid_size
[
0
]},
voxel_mapping
.
options
());
{
at
::
Tensor
point_to_voxelidx
=
-
at
::
ones
({
num_points
,},
voxel_mapping
.
options
());
num_points
,
},
voxel_mapping
.
options
());
at
::
Tensor
coor_to_voxelidx
=
-
at
::
ones
(
{
grid_size
[
2
],
grid_size
[
1
],
grid_size
[
0
]},
voxel_mapping
.
options
());
at
::
Tensor
point_to_voxelidx
=
-
at
::
ones
(
{
num_points
,
},
voxel_mapping
.
options
());
int
voxel_num
=
0
;
int
voxel_num
=
0
;
int
max_points
=
0
;
int
max_points
=
0
;
AT_DISPATCH_ALL_TYPES
(
voxel_mapping
.
type
(),
"determin_max_point"
,
[
&
]
{
AT_DISPATCH_ALL_TYPES
(
voxel_mapping
.
scalar_
type
(),
"determin_max_point"
,
[
&
]
{
determin_max_points_kernel
<
scalar_t
>
(
determin_max_points_kernel
<
scalar_t
>
(
voxel_mapping
.
accessor
<
scalar_t
,
2
>
(),
voxel_mapping
.
accessor
<
scalar_t
,
2
>
(),
point_to_voxelidx
.
accessor
<
scalar_t
,
1
>
(),
point_to_voxelidx
.
accessor
<
scalar_t
,
1
>
(),
num_points_per_voxel
.
accessor
<
scalar_t
,
1
>
(),
num_points_per_voxel
.
accessor
<
scalar_t
,
1
>
(),
coor_to_voxelidx
.
accessor
<
scalar_t
,
3
>
(),
coor_to_voxelidx
.
accessor
<
scalar_t
,
3
>
(),
voxel_num
,
max_points
,
voxel_num
,
num_points
);
max_points
,
num_points
);
});
});
at
::
Tensor
voxels
=
at
::
zeros
({
voxel_num
,
max_points
,
num_features
},
points
.
options
());
at
::
Tensor
voxels
=
at
::
Tensor
voxel_coors
=
at
::
zeros
({
voxel_num
,
NDim
},
points
.
options
().
dtype
(
at
::
kInt
));
at
::
zeros
({
voxel_num
,
max_points
,
num_features
},
points
.
options
());
at
::
Tensor
voxel_coors
=
at
::
zeros
({
voxel_num
,
NDim
},
points
.
options
().
dtype
(
at
::
kInt
));
AT_DISPATCH_ALL_TYPES
(
points
.
type
(),
"scatter_point_to_voxel"
,
[
&
]
{
AT_DISPATCH_ALL_TYPES
(
points
.
scalar_
type
(),
"scatter_point_to_voxel"
,
[
&
]
{
scatter_point_to_voxel_kernel
<
scalar_t
,
int
>
(
scatter_point_to_voxel_kernel
<
scalar_t
,
int
>
(
points
.
accessor
<
scalar_t
,
2
>
(),
points
.
accessor
<
scalar_t
,
2
>
(),
voxel_mapping
.
accessor
<
int
,
2
>
(),
voxel_mapping
.
accessor
<
int
,
2
>
(),
point_to_voxelidx
.
accessor
<
int
,
1
>
(),
point_to_voxelidx
.
accessor
<
int
,
1
>
(),
coor_to_voxelidx
.
accessor
<
int
,
3
>
(),
voxels
.
accessor
<
scalar_t
,
3
>
(),
coor_to_voxelidx
.
accessor
<
int
,
3
>
(),
voxel_coors
.
accessor
<
int
,
2
>
(),
num_features
,
num_points
,
NDim
);
voxels
.
accessor
<
scalar_t
,
3
>
(),
voxel_coors
.
accessor
<
int
,
2
>
(),
num_features
,
num_points
,
NDim
);
});
});
at
::
Tensor
num_points_per_voxel_out
=
num_points_per_voxel
.
slice
(
/*dim=*/
0
,
/*start=*/
0
,
/*end=*/
voxel_num
);
at
::
Tensor
num_points_per_voxel_out
=
num_points_per_voxel
.
slice
(
/*dim=*/
0
,
/*start=*/
0
,
/*end=*/
voxel_num
);
return
{
voxels
,
voxel_coors
,
num_points_per_voxel_out
};
return
{
voxels
,
voxel_coors
,
num_points_per_voxel_out
};
}
}
}
}
// namespace voxelization
mmdet3d/ops/voxel/src/scatter_points_cuda.cu
View file @
ee5667c1
...
@@ -6,7 +6,7 @@
...
@@ -6,7 +6,7 @@
#include <ATen/cuda/CUDAApplyUtils.cuh>
#include <ATen/cuda/CUDAApplyUtils.cuh>
#define CHECK_CUDA(x) \
#define CHECK_CUDA(x) \
TORCH_CHECK(x.
typ
e().is_cuda(), #x " must be a CUDA tensor")
TORCH_CHECK(x.
devic
e().is_cuda(), #x " must be a CUDA tensor")
#define CHECK_CONTIGUOUS(x) \
#define CHECK_CONTIGUOUS(x) \
TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
#define CHECK_INPUT(x) \
#define CHECK_INPUT(x) \
...
@@ -177,7 +177,7 @@ std::vector<at::Tensor> dynamic_point_to_voxel_forward_gpu(
...
@@ -177,7 +177,7 @@ std::vector<at::Tensor> dynamic_point_to_voxel_forward_gpu(
dim3
threads
(
threadsPerBlock
);
dim3
threads
(
threadsPerBlock
);
cudaStream_t
map_stream
=
at
::
cuda
::
getCurrentCUDAStream
();
cudaStream_t
map_stream
=
at
::
cuda
::
getCurrentCUDAStream
();
AT_DISPATCH_ALL_TYPES
(
AT_DISPATCH_ALL_TYPES
(
voxel_mapping
.
type
(),
"determin_duplicate"
,
([
&
]
{
voxel_mapping
.
scalar_
type
(),
"determin_duplicate"
,
([
&
]
{
point_to_voxelidx_kernel
<
int
><<<
blocks
,
threads
,
0
,
map_stream
>>>
(
point_to_voxelidx_kernel
<
int
><<<
blocks
,
threads
,
0
,
map_stream
>>>
(
voxel_mapping
.
data_ptr
<
int
>
(),
point_to_voxelidx
.
data_ptr
<
int
>
(),
voxel_mapping
.
data_ptr
<
int
>
(),
point_to_voxelidx
.
data_ptr
<
int
>
(),
point_to_pointidx
.
data_ptr
<
int
>
(),
num_points
,
NDim
);
point_to_pointidx
.
data_ptr
<
int
>
(),
num_points
,
NDim
);
...
@@ -203,7 +203,7 @@ std::vector<at::Tensor> dynamic_point_to_voxel_forward_gpu(
...
@@ -203,7 +203,7 @@ std::vector<at::Tensor> dynamic_point_to_voxel_forward_gpu(
voxel_mapping
.
options
());
// must be zero from the begining
voxel_mapping
.
options
());
// must be zero from the begining
cudaStream_t
logic_stream
=
at
::
cuda
::
getCurrentCUDAStream
();
cudaStream_t
logic_stream
=
at
::
cuda
::
getCurrentCUDAStream
();
AT_DISPATCH_ALL_TYPES
(
AT_DISPATCH_ALL_TYPES
(
voxel_mapping
.
type
(),
"determin_duplicate"
,
([
&
]
{
voxel_mapping
.
scalar_
type
(),
"determin_duplicate"
,
([
&
]
{
determin_voxel_num
<
int
><<<
1
,
1
,
0
,
logic_stream
>>>
(
determin_voxel_num
<
int
><<<
1
,
1
,
0
,
logic_stream
>>>
(
voxel_mapping
.
data_ptr
<
int
>
(),
num_points_per_voxel
.
data_ptr
<
int
>
(),
voxel_mapping
.
data_ptr
<
int
>
(),
num_points_per_voxel
.
data_ptr
<
int
>
(),
point_to_voxelidx
.
data_ptr
<
int
>
(),
point_to_voxelidx
.
data_ptr
<
int
>
(),
...
@@ -228,7 +228,7 @@ std::vector<at::Tensor> dynamic_point_to_voxel_forward_gpu(
...
@@ -228,7 +228,7 @@ std::vector<at::Tensor> dynamic_point_to_voxel_forward_gpu(
dim3
cp_threads
(
threadsPerBlock
,
4
);
dim3
cp_threads
(
threadsPerBlock
,
4
);
cudaStream_t
cp_stream
=
at
::
cuda
::
getCurrentCUDAStream
();
cudaStream_t
cp_stream
=
at
::
cuda
::
getCurrentCUDAStream
();
AT_DISPATCH_ALL_TYPES
(
AT_DISPATCH_ALL_TYPES
(
points
.
type
(),
"scatter_point_to_voxel"
,
([
&
]
{
points
.
scalar_
type
(),
"scatter_point_to_voxel"
,
([
&
]
{
scatter_point_to_voxel_kernel
<
float
,
int
>
scatter_point_to_voxel_kernel
<
float
,
int
>
<<<
blocks
,
cp_threads
,
0
,
cp_stream
>>>
(
<<<
blocks
,
cp_threads
,
0
,
cp_stream
>>>
(
points
.
data_ptr
<
float
>
(),
voxel_mapping
.
data_ptr
<
int
>
(),
points
.
data_ptr
<
float
>
(),
voxel_mapping
.
data_ptr
<
int
>
(),
...
@@ -265,8 +265,8 @@ void dynamic_point_to_voxel_backward_gpu(at::Tensor& grad_input_points,
...
@@ -265,8 +265,8 @@ void dynamic_point_to_voxel_backward_gpu(at::Tensor& grad_input_points,
dim3
blocks
(
col_blocks
);
dim3
blocks
(
col_blocks
);
dim3
cp_threads
(
threadsPerBlock
,
4
);
dim3
cp_threads
(
threadsPerBlock
,
4
);
cudaStream_t
cp_stream
=
at
::
cuda
::
getCurrentCUDAStream
();
cudaStream_t
cp_stream
=
at
::
cuda
::
getCurrentCUDAStream
();
AT_DISPATCH_ALL_TYPES
(
grad_input_points
.
type
(),
"scatter_point_to_voxel"
,
AT_DISPATCH_ALL_TYPES
(
grad_input_points
.
scalar_type
()
,
([
&
]
{
"scatter_point_to_voxel"
,
([
&
]
{
map_voxel_to_point_kernel
<
float
,
int
>
map_voxel_to_point_kernel
<
float
,
int
>
<<<
blocks
,
cp_threads
,
0
,
cp_stream
>>>
(
<<<
blocks
,
cp_threads
,
0
,
cp_stream
>>>
(
grad_input_points
.
data_ptr
<
float
>
(),
grad_input_points
.
data_ptr
<
float
>
(),
...
...
mmdet3d/ops/voxel/src/voxelization.h
View file @
ee5667c1
...
@@ -49,7 +49,7 @@ inline int hard_voxelize(const at::Tensor& points, at::Tensor& voxels,
...
@@ -49,7 +49,7 @@ inline int hard_voxelize(const at::Tensor& points, at::Tensor& voxels,
const
std
::
vector
<
float
>
coors_range
,
const
std
::
vector
<
float
>
coors_range
,
const
int
max_points
,
const
int
max_voxels
,
const
int
max_points
,
const
int
max_voxels
,
const
int
NDim
=
3
)
{
const
int
NDim
=
3
)
{
if
(
points
.
typ
e
().
is_cuda
())
{
if
(
points
.
devic
e
().
is_cuda
())
{
#ifdef WITH_CUDA
#ifdef WITH_CUDA
return
hard_voxelize_gpu
(
points
,
voxels
,
coors
,
num_points_per_voxel
,
return
hard_voxelize_gpu
(
points
,
voxels
,
coors
,
num_points_per_voxel
,
voxel_size
,
coors_range
,
max_points
,
max_voxels
,
voxel_size
,
coors_range
,
max_points
,
max_voxels
,
...
@@ -67,7 +67,7 @@ inline void dynamic_voxelize(const at::Tensor& points, at::Tensor& coors,
...
@@ -67,7 +67,7 @@ inline void dynamic_voxelize(const at::Tensor& points, at::Tensor& coors,
const
std
::
vector
<
float
>
voxel_size
,
const
std
::
vector
<
float
>
voxel_size
,
const
std
::
vector
<
float
>
coors_range
,
const
std
::
vector
<
float
>
coors_range
,
const
int
NDim
=
3
)
{
const
int
NDim
=
3
)
{
if
(
points
.
typ
e
().
is_cuda
())
{
if
(
points
.
devic
e
().
is_cuda
())
{
#ifdef WITH_CUDA
#ifdef WITH_CUDA
return
dynamic_voxelize_gpu
(
points
,
coors
,
voxel_size
,
coors_range
,
NDim
);
return
dynamic_voxelize_gpu
(
points
,
coors
,
voxel_size
,
coors_range
,
NDim
);
#else
#else
...
@@ -80,7 +80,7 @@ inline void dynamic_voxelize(const at::Tensor& points, at::Tensor& coors,
...
@@ -80,7 +80,7 @@ inline void dynamic_voxelize(const at::Tensor& points, at::Tensor& coors,
inline
std
::
vector
<
torch
::
Tensor
>
dynamic_point_to_voxel_forward
(
inline
std
::
vector
<
torch
::
Tensor
>
dynamic_point_to_voxel_forward
(
const
at
::
Tensor
&
points
,
const
at
::
Tensor
&
voxel_mapping
,
const
at
::
Tensor
&
points
,
const
at
::
Tensor
&
voxel_mapping
,
const
std
::
vector
<
float
>
voxel_size
,
const
std
::
vector
<
float
>
coors_range
)
{
const
std
::
vector
<
float
>
voxel_size
,
const
std
::
vector
<
float
>
coors_range
)
{
if
(
points
.
typ
e
().
is_cuda
())
{
if
(
points
.
devic
e
().
is_cuda
())
{
#ifdef WITH_CUDA
#ifdef WITH_CUDA
return
dynamic_point_to_voxel_forward_gpu
(
points
,
voxel_mapping
,
voxel_size
,
return
dynamic_point_to_voxel_forward_gpu
(
points
,
voxel_mapping
,
voxel_size
,
coors_range
);
coors_range
);
...
@@ -95,7 +95,7 @@ inline std::vector<torch::Tensor> dynamic_point_to_voxel_forward(
...
@@ -95,7 +95,7 @@ inline std::vector<torch::Tensor> dynamic_point_to_voxel_forward(
inline
void
dynamic_point_to_voxel_backward
(
inline
void
dynamic_point_to_voxel_backward
(
at
::
Tensor
&
grad_input_points
,
const
at
::
Tensor
&
grad_output_voxels
,
at
::
Tensor
&
grad_input_points
,
const
at
::
Tensor
&
grad_output_voxels
,
const
at
::
Tensor
&
point_to_voxelidx
,
const
at
::
Tensor
&
coor_to_voxelidx
)
{
const
at
::
Tensor
&
point_to_voxelidx
,
const
at
::
Tensor
&
coor_to_voxelidx
)
{
if
(
grad_input_points
.
typ
e
().
is_cuda
())
{
if
(
grad_input_points
.
devic
e
().
is_cuda
())
{
#ifdef WITH_CUDA
#ifdef WITH_CUDA
return
dynamic_point_to_voxel_backward_gpu
(
return
dynamic_point_to_voxel_backward_gpu
(
grad_input_points
,
grad_output_voxels
,
point_to_voxelidx
,
grad_input_points
,
grad_output_voxels
,
point_to_voxelidx
,
...
...
mmdet3d/ops/voxel/src/voxelization_cpu.cpp
View file @
ee5667c1
#include <torch/extension.h>
#include <ATen/TensorUtils.h>
#include <ATen/TensorUtils.h>
#include <torch/extension.h>
// #include "voxelization.h"
// #include "voxelization.h"
namespace
{
namespace
{
template
<
typename
T
,
typename
T_int
>
template
<
typename
T
,
typename
T_int
>
void
dynamic_voxelize_kernel
(
const
torch
::
TensorAccessor
<
T
,
2
>
points
,
void
dynamic_voxelize_kernel
(
const
torch
::
TensorAccessor
<
T
,
2
>
points
,
torch
::
TensorAccessor
<
T_int
,
2
>
coors
,
torch
::
TensorAccessor
<
T_int
,
2
>
coors
,
const
std
::
vector
<
float
>
voxel_size
,
const
std
::
vector
<
float
>
voxel_size
,
const
std
::
vector
<
float
>
coors_range
,
const
std
::
vector
<
float
>
coors_range
,
const
std
::
vector
<
int
>
grid_size
,
const
std
::
vector
<
int
>
grid_size
,
const
int
num_points
,
const
int
num_points
,
const
int
num_features
,
const
int
num_features
,
const
int
NDim
)
{
const
int
NDim
)
{
const
int
ndim_minus_1
=
NDim
-
1
;
const
int
ndim_minus_1
=
NDim
-
1
;
bool
failed
=
false
;
bool
failed
=
false
;
int
coor
[
NDim
];
int
coor
[
NDim
];
...
@@ -44,56 +40,42 @@ void dynamic_voxelize_kernel(const torch::TensorAccessor<T,2> points,
...
@@ -44,56 +40,42 @@ void dynamic_voxelize_kernel(const torch::TensorAccessor<T,2> points,
return
;
return
;
}
}
template
<
typename
T
,
typename
T_int
>
template
<
typename
T
,
typename
T_int
>
void
hard_voxelize_kernel
(
const
torch
::
TensorAccessor
<
T
,
2
>
points
,
void
hard_voxelize_kernel
(
const
torch
::
TensorAccessor
<
T
,
2
>
points
,
torch
::
TensorAccessor
<
T
,
3
>
voxels
,
torch
::
TensorAccessor
<
T
,
3
>
voxels
,
torch
::
TensorAccessor
<
T_int
,
2
>
coors
,
torch
::
TensorAccessor
<
T_int
,
2
>
coors
,
torch
::
TensorAccessor
<
T_int
,
1
>
num_points_per_voxel
,
torch
::
TensorAccessor
<
T_int
,
1
>
num_points_per_voxel
,
torch
::
TensorAccessor
<
T_int
,
3
>
coor_to_voxelidx
,
torch
::
TensorAccessor
<
T_int
,
3
>
coor_to_voxelidx
,
int
&
voxel_num
,
int
&
voxel_num
,
const
std
::
vector
<
float
>
voxel_size
,
const
std
::
vector
<
float
>
voxel_size
,
const
std
::
vector
<
float
>
coors_range
,
const
std
::
vector
<
float
>
coors_range
,
const
std
::
vector
<
int
>
grid_size
,
const
std
::
vector
<
int
>
grid_size
,
const
int
max_points
,
const
int
max_points
,
const
int
max_voxels
,
const
int
max_voxels
,
const
int
num_points
,
const
int
num_features
,
const
int
num_points
,
const
int
NDim
)
{
const
int
num_features
,
const
int
NDim
)
{
// declare a temp coors
// declare a temp coors
at
::
Tensor
temp_coors
=
at
::
zeros
({
num_points
,
NDim
},
at
::
TensorOptions
().
dtype
(
at
::
kInt
).
device
(
at
::
kCPU
));
at
::
Tensor
temp_coors
=
at
::
zeros
(
{
num_points
,
NDim
},
at
::
TensorOptions
().
dtype
(
at
::
kInt
).
device
(
at
::
kCPU
));
// First use dynamic voxelization to get coors,
// First use dynamic voxelization to get coors,
// then check max points/voxels constraints
// then check max points/voxels constraints
dynamic_voxelize_kernel
<
T
,
int
>
(
dynamic_voxelize_kernel
<
T
,
int
>
(
points
,
temp_coors
.
accessor
<
int
,
2
>
(),
points
,
voxel_size
,
coors_range
,
grid_size
,
temp_coors
.
accessor
<
int
,
2
>
(),
num_points
,
num_features
,
NDim
);
voxel_size
,
coors_range
,
grid_size
,
num_points
,
num_features
,
NDim
);
int
voxelidx
,
num
;
int
voxelidx
,
num
;
auto
coor
=
temp_coors
.
accessor
<
int
,
2
>
();
auto
coor
=
temp_coors
.
accessor
<
int
,
2
>
();
for
(
int
i
=
0
;
i
<
num_points
;
++
i
)
{
for
(
int
i
=
0
;
i
<
num_points
;
++
i
)
{
// T_int* coor = temp_coors.data_ptr<int>() + i * NDim;
// T_int* coor = temp_coors.data_ptr<int>() + i * NDim;
if
(
coor
[
i
][
0
]
==
-
1
)
if
(
coor
[
i
][
0
]
==
-
1
)
continue
;
continue
;
voxelidx
=
coor_to_voxelidx
[
coor
[
i
][
0
]][
coor
[
i
][
1
]][
coor
[
i
][
2
]];
voxelidx
=
coor_to_voxelidx
[
coor
[
i
][
0
]][
coor
[
i
][
1
]][
coor
[
i
][
2
]];
// record voxel
// record voxel
if
(
voxelidx
==
-
1
)
{
if
(
voxelidx
==
-
1
)
{
voxelidx
=
voxel_num
;
voxelidx
=
voxel_num
;
if
(
max_voxels
!=
-
1
&&
voxel_num
>=
max_voxels
)
if
(
max_voxels
!=
-
1
&&
voxel_num
>=
max_voxels
)
break
;
break
;
voxel_num
+=
1
;
voxel_num
+=
1
;
coor_to_voxelidx
[
coor
[
i
][
0
]][
coor
[
i
][
1
]][
coor
[
i
][
2
]]
=
voxelidx
;
coor_to_voxelidx
[
coor
[
i
][
0
]][
coor
[
i
][
1
]][
coor
[
i
][
2
]]
=
voxelidx
;
...
@@ -118,19 +100,14 @@ void hard_voxelize_kernel(const torch::TensorAccessor<T,2> points,
...
@@ -118,19 +100,14 @@ void hard_voxelize_kernel(const torch::TensorAccessor<T,2> points,
}
// namespace
}
// namespace
namespace
voxelization
{
namespace
voxelization
{
int
hard_voxelize_cpu
(
int
hard_voxelize_cpu
(
const
at
::
Tensor
&
points
,
at
::
Tensor
&
voxels
,
const
at
::
Tensor
&
points
,
at
::
Tensor
&
coors
,
at
::
Tensor
&
num_points_per_voxel
,
at
::
Tensor
&
voxels
,
at
::
Tensor
&
coors
,
at
::
Tensor
&
num_points_per_voxel
,
const
std
::
vector
<
float
>
voxel_size
,
const
std
::
vector
<
float
>
voxel_size
,
const
std
::
vector
<
float
>
coors_range
,
const
std
::
vector
<
float
>
coors_range
,
const
int
max_points
,
const
int
max_points
,
const
int
max_voxels
,
const
int
max_voxels
,
const
int
NDim
=
3
)
{
const
int
NDim
=
3
)
{
// current version tooks about 0.02s_0.03s for one frame on cpu
// current version tooks about 0.02s_0.03s for one frame on cpu
// check device
// check device
AT_ASSERTM
(
points
.
device
().
is_cpu
(),
"points must be a CPU tensor"
);
AT_ASSERTM
(
points
.
device
().
is_cpu
(),
"points must be a CPU tensor"
);
...
@@ -140,43 +117,34 @@ int hard_voxelize_cpu(
...
@@ -140,43 +117,34 @@ int hard_voxelize_cpu(
const
int
num_features
=
points
.
size
(
1
);
const
int
num_features
=
points
.
size
(
1
);
for
(
int
i
=
0
;
i
<
NDim
;
++
i
)
{
for
(
int
i
=
0
;
i
<
NDim
;
++
i
)
{
grid_size
[
i
]
=
round
((
coors_range
[
NDim
+
i
]
-
coors_range
[
i
])
/
voxel_size
[
i
]);
grid_size
[
i
]
=
round
((
coors_range
[
NDim
+
i
]
-
coors_range
[
i
])
/
voxel_size
[
i
]);
}
}
// coors, num_points_per_voxel, coor_to_voxelidx are int Tensor
// coors, num_points_per_voxel, coor_to_voxelidx are int Tensor
//printf("cpu coor_to_voxelidx size: [%d, %d, %d]\n", grid_size[2], grid_size[1], grid_size[0]);
// printf("cpu coor_to_voxelidx size: [%d, %d, %d]\n", grid_size[2],
at
::
Tensor
coor_to_voxelidx
=
-
at
::
ones
({
grid_size
[
2
],
grid_size
[
1
],
grid_size
[
0
]},
coors
.
options
());
// grid_size[1], grid_size[0]);
at
::
Tensor
coor_to_voxelidx
=
-
at
::
ones
({
grid_size
[
2
],
grid_size
[
1
],
grid_size
[
0
]},
coors
.
options
());
int
voxel_num
=
0
;
int
voxel_num
=
0
;
AT_DISPATCH_FLOATING_TYPES_AND_HALF
(
points
.
type
(),
"hard_voxelize_forward"
,
[
&
]
{
AT_DISPATCH_FLOATING_TYPES_AND_HALF
(
points
.
scalar_type
(),
"hard_voxelize_forward"
,
[
&
]
{
hard_voxelize_kernel
<
scalar_t
,
int
>
(
hard_voxelize_kernel
<
scalar_t
,
int
>
(
points
.
accessor
<
scalar_t
,
2
>
(),
points
.
accessor
<
scalar_t
,
2
>
(),
voxels
.
accessor
<
scalar_t
,
3
>
(),
voxels
.
accessor
<
scalar_t
,
3
>
(),
coors
.
accessor
<
int
,
2
>
(),
num_points_per_voxel
.
accessor
<
int
,
1
>
(),
coors
.
accessor
<
int
,
2
>
(),
coor_to_voxelidx
.
accessor
<
int
,
3
>
(),
voxel_num
,
voxel_size
,
num_points_per_voxel
.
accessor
<
int
,
1
>
(),
coors_range
,
grid_size
,
max_points
,
max_voxels
,
num_points
,
coor_to_voxelidx
.
accessor
<
int
,
3
>
(),
num_features
,
NDim
);
voxel_num
,
voxel_size
,
coors_range
,
grid_size
,
max_points
,
max_voxels
,
num_points
,
num_features
,
NDim
);
});
});
return
voxel_num
;
return
voxel_num
;
}
}
void
dynamic_voxelize_cpu
(
const
at
::
Tensor
&
points
,
at
::
Tensor
&
coors
,
void
dynamic_voxelize_cpu
(
const
at
::
Tensor
&
points
,
at
::
Tensor
&
coors
,
const
std
::
vector
<
float
>
voxel_size
,
const
std
::
vector
<
float
>
voxel_size
,
const
std
::
vector
<
float
>
coors_range
,
const
std
::
vector
<
float
>
coors_range
,
const
int
NDim
=
3
)
{
const
int
NDim
=
3
)
{
// check device
// check device
AT_ASSERTM
(
points
.
device
().
is_cpu
(),
"points must be a CPU tensor"
);
AT_ASSERTM
(
points
.
device
().
is_cpu
(),
"points must be a CPU tensor"
);
...
@@ -185,24 +153,19 @@ void dynamic_voxelize_cpu(
...
@@ -185,24 +153,19 @@ void dynamic_voxelize_cpu(
const
int
num_features
=
points
.
size
(
1
);
const
int
num_features
=
points
.
size
(
1
);
for
(
int
i
=
0
;
i
<
NDim
;
++
i
)
{
for
(
int
i
=
0
;
i
<
NDim
;
++
i
)
{
grid_size
[
i
]
=
round
((
coors_range
[
NDim
+
i
]
-
coors_range
[
i
])
/
voxel_size
[
i
]);
grid_size
[
i
]
=
round
((
coors_range
[
NDim
+
i
]
-
coors_range
[
i
])
/
voxel_size
[
i
]);
}
}
// coors, num_points_per_voxel, coor_to_voxelidx are int Tensor
// coors, num_points_per_voxel, coor_to_voxelidx are int Tensor
AT_DISPATCH_FLOATING_TYPES_AND_HALF
(
points
.
type
(),
"hard_voxelize_forward"
,
[
&
]
{
AT_DISPATCH_FLOATING_TYPES_AND_HALF
(
points
.
scalar_type
(),
"hard_voxelize_forward"
,
[
&
]
{
dynamic_voxelize_kernel
<
scalar_t
,
int
>
(
dynamic_voxelize_kernel
<
scalar_t
,
int
>
(
points
.
accessor
<
scalar_t
,
2
>
(),
points
.
accessor
<
scalar_t
,
2
>
(),
coors
.
accessor
<
int
,
2
>
(),
coors
.
accessor
<
int
,
2
>
(),
voxel_size
,
coors_range
,
grid_size
,
num_points
,
num_features
,
NDim
);
voxel_size
,
coors_range
,
grid_size
,
num_points
,
num_features
,
NDim
);
});
});
return
;
return
;
}
}
}
}
// namespace voxelization
mmdet3d/ops/voxel/src/voxelization_cuda.cu
View file @
ee5667c1
...
@@ -6,7 +6,7 @@
...
@@ -6,7 +6,7 @@
#include <ATen/cuda/CUDAApplyUtils.cuh>
#include <ATen/cuda/CUDAApplyUtils.cuh>
#define CHECK_CUDA(x) \
#define CHECK_CUDA(x) \
TORCH_CHECK(x.
typ
e().is_cuda(), #x " must be a CUDA tensor")
TORCH_CHECK(x.
devic
e().is_cuda(), #x " must be a CUDA tensor")
#define CHECK_CONTIGUOUS(x) \
#define CHECK_CONTIGUOUS(x) \
TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
#define CHECK_INPUT(x) \
#define CHECK_INPUT(x) \
...
@@ -219,7 +219,7 @@ int hard_voxelize_gpu(const at::Tensor& points, at::Tensor& voxels,
...
@@ -219,7 +219,7 @@ int hard_voxelize_gpu(const at::Tensor& points, at::Tensor& voxels,
// 1. link point to corresponding voxel coors
// 1. link point to corresponding voxel coors
AT_DISPATCH_ALL_TYPES
(
AT_DISPATCH_ALL_TYPES
(
points
.
type
(),
"hard_voxelize_kernel"
,
([
&
]
{
points
.
scalar_
type
(),
"hard_voxelize_kernel"
,
([
&
]
{
dynamic_voxelize_kernel
<
scalar_t
,
int
>
dynamic_voxelize_kernel
<
scalar_t
,
int
>
<<<
grid
,
block
,
0
,
at
::
cuda
::
getCurrentCUDAStream
()
>>>
(
<<<
grid
,
block
,
0
,
at
::
cuda
::
getCurrentCUDAStream
()
>>>
(
points
.
contiguous
().
data_ptr
<
scalar_t
>
(),
points
.
contiguous
().
data_ptr
<
scalar_t
>
(),
...
@@ -247,7 +247,7 @@ int hard_voxelize_gpu(const at::Tensor& points, at::Tensor& voxels,
...
@@ -247,7 +247,7 @@ int hard_voxelize_gpu(const at::Tensor& points, at::Tensor& voxels,
dim3
map_grid
(
std
::
min
(
at
::
cuda
::
ATenCeilDiv
(
num_points
,
512
),
4096
));
dim3
map_grid
(
std
::
min
(
at
::
cuda
::
ATenCeilDiv
(
num_points
,
512
),
4096
));
dim3
map_block
(
512
);
dim3
map_block
(
512
);
AT_DISPATCH_ALL_TYPES
(
AT_DISPATCH_ALL_TYPES
(
temp_coors
.
type
(),
"determin_duplicate"
,
([
&
]
{
temp_coors
.
scalar_
type
(),
"determin_duplicate"
,
([
&
]
{
point_to_voxelidx_kernel
<
int
>
point_to_voxelidx_kernel
<
int
>
<<<
map_grid
,
map_block
,
0
,
at
::
cuda
::
getCurrentCUDAStream
()
>>>
(
<<<
map_grid
,
map_block
,
0
,
at
::
cuda
::
getCurrentCUDAStream
()
>>>
(
temp_coors
.
contiguous
().
data_ptr
<
int
>
(),
temp_coors
.
contiguous
().
data_ptr
<
int
>
(),
...
@@ -272,7 +272,7 @@ int hard_voxelize_gpu(const at::Tensor& points, at::Tensor& voxels,
...
@@ -272,7 +272,7 @@ int hard_voxelize_gpu(const at::Tensor& points, at::Tensor& voxels,
points
.
options
().
dtype
(
at
::
kInt
));
// must be zero from the begining
points
.
options
().
dtype
(
at
::
kInt
));
// must be zero from the begining
AT_DISPATCH_ALL_TYPES
(
AT_DISPATCH_ALL_TYPES
(
temp_coors
.
type
(),
"determin_duplicate"
,
([
&
]
{
temp_coors
.
scalar_
type
(),
"determin_duplicate"
,
([
&
]
{
determin_voxel_num
<
int
><<<
1
,
1
,
0
,
at
::
cuda
::
getCurrentCUDAStream
()
>>>
(
determin_voxel_num
<
int
><<<
1
,
1
,
0
,
at
::
cuda
::
getCurrentCUDAStream
()
>>>
(
num_points_per_voxel
.
contiguous
().
data_ptr
<
int
>
(),
num_points_per_voxel
.
contiguous
().
data_ptr
<
int
>
(),
point_to_voxelidx
.
contiguous
().
data_ptr
<
int
>
(),
point_to_voxelidx
.
contiguous
().
data_ptr
<
int
>
(),
...
@@ -290,7 +290,7 @@ int hard_voxelize_gpu(const at::Tensor& points, at::Tensor& voxels,
...
@@ -290,7 +290,7 @@ int hard_voxelize_gpu(const at::Tensor& points, at::Tensor& voxels,
dim3
cp_grid
(
std
::
min
(
at
::
cuda
::
ATenCeilDiv
(
pts_output_size
,
512
),
4096
));
dim3
cp_grid
(
std
::
min
(
at
::
cuda
::
ATenCeilDiv
(
pts_output_size
,
512
),
4096
));
dim3
cp_block
(
512
);
dim3
cp_block
(
512
);
AT_DISPATCH_ALL_TYPES
(
AT_DISPATCH_ALL_TYPES
(
points
.
type
(),
"assign_point_to_voxel"
,
([
&
]
{
points
.
scalar_
type
(),
"assign_point_to_voxel"
,
([
&
]
{
assign_point_to_voxel
<
float
,
int
>
assign_point_to_voxel
<
float
,
int
>
<<<
cp_grid
,
cp_block
,
0
,
at
::
cuda
::
getCurrentCUDAStream
()
>>>
(
<<<
cp_grid
,
cp_block
,
0
,
at
::
cuda
::
getCurrentCUDAStream
()
>>>
(
pts_output_size
,
points
.
contiguous
().
data_ptr
<
float
>
(),
pts_output_size
,
points
.
contiguous
().
data_ptr
<
float
>
(),
...
@@ -308,7 +308,7 @@ int hard_voxelize_gpu(const at::Tensor& points, at::Tensor& voxels,
...
@@ -308,7 +308,7 @@ int hard_voxelize_gpu(const at::Tensor& points, at::Tensor& voxels,
std
::
min
(
at
::
cuda
::
ATenCeilDiv
(
coors_output_size
,
512
),
4096
));
std
::
min
(
at
::
cuda
::
ATenCeilDiv
(
coors_output_size
,
512
),
4096
));
dim3
coors_cp_block
(
512
);
dim3
coors_cp_block
(
512
);
AT_DISPATCH_ALL_TYPES
(
AT_DISPATCH_ALL_TYPES
(
points
.
type
(),
"assign_point_to_voxel"
,
([
&
]
{
points
.
scalar_
type
(),
"assign_point_to_voxel"
,
([
&
]
{
assign_voxel_coors
<
float
,
int
><<<
coors_cp_grid
,
coors_cp_block
,
0
,
assign_voxel_coors
<
float
,
int
><<<
coors_cp_grid
,
coors_cp_block
,
0
,
at
::
cuda
::
getCurrentCUDAStream
()
>>>
(
at
::
cuda
::
getCurrentCUDAStream
()
>>>
(
coors_output_size
,
temp_coors
.
contiguous
().
data_ptr
<
int
>
(),
coors_output_size
,
temp_coors
.
contiguous
().
data_ptr
<
int
>
(),
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment