Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
mmdetection3d
Commits
ee5667c1
Commit
ee5667c1
authored
Apr 27, 2020
by
zhangwenwei
Browse files
Reformat cpp code to supress warning
parent
2bb43004
Changes
15
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
15 changed files
with
884 additions
and
815 deletions
+884
-815
mmdet3d/datasets/kitti_dataset.py
mmdet3d/datasets/kitti_dataset.py
+2
-2
mmdet3d/models/anchor_heads/second_head.py
mmdet3d/models/anchor_heads/second_head.py
+22
-2
mmdet3d/ops/iou3d/src/iou3d.cpp
mmdet3d/ops/iou3d/src/iou3d.cpp
+145
-127
mmdet3d/ops/roiaware_pool3d/src/points_in_boxes_cpu.cpp
mmdet3d/ops/roiaware_pool3d/src/points_in_boxes_cpu.cpp
+56
-54
mmdet3d/ops/roiaware_pool3d/src/points_in_boxes_cuda.cu
mmdet3d/ops/roiaware_pool3d/src/points_in_boxes_cuda.cu
+102
-90
mmdet3d/ops/roiaware_pool3d/src/roiaware_pool3d.cpp
mmdet3d/ops/roiaware_pool3d/src/roiaware_pool3d.cpp
+123
-100
mmdet3d/ops/spconv/include/spconv/fused_spconv_ops.h
mmdet3d/ops/spconv/include/spconv/fused_spconv_ops.h
+34
-27
mmdet3d/ops/spconv/include/spconv/pool_ops.h
mmdet3d/ops/spconv/include/spconv/pool_ops.h
+9
-8
mmdet3d/ops/spconv/include/spconv/spconv_ops.h
mmdet3d/ops/spconv/include/spconv/spconv_ops.h
+162
-131
mmdet3d/ops/spconv/include/torch_utils.h
mmdet3d/ops/spconv/include/torch_utils.h
+34
-33
mmdet3d/ops/voxel/src/scatter_points_cpu.cpp
mmdet3d/ops/voxel/src/scatter_points_cpu.cpp
+101
-110
mmdet3d/ops/voxel/src/scatter_points_cuda.cu
mmdet3d/ops/voxel/src/scatter_points_cuda.cu
+6
-6
mmdet3d/ops/voxel/src/voxelization.h
mmdet3d/ops/voxel/src/voxelization.h
+4
-4
mmdet3d/ops/voxel/src/voxelization_cpu.cpp
mmdet3d/ops/voxel/src/voxelization_cpu.cpp
+78
-115
mmdet3d/ops/voxel/src/voxelization_cuda.cu
mmdet3d/ops/voxel/src/voxelization_cuda.cu
+6
-6
No files found.
mmdet3d/datasets/kitti_dataset.py
View file @
ee5667c1
...
@@ -274,7 +274,7 @@ class KittiDataset(torch_data.Dataset):
...
@@ -274,7 +274,7 @@ class KittiDataset(torch_data.Dataset):
out
)
out
)
return
result_files
return
result_files
def
evaluate
(
self
,
result_files
,
eval_types
=
None
):
def
evaluate
(
self
,
result_files
,
logger
=
None
,
eval_types
=
None
):
from
mmdet3d.core.evaluation
import
kitti_eval
from
mmdet3d.core.evaluation
import
kitti_eval
gt_annos
=
[
info
[
'annos'
]
for
info
in
self
.
kitti_infos
]
gt_annos
=
[
info
[
'annos'
]
for
info
in
self
.
kitti_infos
]
if
eval_types
==
'img_bbox'
:
if
eval_types
==
'img_bbox'
:
...
@@ -283,7 +283,7 @@ class KittiDataset(torch_data.Dataset):
...
@@ -283,7 +283,7 @@ class KittiDataset(torch_data.Dataset):
else
:
else
:
ap_result_str
,
ap_dict
=
kitti_eval
(
gt_annos
,
result_files
,
ap_result_str
,
ap_dict
=
kitti_eval
(
gt_annos
,
result_files
,
self
.
class_names
)
self
.
class_names
)
return
ap_result_str
,
ap_dict
return
ap_dict
def
bbox2result_kitti
(
self
,
net_outputs
,
class_names
,
out
=
None
):
def
bbox2result_kitti
(
self
,
net_outputs
,
class_names
,
out
=
None
):
if
out
:
if
out
:
...
...
mmdet3d/models/anchor_heads/second_head.py
View file @
ee5667c1
...
@@ -15,12 +15,32 @@ from .train_mixins import AnchorTrainMixin
...
@@ -15,12 +15,32 @@ from .train_mixins import AnchorTrainMixin
@
HEADS
.
register_module
@
HEADS
.
register_module
class
SECONDHead
(
nn
.
Module
,
AnchorTrainMixin
):
class
SECONDHead
(
nn
.
Module
,
AnchorTrainMixin
):
"""Anchor-based head (RPN, RetinaNet, SSD, etc.).
"""Anchor-based head for VoxelNet detectors.
Args:
Args:
class_name (list[str]): name of classes (TODO: to be removed)
in_channels (int): Number of channels in the input feature map.
in_channels (int): Number of channels in the input feature map.
train_cfg (dict): train configs
test_cfg (dict): test configs
feat_channels (int): Number of channels of the feature map.
feat_channels (int): Number of channels of the feature map.
use_direction_classifier (bool): Whether to add a direction classifier.
encode_bg_as_zeros (bool): Whether to use sigmoid of softmax
(TODO: to be removed)
box_code_size (int): The size of box code.
anchor_generator(dict): Config dict of anchor generator.
assigner_per_size (bool): Whether to do assignment for each separate
anchor size.
assign_per_class (bool): Whether to do assignment for each class.
diff_rad_by_sin (bool): Whether to change the difference into sin
difference for box regression loss.
dir_offset (float | int): The offset of BEV rotation angles
(TODO: may be moved into box coder)
dirlimit_offset (float | int): The limited range of BEV rotation angles
(TODO: may be moved into box coder)
box_coder (dict): Config dict of box coders.
loss_cls (dict): Config of classification loss.
loss_cls (dict): Config of classification loss.
loss_bbox (dict): Config of localization loss.
loss_bbox (dict): Config of localization loss.
loss_dir (dict): Config of direction classifier loss.
"""
# noqa: W605
"""
# noqa: W605
def
__init__
(
self
,
def
__init__
(
self
,
...
@@ -253,7 +273,7 @@ class SECONDHead(nn.Module, AnchorTrainMixin):
...
@@ -253,7 +273,7 @@ class SECONDHead(nn.Module, AnchorTrainMixin):
num_levels
=
len
(
cls_scores
)
num_levels
=
len
(
cls_scores
)
featmap_sizes
=
[
cls_scores
[
i
].
shape
[
-
2
:]
for
i
in
range
(
num_levels
)]
featmap_sizes
=
[
cls_scores
[
i
].
shape
[
-
2
:]
for
i
in
range
(
num_levels
)]
device
=
cls_scores
[
0
].
device
device
=
cls_scores
[
0
].
device
mlvl_anchors
=
self
.
anchor_generator
s
.
grid_anchors
(
mlvl_anchors
=
self
.
anchor_generator
.
grid_anchors
(
featmap_sizes
,
device
=
device
)
featmap_sizes
,
device
=
device
)
mlvl_anchors
=
[
mlvl_anchors
=
[
anchor
.
reshape
(
-
1
,
self
.
box_code_size
)
for
anchor
in
mlvl_anchors
anchor
.
reshape
(
-
1
,
self
.
box_code_size
)
for
anchor
in
mlvl_anchors
...
...
mmdet3d/ops/iou3d/src/iou3d.cpp
View file @
ee5667c1
#include <torch/serialize/tensor.h>
#include <torch/extension.h>
#include <vector>
#include <cuda.h>
#include <cuda.h>
#include <cuda_runtime_api.h>
#include <cuda_runtime_api.h>
#include <torch/extension.h>
#include <torch/serialize/tensor.h>
#define CHECK_CUDA(x) AT_CHECK(x.type().is_cuda(), #x, " must be a CUDAtensor ")
#include <vector>
#define CHECK_CONTIGUOUS(x) AT_CHECK(x.is_contiguous(), #x, " must be contiguous ")
#define CHECK_INPUT(x) CHECK_CUDA(x);CHECK_CONTIGUOUS(x)
#define DIVUP(m,n) ((m) / (n) + ((m) % (n) > 0))
#define CHECK_ERROR(ans) { gpuAssert((ans), __FILE__, __LINE__); }
#define CHECK_CUDA(x) \
inline
void
gpuAssert
(
cudaError_t
code
,
const
char
*
file
,
int
line
,
bool
abort
=
true
)
TORCH_CHECK(x.device().is_cuda(), #x, " must be a CUDAtensor ")
{
#define CHECK_CONTIGUOUS(x) \
if
(
code
!=
cudaSuccess
)
TORCH_CHECK(x.is_contiguous(), #x, " must be contiguous ")
{
#define CHECK_INPUT(x) \
fprintf
(
stderr
,
"GPUassert: %s %s %d
\n
"
,
cudaGetErrorString
(
code
),
file
,
line
);
CHECK_CUDA(x); \
if
(
abort
)
exit
(
code
);
CHECK_CONTIGUOUS(x)
}
#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
#define CHECK_ERROR(ans) \
{ gpuAssert((ans), __FILE__, __LINE__); }
inline
void
gpuAssert
(
cudaError_t
code
,
const
char
*
file
,
int
line
,
bool
abort
=
true
)
{
if
(
code
!=
cudaSuccess
)
{
fprintf
(
stderr
,
"GPUassert: %s %s %d
\n
"
,
cudaGetErrorString
(
code
),
file
,
line
);
if
(
abort
)
exit
(
code
);
}
}
}
const
int
THREADS_PER_BLOCK_NMS
=
sizeof
(
unsigned
long
long
)
*
8
;
const
int
THREADS_PER_BLOCK_NMS
=
sizeof
(
unsigned
long
long
)
*
8
;
void
boxesoverlapLauncher
(
const
int
num_a
,
const
float
*
boxes_a
,
void
boxesoverlapLauncher
(
const
int
num_a
,
const
float
*
boxes_a
,
const
int
num_b
,
const
float
*
boxes_b
,
float
*
ans_overlap
);
const
int
num_b
,
const
float
*
boxes_b
,
void
boxesioubevLauncher
(
const
int
num_a
,
const
float
*
boxes_a
,
const
int
num_b
,
const
float
*
boxes_b
,
float
*
ans_iou
);
float
*
ans_overlap
);
void
nmsLauncher
(
const
float
*
boxes
,
unsigned
long
long
*
mask
,
int
boxes_num
,
float
nms_overlap_thresh
);
void
boxesioubevLauncher
(
const
int
num_a
,
const
float
*
boxes_a
,
const
int
num_b
,
void
nmsNormalLauncher
(
const
float
*
boxes
,
unsigned
long
long
*
mask
,
int
boxes_num
,
float
nms_overlap_thresh
);
const
float
*
boxes_b
,
float
*
ans_iou
);
void
nmsLauncher
(
const
float
*
boxes
,
unsigned
long
long
*
mask
,
int
boxes_num
,
int
boxes_overlap_bev_gpu
(
at
::
Tensor
boxes_a
,
at
::
Tensor
boxes_b
,
at
::
Tensor
ans_overlap
){
float
nms_overlap_thresh
);
// params boxes_a: (N, 5) [x1, y1, x2, y2, ry]
void
nmsNormalLauncher
(
const
float
*
boxes
,
unsigned
long
long
*
mask
,
// params boxes_b: (M, 5)
int
boxes_num
,
float
nms_overlap_thresh
);
// params ans_overlap: (N, M)
int
boxes_overlap_bev_gpu
(
at
::
Tensor
boxes_a
,
at
::
Tensor
boxes_b
,
CHECK_INPUT
(
boxes_a
);
at
::
Tensor
ans_overlap
)
{
CHECK_INPUT
(
boxes_b
);
// params boxes_a: (N, 5) [x1, y1, x2, y2, ry]
CHECK_INPUT
(
ans_overlap
);
// params boxes_b: (M, 5)
// params ans_overlap: (N, M)
int
num_a
=
boxes_a
.
size
(
0
);
int
num_b
=
boxes_b
.
size
(
0
);
CHECK_INPUT
(
boxes_a
);
CHECK_INPUT
(
boxes_b
);
const
float
*
boxes_a_data
=
boxes_a
.
data
<
float
>
();
CHECK_INPUT
(
ans_overlap
);
const
float
*
boxes_b_data
=
boxes_b
.
data
<
float
>
();
float
*
ans_overlap_data
=
ans_overlap
.
data
<
float
>
();
int
num_a
=
boxes_a
.
size
(
0
);
int
num_b
=
boxes_b
.
size
(
0
);
boxesoverlapLauncher
(
num_a
,
boxes_a_data
,
num_b
,
boxes_b_data
,
ans_overlap_data
);
const
float
*
boxes_a_data
=
boxes_a
.
data_ptr
<
float
>
();
return
1
;
const
float
*
boxes_b_data
=
boxes_b
.
data_ptr
<
float
>
();
float
*
ans_overlap_data
=
ans_overlap
.
data_ptr
<
float
>
();
boxesoverlapLauncher
(
num_a
,
boxes_a_data
,
num_b
,
boxes_b_data
,
ans_overlap_data
);
return
1
;
}
}
int
boxes_iou_bev_gpu
(
at
::
Tensor
boxes_a
,
at
::
Tensor
boxes_b
,
at
::
Tensor
ans_iou
){
int
boxes_iou_bev_gpu
(
at
::
Tensor
boxes_a
,
at
::
Tensor
boxes_b
,
// params boxes_a: (N, 5) [x1, y1, x2, y2, ry]
at
::
Tensor
ans_iou
)
{
// params boxes_b: (M, 5)
// params boxes_a: (N, 5) [x1, y1, x2, y2, ry]
// params ans_overlap: (N, M)
// params boxes_b: (M, 5)
// params ans_overlap: (N, M)
CHECK_INPUT
(
boxes_a
);
CHECK_INPUT
(
boxes_a
);
CHECK_INPUT
(
boxes_b
);
CHECK_INPUT
(
boxes_b
);
CHECK_INPUT
(
ans_iou
);
CHECK_INPUT
(
ans_iou
);
int
num_a
=
boxes_a
.
size
(
0
);
int
num_a
=
boxes_a
.
size
(
0
);
int
num_b
=
boxes_b
.
size
(
0
);
int
num_b
=
boxes_b
.
size
(
0
);
const
float
*
boxes_a_data
=
boxes_a
.
data
<
float
>
();
const
float
*
boxes_a_data
=
boxes_a
.
data
_ptr
<
float
>
();
const
float
*
boxes_b_data
=
boxes_b
.
data
<
float
>
();
const
float
*
boxes_b_data
=
boxes_b
.
data
_ptr
<
float
>
();
float
*
ans_iou_data
=
ans_iou
.
data
<
float
>
();
float
*
ans_iou_data
=
ans_iou
.
data
_ptr
<
float
>
();
boxesioubevLauncher
(
num_a
,
boxes_a_data
,
num_b
,
boxes_b_data
,
ans_iou_data
);
boxesioubevLauncher
(
num_a
,
boxes_a_data
,
num_b
,
boxes_b_data
,
ans_iou_data
);
return
1
;
return
1
;
}
}
int
nms_gpu
(
at
::
Tensor
boxes
,
at
::
Tensor
keep
,
float
nms_overlap_thresh
){
int
nms_gpu
(
at
::
Tensor
boxes
,
at
::
Tensor
keep
,
float
nms_overlap_thresh
)
{
// params boxes: (N, 5) [x1, y1, x2, y2, ry]
// params boxes: (N, 5) [x1, y1, x2, y2, ry]
// params keep: (N)
// params keep: (N)
CHECK_INPUT
(
boxes
);
CHECK_INPUT
(
boxes
);
CHECK_CONTIGUOUS
(
keep
);
CHECK_CONTIGUOUS
(
keep
);
int
boxes_num
=
boxes
.
size
(
0
);
int
boxes_num
=
boxes
.
size
(
0
);
const
float
*
boxes_data
=
boxes
.
data
<
float
>
();
const
float
*
boxes_data
=
boxes
.
data
_ptr
<
float
>
();
long
*
keep_data
=
keep
.
data
<
long
>
();
long
*
keep_data
=
keep
.
data
_ptr
<
long
>
();
const
int
col_blocks
=
DIVUP
(
boxes_num
,
THREADS_PER_BLOCK_NMS
);
const
int
col_blocks
=
DIVUP
(
boxes_num
,
THREADS_PER_BLOCK_NMS
);
unsigned
long
long
*
mask_data
=
NULL
;
unsigned
long
long
*
mask_data
=
NULL
;
CHECK_ERROR
(
cudaMalloc
((
void
**
)
&
mask_data
,
boxes_num
*
col_blocks
*
sizeof
(
unsigned
long
long
)));
CHECK_ERROR
(
cudaMalloc
((
void
**
)
&
mask_data
,
nmsLauncher
(
boxes_data
,
mask_data
,
boxes_num
,
nms_overlap_thresh
);
boxes_num
*
col_blocks
*
sizeof
(
unsigned
long
long
)));
nmsLauncher
(
boxes_data
,
mask_data
,
boxes_num
,
nms_overlap_thresh
);
// unsigned long long mask_cpu[boxes_num * col_blocks];
// unsigned long long mask_cpu[boxes_num * col_blocks];
// unsigned long long *mask_cpu = new unsigned long long [boxes_num * col_blocks];
// unsigned long long *mask_cpu = new unsigned long long [boxes_num *
std
::
vector
<
unsigned
long
long
>
mask_cpu
(
boxes_num
*
col_blocks
);
// col_blocks];
std
::
vector
<
unsigned
long
long
>
mask_cpu
(
boxes_num
*
col_blocks
);
// printf("boxes_num=%d, col_blocks=%d\n", boxes_num, col_blocks);
// printf("boxes_num=%d, col_blocks=%d\n", boxes_num, col_blocks);
CHECK_ERROR
(
cudaMemcpy
(
&
mask_cpu
[
0
],
mask_data
,
boxes_num
*
col_blocks
*
sizeof
(
unsigned
long
long
),
CHECK_ERROR
(
cudaMemcpy
(
&
mask_cpu
[
0
],
mask_data
,
cudaMemcpyDeviceToHost
));
boxes_num
*
col_blocks
*
sizeof
(
unsigned
long
long
),
cudaMemcpyDeviceToHost
));
cudaFree
(
mask_data
);
cudaFree
(
mask_data
);
unsigned
long
long
remv_cpu
[
col_blocks
];
unsigned
long
long
remv_cpu
[
col_blocks
];
memset
(
remv_cpu
,
0
,
col_blocks
*
sizeof
(
unsigned
long
long
));
memset
(
remv_cpu
,
0
,
col_blocks
*
sizeof
(
unsigned
long
long
));
int
num_to_keep
=
0
;
int
num_to_keep
=
0
;
for
(
int
i
=
0
;
i
<
boxes_num
;
i
++
){
for
(
int
i
=
0
;
i
<
boxes_num
;
i
++
)
{
int
nblock
=
i
/
THREADS_PER_BLOCK_NMS
;
int
nblock
=
i
/
THREADS_PER_BLOCK_NMS
;
int
inblock
=
i
%
THREADS_PER_BLOCK_NMS
;
int
inblock
=
i
%
THREADS_PER_BLOCK_NMS
;
if
(
!
(
remv_cpu
[
nblock
]
&
(
1ULL
<<
inblock
))){
if
(
!
(
remv_cpu
[
nblock
]
&
(
1ULL
<<
inblock
)))
{
keep_data
[
num_to_keep
++
]
=
i
;
keep_data
[
num_to_keep
++
]
=
i
;
unsigned
long
long
*
p
=
&
mask_cpu
[
0
]
+
i
*
col_blocks
;
unsigned
long
long
*
p
=
&
mask_cpu
[
0
]
+
i
*
col_blocks
;
for
(
int
j
=
nblock
;
j
<
col_blocks
;
j
++
){
for
(
int
j
=
nblock
;
j
<
col_blocks
;
j
++
)
{
remv_cpu
[
j
]
|=
p
[
j
];
remv_cpu
[
j
]
|=
p
[
j
];
}
}
}
}
}
if
(
cudaSuccess
!=
cudaGetLastError
()
)
printf
(
"Error!
\n
"
);
}
if
(
cudaSuccess
!=
cudaGetLastError
())
printf
(
"Error!
\n
"
);
return
num_to_keep
;
return
num_to_keep
;
}
}
int
nms_normal_gpu
(
at
::
Tensor
boxes
,
at
::
Tensor
keep
,
float
nms_overlap_thresh
)
{
// params boxes: (N, 5) [x1, y1, x2, y2, ry]
// params keep: (N)
int
nms_normal_gpu
(
at
::
Tensor
boxes
,
at
::
Tensor
keep
,
float
nms_overlap_thresh
){
CHECK_INPUT
(
boxes
);
// params boxes: (N, 5) [x1, y1, x2, y2, ry]
CHECK_CONTIGUOUS
(
keep
);
// params keep: (N)
CHECK_INPUT
(
boxes
);
CHECK_CONTIGUOUS
(
keep
);
int
boxes_num
=
boxes
.
size
(
0
);
int
boxes_num
=
boxes
.
size
(
0
);
const
float
*
boxes_data
=
boxes
.
data
<
float
>
();
const
float
*
boxes_data
=
boxes
.
data
_ptr
<
float
>
();
long
*
keep_data
=
keep
.
data
<
long
>
();
long
*
keep_data
=
keep
.
data
_ptr
<
long
>
();
const
int
col_blocks
=
DIVUP
(
boxes_num
,
THREADS_PER_BLOCK_NMS
);
const
int
col_blocks
=
DIVUP
(
boxes_num
,
THREADS_PER_BLOCK_NMS
);
unsigned
long
long
*
mask_data
=
NULL
;
unsigned
long
long
*
mask_data
=
NULL
;
CHECK_ERROR
(
cudaMalloc
((
void
**
)
&
mask_data
,
boxes_num
*
col_blocks
*
sizeof
(
unsigned
long
long
)));
CHECK_ERROR
(
cudaMalloc
((
void
**
)
&
mask_data
,
nmsNormalLauncher
(
boxes_data
,
mask_data
,
boxes_num
,
nms_overlap_thresh
);
boxes_num
*
col_blocks
*
sizeof
(
unsigned
long
long
)));
nmsNormalLauncher
(
boxes_data
,
mask_data
,
boxes_num
,
nms_overlap_thresh
);
// unsigned long long mask_cpu[boxes_num * col_blocks];
// unsigned long long mask_cpu[boxes_num * col_blocks];
// unsigned long long *mask_cpu = new unsigned long long [boxes_num * col_blocks];
// unsigned long long *mask_cpu = new unsigned long long [boxes_num *
std
::
vector
<
unsigned
long
long
>
mask_cpu
(
boxes_num
*
col_blocks
);
// col_blocks];
std
::
vector
<
unsigned
long
long
>
mask_cpu
(
boxes_num
*
col_blocks
);
// printf("boxes_num=%d, col_blocks=%d\n", boxes_num, col_blocks);
// printf("boxes_num=%d, col_blocks=%d\n", boxes_num, col_blocks);
CHECK_ERROR
(
cudaMemcpy
(
&
mask_cpu
[
0
],
mask_data
,
boxes_num
*
col_blocks
*
sizeof
(
unsigned
long
long
),
CHECK_ERROR
(
cudaMemcpy
(
&
mask_cpu
[
0
],
mask_data
,
cudaMemcpyDeviceToHost
));
boxes_num
*
col_blocks
*
sizeof
(
unsigned
long
long
),
cudaMemcpyDeviceToHost
));
cudaFree
(
mask_data
);
cudaFree
(
mask_data
);
unsigned
long
long
remv_cpu
[
col_blocks
];
unsigned
long
long
remv_cpu
[
col_blocks
];
memset
(
remv_cpu
,
0
,
col_blocks
*
sizeof
(
unsigned
long
long
));
memset
(
remv_cpu
,
0
,
col_blocks
*
sizeof
(
unsigned
long
long
));
int
num_to_keep
=
0
;
int
num_to_keep
=
0
;
for
(
int
i
=
0
;
i
<
boxes_num
;
i
++
){
for
(
int
i
=
0
;
i
<
boxes_num
;
i
++
)
{
int
nblock
=
i
/
THREADS_PER_BLOCK_NMS
;
int
nblock
=
i
/
THREADS_PER_BLOCK_NMS
;
int
inblock
=
i
%
THREADS_PER_BLOCK_NMS
;
int
inblock
=
i
%
THREADS_PER_BLOCK_NMS
;
if
(
!
(
remv_cpu
[
nblock
]
&
(
1ULL
<<
inblock
))){
if
(
!
(
remv_cpu
[
nblock
]
&
(
1ULL
<<
inblock
)))
{
keep_data
[
num_to_keep
++
]
=
i
;
keep_data
[
num_to_keep
++
]
=
i
;
unsigned
long
long
*
p
=
&
mask_cpu
[
0
]
+
i
*
col_blocks
;
unsigned
long
long
*
p
=
&
mask_cpu
[
0
]
+
i
*
col_blocks
;
for
(
int
j
=
nblock
;
j
<
col_blocks
;
j
++
){
for
(
int
j
=
nblock
;
j
<
col_blocks
;
j
++
)
{
remv_cpu
[
j
]
|=
p
[
j
];
remv_cpu
[
j
]
|=
p
[
j
];
}
}
}
}
}
if
(
cudaSuccess
!=
cudaGetLastError
()
)
printf
(
"Error!
\n
"
);
}
if
(
cudaSuccess
!=
cudaGetLastError
())
printf
(
"Error!
\n
"
);
return
num_to_keep
;
return
num_to_keep
;
}
}
PYBIND11_MODULE
(
TORCH_EXTENSION_NAME
,
m
)
{
PYBIND11_MODULE
(
TORCH_EXTENSION_NAME
,
m
)
{
m
.
def
(
"boxes_overlap_bev_gpu"
,
&
boxes_overlap_bev_gpu
,
"oriented boxes overlap"
);
m
.
def
(
"boxes_overlap_bev_gpu"
,
&
boxes_overlap_bev_gpu
,
"oriented boxes overlap"
);
m
.
def
(
"boxes_iou_bev_gpu"
,
&
boxes_iou_bev_gpu
,
"oriented boxes iou"
);
m
.
def
(
"boxes_iou_bev_gpu"
,
&
boxes_iou_bev_gpu
,
"oriented boxes iou"
);
m
.
def
(
"nms_gpu"
,
&
nms_gpu
,
"oriented nms gpu"
);
m
.
def
(
"nms_gpu"
,
&
nms_gpu
,
"oriented nms gpu"
);
m
.
def
(
"nms_normal_gpu"
,
&
nms_normal_gpu
,
"nms gpu"
);
m
.
def
(
"nms_normal_gpu"
,
&
nms_normal_gpu
,
"nms gpu"
);
...
...
mmdet3d/ops/roiaware_pool3d/src/points_in_boxes_cpu.cpp
View file @
ee5667c1
//Modified from
//
Modified from
//https://github.com/sshaoshuai/PCDet/blob/master/pcdet/ops/roiaware_pool3d/src/roiaware_pool3d_kernel.cu
//
https://github.com/sshaoshuai/PCDet/blob/master/pcdet/ops/roiaware_pool3d/src/roiaware_pool3d_kernel.cu
//Points in boxes cpu
//
Points in boxes cpu
//Written by Shaoshuai Shi
//
Written by Shaoshuai Shi
//All Rights Reserved 2019.
//
All Rights Reserved 2019.
#include <torch/serialize/tensor.h>
#include <torch/extension.h>
#include <assert.h>
#include <assert.h>
#include <math.h>
#include <math.h>
#include <stdio.h>
#include <stdio.h>
#include <torch/extension.h>
#include <torch/serialize/tensor.h>
#define CHECK_CONTIGUOUS(x) AT_CHECK(x.is_contiguous(), #x, " must be contiguous ")
#define CHECK_CONTIGUOUS(x) \
TORCH_CHECK(x.is_contiguous(), #x, " must be contiguous ")
// #define DEBUG
// #define DEBUG
inline
void
lidar_to_local_coords_cpu
(
float
shift_x
,
float
shift_y
,
float
rz
,
inline
void
lidar_to_local_coords_cpu
(
float
shift_x
,
float
shift_y
,
float
rz
,
float
&
local_x
,
float
&
local_y
){
float
&
local_x
,
float
&
local_y
)
{
// should rotate pi/2 + alpha to translate LiDAR to local
// should rotate pi/2 + alpha to translate LiDAR to local
float
rot_angle
=
rz
+
M_PI
/
2
;
float
rot_angle
=
rz
+
M_PI
/
2
;
float
cosa
=
cos
(
rot_angle
),
sina
=
sin
(
rot_angle
);
float
cosa
=
cos
(
rot_angle
),
sina
=
sin
(
rot_angle
);
local_x
=
shift_x
*
cosa
+
shift_y
*
(
-
sina
);
local_x
=
shift_x
*
cosa
+
shift_y
*
(
-
sina
);
local_y
=
shift_x
*
sina
+
shift_y
*
cosa
;
local_y
=
shift_x
*
sina
+
shift_y
*
cosa
;
}
}
inline
int
check_pt_in_box3d_cpu
(
const
float
*
pt
,
const
float
*
box3d
,
inline
int
check_pt_in_box3d_cpu
(
const
float
*
pt
,
const
float
*
box3d
,
float
&
local_x
,
float
&
local_y
){
float
&
local_x
,
float
&
local_y
)
{
// param pt: (x, y, z)
// param pt: (x, y, z)
// param box3d: (cx, cy, cz, w, l, h, rz) in LiDAR coordinate, cz in the bottom center
// param box3d: (cx, cy, cz, w, l, h, rz) in LiDAR coordinate, cz in the
float
x
=
pt
[
0
],
y
=
pt
[
1
],
z
=
pt
[
2
];
// bottom center
float
cx
=
box3d
[
0
],
cy
=
box3d
[
1
],
cz
=
box3d
[
2
];
float
x
=
pt
[
0
],
y
=
pt
[
1
],
z
=
pt
[
2
];
float
w
=
box3d
[
3
],
l
=
box3d
[
4
],
h
=
box3d
[
5
],
rz
=
box3d
[
6
];
float
cx
=
box3d
[
0
],
cy
=
box3d
[
1
],
cz
=
box3d
[
2
];
cz
+=
h
/
2.0
;
// shift to the center since cz in box3d is the bottom center
float
w
=
box3d
[
3
],
l
=
box3d
[
4
],
h
=
box3d
[
5
],
rz
=
box3d
[
6
];
cz
+=
h
/
2.0
;
// shift to the center since cz in box3d is the bottom center
if
(
fabsf
(
z
-
cz
)
>
h
/
2.0
)
return
0
;
lidar_to_local_coords_cpu
(
x
-
cx
,
y
-
cy
,
rz
,
local_x
,
local_y
);
if
(
fabsf
(
z
-
cz
)
>
h
/
2.0
)
return
0
;
float
in_flag
=
(
local_x
>
-
l
/
2.0
)
&
(
local_x
<
l
/
2.0
)
&
(
local_y
>
-
w
/
2.0
)
&
(
local_y
<
w
/
2.0
);
lidar_to_local_coords_cpu
(
x
-
cx
,
y
-
cy
,
rz
,
local_x
,
local_y
);
return
in_flag
;
float
in_flag
=
(
local_x
>
-
l
/
2.0
)
&
(
local_x
<
l
/
2.0
)
&
(
local_y
>
-
w
/
2.0
)
&
(
local_y
<
w
/
2.0
);
return
in_flag
;
}
}
int
points_in_boxes_cpu
(
at
::
Tensor
boxes_tensor
,
at
::
Tensor
pts_tensor
,
int
points_in_boxes_cpu
(
at
::
Tensor
boxes_tensor
,
at
::
Tensor
pts_tensor
,
at
::
Tensor
pts_indices_tensor
){
at
::
Tensor
pts_indices_tensor
)
{
// params boxes: (N, 7) [x, y, z, w, l, h, rz] in LiDAR coordinate, z is the
bottom center, each box DO NOT overlaps
// params boxes: (N, 7) [x, y, z, w, l, h, rz] in LiDAR coordinate, z is the
// params pts: (npoints, 3) [x, y, z]
in LiDAR coordinate
//
bottom center, each box DO NOT overlaps
params pts: (npoints, 3) [x, y, z]
// params pts_indices: (N, npoints)
//
in LiDAR coordinate
params pts_indices: (N, npoints)
CHECK_CONTIGUOUS
(
boxes_tensor
);
CHECK_CONTIGUOUS
(
boxes_tensor
);
CHECK_CONTIGUOUS
(
pts_tensor
);
CHECK_CONTIGUOUS
(
pts_tensor
);
CHECK_CONTIGUOUS
(
pts_indices_tensor
);
CHECK_CONTIGUOUS
(
pts_indices_tensor
);
int
boxes_num
=
boxes_tensor
.
size
(
0
);
int
boxes_num
=
boxes_tensor
.
size
(
0
);
int
pts_num
=
pts_tensor
.
size
(
0
);
int
pts_num
=
pts_tensor
.
size
(
0
);
const
float
*
boxes
=
boxes_tensor
.
data
<
float
>
();
const
float
*
boxes
=
boxes_tensor
.
data
_ptr
<
float
>
();
const
float
*
pts
=
pts_tensor
.
data
<
float
>
();
const
float
*
pts
=
pts_tensor
.
data
_ptr
<
float
>
();
int
*
pts_indices
=
pts_indices_tensor
.
data
<
int
>
();
int
*
pts_indices
=
pts_indices_tensor
.
data
_ptr
<
int
>
();
float
local_x
=
0
,
local_y
=
0
;
float
local_x
=
0
,
local_y
=
0
;
for
(
int
i
=
0
;
i
<
boxes_num
;
i
++
){
for
(
int
i
=
0
;
i
<
boxes_num
;
i
++
)
{
for
(
int
j
=
0
;
j
<
pts_num
;
j
++
){
for
(
int
j
=
0
;
j
<
pts_num
;
j
++
)
{
int
cur_in_flag
=
check_pt_in_box3d_cpu
(
pts
+
j
*
3
,
boxes
+
i
*
7
,
local_x
,
local_y
);
int
cur_in_flag
=
pt
s
_in
dices
[
i
*
pts_num
+
j
]
=
cur_in_flag
;
check_
pt_in
_box3d_cpu
(
pts
+
j
*
3
,
boxes
+
i
*
7
,
local_x
,
local_y
)
;
}
pts_indices
[
i
*
pts_num
+
j
]
=
cur_in_flag
;
}
}
}
return
1
;
return
1
;
}
}
mmdet3d/ops/roiaware_pool3d/src/points_in_boxes_cuda.cu
View file @
ee5667c1
//Modified from
//
Modified from
//https://github.com/sshaoshuai/PCDet/blob/master/pcdet/ops/roiaware_pool3d/src/roiaware_pool3d_kernel.cu
//
https://github.com/sshaoshuai/PCDet/blob/master/pcdet/ops/roiaware_pool3d/src/roiaware_pool3d_kernel.cu
//Points in boxes gpu
//
Points in boxes gpu
//Written by Shaoshuai Shi
//
Written by Shaoshuai Shi
//All Rights Reserved 2019.
//
All Rights Reserved 2019.
#include <torch/serialize/tensor.h>
#include <torch/extension.h>
#include <assert.h>
#include <assert.h>
#include <math.h>
#include <math.h>
#include <stdio.h>
#include <stdio.h>
#include <torch/extension.h>
#include <torch/serialize/tensor.h>
#define THREADS_PER_BLOCK 256
#define THREADS_PER_BLOCK 256
#define DIVUP(m,n) ((m) / (n) + ((m) % (n) > 0))
#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
#define CHECK_CUDA(x) AT_CHECK(x.type().is_cuda(), #x, " must be a CUDAtensor ")
#define CHECK_CUDA(x) \
#define CHECK_CONTIGUOUS(x) AT_CHECK(x.is_contiguous(), #x, " must be contiguous ")
TORCH_CHECK(x.device().is_cuda(), #x, " must be a CUDAtensor ")
#define CHECK_INPUT(x) CHECK_CUDA(x);CHECK_CONTIGUOUS(x)
#define CHECK_CONTIGUOUS(x) \
TORCH_CHECK(x.is_contiguous(), #x, " must be contiguous ")
#define CHECK_INPUT(x) \
CHECK_CUDA(x); \
CHECK_CONTIGUOUS(x)
// #define DEBUG
// #define DEBUG
__device__
inline
void
lidar_to_local_coords
(
float
shift_x
,
float
shift_y
,
__device__
inline
void
lidar_to_local_coords
(
float
shift_x
,
float
shift_y
,
float
rz
,
float
&
local_x
,
float
&
local_y
){
float
rz
,
float
&
local_x
,
// should rotate pi/2 + alpha to translate LiDAR to local
float
&
local_y
)
{
float
rot_angle
=
rz
+
M_PI
/
2
;
// should rotate pi/2 + alpha to translate LiDAR to local
float
cosa
=
cos
(
rot_angle
),
sina
=
sin
(
rot_angle
);
float
rot_angle
=
rz
+
M_PI
/
2
;
local_x
=
shift_x
*
cosa
+
shift_y
*
(
-
sina
);
float
cosa
=
cos
(
rot_angle
),
sina
=
sin
(
rot_angle
);
local_y
=
shift_x
*
sina
+
shift_y
*
cosa
;
local_x
=
shift_x
*
cosa
+
shift_y
*
(
-
sina
);
local_y
=
shift_x
*
sina
+
shift_y
*
cosa
;
}
}
__device__
inline
int
check_pt_in_box3d
(
const
float
*
pt
,
const
float
*
box3d
,
__device__
inline
int
check_pt_in_box3d
(
const
float
*
pt
,
const
float
*
box3d
,
float
&
local_x
,
float
&
local_y
){
float
&
local_x
,
float
&
local_y
)
{
// param pt: (x, y, z)
// param pt: (x, y, z)
// param box3d: (cx, cy, cz, w, l, h, rz) in LiDAR coordinate, cz in the bottom center
// param box3d: (cx, cy, cz, w, l, h, rz) in LiDAR coordinate, cz in the
float
x
=
pt
[
0
],
y
=
pt
[
1
],
z
=
pt
[
2
];
// bottom center
float
cx
=
box3d
[
0
],
cy
=
box3d
[
1
],
cz
=
box3d
[
2
];
float
x
=
pt
[
0
],
y
=
pt
[
1
],
z
=
pt
[
2
];
float
w
=
box3d
[
3
],
l
=
box3d
[
4
],
h
=
box3d
[
5
],
rz
=
box3d
[
6
];
float
cx
=
box3d
[
0
],
cy
=
box3d
[
1
],
cz
=
box3d
[
2
];
cz
+=
h
/
2.0
;
// shift to the center since cz in box3d is the bottom center
float
w
=
box3d
[
3
],
l
=
box3d
[
4
],
h
=
box3d
[
5
],
rz
=
box3d
[
6
];
cz
+=
h
/
2.0
;
// shift to the center since cz in box3d is the bottom center
if
(
fabsf
(
z
-
cz
)
>
h
/
2.0
)
return
0
;
lidar_to_local_coords
(
x
-
cx
,
y
-
cy
,
rz
,
local_x
,
local_y
);
if
(
fabsf
(
z
-
cz
)
>
h
/
2.0
)
return
0
;
float
in_flag
=
(
local_x
>
-
l
/
2.0
)
&
(
local_x
<
l
/
2.0
)
&
(
local_y
>
-
w
/
2.0
)
&
(
local_y
<
w
/
2.0
);
lidar_to_local_coords
(
x
-
cx
,
y
-
cy
,
rz
,
local_x
,
local_y
);
return
in_flag
;
float
in_flag
=
(
local_x
>
-
l
/
2.0
)
&
(
local_x
<
l
/
2.0
)
&
(
local_y
>
-
w
/
2.0
)
&
(
local_y
<
w
/
2.0
);
return
in_flag
;
}
}
__global__
void
points_in_boxes_kernel
(
int
batch_size
,
int
boxes_num
,
__global__
void
points_in_boxes_kernel
(
int
batch_size
,
int
boxes_num
,
int
pts_num
,
const
float
*
boxes
,
int
pts_num
,
const
float
*
boxes
,
const
float
*
pts
,
int
*
box_idx_of_points
){
const
float
*
pts
,
// params boxes: (B, N, 7) [x, y, z, w, l, h, rz] in LiDAR coordinate, z is the bottom center, each box DO NOT overlaps
int
*
box_idx_of_points
)
{
// params pts: (B, npoints, 3) [x, y, z] in LiDAR coordinate
// params boxes: (B, N, 7) [x, y, z, w, l, h, rz] in LiDAR coordinate, z is
// params boxes_idx_of_points: (B, npoints), default -1
// the bottom center, each box DO NOT overlaps params pts: (B, npoints, 3) [x,
// y, z] in LiDAR coordinate params boxes_idx_of_points: (B, npoints), default
int
bs_idx
=
blockIdx
.
y
;
// -1
int
pt_idx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
bs_idx
>=
batch_size
||
pt_idx
>=
pts_num
)
return
;
int
bs_idx
=
blockIdx
.
y
;
int
pt_idx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
boxes
+=
bs_idx
*
boxes_num
*
7
;
if
(
bs_idx
>=
batch_size
||
pt_idx
>=
pts_num
)
return
;
pts
+=
bs_idx
*
pts_num
*
3
+
pt_idx
*
3
;
box_idx_of_points
+=
bs_idx
*
pts_num
+
pt_idx
;
boxes
+=
bs_idx
*
boxes_num
*
7
;
pts
+=
bs_idx
*
pts_num
*
3
+
pt_idx
*
3
;
float
local_x
=
0
,
local_y
=
0
;
box_idx_of_points
+=
bs_idx
*
pts_num
+
pt_idx
;
int
cur_in_flag
=
0
;
for
(
int
k
=
0
;
k
<
boxes_num
;
k
++
){
float
local_x
=
0
,
local_y
=
0
;
cur_in_flag
=
check_pt_in_box3d
(
pts
,
boxes
+
k
*
7
,
local_x
,
local_y
);
int
cur_in_flag
=
0
;
if
(
cur_in_flag
){
for
(
int
k
=
0
;
k
<
boxes_num
;
k
++
)
{
box_idx_of_points
[
0
]
=
k
;
cur_in_flag
=
check_pt_in_box3d
(
pts
,
boxes
+
k
*
7
,
local_x
,
local_y
);
break
;
if
(
cur_in_flag
)
{
}
box_idx_of_points
[
0
]
=
k
;
break
;
}
}
}
}
}
void
points_in_boxes_launcher
(
int
batch_size
,
int
boxes_num
,
int
pts_num
,
void
points_in_boxes_launcher
(
int
batch_size
,
int
boxes_num
,
int
pts_num
,
const
float
*
boxes
,
const
float
*
boxes
,
const
float
*
pts
,
const
float
*
pts
,
int
*
box_idx_of_points
){
int
*
box_idx_of_points
)
{
// params boxes: (B, N, 7) [x, y, z, w, l, h, rz] in LiDAR coordinate, z is the bottom center, each box DO NOT overlaps
// params boxes: (B, N, 7) [x, y, z, w, l, h, rz] in LiDAR coordinate, z is
// params pts: (B, npoints, 3) [x, y, z] in LiDAR coordinate
// the bottom center, each box DO NOT overlaps params pts: (B, npoints, 3) [x,
// params boxes_idx_of_points: (B, npoints), default -1
// y, z] in LiDAR coordinate params boxes_idx_of_points: (B, npoints), default
cudaError_t
err
;
// -1
cudaError_t
err
;
dim3
blocks
(
DIVUP
(
pts_num
,
THREADS_PER_BLOCK
),
batch_size
);
dim3
threads
(
THREADS_PER_BLOCK
);
dim3
blocks
(
DIVUP
(
pts_num
,
THREADS_PER_BLOCK
),
batch_size
);
points_in_boxes_kernel
<<<
blocks
,
threads
>>>
(
batch_size
,
boxes_num
,
pts_num
,
boxes
,
pts
,
box_idx_of_points
);
dim3
threads
(
THREADS_PER_BLOCK
);
points_in_boxes_kernel
<<<
blocks
,
threads
>>>
(
batch_size
,
boxes_num
,
pts_num
,
err
=
cudaGetLastError
();
boxes
,
pts
,
box_idx_of_points
);
if
(
cudaSuccess
!=
err
)
{
fprintf
(
stderr
,
"CUDA kernel failed : %s
\n
"
,
cudaGetErrorString
(
err
));
err
=
cudaGetLastError
();
exit
(
-
1
);
if
(
cudaSuccess
!=
err
)
{
}
fprintf
(
stderr
,
"CUDA kernel failed : %s
\n
"
,
cudaGetErrorString
(
err
));
exit
(
-
1
);
}
#ifdef DEBUG
#ifdef DEBUG
cudaDeviceSynchronize
();
// for using printf in kernel function
cudaDeviceSynchronize
();
// for using printf in kernel function
#endif
#endif
}
}
int
points_in_boxes_gpu
(
at
::
Tensor
boxes_tensor
,
at
::
Tensor
pts_tensor
,
at
::
Tensor
box_idx_of_points_tensor
){
int
points_in_boxes_gpu
(
at
::
Tensor
boxes_tensor
,
at
::
Tensor
pts_tensor
,
// params boxes: (B, N, 7) [x, y, z, w, l, h, rz] in LiDAR coordinate, z is the bottom center, each box DO NOT overlaps
at
::
Tensor
box_idx_of_points_tensor
)
{
// params pts: (B, npoints, 3) [x, y, z] in LiDAR coordinate
// params boxes: (B, N, 7) [x, y, z, w, l, h, rz] in LiDAR coordinate, z is
// params boxes_idx_of_points: (B, npoints), default -1
// the bottom center, each box DO NOT overlaps params pts: (B, npoints, 3) [x,
// y, z] in LiDAR coordinate params boxes_idx_of_points: (B, npoints), default
// -1
CHECK_INPUT
(
boxes_tensor
);
CHECK_INPUT
(
boxes_tensor
);
CHECK_INPUT
(
pts_tensor
);
CHECK_INPUT
(
pts_tensor
);
CHECK_INPUT
(
box_idx_of_points_tensor
);
CHECK_INPUT
(
box_idx_of_points_tensor
);
int
batch_size
=
boxes_tensor
.
size
(
0
);
int
batch_size
=
boxes_tensor
.
size
(
0
);
int
boxes_num
=
boxes_tensor
.
size
(
1
);
int
boxes_num
=
boxes_tensor
.
size
(
1
);
int
pts_num
=
pts_tensor
.
size
(
1
);
int
pts_num
=
pts_tensor
.
size
(
1
);
const
float
*
boxes
=
boxes_tensor
.
data
<
float
>
();
const
float
*
boxes
=
boxes_tensor
.
data
_ptr
<
float
>
();
const
float
*
pts
=
pts_tensor
.
data
<
float
>
();
const
float
*
pts
=
pts_tensor
.
data
_ptr
<
float
>
();
int
*
box_idx_of_points
=
box_idx_of_points_tensor
.
data
<
int
>
();
int
*
box_idx_of_points
=
box_idx_of_points_tensor
.
data
_ptr
<
int
>
();
points_in_boxes_launcher
(
batch_size
,
boxes_num
,
pts_num
,
boxes
,
pts
,
box_idx_of_points
);
points_in_boxes_launcher
(
batch_size
,
boxes_num
,
pts_num
,
boxes
,
pts
,
box_idx_of_points
);
return
1
;
return
1
;
}
}
mmdet3d/ops/roiaware_pool3d/src/roiaware_pool3d.cpp
View file @
ee5667c1
//Modified from
//
Modified from
//https://github.com/sshaoshuai/PCDet/blob/master/pcdet/ops/roiaware_pool3d/src/roiaware_pool3d_kernel.cu
//
https://github.com/sshaoshuai/PCDet/blob/master/pcdet/ops/roiaware_pool3d/src/roiaware_pool3d_kernel.cu
//RoI-aware point cloud feature pooling
//
RoI-aware point cloud feature pooling
//Written by Shaoshuai Shi
//
Written by Shaoshuai Shi
//All Rights Reserved 2019.
//
All Rights Reserved 2019.
#include <torch/serialize/tensor.h>
#include <torch/extension.h>
#include <assert.h>
#include <assert.h>
#include <torch/extension.h>
#include <torch/serialize/tensor.h>
#define CHECK_CUDA(x) \
#define CHECK_CUDA(x) AT_CHECK(x.type().is_cuda(), #x, " must be a CUDAtensor ")
TORCH_CHECK(x.device().is_cuda(), #x, " must be a CUDAtensor ")
#define CHECK_CONTIGUOUS(x) AT_CHECK(x.is_contiguous(), #x, " must be contiguous ")
#define CHECK_CONTIGUOUS(x) \
#define CHECK_INPUT(x) CHECK_CUDA(x);CHECK_CONTIGUOUS(x)
TORCH_CHECK(x.is_contiguous(), #x, " must be contiguous ")
#define CHECK_INPUT(x) \
CHECK_CUDA(x); \
void
roiaware_pool3d_launcher
(
int
boxes_num
,
int
pts_num
,
int
channels
,
int
max_pts_each_voxel
,
CHECK_CONTIGUOUS(x)
int
out_x
,
int
out_y
,
int
out_z
,
const
float
*
rois
,
const
float
*
pts
,
const
float
*
pts_feature
,
int
*
argmax
,
int
*
pts_idx_of_voxels
,
float
*
pooled_features
,
int
pool_method
);
void
roiaware_pool3d_launcher
(
int
boxes_num
,
int
pts_num
,
int
channels
,
int
max_pts_each_voxel
,
int
out_x
,
int
out_y
,
void
roiaware_pool3d_backward_launcher
(
int
boxes_num
,
int
out_x
,
int
out_y
,
int
out_z
,
int
channels
,
int
max_pts_each_voxel
,
int
out_z
,
const
float
*
rois
,
const
float
*
pts
,
const
int
*
pts_idx_of_voxels
,
const
int
*
argmax
,
const
float
*
grad_out
,
float
*
grad_in
,
int
pool_method
);
const
float
*
pts_feature
,
int
*
argmax
,
int
*
pts_idx_of_voxels
,
float
*
pooled_features
,
int
roiaware_pool3d_gpu
(
at
::
Tensor
rois
,
at
::
Tensor
pts
,
at
::
Tensor
pts_feature
,
at
::
Tensor
argmax
,
int
pool_method
);
at
::
Tensor
pts_idx_of_voxels
,
at
::
Tensor
pooled_features
,
int
pool_method
);
void
roiaware_pool3d_backward_launcher
(
int
boxes_num
,
int
out_x
,
int
out_y
,
int
roiaware_pool3d_gpu_backward
(
at
::
Tensor
pts_idx_of_voxels
,
at
::
Tensor
argmax
,
at
::
Tensor
grad_out
,
int
out_z
,
int
channels
,
at
::
Tensor
grad_in
,
int
pool_method
);
int
max_pts_each_voxel
,
const
int
*
pts_idx_of_voxels
,
int
points_in_boxes_cpu
(
at
::
Tensor
boxes_tensor
,
at
::
Tensor
pts_tensor
,
at
::
Tensor
pts_indices_tensor
);
const
int
*
argmax
,
const
float
*
grad_out
,
float
*
grad_in
,
int
pool_method
);
int
points_in_boxes_gpu
(
at
::
Tensor
boxes_tensor
,
at
::
Tensor
pts_tensor
,
at
::
Tensor
box_idx_of_points_tensor
);
int
roiaware_pool3d_gpu
(
at
::
Tensor
rois
,
at
::
Tensor
pts
,
at
::
Tensor
pts_feature
,
at
::
Tensor
argmax
,
at
::
Tensor
pts_idx_of_voxels
,
int
roiaware_pool3d_gpu
(
at
::
Tensor
rois
,
at
::
Tensor
pts
,
at
::
Tensor
pts_feature
,
at
::
Tensor
argmax
,
at
::
Tensor
pts_idx_of_voxels
,
at
::
Tensor
pooled_features
,
int
pool_method
){
at
::
Tensor
pooled_features
,
int
pool_method
);
// params rois: (N, 7) [x, y, z, w, l, h, ry] in LiDAR coordinate
// params pts: (npoints, 3) [x, y, z] in LiDAR coordinate
int
roiaware_pool3d_gpu_backward
(
at
::
Tensor
pts_idx_of_voxels
,
// params pts_feature: (npoints, C)
at
::
Tensor
argmax
,
at
::
Tensor
grad_out
,
// params argmax: (N, out_x, out_y, out_z, C)
at
::
Tensor
grad_in
,
int
pool_method
);
// params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel)
// params pooled_features: (N, out_x, out_y, out_z, C)
int
points_in_boxes_cpu
(
at
::
Tensor
boxes_tensor
,
at
::
Tensor
pts_tensor
,
// params pool_method: 0: max_pool 1: avg_pool
at
::
Tensor
pts_indices_tensor
);
CHECK_INPUT
(
rois
);
int
points_in_boxes_gpu
(
at
::
Tensor
boxes_tensor
,
at
::
Tensor
pts_tensor
,
CHECK_INPUT
(
pts
);
at
::
Tensor
box_idx_of_points_tensor
);
CHECK_INPUT
(
pts_feature
);
CHECK_INPUT
(
argmax
);
int
roiaware_pool3d_gpu
(
at
::
Tensor
rois
,
at
::
Tensor
pts
,
at
::
Tensor
pts_feature
,
CHECK_INPUT
(
pts_idx_of_voxels
);
at
::
Tensor
argmax
,
at
::
Tensor
pts_idx_of_voxels
,
CHECK_INPUT
(
pooled_features
);
at
::
Tensor
pooled_features
,
int
pool_method
)
{
// params rois: (N, 7) [x, y, z, w, l, h, ry] in LiDAR coordinate
int
boxes_num
=
rois
.
size
(
0
);
// params pts: (npoints, 3) [x, y, z] in LiDAR coordinate
int
pts_num
=
pts
.
size
(
0
);
// params pts_feature: (npoints, C)
int
channels
=
pts_feature
.
size
(
1
);
// params argmax: (N, out_x, out_y, out_z, C)
int
max_pts_each_voxel
=
pts_idx_of_voxels
.
size
(
4
);
// index 0 is the counter
// params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel)
int
out_x
=
pts_idx_of_voxels
.
size
(
1
);
// params pooled_features: (N, out_x, out_y, out_z, C)
int
out_y
=
pts_idx_of_voxels
.
size
(
2
);
// params pool_method: 0: max_pool 1: avg_pool
int
out_z
=
pts_idx_of_voxels
.
size
(
3
);
assert
((
out_x
<
256
)
&&
(
out_y
<
256
)
&&
(
out_z
<
256
));
// we encode index with 8bit
CHECK_INPUT
(
rois
);
CHECK_INPUT
(
pts
);
const
float
*
rois_data
=
rois
.
data
<
float
>
();
CHECK_INPUT
(
pts_feature
);
const
float
*
pts_data
=
pts
.
data
<
float
>
();
CHECK_INPUT
(
argmax
);
const
float
*
pts_feature_data
=
pts_feature
.
data
<
float
>
();
CHECK_INPUT
(
pts_idx_of_voxels
);
int
*
argmax_data
=
argmax
.
data
<
int
>
();
CHECK_INPUT
(
pooled_features
);
int
*
pts_idx_of_voxels_data
=
pts_idx_of_voxels
.
data
<
int
>
();
float
*
pooled_features_data
=
pooled_features
.
data
<
float
>
();
int
boxes_num
=
rois
.
size
(
0
);
int
pts_num
=
pts
.
size
(
0
);
roiaware_pool3d_launcher
(
boxes_num
,
pts_num
,
channels
,
max_pts_each_voxel
,
out_x
,
out_y
,
out_z
,
int
channels
=
pts_feature
.
size
(
1
);
rois_data
,
pts_data
,
pts_feature_data
,
argmax_data
,
pts_idx_of_voxels_data
,
pooled_features_data
,
pool_method
);
int
max_pts_each_voxel
=
pts_idx_of_voxels
.
size
(
4
);
// index 0 is the counter
int
out_x
=
pts_idx_of_voxels
.
size
(
1
);
return
1
;
int
out_y
=
pts_idx_of_voxels
.
size
(
2
);
int
out_z
=
pts_idx_of_voxels
.
size
(
3
);
assert
((
out_x
<
256
)
&&
(
out_y
<
256
)
&&
(
out_z
<
256
));
// we encode index with 8bit
const
float
*
rois_data
=
rois
.
data_ptr
<
float
>
();
const
float
*
pts_data
=
pts
.
data_ptr
<
float
>
();
const
float
*
pts_feature_data
=
pts_feature
.
data_ptr
<
float
>
();
int
*
argmax_data
=
argmax
.
data_ptr
<
int
>
();
int
*
pts_idx_of_voxels_data
=
pts_idx_of_voxels
.
data_ptr
<
int
>
();
float
*
pooled_features_data
=
pooled_features
.
data_ptr
<
float
>
();
roiaware_pool3d_launcher
(
boxes_num
,
pts_num
,
channels
,
max_pts_each_voxel
,
out_x
,
out_y
,
out_z
,
rois_data
,
pts_data
,
pts_feature_data
,
argmax_data
,
pts_idx_of_voxels_data
,
pooled_features_data
,
pool_method
);
return
1
;
}
}
int
roiaware_pool3d_gpu_backward
(
at
::
Tensor
pts_idx_of_voxels
,
at
::
Tensor
argmax
,
at
::
Tensor
grad_out
,
at
::
Tensor
grad_in
,
int
pool_method
){
int
roiaware_pool3d_gpu_backward
(
at
::
Tensor
pts_idx_of_voxels
,
// params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel)
at
::
Tensor
argmax
,
at
::
Tensor
grad_out
,
// params argmax: (N, out_x, out_y, out_z, C)
at
::
Tensor
grad_in
,
int
pool_method
)
{
// params grad_out: (N, out_x, out_y, out_z, C)
// params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel)
// params grad_in: (npoints, C), return value
// params argmax: (N, out_x, out_y, out_z, C)
// params pool_method: 0: max_pool 1: avg_pool
// params grad_out: (N, out_x, out_y, out_z, C)
// params grad_in: (npoints, C), return value
CHECK_INPUT
(
pts_idx_of_voxels
);
// params pool_method: 0: max_pool 1: avg_pool
CHECK_INPUT
(
argmax
);
CHECK_INPUT
(
grad_out
);
CHECK_INPUT
(
pts_idx_of_voxels
);
CHECK_INPUT
(
grad_in
);
CHECK_INPUT
(
argmax
);
CHECK_INPUT
(
grad_out
);
int
boxes_num
=
pts_idx_of_voxels
.
size
(
0
);
CHECK_INPUT
(
grad_in
);
int
out_x
=
pts_idx_of_voxels
.
size
(
1
);
int
out_y
=
pts_idx_of_voxels
.
size
(
2
);
int
boxes_num
=
pts_idx_of_voxels
.
size
(
0
);
int
out_z
=
pts_idx_of_voxels
.
size
(
3
);
int
out_x
=
pts_idx_of_voxels
.
size
(
1
);
int
max_pts_each_voxel
=
pts_idx_of_voxels
.
size
(
4
);
// index 0 is the counter
int
out_y
=
pts_idx_of_voxels
.
size
(
2
);
int
channels
=
grad_out
.
size
(
4
);
int
out_z
=
pts_idx_of_voxels
.
size
(
3
);
int
max_pts_each_voxel
=
pts_idx_of_voxels
.
size
(
4
);
// index 0 is the counter
const
int
*
pts_idx_of_voxels_data
=
pts_idx_of_voxels
.
data
<
int
>
();
int
channels
=
grad_out
.
size
(
4
);
const
int
*
argmax_data
=
argmax
.
data
<
int
>
();
const
float
*
grad_out_data
=
grad_out
.
data
<
float
>
();
const
int
*
pts_idx_of_voxels_data
=
pts_idx_of_voxels
.
data_ptr
<
int
>
();
float
*
grad_in_data
=
grad_in
.
data
<
float
>
();
const
int
*
argmax_data
=
argmax
.
data_ptr
<
int
>
();
const
float
*
grad_out_data
=
grad_out
.
data_ptr
<
float
>
();
roiaware_pool3d_backward_launcher
(
boxes_num
,
out_x
,
out_y
,
out_z
,
channels
,
max_pts_each_voxel
,
float
*
grad_in_data
=
grad_in
.
data_ptr
<
float
>
();
pts_idx_of_voxels_data
,
argmax_data
,
grad_out_data
,
grad_in_data
,
pool_method
);
roiaware_pool3d_backward_launcher
(
boxes_num
,
out_x
,
out_y
,
out_z
,
channels
,
return
1
;
max_pts_each_voxel
,
pts_idx_of_voxels_data
,
argmax_data
,
grad_out_data
,
grad_in_data
,
pool_method
);
return
1
;
}
}
PYBIND11_MODULE
(
TORCH_EXTENSION_NAME
,
m
)
{
PYBIND11_MODULE
(
TORCH_EXTENSION_NAME
,
m
)
{
m
.
def
(
"forward"
,
&
roiaware_pool3d_gpu
,
"roiaware pool3d forward (CUDA)"
);
m
.
def
(
"forward"
,
&
roiaware_pool3d_gpu
,
"roiaware pool3d forward (CUDA)"
);
m
.
def
(
"backward"
,
&
roiaware_pool3d_gpu_backward
,
"roiaware pool3d backward (CUDA)"
);
m
.
def
(
"backward"
,
&
roiaware_pool3d_gpu_backward
,
m
.
def
(
"points_in_boxes_gpu"
,
&
points_in_boxes_gpu
,
"points_in_boxes_gpu forward (CUDA)"
);
"roiaware pool3d backward (CUDA)"
);
m
.
def
(
"points_in_boxes_cpu"
,
&
points_in_boxes_cpu
,
"points_in_boxes_cpu forward (CPU)"
);
m
.
def
(
"points_in_boxes_gpu"
,
&
points_in_boxes_gpu
,
"points_in_boxes_gpu forward (CUDA)"
);
m
.
def
(
"points_in_boxes_cpu"
,
&
points_in_boxes_cpu
,
"points_in_boxes_cpu forward (CPU)"
);
}
}
mmdet3d/ops/spconv/include/spconv/fused_spconv_ops.h
View file @
ee5667c1
...
@@ -26,9 +26,10 @@ namespace spconv {
...
@@ -26,9 +26,10 @@ namespace spconv {
// torch.jit's doc says only support int64, so we need to convert to int32.
// torch.jit's doc says only support int64, so we need to convert to int32.
template
<
typename
T
>
template
<
typename
T
>
torch
::
Tensor
fusedIndiceConvBatchNorm
(
torch
::
Tensor
features
,
torch
::
Tensor
filters
,
torch
::
Tensor
bias
,
torch
::
Tensor
fusedIndiceConvBatchNorm
(
torch
::
Tensor
indicePairs
,
torch
::
Tensor
indiceNum
,
torch
::
Tensor
features
,
torch
::
Tensor
filters
,
torch
::
Tensor
bias
,
int64_t
numActOut
,
int64_t
_inverse
,
int64_t
_subM
)
{
torch
::
Tensor
indicePairs
,
torch
::
Tensor
indiceNum
,
int64_t
numActOut
,
int64_t
_inverse
,
int64_t
_subM
)
{
bool
subM
=
_subM
!=
0
;
bool
subM
=
_subM
!=
0
;
bool
inverse
=
_inverse
!=
0
;
bool
inverse
=
_inverse
!=
0
;
auto
device
=
features
.
device
().
type
();
auto
device
=
features
.
device
().
type
();
...
@@ -37,13 +38,16 @@ torch::Tensor fusedIndiceConvBatchNorm(torch::Tensor features, torch::Tensor fil
...
@@ -37,13 +38,16 @@ torch::Tensor fusedIndiceConvBatchNorm(torch::Tensor features, torch::Tensor fil
auto
numInPlanes
=
features
.
size
(
1
);
auto
numInPlanes
=
features
.
size
(
1
);
auto
numOutPlanes
=
filters
.
size
(
ndim
+
1
);
auto
numOutPlanes
=
filters
.
size
(
ndim
+
1
);
auto
indicePairNumCpu
=
indiceNum
.
to
({
torch
::
kCPU
});
auto
indicePairNumCpu
=
indiceNum
.
to
({
torch
::
kCPU
});
auto
indicePairMaxSizeIter
=
std
::
max_element
(
auto
indicePairMaxSizeIter
=
indicePairNumCpu
.
data
<
int
>
(),
indicePairNumCpu
.
data
<
int
>
()
+
kernelVolume
);
std
::
max_element
(
indicePairNumCpu
.
data_ptr
<
int
>
(),
int
indicePairMaxOffset
=
indicePairMaxSizeIter
-
indicePairNumCpu
.
data
<
int
>
();
indicePairNumCpu
.
data_ptr
<
int
>
()
+
kernelVolume
);
int
indicePairMaxOffset
=
indicePairMaxSizeIter
-
indicePairNumCpu
.
data_ptr
<
int
>
();
int
indicePairMaxSize
=
*
indicePairMaxSizeIter
;
int
indicePairMaxSize
=
*
indicePairMaxSizeIter
;
/*if (_subM){
/*if (_subM){
std::vector<int> indicePairNumVec(indicePairNumCpu.data<int>(), indicePairNumCpu.data<int>() + kernelVolume);
std::vector<int> indicePairNumVec(indicePairNumCpu.data_ptr<int>(),
indicePairNumCpu.data_ptr<int>() + kernelVolume);
indicePairNumVec.erase(indicePairNumVec.begin() + indicePairMaxOffset);
indicePairNumVec.erase(indicePairNumVec.begin() + indicePairMaxOffset);
auto indicePairVecMaxSizeIter = std::max_element(
auto indicePairVecMaxSizeIter = std::max_element(
...
@@ -56,46 +60,49 @@ torch::Tensor fusedIndiceConvBatchNorm(torch::Tensor features, torch::Tensor fil
...
@@ -56,46 +60,49 @@ torch::Tensor fusedIndiceConvBatchNorm(torch::Tensor features, torch::Tensor fil
// auto indicePairOptions =
// auto indicePairOptions =
// torch::TensorOptions().dtype(torch::kInt64).device(indicePairs.device());
// torch::TensorOptions().dtype(torch::kInt64).device(indicePairs.device());
torch
::
Tensor
output
=
torch
::
zeros
({
numActOut
,
numOutPlanes
},
options
).
copy_
(
bias
);
torch
::
Tensor
output
=
torch
::
Tensor
inputBuffer
=
torch
::
zeros
({
indicePairMaxSize
,
numInPlanes
},
options
);
torch
::
zeros
({
numActOut
,
numOutPlanes
},
options
).
copy_
(
bias
);
torch
::
Tensor
inputBuffer
=
torch
::
zeros
({
indicePairMaxSize
,
numInPlanes
},
options
);
torch
::
Tensor
outputBuffer
=
torch
::
Tensor
outputBuffer
=
torch
::
zeros
({
indicePairMaxSize
,
numOutPlanes
},
options
);
torch
::
zeros
({
indicePairMaxSize
,
numOutPlanes
},
options
);
filters
=
filters
.
view
({
-
1
,
numInPlanes
,
numOutPlanes
});
filters
=
filters
.
view
({
-
1
,
numInPlanes
,
numOutPlanes
});
if
(
subM
)
{
// the center index of subm conv don't need gather and scatter
if
(
subM
)
{
// the center index of subm conv don't need gather and scatter
// add.
// add.
torch
::
mm_out
(
output
,
features
,
filters
[
indicePairMaxOffset
]);
torch
::
mm_out
(
output
,
features
,
filters
[
indicePairMaxOffset
]);
}
}
double
totalGatherTime
=
0
;
double
totalGatherTime
=
0
;
double
totalGEMMTime
=
0
;
double
totalGEMMTime
=
0
;
double
totalSAddTime
=
0
;
double
totalSAddTime
=
0
;
for
(
int
i
=
0
;
i
<
kernelVolume
;
++
i
)
{
for
(
int
i
=
0
;
i
<
kernelVolume
;
++
i
)
{
auto
nHot
=
indicePairNumCpu
.
data
<
int
>
()[
i
];
auto
nHot
=
indicePairNumCpu
.
data
_ptr
<
int
>
()[
i
];
if
(
nHot
<=
0
||
(
subM
&&
i
==
indicePairMaxOffset
))
{
if
(
nHot
<=
0
||
(
subM
&&
i
==
indicePairMaxOffset
))
{
continue
;
continue
;
}
}
// auto timer = spconv::CudaContextTimer<>();
// auto timer = spconv::CudaContextTimer<>();
auto
outputBufferBlob
=
auto
outputBufferBlob
=
torch
::
from_blob
(
outputBuffer
.
data_ptr
<
T
>
(),
torch
::
from_blob
(
outputBuffer
.
data
<
T
>
(),
{
nHot
,
numOutPlanes
},
options
);
{
nHot
,
numOutPlanes
},
options
);
auto
inputBufferBlob
=
auto
inputBufferBlob
=
torch
::
from_blob
(
inputBuffer
.
data_ptr
<
T
>
(),
torch
::
from_blob
(
inputBuffer
.
data
<
T
>
(),
{
nHot
,
numInPlanes
},
options
);
{
nHot
,
numInPlanes
},
options
);
if
(
device
==
torch
::
kCPU
)
{
if
(
device
==
torch
::
kCPU
)
{
functor
::
SparseGatherFunctor
<
tv
::
CPU
,
T
,
int
>
gatherFtor
;
functor
::
SparseGatherFunctor
<
tv
::
CPU
,
T
,
int
>
gatherFtor
;
gatherFtor
(
tv
::
CPU
(),
tv
::
torch2tv
<
T
>
(
inputBuffer
),
gatherFtor
(
tv
::
CPU
(),
tv
::
torch2tv
<
T
>
(
inputBuffer
),
tv
::
torch2tv
<
const
T
>
(
features
),
tv
::
torch2tv
<
const
T
>
(
features
),
tv
::
torch2tv
<
const
int
>
(
indicePairs
).
subview
(
i
,
inverse
),
nHot
);
tv
::
torch2tv
<
const
int
>
(
indicePairs
).
subview
(
i
,
inverse
),
nHot
);
}
else
{
}
else
{
functor
::
SparseGatherFunctor
<
tv
::
GPU
,
T
,
int
>
gatherFtor
;
functor
::
SparseGatherFunctor
<
tv
::
GPU
,
T
,
int
>
gatherFtor
;
gatherFtor
(
tv
::
TorchGPU
(),
tv
::
torch2tv
<
T
>
(
inputBuffer
),
gatherFtor
(
tv
::
TorchGPU
(),
tv
::
torch2tv
<
T
>
(
inputBuffer
),
tv
::
torch2tv
<
const
T
>
(
features
),
tv
::
torch2tv
<
const
T
>
(
features
),
tv
::
torch2tv
<
const
int
>
(
indicePairs
).
subview
(
i
,
inverse
),
nHot
);
tv
::
torch2tv
<
const
int
>
(
indicePairs
).
subview
(
i
,
inverse
),
nHot
);
TV_CHECK_CUDA_ERR
();
TV_CHECK_CUDA_ERR
();
/* slower than SparseGatherFunctor, may due to int->long conversion
/* slower than SparseGatherFunctor, may due to int->long conversion
auto indicePairLong = indicePairs[i][inverse].to(torch::kInt64);
auto indicePairLong = indicePairs[i][inverse].to(torch::kInt64);
auto indicePairBlob = torch::from_blob(indicePairLong.data<long>(), {nHot},
auto indicePairBlob = torch::from_blob(indicePairLong.data_ptr<long>(),
indicePairOptions);
{nHot}, indicePairOptions); torch::index_select_out(inputBufferBlob,
torch::index_select_out(inputBufferBlob, features, 0,
features, 0, indicePairBlob);*/
indicePairBlob);*/
}
}
// totalGatherTime += timer.report() / 1000.0;
// totalGatherTime += timer.report() / 1000.0;
torch
::
mm_out
(
outputBufferBlob
,
inputBufferBlob
,
filters
[
i
]);
torch
::
mm_out
(
outputBufferBlob
,
inputBufferBlob
,
filters
[
i
]);
...
@@ -105,14 +112,14 @@ torch::Tensor fusedIndiceConvBatchNorm(torch::Tensor features, torch::Tensor fil
...
@@ -105,14 +112,14 @@ torch::Tensor fusedIndiceConvBatchNorm(torch::Tensor features, torch::Tensor fil
functor
::
SparseScatterAddFunctor
<
tv
::
CPU
,
T
,
int
>
scatterFtor
;
functor
::
SparseScatterAddFunctor
<
tv
::
CPU
,
T
,
int
>
scatterFtor
;
scatterFtor
(
tv
::
CPU
(),
tv
::
torch2tv
<
T
>
(
output
),
scatterFtor
(
tv
::
CPU
(),
tv
::
torch2tv
<
T
>
(
output
),
tv
::
torch2tv
<
const
T
>
(
outputBuffer
),
tv
::
torch2tv
<
const
T
>
(
outputBuffer
),
tv
::
torch2tv
<
const
int
>
(
indicePairs
).
subview
(
i
,
!
inverse
),
nHot
,
tv
::
torch2tv
<
const
int
>
(
indicePairs
).
subview
(
i
,
!
inverse
),
true
);
nHot
,
true
);
}
else
{
}
else
{
functor
::
SparseScatterAddFunctor
<
tv
::
GPU
,
T
,
int
>
scatterFtor
;
functor
::
SparseScatterAddFunctor
<
tv
::
GPU
,
T
,
int
>
scatterFtor
;
scatterFtor
(
tv
::
TorchGPU
(),
tv
::
torch2tv
<
T
>
(
output
),
scatterFtor
(
tv
::
TorchGPU
(),
tv
::
torch2tv
<
T
>
(
output
),
tv
::
torch2tv
<
const
T
>
(
outputBuffer
),
tv
::
torch2tv
<
const
T
>
(
outputBuffer
),
tv
::
torch2tv
<
const
int
>
(
indicePairs
).
subview
(
i
,
!
inverse
),
nHot
,
tv
::
torch2tv
<
const
int
>
(
indicePairs
).
subview
(
i
,
!
inverse
),
true
);
nHot
,
true
);
TV_CHECK_CUDA_ERR
();
TV_CHECK_CUDA_ERR
();
}
}
// totalSAddTime += timer.report() / 1000.0;
// totalSAddTime += timer.report() / 1000.0;
...
@@ -122,6 +129,6 @@ torch::Tensor fusedIndiceConvBatchNorm(torch::Tensor features, torch::Tensor fil
...
@@ -122,6 +129,6 @@ torch::Tensor fusedIndiceConvBatchNorm(torch::Tensor features, torch::Tensor fil
// std::cout << "scatteradd time " << totalSAddTime << std::endl;
// std::cout << "scatteradd time " << totalSAddTime << std::endl;
return
output
;
return
output
;
}
}
}
// namespace spconv
}
// namespace spconv
#endif
#endif
mmdet3d/ops/spconv/include/spconv/pool_ops.h
View file @
ee5667c1
...
@@ -24,7 +24,7 @@
...
@@ -24,7 +24,7 @@
namespace
spconv
{
namespace
spconv
{
template
<
typename
T
>
template
<
typename
T
>
torch
::
Tensor
indiceMaxPool
(
torch
::
Tensor
features
,
torch
::
Tensor
indicePairs
,
torch
::
Tensor
indiceMaxPool
(
torch
::
Tensor
features
,
torch
::
Tensor
indicePairs
,
torch
::
Tensor
indiceNum
,
int64_t
numAct
)
{
torch
::
Tensor
indiceNum
,
int64_t
numAct
)
{
auto
device
=
features
.
device
().
type
();
auto
device
=
features
.
device
().
type
();
auto
kernelVolume
=
indicePairs
.
size
(
0
);
auto
kernelVolume
=
indicePairs
.
size
(
0
);
auto
numInPlanes
=
features
.
size
(
1
);
auto
numInPlanes
=
features
.
size
(
1
);
...
@@ -34,7 +34,7 @@ torch::Tensor indiceMaxPool(torch::Tensor features, torch::Tensor indicePairs,
...
@@ -34,7 +34,7 @@ torch::Tensor indiceMaxPool(torch::Tensor features, torch::Tensor indicePairs,
torch
::
Tensor
output
=
torch
::
zeros
({
numAct
,
numInPlanes
},
options
);
torch
::
Tensor
output
=
torch
::
zeros
({
numAct
,
numInPlanes
},
options
);
double
totalTime
=
0
;
double
totalTime
=
0
;
for
(
int
i
=
0
;
i
<
kernelVolume
;
++
i
)
{
for
(
int
i
=
0
;
i
<
kernelVolume
;
++
i
)
{
auto
nHot
=
indicePairNumCpu
.
data
<
int
>
()[
i
];
auto
nHot
=
indicePairNumCpu
.
data
_ptr
<
int
>
()[
i
];
if
(
nHot
<=
0
)
{
if
(
nHot
<=
0
)
{
continue
;
continue
;
}
}
...
@@ -59,18 +59,19 @@ torch::Tensor indiceMaxPool(torch::Tensor features, torch::Tensor indicePairs,
...
@@ -59,18 +59,19 @@ torch::Tensor indiceMaxPool(torch::Tensor features, torch::Tensor indicePairs,
template
<
typename
T
>
template
<
typename
T
>
torch
::
Tensor
indiceMaxPoolBackward
(
torch
::
Tensor
features
,
torch
::
Tensor
indiceMaxPoolBackward
(
torch
::
Tensor
features
,
torch
::
Tensor
outFeatures
,
torch
::
Tensor
outFeatures
,
torch
::
Tensor
outGrad
,
torch
::
Tensor
indicePairs
,
torch
::
Tensor
outGrad
,
torch
::
Tensor
indiceNum
)
{
torch
::
Tensor
indicePairs
,
torch
::
Tensor
indiceNum
)
{
auto
device
=
features
.
device
().
type
();
auto
device
=
features
.
device
().
type
();
auto
numInPlanes
=
features
.
size
(
1
);
auto
numInPlanes
=
features
.
size
(
1
);
auto
indicePairNumCpu
=
indiceNum
.
to
({
torch
::
kCPU
});
auto
indicePairNumCpu
=
indiceNum
.
to
({
torch
::
kCPU
});
auto
options
=
auto
options
=
torch
::
TensorOptions
().
dtype
(
features
.
dtype
()).
device
(
features
.
device
());
torch
::
TensorOptions
().
dtype
(
features
.
dtype
()).
device
(
features
.
device
());
torch
::
Tensor
inputGrad
=
torch
::
zeros
(
features
.
sizes
(),
options
);
torch
::
Tensor
inputGrad
=
torch
::
zeros
(
features
.
sizes
(),
options
);
auto
kernelVolume
=
indicePairs
.
size
(
0
);
auto
kernelVolume
=
indicePairs
.
size
(
0
);
for
(
int
i
=
0
;
i
<
kernelVolume
;
++
i
)
{
for
(
int
i
=
0
;
i
<
kernelVolume
;
++
i
)
{
auto
nHot
=
indicePairNumCpu
.
data
<
int
>
()[
i
];
auto
nHot
=
indicePairNumCpu
.
data
_ptr
<
int
>
()[
i
];
if
(
nHot
<=
0
)
{
if
(
nHot
<=
0
)
{
continue
;
continue
;
}
}
...
@@ -92,6 +93,6 @@ torch::Tensor indiceMaxPoolBackward(torch::Tensor features,
...
@@ -92,6 +93,6 @@ torch::Tensor indiceMaxPoolBackward(torch::Tensor features,
return
inputGrad
;
return
inputGrad
;
}
}
}
// namespace spconv
}
// namespace spconv
#endif
#endif
mmdet3d/ops/spconv/include/spconv/spconv_ops.h
View file @
ee5667c1
This diff is collapsed.
Click to expand it.
mmdet3d/ops/spconv/include/torch_utils.h
View file @
ee5667c1
...
@@ -13,48 +13,49 @@
...
@@ -13,48 +13,49 @@
// limitations under the License.
// limitations under the License.
#pragma once
#pragma once
#include <tensorview/tensorview.h>
#include <torch/script.h>
#include <ATen/ATen.h>
#include <ATen/ATen.h>
#include <ATen/cuda/CUDAContext.h>
#include <ATen/cuda/CUDAContext.h>
#include <tensorview/tensorview.h>
#include <torch/script.h>
namespace
tv
{
namespace
tv
{
struct
TorchGPU
:
public
tv
::
GPU
{
struct
TorchGPU
:
public
tv
::
GPU
{
virtual
cudaStream_t
getStream
()
const
override
{
virtual
cudaStream_t
getStream
()
const
override
{
return
at
::
cuda
::
getCurrentCUDAStream
();
return
at
::
cuda
::
getCurrentCUDAStream
();
}
}
};
};
template
<
typename
T
>
void
check_torch_dtype
(
const
torch
::
Tensor
&
tensor
)
{
template
<
typename
T
>
void
check_torch_dtype
(
const
torch
::
Tensor
&
tensor
)
{
switch
(
tensor
.
type
().
scalarType
())
{
switch
(
tensor
.
type
().
scalarType
())
{
case
at
::
ScalarType
::
Double
:
{
case
at
::
ScalarType
::
Double
:
{
auto
val
=
std
::
is_same
<
std
::
remove_const_t
<
T
>
,
double
>::
value
;
auto
val
=
std
::
is_same
<
std
::
remove_const_t
<
T
>
,
double
>::
value
;
TV_ASSERT_RT_ERR
(
val
,
"error"
);
TV_ASSERT_RT_ERR
(
val
,
"error"
);
break
;
break
;
}
}
case
at
::
ScalarType
::
Float
:
{
case
at
::
ScalarType
::
Float
:
{
auto
val
=
std
::
is_same
<
std
::
remove_const_t
<
T
>
,
float
>::
value
;
auto
val
=
std
::
is_same
<
std
::
remove_const_t
<
T
>
,
float
>::
value
;
TV_ASSERT_RT_ERR
(
val
,
"error"
);
TV_ASSERT_RT_ERR
(
val
,
"error"
);
break
;
break
;
}
}
case
at
::
ScalarType
::
Int
:
{
case
at
::
ScalarType
::
Int
:
{
auto
val
=
std
::
is_same
<
std
::
remove_const_t
<
T
>
,
int
>::
value
;
auto
val
=
std
::
is_same
<
std
::
remove_const_t
<
T
>
,
int
>::
value
;
TV_ASSERT_RT_ERR
(
val
,
"error"
);
TV_ASSERT_RT_ERR
(
val
,
"error"
);
break
;
break
;
}
}
case
at
::
ScalarType
::
Half
:
{
case
at
::
ScalarType
::
Half
:
{
auto
val
=
std
::
is_same
<
std
::
remove_const_t
<
T
>
,
at
::
Half
>::
value
;
auto
val
=
std
::
is_same
<
std
::
remove_const_t
<
T
>
,
at
::
Half
>::
value
;
TV_ASSERT_RT_ERR
(
val
,
"error"
);
TV_ASSERT_RT_ERR
(
val
,
"error"
);
break
;
break
;
}
}
case
at
::
ScalarType
::
Long
:
{
case
at
::
ScalarType
::
Long
:
{
auto
val
=
std
::
is_same
<
std
::
remove_const_t
<
T
>
,
long
>::
value
;
auto
val
=
std
::
is_same
<
std
::
remove_const_t
<
T
>
,
long
>::
value
;
TV_ASSERT_RT_ERR
(
val
,
"error"
);
TV_ASSERT_RT_ERR
(
val
,
"error"
);
break
;
break
;
}
}
default:
default:
TV_ASSERT_RT_ERR
(
false
,
"error"
);
TV_ASSERT_RT_ERR
(
false
,
"error"
);
}
}
}
}
...
@@ -65,6 +66,6 @@ tv::TensorView<T> torch2tv(const torch::Tensor &tensor) {
...
@@ -65,6 +66,6 @@ tv::TensorView<T> torch2tv(const torch::Tensor &tensor) {
for
(
auto
i
:
tensor
.
sizes
())
{
for
(
auto
i
:
tensor
.
sizes
())
{
shape
.
push_back
(
i
);
shape
.
push_back
(
i
);
}
}
return
tv
::
TensorView
<
T
>
(
tensor
.
data
<
std
::
remove_const_t
<
T
>>
(),
shape
);
return
tv
::
TensorView
<
T
>
(
tensor
.
data
_ptr
<
std
::
remove_const_t
<
T
>>
(),
shape
);
}
}
}
// namespace tv
}
// namespace tv
mmdet3d/ops/voxel/src/scatter_points_cpu.cpp
View file @
ee5667c1
#include <torch/extension.h>
#include <ATen/TensorUtils.h>
#include <ATen/TensorUtils.h>
#include <torch/extension.h>
// #include "voxelization.h"
// #include "voxelization.h"
namespace
{
namespace
{
template
<
typename
T_int
>
template
<
typename
T_int
>
void
determin_max_points_kernel
(
torch
::
TensorAccessor
<
T_int
,
2
>
coor
,
void
determin_max_points_kernel
(
torch
::
TensorAccessor
<
T_int
,
1
>
point_to_voxelidx
,
torch
::
TensorAccessor
<
T_int
,
2
>
coor
,
torch
::
TensorAccessor
<
T_int
,
1
>
num_points_per_voxel
,
torch
::
TensorAccessor
<
T_int
,
1
>
point_to_voxelidx
,
torch
::
TensorAccessor
<
T_int
,
3
>
coor_to_voxelidx
,
torch
::
TensorAccessor
<
T_int
,
1
>
num_points_per_voxel
,
int
&
voxel_num
,
torch
::
TensorAccessor
<
T_int
,
3
>
coor_to_voxelidx
,
int
&
voxel_num
,
int
&
max_points
,
int
&
max_points
,
const
int
num_points
)
{
const
int
num_points
int
voxelidx
,
num
;
)
{
for
(
int
i
=
0
;
i
<
num_points
;
++
i
)
{
if
(
coor
[
i
][
0
]
==
-
1
)
continue
;
int
voxelidx
,
num
;
voxelidx
=
coor_to_voxelidx
[
coor
[
i
][
0
]][
coor
[
i
][
1
]][
coor
[
i
][
2
]];
for
(
int
i
=
0
;
i
<
num_points
;
++
i
)
{
if
(
coor
[
i
][
0
]
==
-
1
)
// record voxel
continue
;
if
(
voxelidx
==
-
1
)
{
voxelidx
=
coor_to_voxelidx
[
coor
[
i
][
0
]][
coor
[
i
][
1
]][
coor
[
i
][
2
]];
voxelidx
=
voxel_num
;
voxel_num
+=
1
;
// record voxel
coor_to_voxelidx
[
coor
[
i
][
0
]][
coor
[
i
][
1
]][
coor
[
i
][
2
]]
=
voxelidx
;
if
(
voxelidx
==
-
1
)
{
voxelidx
=
voxel_num
;
voxel_num
+=
1
;
coor_to_voxelidx
[
coor
[
i
][
0
]][
coor
[
i
][
1
]][
coor
[
i
][
2
]]
=
voxelidx
;
}
// put points into voxel
num
=
num_points_per_voxel
[
voxelidx
];
point_to_voxelidx
[
i
]
=
num
;
num_points_per_voxel
[
voxelidx
]
+=
1
;
// update max points per voxel
max_points
=
std
::
max
(
max_points
,
num
+
1
);
}
}
return
;
// put points into voxel
}
num
=
num_points_per_voxel
[
voxelidx
];
point_to_voxelidx
[
i
]
=
num
;
num_points_per_voxel
[
voxelidx
]
+=
1
;
// update max points per voxel
max_points
=
std
::
max
(
max_points
,
num
+
1
);
}
return
;
}
template
<
typename
T
,
typename
T_int
>
template
<
typename
T
,
typename
T_int
>
void
scatter_point_to_voxel_kernel
(
void
scatter_point_to_voxel_kernel
(
const
torch
::
TensorAccessor
<
T
,
2
>
points
,
const
torch
::
TensorAccessor
<
T
,
2
>
points
,
torch
::
TensorAccessor
<
T_int
,
2
>
coor
,
torch
::
TensorAccessor
<
T_int
,
2
>
coor
,
torch
::
TensorAccessor
<
T_int
,
1
>
point_to_voxelidx
,
torch
::
TensorAccessor
<
T_int
,
1
>
point_to_voxelidx
,
torch
::
TensorAccessor
<
T_int
,
3
>
coor_to_voxelidx
,
torch
::
TensorAccessor
<
T_int
,
3
>
coor_to_voxelidx
,
torch
::
TensorAccessor
<
T
,
3
>
voxels
,
torch
::
TensorAccessor
<
T
,
3
>
voxels
,
torch
::
TensorAccessor
<
T_int
,
2
>
voxel_coors
,
torch
::
TensorAccessor
<
T_int
,
2
>
voxel_coors
,
const
int
num_features
,
const
int
num_features
,
const
int
num_points
,
const
int
NDim
)
{
const
int
num_points
,
for
(
int
i
=
0
;
i
<
num_points
;
++
i
)
{
const
int
NDim
int
num
=
point_to_voxelidx
[
i
];
){
int
voxelidx
=
coor_to_voxelidx
[
coor
[
i
][
0
]][
coor
[
i
][
1
]][
coor
[
i
][
2
]];
for
(
int
i
=
0
;
i
<
num_points
;
++
i
)
{
for
(
int
k
=
0
;
k
<
num_features
;
++
k
)
{
int
num
=
point_to_voxelidx
[
i
];
voxels
[
voxelidx
][
num
][
k
]
=
points
[
i
][
k
];
int
voxelidx
=
coor_to_voxelidx
[
coor
[
i
][
0
]][
coor
[
i
][
1
]][
coor
[
i
][
2
]];
}
for
(
int
k
=
0
;
k
<
num_features
;
++
k
)
{
for
(
int
k
=
0
;
k
<
NDim
;
++
k
)
{
voxels
[
voxelidx
][
num
][
k
]
=
points
[
i
][
k
];
voxel_coors
[
voxelidx
][
k
]
=
coor
[
i
][
k
];
}
for
(
int
k
=
0
;
k
<
NDim
;
++
k
)
{
voxel_coors
[
voxelidx
][
k
]
=
coor
[
i
][
k
];
}
}
}
}
}
}
}
// namespace
}
// namespace
namespace
voxelization
{
namespace
voxelization
{
std
::
vector
<
at
::
Tensor
>
dynamic_point_to_voxel_cpu
(
std
::
vector
<
at
::
Tensor
>
dynamic_point_to_voxel_cpu
(
const
at
::
Tensor
&
points
,
const
at
::
Tensor
&
points
,
const
at
::
Tensor
&
voxel_mapping
,
const
at
::
Tensor
&
voxel_mapping
,
const
std
::
vector
<
float
>
voxel_size
,
const
std
::
vector
<
float
>
coors_range
)
{
const
std
::
vector
<
float
>
voxel_size
,
// current version tooks about 0.02s_0.03s for one frame on cpu
const
std
::
vector
<
float
>
coors_range
)
{
// check device
// current version tooks about 0.02s_0.03s for one frame on cpu
AT_ASSERTM
(
points
.
device
().
is_cpu
(),
"points must be a CPU tensor"
);
// check device
AT_ASSERTM
(
points
.
device
().
is_cpu
(),
"points must be a CPU tensor"
);
const
int
NDim
=
voxel_mapping
.
size
(
1
);
const
int
num_points
=
points
.
size
(
0
);
const
int
NDim
=
voxel_mapping
.
size
(
1
);
const
int
num_features
=
points
.
size
(
1
);
const
int
num_points
=
points
.
size
(
0
);
const
int
num_features
=
points
.
size
(
1
);
std
::
vector
<
int
>
grid_size
(
NDim
);
for
(
int
i
=
0
;
i
<
NDim
;
++
i
)
{
std
::
vector
<
int
>
grid_size
(
NDim
);
grid_size
[
i
]
=
for
(
int
i
=
0
;
i
<
NDim
;
++
i
)
{
round
((
coors_range
[
NDim
+
i
]
-
coors_range
[
i
])
/
voxel_size
[
i
]);
grid_size
[
i
]
=
round
((
coors_range
[
NDim
+
i
]
-
coors_range
[
i
])
/
voxel_size
[
i
]);
}
}
at
::
Tensor
num_points_per_voxel
=
at
::
zeros
(
at
::
Tensor
num_points_per_voxel
=
at
::
zeros
({
num_points
,},
voxel_mapping
.
options
());
{
at
::
Tensor
coor_to_voxelidx
=
-
at
::
ones
({
grid_size
[
2
],
grid_size
[
1
],
grid_size
[
0
]},
voxel_mapping
.
options
());
num_points
,
at
::
Tensor
point_to_voxelidx
=
-
at
::
ones
({
num_points
,},
voxel_mapping
.
options
());
},
voxel_mapping
.
options
());
int
voxel_num
=
0
;
at
::
Tensor
coor_to_voxelidx
=
-
at
::
ones
(
int
max_points
=
0
;
{
grid_size
[
2
],
grid_size
[
1
],
grid_size
[
0
]},
voxel_mapping
.
options
());
AT_DISPATCH_ALL_TYPES
(
voxel_mapping
.
type
(),
"determin_max_point"
,
[
&
]
{
at
::
Tensor
point_to_voxelidx
=
-
at
::
ones
(
determin_max_points_kernel
<
scalar_t
>
(
{
voxel_mapping
.
accessor
<
scalar_t
,
2
>
(),
num_points
,
point_to_voxelidx
.
accessor
<
scalar_t
,
1
>
(),
},
num_points_per_voxel
.
accessor
<
scalar_t
,
1
>
(),
voxel_mapping
.
options
());
coor_to_voxelidx
.
accessor
<
scalar_t
,
3
>
(),
voxel_num
,
int
voxel_num
=
0
;
max_points
,
int
max_points
=
0
;
num_points
AT_DISPATCH_ALL_TYPES
(
voxel_mapping
.
scalar_type
(),
"determin_max_point"
,
[
&
]
{
);
determin_max_points_kernel
<
scalar_t
>
(
});
voxel_mapping
.
accessor
<
scalar_t
,
2
>
(),
point_to_voxelidx
.
accessor
<
scalar_t
,
1
>
(),
at
::
Tensor
voxels
=
at
::
zeros
({
voxel_num
,
max_points
,
num_features
},
points
.
options
());
num_points_per_voxel
.
accessor
<
scalar_t
,
1
>
(),
at
::
Tensor
voxel_coors
=
at
::
zeros
({
voxel_num
,
NDim
},
points
.
options
().
dtype
(
at
::
kInt
));
coor_to_voxelidx
.
accessor
<
scalar_t
,
3
>
(),
voxel_num
,
max_points
,
num_points
);
AT_DISPATCH_ALL_TYPES
(
points
.
type
(),
"scatter_point_to_voxel"
,
[
&
]
{
});
scatter_point_to_voxel_kernel
<
scalar_t
,
int
>
(
points
.
accessor
<
scalar_t
,
2
>
(),
at
::
Tensor
voxels
=
voxel_mapping
.
accessor
<
int
,
2
>
(),
at
::
zeros
({
voxel_num
,
max_points
,
num_features
},
points
.
options
());
point_to_voxelidx
.
accessor
<
int
,
1
>
(),
at
::
Tensor
voxel_coors
=
coor_to_voxelidx
.
accessor
<
int
,
3
>
(),
at
::
zeros
({
voxel_num
,
NDim
},
points
.
options
().
dtype
(
at
::
kInt
));
voxels
.
accessor
<
scalar_t
,
3
>
(),
voxel_coors
.
accessor
<
int
,
2
>
(),
AT_DISPATCH_ALL_TYPES
(
points
.
scalar_type
(),
"scatter_point_to_voxel"
,
[
&
]
{
num_features
,
scatter_point_to_voxel_kernel
<
scalar_t
,
int
>
(
num_points
,
points
.
accessor
<
scalar_t
,
2
>
(),
voxel_mapping
.
accessor
<
int
,
2
>
(),
NDim
point_to_voxelidx
.
accessor
<
int
,
1
>
(),
);
coor_to_voxelidx
.
accessor
<
int
,
3
>
(),
voxels
.
accessor
<
scalar_t
,
3
>
(),
});
voxel_coors
.
accessor
<
int
,
2
>
(),
num_features
,
num_points
,
NDim
);
});
at
::
Tensor
num_points_per_voxel_out
=
num_points_per_voxel
.
slice
(
/*dim=*/
0
,
/*start=*/
0
,
/*end=*/
voxel_num
);
return
{
voxels
,
voxel_coors
,
num_points_per_voxel_out
};
at
::
Tensor
num_points_per_voxel_out
=
num_points_per_voxel
.
slice
(
/*dim=*/
0
,
/*start=*/
0
,
/*end=*/
voxel_num
);
return
{
voxels
,
voxel_coors
,
num_points_per_voxel_out
};
}
}
}
}
// namespace voxelization
mmdet3d/ops/voxel/src/scatter_points_cuda.cu
View file @
ee5667c1
...
@@ -6,7 +6,7 @@
...
@@ -6,7 +6,7 @@
#include <ATen/cuda/CUDAApplyUtils.cuh>
#include <ATen/cuda/CUDAApplyUtils.cuh>
#define CHECK_CUDA(x) \
#define CHECK_CUDA(x) \
TORCH_CHECK(x.
typ
e().is_cuda(), #x " must be a CUDA tensor")
TORCH_CHECK(x.
devic
e().is_cuda(), #x " must be a CUDA tensor")
#define CHECK_CONTIGUOUS(x) \
#define CHECK_CONTIGUOUS(x) \
TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
#define CHECK_INPUT(x) \
#define CHECK_INPUT(x) \
...
@@ -177,7 +177,7 @@ std::vector<at::Tensor> dynamic_point_to_voxel_forward_gpu(
...
@@ -177,7 +177,7 @@ std::vector<at::Tensor> dynamic_point_to_voxel_forward_gpu(
dim3
threads
(
threadsPerBlock
);
dim3
threads
(
threadsPerBlock
);
cudaStream_t
map_stream
=
at
::
cuda
::
getCurrentCUDAStream
();
cudaStream_t
map_stream
=
at
::
cuda
::
getCurrentCUDAStream
();
AT_DISPATCH_ALL_TYPES
(
AT_DISPATCH_ALL_TYPES
(
voxel_mapping
.
type
(),
"determin_duplicate"
,
([
&
]
{
voxel_mapping
.
scalar_
type
(),
"determin_duplicate"
,
([
&
]
{
point_to_voxelidx_kernel
<
int
><<<
blocks
,
threads
,
0
,
map_stream
>>>
(
point_to_voxelidx_kernel
<
int
><<<
blocks
,
threads
,
0
,
map_stream
>>>
(
voxel_mapping
.
data_ptr
<
int
>
(),
point_to_voxelidx
.
data_ptr
<
int
>
(),
voxel_mapping
.
data_ptr
<
int
>
(),
point_to_voxelidx
.
data_ptr
<
int
>
(),
point_to_pointidx
.
data_ptr
<
int
>
(),
num_points
,
NDim
);
point_to_pointidx
.
data_ptr
<
int
>
(),
num_points
,
NDim
);
...
@@ -203,7 +203,7 @@ std::vector<at::Tensor> dynamic_point_to_voxel_forward_gpu(
...
@@ -203,7 +203,7 @@ std::vector<at::Tensor> dynamic_point_to_voxel_forward_gpu(
voxel_mapping
.
options
());
// must be zero from the begining
voxel_mapping
.
options
());
// must be zero from the begining
cudaStream_t
logic_stream
=
at
::
cuda
::
getCurrentCUDAStream
();
cudaStream_t
logic_stream
=
at
::
cuda
::
getCurrentCUDAStream
();
AT_DISPATCH_ALL_TYPES
(
AT_DISPATCH_ALL_TYPES
(
voxel_mapping
.
type
(),
"determin_duplicate"
,
([
&
]
{
voxel_mapping
.
scalar_
type
(),
"determin_duplicate"
,
([
&
]
{
determin_voxel_num
<
int
><<<
1
,
1
,
0
,
logic_stream
>>>
(
determin_voxel_num
<
int
><<<
1
,
1
,
0
,
logic_stream
>>>
(
voxel_mapping
.
data_ptr
<
int
>
(),
num_points_per_voxel
.
data_ptr
<
int
>
(),
voxel_mapping
.
data_ptr
<
int
>
(),
num_points_per_voxel
.
data_ptr
<
int
>
(),
point_to_voxelidx
.
data_ptr
<
int
>
(),
point_to_voxelidx
.
data_ptr
<
int
>
(),
...
@@ -228,7 +228,7 @@ std::vector<at::Tensor> dynamic_point_to_voxel_forward_gpu(
...
@@ -228,7 +228,7 @@ std::vector<at::Tensor> dynamic_point_to_voxel_forward_gpu(
dim3
cp_threads
(
threadsPerBlock
,
4
);
dim3
cp_threads
(
threadsPerBlock
,
4
);
cudaStream_t
cp_stream
=
at
::
cuda
::
getCurrentCUDAStream
();
cudaStream_t
cp_stream
=
at
::
cuda
::
getCurrentCUDAStream
();
AT_DISPATCH_ALL_TYPES
(
AT_DISPATCH_ALL_TYPES
(
points
.
type
(),
"scatter_point_to_voxel"
,
([
&
]
{
points
.
scalar_
type
(),
"scatter_point_to_voxel"
,
([
&
]
{
scatter_point_to_voxel_kernel
<
float
,
int
>
scatter_point_to_voxel_kernel
<
float
,
int
>
<<<
blocks
,
cp_threads
,
0
,
cp_stream
>>>
(
<<<
blocks
,
cp_threads
,
0
,
cp_stream
>>>
(
points
.
data_ptr
<
float
>
(),
voxel_mapping
.
data_ptr
<
int
>
(),
points
.
data_ptr
<
float
>
(),
voxel_mapping
.
data_ptr
<
int
>
(),
...
@@ -265,8 +265,8 @@ void dynamic_point_to_voxel_backward_gpu(at::Tensor& grad_input_points,
...
@@ -265,8 +265,8 @@ void dynamic_point_to_voxel_backward_gpu(at::Tensor& grad_input_points,
dim3
blocks
(
col_blocks
);
dim3
blocks
(
col_blocks
);
dim3
cp_threads
(
threadsPerBlock
,
4
);
dim3
cp_threads
(
threadsPerBlock
,
4
);
cudaStream_t
cp_stream
=
at
::
cuda
::
getCurrentCUDAStream
();
cudaStream_t
cp_stream
=
at
::
cuda
::
getCurrentCUDAStream
();
AT_DISPATCH_ALL_TYPES
(
grad_input_points
.
type
(),
"scatter_point_to_voxel"
,
AT_DISPATCH_ALL_TYPES
(
grad_input_points
.
scalar_type
()
,
([
&
]
{
"scatter_point_to_voxel"
,
([
&
]
{
map_voxel_to_point_kernel
<
float
,
int
>
map_voxel_to_point_kernel
<
float
,
int
>
<<<
blocks
,
cp_threads
,
0
,
cp_stream
>>>
(
<<<
blocks
,
cp_threads
,
0
,
cp_stream
>>>
(
grad_input_points
.
data_ptr
<
float
>
(),
grad_input_points
.
data_ptr
<
float
>
(),
...
...
mmdet3d/ops/voxel/src/voxelization.h
View file @
ee5667c1
...
@@ -49,7 +49,7 @@ inline int hard_voxelize(const at::Tensor& points, at::Tensor& voxels,
...
@@ -49,7 +49,7 @@ inline int hard_voxelize(const at::Tensor& points, at::Tensor& voxels,
const
std
::
vector
<
float
>
coors_range
,
const
std
::
vector
<
float
>
coors_range
,
const
int
max_points
,
const
int
max_voxels
,
const
int
max_points
,
const
int
max_voxels
,
const
int
NDim
=
3
)
{
const
int
NDim
=
3
)
{
if
(
points
.
typ
e
().
is_cuda
())
{
if
(
points
.
devic
e
().
is_cuda
())
{
#ifdef WITH_CUDA
#ifdef WITH_CUDA
return
hard_voxelize_gpu
(
points
,
voxels
,
coors
,
num_points_per_voxel
,
return
hard_voxelize_gpu
(
points
,
voxels
,
coors
,
num_points_per_voxel
,
voxel_size
,
coors_range
,
max_points
,
max_voxels
,
voxel_size
,
coors_range
,
max_points
,
max_voxels
,
...
@@ -67,7 +67,7 @@ inline void dynamic_voxelize(const at::Tensor& points, at::Tensor& coors,
...
@@ -67,7 +67,7 @@ inline void dynamic_voxelize(const at::Tensor& points, at::Tensor& coors,
const
std
::
vector
<
float
>
voxel_size
,
const
std
::
vector
<
float
>
voxel_size
,
const
std
::
vector
<
float
>
coors_range
,
const
std
::
vector
<
float
>
coors_range
,
const
int
NDim
=
3
)
{
const
int
NDim
=
3
)
{
if
(
points
.
typ
e
().
is_cuda
())
{
if
(
points
.
devic
e
().
is_cuda
())
{
#ifdef WITH_CUDA
#ifdef WITH_CUDA
return
dynamic_voxelize_gpu
(
points
,
coors
,
voxel_size
,
coors_range
,
NDim
);
return
dynamic_voxelize_gpu
(
points
,
coors
,
voxel_size
,
coors_range
,
NDim
);
#else
#else
...
@@ -80,7 +80,7 @@ inline void dynamic_voxelize(const at::Tensor& points, at::Tensor& coors,
...
@@ -80,7 +80,7 @@ inline void dynamic_voxelize(const at::Tensor& points, at::Tensor& coors,
inline
std
::
vector
<
torch
::
Tensor
>
dynamic_point_to_voxel_forward
(
inline
std
::
vector
<
torch
::
Tensor
>
dynamic_point_to_voxel_forward
(
const
at
::
Tensor
&
points
,
const
at
::
Tensor
&
voxel_mapping
,
const
at
::
Tensor
&
points
,
const
at
::
Tensor
&
voxel_mapping
,
const
std
::
vector
<
float
>
voxel_size
,
const
std
::
vector
<
float
>
coors_range
)
{
const
std
::
vector
<
float
>
voxel_size
,
const
std
::
vector
<
float
>
coors_range
)
{
if
(
points
.
typ
e
().
is_cuda
())
{
if
(
points
.
devic
e
().
is_cuda
())
{
#ifdef WITH_CUDA
#ifdef WITH_CUDA
return
dynamic_point_to_voxel_forward_gpu
(
points
,
voxel_mapping
,
voxel_size
,
return
dynamic_point_to_voxel_forward_gpu
(
points
,
voxel_mapping
,
voxel_size
,
coors_range
);
coors_range
);
...
@@ -95,7 +95,7 @@ inline std::vector<torch::Tensor> dynamic_point_to_voxel_forward(
...
@@ -95,7 +95,7 @@ inline std::vector<torch::Tensor> dynamic_point_to_voxel_forward(
inline
void
dynamic_point_to_voxel_backward
(
inline
void
dynamic_point_to_voxel_backward
(
at
::
Tensor
&
grad_input_points
,
const
at
::
Tensor
&
grad_output_voxels
,
at
::
Tensor
&
grad_input_points
,
const
at
::
Tensor
&
grad_output_voxels
,
const
at
::
Tensor
&
point_to_voxelidx
,
const
at
::
Tensor
&
coor_to_voxelidx
)
{
const
at
::
Tensor
&
point_to_voxelidx
,
const
at
::
Tensor
&
coor_to_voxelidx
)
{
if
(
grad_input_points
.
typ
e
().
is_cuda
())
{
if
(
grad_input_points
.
devic
e
().
is_cuda
())
{
#ifdef WITH_CUDA
#ifdef WITH_CUDA
return
dynamic_point_to_voxel_backward_gpu
(
return
dynamic_point_to_voxel_backward_gpu
(
grad_input_points
,
grad_output_voxels
,
point_to_voxelidx
,
grad_input_points
,
grad_output_voxels
,
point_to_voxelidx
,
...
...
mmdet3d/ops/voxel/src/voxelization_cpu.cpp
View file @
ee5667c1
#include <torch/extension.h>
#include <ATen/TensorUtils.h>
#include <ATen/TensorUtils.h>
#include <torch/extension.h>
// #include "voxelization.h"
// #include "voxelization.h"
namespace
{
namespace
{
template
<
typename
T
,
typename
T_int
>
template
<
typename
T
,
typename
T_int
>
void
dynamic_voxelize_kernel
(
const
torch
::
TensorAccessor
<
T
,
2
>
points
,
void
dynamic_voxelize_kernel
(
const
torch
::
TensorAccessor
<
T
,
2
>
points
,
torch
::
TensorAccessor
<
T_int
,
2
>
coors
,
torch
::
TensorAccessor
<
T_int
,
2
>
coors
,
const
std
::
vector
<
float
>
voxel_size
,
const
std
::
vector
<
float
>
voxel_size
,
const
std
::
vector
<
float
>
coors_range
,
const
std
::
vector
<
float
>
coors_range
,
const
std
::
vector
<
int
>
grid_size
,
const
std
::
vector
<
int
>
grid_size
,
const
int
num_points
,
const
int
num_points
,
const
int
num_features
,
const
int
num_features
,
const
int
NDim
)
{
const
int
NDim
)
{
const
int
ndim_minus_1
=
NDim
-
1
;
const
int
ndim_minus_1
=
NDim
-
1
;
bool
failed
=
false
;
bool
failed
=
false
;
int
coor
[
NDim
];
int
coor
[
NDim
];
...
@@ -44,56 +40,42 @@ void dynamic_voxelize_kernel(const torch::TensorAccessor<T,2> points,
...
@@ -44,56 +40,42 @@ void dynamic_voxelize_kernel(const torch::TensorAccessor<T,2> points,
return
;
return
;
}
}
template
<
typename
T
,
typename
T_int
>
template
<
typename
T
,
typename
T_int
>
void
hard_voxelize_kernel
(
const
torch
::
TensorAccessor
<
T
,
2
>
points
,
void
hard_voxelize_kernel
(
const
torch
::
TensorAccessor
<
T
,
2
>
points
,
torch
::
TensorAccessor
<
T
,
3
>
voxels
,
torch
::
TensorAccessor
<
T
,
3
>
voxels
,
torch
::
TensorAccessor
<
T_int
,
2
>
coors
,
torch
::
TensorAccessor
<
T_int
,
2
>
coors
,
torch
::
TensorAccessor
<
T_int
,
1
>
num_points_per_voxel
,
torch
::
TensorAccessor
<
T_int
,
1
>
num_points_per_voxel
,
torch
::
TensorAccessor
<
T_int
,
3
>
coor_to_voxelidx
,
torch
::
TensorAccessor
<
T_int
,
3
>
coor_to_voxelidx
,
int
&
voxel_num
,
int
&
voxel_num
,
const
std
::
vector
<
float
>
voxel_size
,
const
std
::
vector
<
float
>
voxel_size
,
const
std
::
vector
<
float
>
coors_range
,
const
std
::
vector
<
float
>
coors_range
,
const
std
::
vector
<
int
>
grid_size
,
const
std
::
vector
<
int
>
grid_size
,
const
int
max_points
,
const
int
max_points
,
const
int
max_voxels
,
const
int
max_voxels
,
const
int
num_points
,
const
int
num_features
,
const
int
num_points
,
const
int
NDim
)
{
const
int
num_features
,
const
int
NDim
)
{
// declare a temp coors
// declare a temp coors
at
::
Tensor
temp_coors
=
at
::
zeros
({
num_points
,
NDim
},
at
::
TensorOptions
().
dtype
(
at
::
kInt
).
device
(
at
::
kCPU
));
at
::
Tensor
temp_coors
=
at
::
zeros
(
{
num_points
,
NDim
},
at
::
TensorOptions
().
dtype
(
at
::
kInt
).
device
(
at
::
kCPU
));
// First use dynamic voxelization to get coors,
// First use dynamic voxelization to get coors,
// then check max points/voxels constraints
// then check max points/voxels constraints
dynamic_voxelize_kernel
<
T
,
int
>
(
dynamic_voxelize_kernel
<
T
,
int
>
(
points
,
temp_coors
.
accessor
<
int
,
2
>
(),
points
,
voxel_size
,
coors_range
,
grid_size
,
temp_coors
.
accessor
<
int
,
2
>
(),
num_points
,
num_features
,
NDim
);
voxel_size
,
coors_range
,
grid_size
,
num_points
,
num_features
,
NDim
);
int
voxelidx
,
num
;
int
voxelidx
,
num
;
auto
coor
=
temp_coors
.
accessor
<
int
,
2
>
();
auto
coor
=
temp_coors
.
accessor
<
int
,
2
>
();
for
(
int
i
=
0
;
i
<
num_points
;
++
i
)
{
for
(
int
i
=
0
;
i
<
num_points
;
++
i
)
{
// T_int* coor = temp_coors.data_ptr<int>() + i * NDim;
// T_int* coor = temp_coors.data_ptr<int>() + i * NDim;
if
(
coor
[
i
][
0
]
==
-
1
)
if
(
coor
[
i
][
0
]
==
-
1
)
continue
;
continue
;
voxelidx
=
coor_to_voxelidx
[
coor
[
i
][
0
]][
coor
[
i
][
1
]][
coor
[
i
][
2
]];
voxelidx
=
coor_to_voxelidx
[
coor
[
i
][
0
]][
coor
[
i
][
1
]][
coor
[
i
][
2
]];
// record voxel
// record voxel
if
(
voxelidx
==
-
1
)
{
if
(
voxelidx
==
-
1
)
{
voxelidx
=
voxel_num
;
voxelidx
=
voxel_num
;
if
(
max_voxels
!=
-
1
&&
voxel_num
>=
max_voxels
)
if
(
max_voxels
!=
-
1
&&
voxel_num
>=
max_voxels
)
break
;
break
;
voxel_num
+=
1
;
voxel_num
+=
1
;
coor_to_voxelidx
[
coor
[
i
][
0
]][
coor
[
i
][
1
]][
coor
[
i
][
2
]]
=
voxelidx
;
coor_to_voxelidx
[
coor
[
i
][
0
]][
coor
[
i
][
1
]][
coor
[
i
][
2
]]
=
voxelidx
;
...
@@ -116,93 +98,74 @@ void hard_voxelize_kernel(const torch::TensorAccessor<T,2> points,
...
@@ -116,93 +98,74 @@ void hard_voxelize_kernel(const torch::TensorAccessor<T,2> points,
return
;
return
;
}
}
}
// namespace
}
// namespace
namespace
voxelization
{
namespace
voxelization
{
int
hard_voxelize_cpu
(
int
hard_voxelize_cpu
(
const
at
::
Tensor
&
points
,
at
::
Tensor
&
voxels
,
const
at
::
Tensor
&
points
,
at
::
Tensor
&
coors
,
at
::
Tensor
&
num_points_per_voxel
,
at
::
Tensor
&
voxels
,
const
std
::
vector
<
float
>
voxel_size
,
at
::
Tensor
&
coors
,
const
std
::
vector
<
float
>
coors_range
,
at
::
Tensor
&
num_points_per_voxel
,
const
int
max_points
,
const
int
max_voxels
,
const
std
::
vector
<
float
>
voxel_size
,
const
int
NDim
=
3
)
{
const
std
::
vector
<
float
>
coors_range
,
// current version tooks about 0.02s_0.03s for one frame on cpu
const
int
max_points
,
// check device
const
int
max_voxels
,
AT_ASSERTM
(
points
.
device
().
is_cpu
(),
"points must be a CPU tensor"
);
const
int
NDim
=
3
)
{
// current version tooks about 0.02s_0.03s for one frame on cpu
std
::
vector
<
int
>
grid_size
(
NDim
);
// check device
const
int
num_points
=
points
.
size
(
0
);
AT_ASSERTM
(
points
.
device
().
is_cpu
(),
"points must be a CPU tensor"
);
const
int
num_features
=
points
.
size
(
1
);
std
::
vector
<
int
>
grid_size
(
NDim
);
for
(
int
i
=
0
;
i
<
NDim
;
++
i
)
{
const
int
num_points
=
points
.
size
(
0
);
grid_size
[
i
]
=
const
int
num_features
=
points
.
size
(
1
);
round
((
coors_range
[
NDim
+
i
]
-
coors_range
[
i
])
/
voxel_size
[
i
]);
}
for
(
int
i
=
0
;
i
<
NDim
;
++
i
)
{
grid_size
[
i
]
=
round
((
coors_range
[
NDim
+
i
]
-
coors_range
[
i
])
/
voxel_size
[
i
]);
}
// coors, num_points_per_voxel, coor_to_voxelidx are int Tensor
// coors, num_points_per_voxel, coor_to_voxelidx are int Tensor
//printf("cpu coor_to_voxelidx size: [%d, %d, %d]\n", grid_size[2], grid_size[1], grid_size[0]);
// printf("cpu coor_to_voxelidx size: [%d, %d, %d]\n", grid_size[2],
at
::
Tensor
coor_to_voxelidx
=
-
at
::
ones
({
grid_size
[
2
],
grid_size
[
1
],
grid_size
[
0
]},
coors
.
options
());
// grid_size[1], grid_size[0]);
at
::
Tensor
coor_to_voxelidx
=
-
at
::
ones
({
grid_size
[
2
],
grid_size
[
1
],
grid_size
[
0
]},
coors
.
options
());
int
voxel_num
=
0
;
int
voxel_num
=
0
;
AT_DISPATCH_FLOATING_TYPES_AND_HALF
(
points
.
type
(),
"hard_voxelize_forward"
,
[
&
]
{
AT_DISPATCH_FLOATING_TYPES_AND_HALF
(
points
.
scalar_type
(),
"hard_voxelize_forward"
,
[
&
]
{
hard_voxelize_kernel
<
scalar_t
,
int
>
(
hard_voxelize_kernel
<
scalar_t
,
int
>
(
points
.
accessor
<
scalar_t
,
2
>
(),
points
.
accessor
<
scalar_t
,
2
>
(),
voxels
.
accessor
<
scalar_t
,
3
>
(),
voxels
.
accessor
<
scalar_t
,
3
>
(),
coors
.
accessor
<
int
,
2
>
(),
num_points_per_voxel
.
accessor
<
int
,
1
>
(),
coors
.
accessor
<
int
,
2
>
(),
coor_to_voxelidx
.
accessor
<
int
,
3
>
(),
voxel_num
,
voxel_size
,
num_points_per_voxel
.
accessor
<
int
,
1
>
(),
coors_range
,
grid_size
,
max_points
,
max_voxels
,
num_points
,
coor_to_voxelidx
.
accessor
<
int
,
3
>
(),
num_features
,
NDim
);
voxel_num
,
});
voxel_size
,
coors_range
,
return
voxel_num
;
grid_size
,
max_points
,
max_voxels
,
num_points
,
num_features
,
NDim
);
});
return
voxel_num
;
}
}
void
dynamic_voxelize_cpu
(
const
at
::
Tensor
&
points
,
at
::
Tensor
&
coors
,
const
std
::
vector
<
float
>
voxel_size
,
const
std
::
vector
<
float
>
coors_range
,
const
int
NDim
=
3
)
{
// check device
AT_ASSERTM
(
points
.
device
().
is_cpu
(),
"points must be a CPU tensor"
);
void
dynamic_voxelize_cpu
(
std
::
vector
<
int
>
grid_size
(
NDim
);
const
at
::
Tensor
&
points
,
const
int
num_points
=
points
.
size
(
0
);
at
::
Tensor
&
coors
,
const
int
num_features
=
points
.
size
(
1
);
const
std
::
vector
<
float
>
voxel_size
,
const
std
::
vector
<
float
>
coors_range
,
const
int
NDim
=
3
)
{
// check device
AT_ASSERTM
(
points
.
device
().
is_cpu
(),
"points must be a CPU tensor"
);
std
::
vector
<
int
>
grid_size
(
NDim
);
const
int
num_points
=
points
.
size
(
0
);
const
int
num_features
=
points
.
size
(
1
);
for
(
int
i
=
0
;
i
<
NDim
;
++
i
)
{
for
(
int
i
=
0
;
i
<
NDim
;
++
i
)
{
grid_size
[
i
]
=
round
((
coors_range
[
NDim
+
i
]
-
coors_range
[
i
])
/
voxel_size
[
i
]);
grid_size
[
i
]
=
}
round
((
coors_range
[
NDim
+
i
]
-
coors_range
[
i
])
/
voxel_size
[
i
]);
}
// coors, num_points_per_voxel, coor_to_voxelidx are int Tensor
// coors, num_points_per_voxel, coor_to_voxelidx are int Tensor
AT_DISPATCH_FLOATING_TYPES_AND_HALF
(
points
.
type
(),
"hard_voxelize_forward"
,
[
&
]
{
AT_DISPATCH_FLOATING_TYPES_AND_HALF
(
points
.
scalar_type
(),
"hard_voxelize_forward"
,
[
&
]
{
dynamic_voxelize_kernel
<
scalar_t
,
int
>
(
dynamic_voxelize_kernel
<
scalar_t
,
int
>
(
points
.
accessor
<
scalar_t
,
2
>
(),
points
.
accessor
<
scalar_t
,
2
>
(),
coors
.
accessor
<
int
,
2
>
(),
coors
.
accessor
<
int
,
2
>
(),
voxel_size
,
coors_range
,
grid_size
,
num_points
,
num_features
,
NDim
);
voxel_size
,
});
coors_range
,
grid_size
,
num_points
,
num_features
,
NDim
);
});
return
;
}
return
;
}
}
}
// namespace voxelization
mmdet3d/ops/voxel/src/voxelization_cuda.cu
View file @
ee5667c1
...
@@ -6,7 +6,7 @@
...
@@ -6,7 +6,7 @@
#include <ATen/cuda/CUDAApplyUtils.cuh>
#include <ATen/cuda/CUDAApplyUtils.cuh>
#define CHECK_CUDA(x) \
#define CHECK_CUDA(x) \
TORCH_CHECK(x.
typ
e().is_cuda(), #x " must be a CUDA tensor")
TORCH_CHECK(x.
devic
e().is_cuda(), #x " must be a CUDA tensor")
#define CHECK_CONTIGUOUS(x) \
#define CHECK_CONTIGUOUS(x) \
TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
#define CHECK_INPUT(x) \
#define CHECK_INPUT(x) \
...
@@ -219,7 +219,7 @@ int hard_voxelize_gpu(const at::Tensor& points, at::Tensor& voxels,
...
@@ -219,7 +219,7 @@ int hard_voxelize_gpu(const at::Tensor& points, at::Tensor& voxels,
// 1. link point to corresponding voxel coors
// 1. link point to corresponding voxel coors
AT_DISPATCH_ALL_TYPES
(
AT_DISPATCH_ALL_TYPES
(
points
.
type
(),
"hard_voxelize_kernel"
,
([
&
]
{
points
.
scalar_
type
(),
"hard_voxelize_kernel"
,
([
&
]
{
dynamic_voxelize_kernel
<
scalar_t
,
int
>
dynamic_voxelize_kernel
<
scalar_t
,
int
>
<<<
grid
,
block
,
0
,
at
::
cuda
::
getCurrentCUDAStream
()
>>>
(
<<<
grid
,
block
,
0
,
at
::
cuda
::
getCurrentCUDAStream
()
>>>
(
points
.
contiguous
().
data_ptr
<
scalar_t
>
(),
points
.
contiguous
().
data_ptr
<
scalar_t
>
(),
...
@@ -247,7 +247,7 @@ int hard_voxelize_gpu(const at::Tensor& points, at::Tensor& voxels,
...
@@ -247,7 +247,7 @@ int hard_voxelize_gpu(const at::Tensor& points, at::Tensor& voxels,
dim3
map_grid
(
std
::
min
(
at
::
cuda
::
ATenCeilDiv
(
num_points
,
512
),
4096
));
dim3
map_grid
(
std
::
min
(
at
::
cuda
::
ATenCeilDiv
(
num_points
,
512
),
4096
));
dim3
map_block
(
512
);
dim3
map_block
(
512
);
AT_DISPATCH_ALL_TYPES
(
AT_DISPATCH_ALL_TYPES
(
temp_coors
.
type
(),
"determin_duplicate"
,
([
&
]
{
temp_coors
.
scalar_
type
(),
"determin_duplicate"
,
([
&
]
{
point_to_voxelidx_kernel
<
int
>
point_to_voxelidx_kernel
<
int
>
<<<
map_grid
,
map_block
,
0
,
at
::
cuda
::
getCurrentCUDAStream
()
>>>
(
<<<
map_grid
,
map_block
,
0
,
at
::
cuda
::
getCurrentCUDAStream
()
>>>
(
temp_coors
.
contiguous
().
data_ptr
<
int
>
(),
temp_coors
.
contiguous
().
data_ptr
<
int
>
(),
...
@@ -272,7 +272,7 @@ int hard_voxelize_gpu(const at::Tensor& points, at::Tensor& voxels,
...
@@ -272,7 +272,7 @@ int hard_voxelize_gpu(const at::Tensor& points, at::Tensor& voxels,
points
.
options
().
dtype
(
at
::
kInt
));
// must be zero from the begining
points
.
options
().
dtype
(
at
::
kInt
));
// must be zero from the begining
AT_DISPATCH_ALL_TYPES
(
AT_DISPATCH_ALL_TYPES
(
temp_coors
.
type
(),
"determin_duplicate"
,
([
&
]
{
temp_coors
.
scalar_
type
(),
"determin_duplicate"
,
([
&
]
{
determin_voxel_num
<
int
><<<
1
,
1
,
0
,
at
::
cuda
::
getCurrentCUDAStream
()
>>>
(
determin_voxel_num
<
int
><<<
1
,
1
,
0
,
at
::
cuda
::
getCurrentCUDAStream
()
>>>
(
num_points_per_voxel
.
contiguous
().
data_ptr
<
int
>
(),
num_points_per_voxel
.
contiguous
().
data_ptr
<
int
>
(),
point_to_voxelidx
.
contiguous
().
data_ptr
<
int
>
(),
point_to_voxelidx
.
contiguous
().
data_ptr
<
int
>
(),
...
@@ -290,7 +290,7 @@ int hard_voxelize_gpu(const at::Tensor& points, at::Tensor& voxels,
...
@@ -290,7 +290,7 @@ int hard_voxelize_gpu(const at::Tensor& points, at::Tensor& voxels,
dim3
cp_grid
(
std
::
min
(
at
::
cuda
::
ATenCeilDiv
(
pts_output_size
,
512
),
4096
));
dim3
cp_grid
(
std
::
min
(
at
::
cuda
::
ATenCeilDiv
(
pts_output_size
,
512
),
4096
));
dim3
cp_block
(
512
);
dim3
cp_block
(
512
);
AT_DISPATCH_ALL_TYPES
(
AT_DISPATCH_ALL_TYPES
(
points
.
type
(),
"assign_point_to_voxel"
,
([
&
]
{
points
.
scalar_
type
(),
"assign_point_to_voxel"
,
([
&
]
{
assign_point_to_voxel
<
float
,
int
>
assign_point_to_voxel
<
float
,
int
>
<<<
cp_grid
,
cp_block
,
0
,
at
::
cuda
::
getCurrentCUDAStream
()
>>>
(
<<<
cp_grid
,
cp_block
,
0
,
at
::
cuda
::
getCurrentCUDAStream
()
>>>
(
pts_output_size
,
points
.
contiguous
().
data_ptr
<
float
>
(),
pts_output_size
,
points
.
contiguous
().
data_ptr
<
float
>
(),
...
@@ -308,7 +308,7 @@ int hard_voxelize_gpu(const at::Tensor& points, at::Tensor& voxels,
...
@@ -308,7 +308,7 @@ int hard_voxelize_gpu(const at::Tensor& points, at::Tensor& voxels,
std
::
min
(
at
::
cuda
::
ATenCeilDiv
(
coors_output_size
,
512
),
4096
));
std
::
min
(
at
::
cuda
::
ATenCeilDiv
(
coors_output_size
,
512
),
4096
));
dim3
coors_cp_block
(
512
);
dim3
coors_cp_block
(
512
);
AT_DISPATCH_ALL_TYPES
(
AT_DISPATCH_ALL_TYPES
(
points
.
type
(),
"assign_point_to_voxel"
,
([
&
]
{
points
.
scalar_
type
(),
"assign_point_to_voxel"
,
([
&
]
{
assign_voxel_coors
<
float
,
int
><<<
coors_cp_grid
,
coors_cp_block
,
0
,
assign_voxel_coors
<
float
,
int
><<<
coors_cp_grid
,
coors_cp_block
,
0
,
at
::
cuda
::
getCurrentCUDAStream
()
>>>
(
at
::
cuda
::
getCurrentCUDAStream
()
>>>
(
coors_output_size
,
temp_coors
.
contiguous
().
data_ptr
<
int
>
(),
coors_output_size
,
temp_coors
.
contiguous
().
data_ptr
<
int
>
(),
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment