Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
mmdetection3d
Commits
333536f6
Unverified
Commit
333536f6
authored
Apr 06, 2022
by
Wenwei Zhang
Committed by
GitHub
Apr 06, 2022
Browse files
Release v1.0.0rc1
parents
9c7270d0
f747daab
Changes
219
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
7 additions
and
1845 deletions
+7
-1845
mmdet3d/ops/knn/src/knn.cpp
mmdet3d/ops/knn/src/knn.cpp
+0
-46
mmdet3d/ops/knn/src/knn_cuda.cu
mmdet3d/ops/knn/src/knn_cuda.cu
+0
-115
mmdet3d/ops/paconv/__init__.py
mmdet3d/ops/paconv/__init__.py
+1
-2
mmdet3d/ops/paconv/assign_score.py
mmdet3d/ops/paconv/assign_score.py
+0
-102
mmdet3d/ops/paconv/paconv.py
mmdet3d/ops/paconv/paconv.py
+1
-1
mmdet3d/ops/paconv/src/assign_score_withk.cpp
mmdet3d/ops/paconv/src/assign_score_withk.cpp
+0
-36
mmdet3d/ops/paconv/src/assign_score_withk_cuda.cu
mmdet3d/ops/paconv/src/assign_score_withk_cuda.cu
+0
-212
mmdet3d/ops/pointnet_modules/point_fp_module.py
mmdet3d/ops/pointnet_modules/point_fp_module.py
+1
-2
mmdet3d/ops/pointnet_modules/point_sa_module.py
mmdet3d/ops/pointnet_modules/point_sa_module.py
+4
-2
mmdet3d/ops/roiaware_pool3d/__init__.py
mmdet3d/ops/roiaware_pool3d/__init__.py
+0
-9
mmdet3d/ops/roiaware_pool3d/points_in_boxes.py
mmdet3d/ops/roiaware_pool3d/points_in_boxes.py
+0
-129
mmdet3d/ops/roiaware_pool3d/roiaware_pool3d.py
mmdet3d/ops/roiaware_pool3d/roiaware_pool3d.py
+0
-111
mmdet3d/ops/roiaware_pool3d/src/points_in_boxes_cpu.cpp
mmdet3d/ops/roiaware_pool3d/src/points_in_boxes_cpu.cpp
+0
-67
mmdet3d/ops/roiaware_pool3d/src/points_in_boxes_cuda.cu
mmdet3d/ops/roiaware_pool3d/src/points_in_boxes_cuda.cu
+0
-201
mmdet3d/ops/roiaware_pool3d/src/roiaware_pool3d.cpp
mmdet3d/ops/roiaware_pool3d/src/roiaware_pool3d.cpp
+0
-136
mmdet3d/ops/roiaware_pool3d/src/roiaware_pool3d_kernel.cu
mmdet3d/ops/roiaware_pool3d/src/roiaware_pool3d_kernel.cu
+0
-364
mmdet3d/ops/roipoint_pool3d/__init__.py
mmdet3d/ops/roipoint_pool3d/__init__.py
+0
-4
mmdet3d/ops/roipoint_pool3d/roipoint_pool3d.py
mmdet3d/ops/roipoint_pool3d/roipoint_pool3d.py
+0
-72
mmdet3d/ops/roipoint_pool3d/src/roipoint_pool3d.cpp
mmdet3d/ops/roipoint_pool3d/src/roipoint_pool3d.cpp
+0
-66
mmdet3d/ops/roipoint_pool3d/src/roipoint_pool3d_kernel.cu
mmdet3d/ops/roipoint_pool3d/src/roipoint_pool3d_kernel.cu
+0
-168
No files found.
mmdet3d/ops/knn/src/knn.cpp
deleted
100644 → 0
View file @
9c7270d0
// Modified from https://github.com/CVMI-Lab/PAConv/tree/main/scene_seg/lib/pointops/src/knnquery_heap
#include <torch/serialize/tensor.h>
#include <torch/extension.h>
#include <vector>
#include <THC/THC.h>
#include <ATen/cuda/CUDAContext.h>
extern
THCState
*
state
;
#define CHECK_CUDA(x) TORCH_CHECK(x.is_cuda(), #x, " must be a CUDAtensor ")
#define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x, " must be contiguous ")
#define CHECK_INPUT(x) CHECK_CUDA(x);CHECK_CONTIGUOUS(x)
void
knn_kernel_launcher
(
int
b
,
int
n
,
int
m
,
int
nsample
,
const
float
*
xyz
,
const
float
*
new_xyz
,
int
*
idx
,
float
*
dist2
,
cudaStream_t
stream
);
void
knn_wrapper
(
int
b
,
int
n
,
int
m
,
int
nsample
,
at
::
Tensor
xyz_tensor
,
at
::
Tensor
new_xyz_tensor
,
at
::
Tensor
idx_tensor
,
at
::
Tensor
dist2_tensor
)
{
CHECK_INPUT
(
new_xyz_tensor
);
CHECK_INPUT
(
xyz_tensor
);
const
float
*
new_xyz
=
new_xyz_tensor
.
data_ptr
<
float
>
();
const
float
*
xyz
=
xyz_tensor
.
data_ptr
<
float
>
();
int
*
idx
=
idx_tensor
.
data_ptr
<
int
>
();
float
*
dist2
=
dist2_tensor
.
data_ptr
<
float
>
();
cudaStream_t
stream
=
at
::
cuda
::
getCurrentCUDAStream
();
knn_kernel_launcher
(
b
,
n
,
m
,
nsample
,
xyz
,
new_xyz
,
idx
,
dist2
,
stream
);
}
PYBIND11_MODULE
(
TORCH_EXTENSION_NAME
,
m
)
{
m
.
def
(
"knn_wrapper"
,
&
knn_wrapper
,
"knn_wrapper"
);
}
mmdet3d/ops/knn/src/knn_cuda.cu
deleted
100644 → 0
View file @
9c7270d0
// Modified from https://github.com/CVMI-Lab/PAConv/tree/main/scene_seg/lib/pointops/src/knnquery_heap
#include <cmath>
#include <cstdio>
#define THREADS_PER_BLOCK 256
#define DIVUP(m,n) ((m) / (n) + ((m) % (n) > 0))
__device__
void
swap_float
(
float
*
x
,
float
*
y
)
{
float
tmp
=
*
x
;
*
x
=
*
y
;
*
y
=
tmp
;
}
__device__
void
swap_int
(
int
*
x
,
int
*
y
)
{
int
tmp
=
*
x
;
*
x
=
*
y
;
*
y
=
tmp
;
}
__device__
void
reheap
(
float
*
dist
,
int
*
idx
,
int
k
)
{
int
root
=
0
;
int
child
=
root
*
2
+
1
;
while
(
child
<
k
)
{
if
(
child
+
1
<
k
&&
dist
[
child
+
1
]
>
dist
[
child
])
child
++
;
if
(
dist
[
root
]
>
dist
[
child
])
return
;
swap_float
(
&
dist
[
root
],
&
dist
[
child
]);
swap_int
(
&
idx
[
root
],
&
idx
[
child
]);
root
=
child
;
child
=
root
*
2
+
1
;
}
}
__device__
void
heap_sort
(
float
*
dist
,
int
*
idx
,
int
k
)
{
int
i
;
for
(
i
=
k
-
1
;
i
>
0
;
i
--
)
{
swap_float
(
&
dist
[
0
],
&
dist
[
i
]);
swap_int
(
&
idx
[
0
],
&
idx
[
i
]);
reheap
(
dist
,
idx
,
i
);
}
}
// input: xyz (b, n, 3) new_xyz (b, m, 3)
// output: idx (b, m, nsample) dist2 (b, m, nsample)
__global__
void
knn_kernel
(
int
b
,
int
n
,
int
m
,
int
nsample
,
const
float
*
__restrict__
xyz
,
const
float
*
__restrict__
new_xyz
,
int
*
__restrict__
idx
,
float
*
__restrict__
dist2
)
{
int
bs_idx
=
blockIdx
.
y
;
int
pt_idx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
bs_idx
>=
b
||
pt_idx
>=
m
)
return
;
new_xyz
+=
bs_idx
*
m
*
3
+
pt_idx
*
3
;
xyz
+=
bs_idx
*
n
*
3
;
idx
+=
bs_idx
*
m
*
nsample
+
pt_idx
*
nsample
;
dist2
+=
bs_idx
*
m
*
nsample
+
pt_idx
*
nsample
;
float
new_x
=
new_xyz
[
0
];
float
new_y
=
new_xyz
[
1
];
float
new_z
=
new_xyz
[
2
];
float
best_dist
[
100
];
int
best_idx
[
100
];
for
(
int
i
=
0
;
i
<
nsample
;
i
++
){
best_dist
[
i
]
=
1e10
;
best_idx
[
i
]
=
0
;
}
for
(
int
i
=
0
;
i
<
n
;
i
++
){
float
x
=
xyz
[
i
*
3
+
0
];
float
y
=
xyz
[
i
*
3
+
1
];
float
z
=
xyz
[
i
*
3
+
2
];
float
d2
=
(
new_x
-
x
)
*
(
new_x
-
x
)
+
(
new_y
-
y
)
*
(
new_y
-
y
)
+
(
new_z
-
z
)
*
(
new_z
-
z
);
if
(
d2
<
best_dist
[
0
]){
best_dist
[
0
]
=
d2
;
best_idx
[
0
]
=
i
;
reheap
(
best_dist
,
best_idx
,
nsample
);
}
}
heap_sort
(
best_dist
,
best_idx
,
nsample
);
for
(
int
i
=
0
;
i
<
nsample
;
i
++
){
idx
[
i
]
=
best_idx
[
i
];
dist2
[
i
]
=
best_dist
[
i
];
}
}
void
knn_kernel_launcher
(
int
b
,
int
n
,
int
m
,
int
nsample
,
const
float
*
xyz
,
const
float
*
new_xyz
,
int
*
idx
,
float
*
dist2
,
cudaStream_t
stream
)
{
// param new_xyz: (B, m, 3)
// param xyz: (B, n, 3)
// param idx: (B, m, nsample)
cudaError_t
err
;
dim3
blocks
(
DIVUP
(
m
,
THREADS_PER_BLOCK
),
b
);
// blockIdx.x(col), blockIdx.y(row)
dim3
threads
(
THREADS_PER_BLOCK
);
knn_kernel
<<<
blocks
,
threads
,
0
,
stream
>>>
(
b
,
n
,
m
,
nsample
,
xyz
,
new_xyz
,
idx
,
dist2
);
// cudaDeviceSynchronize(); // for using printf in kernel function
err
=
cudaGetLastError
();
if
(
cudaSuccess
!=
err
)
{
fprintf
(
stderr
,
"CUDA kernel failed : %s
\n
"
,
cudaGetErrorString
(
err
));
exit
(
-
1
);
}
}
mmdet3d/ops/paconv/__init__.py
View file @
333536f6
# Copyright (c) OpenMMLab. All rights reserved.
# Copyright (c) OpenMMLab. All rights reserved.
from
.assign_score
import
assign_score_withk
from
.paconv
import
PAConv
,
PAConvCUDA
from
.paconv
import
PAConv
,
PAConvCUDA
__all__
=
[
'assign_score_withk'
,
'PAConv'
,
'PAConvCUDA'
]
__all__
=
[
'PAConv'
,
'PAConvCUDA'
]
mmdet3d/ops/paconv/assign_score.py
deleted
100644 → 0
View file @
9c7270d0
# Copyright (c) OpenMMLab. All rights reserved.
from
torch.autograd
import
Function
from
.
import
assign_score_withk_ext
class
AssignScoreWithK
(
Function
):
r
"""Perform weighted sum to generate output features according to scores.
Modified from `PAConv <https://github.com/CVMI-Lab/PAConv/tree/main/
scene_seg/lib/paconv_lib/src/gpu>`_.
This is a memory-efficient CUDA implementation of assign_scores operation,
which first transform all point feature with weight bank, then assemble
neighbor features with `knn_idx` and perform weighted sum of `scores`.
See the `paper <https://arxiv.org/pdf/2103.14635.pdf>`_ appendix Sec. D for
more detailed descriptions.
Note:
This implementation assumes using ``neighbor`` kernel input, which is
(point_features - center_features, point_features).
See https://github.com/CVMI-Lab/PAConv/blob/main/scene_seg/model/
pointnet2/paconv.py#L128 for more details.
"""
@
staticmethod
def
forward
(
ctx
,
scores
,
point_features
,
center_features
,
knn_idx
,
aggregate
=
'sum'
):
"""Forward.
Args:
scores (torch.Tensor): (B, npoint, K, M), predicted scores to
aggregate weight matrices in the weight bank.
``npoint`` is the number of sampled centers.
``K`` is the number of queried neighbors.
``M`` is the number of weight matrices in the weight bank.
point_features (torch.Tensor): (B, N, M, out_dim)
Pre-computed point features to be aggregated.
center_features (torch.Tensor): (B, N, M, out_dim)
Pre-computed center features to be aggregated.
knn_idx (torch.Tensor): (B, npoint, K), index of sampled kNN.
We assume the first idx in each row is the idx of the center.
aggregate (str, optional): Aggregation method.
Can be 'sum', 'avg' or 'max'. Defaults to 'sum'.
Returns:
torch.Tensor: (B, out_dim, npoint, K), the aggregated features.
"""
agg
=
{
'sum'
:
0
,
'avg'
:
1
,
'max'
:
2
}
B
,
N
,
M
,
out_dim
=
point_features
.
size
()
_
,
npoint
,
K
,
_
=
scores
.
size
()
output
=
point_features
.
new_zeros
((
B
,
out_dim
,
npoint
,
K
))
assign_score_withk_ext
.
assign_score_withk_forward_wrapper
(
B
,
N
,
npoint
,
M
,
K
,
out_dim
,
agg
[
aggregate
],
point_features
.
contiguous
(),
center_features
.
contiguous
(),
scores
.
contiguous
(),
knn_idx
.
contiguous
(),
output
)
ctx
.
save_for_backward
(
output
,
point_features
,
center_features
,
scores
,
knn_idx
)
ctx
.
agg
=
agg
[
aggregate
]
return
output
@
staticmethod
def
backward
(
ctx
,
grad_out
):
"""Backward.
Args:
grad_out (torch.Tensor): (B, out_dim, npoint, K)
Returns:
grad_scores (torch.Tensor): (B, npoint, K, M)
grad_point_features (torch.Tensor): (B, N, M, out_dim)
grad_center_features (torch.Tensor): (B, N, M, out_dim)
"""
_
,
point_features
,
center_features
,
scores
,
knn_idx
=
ctx
.
saved_tensors
agg
=
ctx
.
agg
B
,
N
,
M
,
out_dim
=
point_features
.
size
()
_
,
npoint
,
K
,
_
=
scores
.
size
()
grad_point_features
=
point_features
.
new_zeros
(
point_features
.
shape
)
grad_center_features
=
center_features
.
new_zeros
(
center_features
.
shape
)
grad_scores
=
scores
.
new_zeros
(
scores
.
shape
)
assign_score_withk_ext
.
assign_score_withk_backward_wrapper
(
B
,
N
,
npoint
,
M
,
K
,
out_dim
,
agg
,
grad_out
.
contiguous
(),
point_features
.
contiguous
(),
center_features
.
contiguous
(),
scores
.
contiguous
(),
knn_idx
.
contiguous
(),
grad_point_features
,
grad_center_features
,
grad_scores
)
return
grad_scores
,
grad_point_features
,
\
grad_center_features
,
None
,
None
assign_score_withk
=
AssignScoreWithK
.
apply
mmdet3d/ops/paconv/paconv.py
View file @
333536f6
...
@@ -4,10 +4,10 @@ import copy
...
@@ -4,10 +4,10 @@ import copy
import
torch
import
torch
from
mmcv.cnn
import
(
ConvModule
,
build_activation_layer
,
build_norm_layer
,
from
mmcv.cnn
import
(
ConvModule
,
build_activation_layer
,
build_norm_layer
,
constant_init
)
constant_init
)
from
mmcv.ops
import
assign_score_withk
as
assign_score_cuda
from
torch
import
nn
as
nn
from
torch
import
nn
as
nn
from
torch.nn
import
functional
as
F
from
torch.nn
import
functional
as
F
from
.assign_score
import
assign_score_withk
as
assign_score_cuda
from
.utils
import
assign_kernel_withoutk
,
assign_score
,
calc_euclidian_dist
from
.utils
import
assign_kernel_withoutk
,
assign_score
,
calc_euclidian_dist
...
...
mmdet3d/ops/paconv/src/assign_score_withk.cpp
deleted
100644 → 0
View file @
9c7270d0
// Modified from https://github.com/CVMI-Lab/PAConv/tree/main/scene_seg/lib/paconv_lib/src/gpu
#include <torch/torch.h>
#include <torch/extension.h>
void
assign_score_withk_forward_wrapper
(
int
B
,
int
N0
,
int
N1
,
int
M
,
int
K
,
int
O
,
int
aggregate
,
const
at
::
Tensor
&
points
,
const
at
::
Tensor
&
centers
,
const
at
::
Tensor
&
scores
,
const
at
::
Tensor
&
knn_idx
,
at
::
Tensor
&
output
);
void
assign_score_withk_backward_wrapper
(
int
B
,
int
N0
,
int
N1
,
int
M
,
int
K
,
int
O
,
int
aggregate
,
const
at
::
Tensor
&
grad_out
,
const
at
::
Tensor
&
points
,
const
at
::
Tensor
&
centers
,
const
at
::
Tensor
&
scores
,
const
at
::
Tensor
&
knn_idx
,
at
::
Tensor
&
grad_points
,
at
::
Tensor
&
grad_centers
,
at
::
Tensor
&
grad_scores
);
PYBIND11_MODULE
(
TORCH_EXTENSION_NAME
,
m
)
{
m
.
def
(
"assign_score_withk_forward_wrapper"
,
&
assign_score_withk_forward_wrapper
,
"Assign score kernel forward (GPU), save memory version"
);
m
.
def
(
"assign_score_withk_backward_wrapper"
,
&
assign_score_withk_backward_wrapper
,
"Assign score kernel backward (GPU), save memory version"
);
}
mmdet3d/ops/paconv/src/assign_score_withk_cuda.cu
deleted
100644 → 0
View file @
9c7270d0
// Modified from https://github.com/CVMI-Lab/PAConv/tree/main/scene_seg/lib/paconv_lib/src/gpu
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <cmath>
#include <cstdint>
#include <vector>
#include <cuda.h>
#include <cuda_runtime.h>
#include <ATen/ATen.h>
#include <ATen/cuda/CUDAContext.h>
#include <torch/types.h>
#define THREADS_PER_BLOCK 256
#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
#define CHECK_CONTIGUOUS(x) \
do { \
AT_ASSERT(x.is_contiguous(), #x " must be a contiguous tensor"); \
} while (0)
#define CUDA_CHECK_ERRORS() \
do { \
cudaError_t err = cudaGetLastError(); \
if (cudaSuccess != err) { \
fprintf(stderr, "CUDA kernel failed : %s\n%s at L:%d in %s\n", \
cudaGetErrorString(err), __PRETTY_FUNCTION__, __LINE__, \
__FILE__); \
exit(-1); \
} \
} while (0)
// input: points(B,N0,M,O), centers(B,N0,M,O), scores(B,N1,K,M), knn_idx(B,N1,K)
// output: fout(B,O,N)
// algo: fout(b,i,k,j) = s(b,i,k,m)*p(b,c(i),k,m,j) = s(b,i,k,m)*p(b,i(k),m,j)
// i(k) = idx(b,i,k)
// sum: fout(b,i,j) = fout(b,i,j) + s(b,i,k,m)*p(b,i,k,m,j)
// avg: fout(b,i,j) = sum(fout(b,i,k,j)) / k
// max: fout(b,i,j) = max(fout(b,i,k,j), sum(s(b,i,k,m)*p(b,i,k,m,j)))
__global__
void
assign_score_withk_forward_kernel
(
const
int
B
,
const
int
N0
,
const
int
N1
,
const
int
M
,
const
int
K
,
const
int
O
,
const
int
aggregate
,
const
float
*
points
,
const
float
*
centers
,
const
float
*
scores
,
const
int64_t
*
knn_idx
,
float
*
output
)
{
// ----- parallel loop for B, N1, K and O ---------
long
i
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
i
>=
B
*
N1
*
K
*
O
)
return
;
// ------- loop for M ----------
for
(
int
m
=
0
;
m
<
M
;
m
++
)
{
int
b
=
(
int
)(
i
/
(
O
*
N1
*
K
));
int
o
=
(
int
)(
i
%
(
O
*
N1
*
K
)
/
(
N1
*
K
));
int
n
=
(
int
)(
i
%
(
N1
*
K
)
/
K
);
int
k
=
(
int
)(
i
%
K
);
int
cn
=
(
int
)
knn_idx
[
b
*
K
*
N1
+
n
*
K
+
0
];
//The first neighbor is the center point
int
kn
=
(
int
)
knn_idx
[
b
*
K
*
N1
+
n
*
K
+
k
];
if
(
kn
>=
N0
||
kn
<
0
)
{
// if index overflows, it is out of the neighborhood range
continue
;
}
assert
(
b
<
B
);
assert
(
kn
<
N0
);
assert
(
cn
<
N0
);
assert
(
o
<
O
);
assert
(
n
<
N1
);
atomicAdd
(
output
+
b
*
N1
*
O
*
K
+
o
*
N1
*
K
+
n
*
K
+
k
,
points
[
b
*
N0
*
M
*
O
+
kn
*
M
*
O
+
m
*
O
+
o
]
*
scores
[
b
*
N1
*
K
*
M
+
n
*
K
*
M
+
k
*
M
+
m
]
-
centers
[
b
*
N0
*
M
*
O
+
cn
*
M
*
O
+
m
*
O
+
o
]
*
scores
[
b
*
N1
*
K
*
M
+
n
*
K
*
M
+
k
*
M
+
m
]);
}
}
__global__
void
assign_score_withk_backward_points_kernel
(
const
int
B
,
const
int
N0
,
const
int
N
,
const
int
M
,
const
int
K
,
const
int
O
,
const
int
aggregate
,
const
float
*
grad_out
,
const
float
*
scores
,
const
int64_t
*
knn_idx
,
float
*
grad_points
,
float
*
grad_centers
)
{
// ----- parallel loop for B, M, O ---------
long
i
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
i
>=
B
*
M
*
O
)
return
;
int
b
=
(
int
)(
i
/
(
M
*
O
));
int
m
=
(
int
)(
i
%
(
M
*
O
)
/
O
);
int
o
=
(
int
)(
i
%
O
);
// ----- loop for N,K ---------
for
(
int
n
=
0
;
n
<
N
;
n
++
)
{
for
(
int
k
=
0
;
k
<
K
;
k
++
)
{
int
kn
=
knn_idx
[
b
*
N
*
K
+
n
*
K
+
k
];
int
cn
=
knn_idx
[
b
*
N
*
K
+
n
*
K
+
0
];
if
(
kn
>=
N0
||
kn
<
0
)
{
// if index overflows, it is out of the neighborhood range
continue
;
}
atomicAdd
(
grad_points
+
b
*
N0
*
M
*
O
+
kn
*
M
*
O
+
m
*
O
+
o
,
scores
[
b
*
N
*
K
*
M
+
n
*
K
*
M
+
k
*
M
+
m
]
*
grad_out
[
b
*
O
*
N
*
K
+
o
*
N
*
K
+
n
*
K
+
k
]);
atomicAdd
(
grad_centers
+
b
*
N0
*
M
*
O
+
cn
*
M
*
O
+
m
*
O
+
o
,
-
scores
[
b
*
N
*
K
*
M
+
n
*
K
*
M
+
k
*
M
+
m
]
*
grad_out
[
b
*
O
*
N
*
K
+
o
*
N
*
K
+
n
*
K
+
k
]);
}
}
}
__global__
void
assign_score_withk_backward_scores_kernel
(
const
int
B
,
const
int
N0
,
const
int
N
,
const
int
M
,
const
int
K
,
const
int
O
,
const
int
aggregate
,
const
float
*
grad_out
,
const
float
*
points
,
const
float
*
centers
,
const
int64_t
*
knn_idx
,
float
*
grad_scores
)
{
// ----- parallel loop for B, N, K, M ---------
long
i
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
i
>=
B
*
N
*
K
*
M
)
return
;
int
b
=
(
int
)(
i
/
(
N
*
M
*
K
));
int
n
=
(
int
)(
i
%
(
N
*
M
*
K
)
/
M
/
K
);
int
k
=
(
int
)(
i
%
(
M
*
K
)
/
M
);
int
m
=
(
int
)(
i
%
M
);
int
cn
=
knn_idx
[
b
*
N
*
K
+
n
*
K
+
0
];
int
kn
=
knn_idx
[
b
*
N
*
K
+
n
*
K
+
k
];
if
(
kn
>=
N0
||
kn
<
0
)
{
// if index overflows, it is out of the neighborhood range
return
;
}
// -------------- loop for O ------------------------
for
(
int
o
=
0
;
o
<
O
;
o
++
)
{
atomicAdd
(
grad_scores
+
b
*
N
*
K
*
M
+
n
*
K
*
M
+
k
*
M
+
m
,
(
points
[
b
*
N0
*
M
*
O
+
kn
*
M
*
O
+
m
*
O
+
o
]
-
centers
[
b
*
N0
*
M
*
O
+
cn
*
M
*
O
+
m
*
O
+
o
])
*
grad_out
[
b
*
O
*
N
*
K
+
o
*
N
*
K
+
n
*
K
+
k
]);
}
}
void
assign_score_withk_forward_wrapper
(
int
B
,
int
N0
,
int
N1
,
int
M
,
int
K
,
int
O
,
int
aggregate
,
const
at
::
Tensor
&
points
,
const
at
::
Tensor
&
centers
,
const
at
::
Tensor
&
scores
,
const
at
::
Tensor
&
knn_idx
,
at
::
Tensor
&
output
)
{
CHECK_CONTIGUOUS
(
points
);
CHECK_CONTIGUOUS
(
centers
);
CHECK_CONTIGUOUS
(
scores
);
CHECK_CONTIGUOUS
(
knn_idx
);
CHECK_CONTIGUOUS
(
output
);
const
float
*
points_data
=
points
.
data_ptr
<
float
>
();
const
float
*
centers_data
=
centers
.
data_ptr
<
float
>
();
const
float
*
scores_data
=
scores
.
data_ptr
<
float
>
();
const
int64_t
*
knn_idx_data
=
knn_idx
.
data_ptr
<
int64_t
>
();
float
*
output_data
=
output
.
data_ptr
<
float
>
();
dim3
blocks
(
DIVUP
(
B
*
O
*
N1
*
K
,
THREADS_PER_BLOCK
));
dim3
threads
(
THREADS_PER_BLOCK
);
assign_score_withk_forward_kernel
<<<
blocks
,
threads
,
0
>>>
(
B
,
N0
,
N1
,
M
,
K
,
O
,
aggregate
,
points_data
,
centers_data
,
scores_data
,
knn_idx_data
,
output_data
);
CUDA_CHECK_ERRORS
();
}
void
assign_score_withk_backward_wrapper
(
int
B
,
int
N0
,
int
N1
,
int
M
,
int
K
,
int
O
,
int
aggregate
,
const
at
::
Tensor
&
grad_out
,
const
at
::
Tensor
&
points
,
const
at
::
Tensor
&
centers
,
const
at
::
Tensor
&
scores
,
const
at
::
Tensor
&
knn_idx
,
at
::
Tensor
&
grad_points
,
at
::
Tensor
&
grad_centers
,
at
::
Tensor
&
grad_scores
)
{
CHECK_CONTIGUOUS
(
grad_out
);
CHECK_CONTIGUOUS
(
scores
);
CHECK_CONTIGUOUS
(
points
);
CHECK_CONTIGUOUS
(
centers
);
CHECK_CONTIGUOUS
(
knn_idx
);
CHECK_CONTIGUOUS
(
grad_scores
);
CHECK_CONTIGUOUS
(
grad_points
);
CHECK_CONTIGUOUS
(
grad_centers
);
const
float
*
grad_out_data
=
grad_out
.
data_ptr
<
float
>
();
const
float
*
points_data
=
points
.
data_ptr
<
float
>
();
const
float
*
centers_data
=
centers
.
data_ptr
<
float
>
();
const
float
*
scores_data
=
scores
.
data_ptr
<
float
>
();
const
int64_t
*
knn_idx_data
=
knn_idx
.
data_ptr
<
int64_t
>
();
float
*
grad_points_data
=
grad_points
.
data_ptr
<
float
>
();
float
*
grad_centers_data
=
grad_centers
.
data_ptr
<
float
>
();
float
*
grad_scores_data
=
grad_scores
.
data_ptr
<
float
>
();
cudaStream_t
stream
=
at
::
cuda
::
getCurrentCUDAStream
();
dim3
blocks1
(
DIVUP
(
B
*
M
*
O
,
THREADS_PER_BLOCK
));
dim3
threads1
(
THREADS_PER_BLOCK
);
dim3
blocks2
(
DIVUP
(
B
*
N1
*
K
*
M
,
THREADS_PER_BLOCK
));
dim3
threads2
(
THREADS_PER_BLOCK
);
assign_score_withk_backward_points_kernel
<<<
blocks1
,
threads1
,
0
>>>
(
B
,
N0
,
N1
,
M
,
K
,
O
,
aggregate
,
grad_out_data
,
scores_data
,
knn_idx_data
,
grad_points_data
,
grad_centers_data
);
assign_score_withk_backward_scores_kernel
<<<
blocks2
,
threads2
,
0
>>>
(
B
,
N0
,
N1
,
M
,
K
,
O
,
aggregate
,
grad_out_data
,
points_data
,
centers_data
,
knn_idx_data
,
grad_scores_data
);
CUDA_CHECK_ERRORS
();
}
mmdet3d/ops/pointnet_modules/point_fp_module.py
View file @
333536f6
...
@@ -3,11 +3,10 @@ from typing import List
...
@@ -3,11 +3,10 @@ from typing import List
import
torch
import
torch
from
mmcv.cnn
import
ConvModule
from
mmcv.cnn
import
ConvModule
from
mmcv.ops
import
three_interpolate
,
three_nn
from
mmcv.runner
import
BaseModule
,
force_fp32
from
mmcv.runner
import
BaseModule
,
force_fp32
from
torch
import
nn
as
nn
from
torch
import
nn
as
nn
from
mmdet3d.ops
import
three_interpolate
,
three_nn
class
PointFPModule
(
BaseModule
):
class
PointFPModule
(
BaseModule
):
"""Point feature propagation module used in PointNets.
"""Point feature propagation module used in PointNets.
...
...
mmdet3d/ops/pointnet_modules/point_sa_module.py
View file @
333536f6
# Copyright (c) OpenMMLab. All rights reserved.
# Copyright (c) OpenMMLab. All rights reserved.
import
torch
import
torch
from
mmcv.cnn
import
ConvModule
from
mmcv.cnn
import
ConvModule
from
mmcv.ops
import
GroupAll
from
mmcv.ops
import
PointsSampler
as
Points_Sampler
from
mmcv.ops
import
QueryAndGroup
,
gather_points
from
torch
import
nn
as
nn
from
torch
import
nn
as
nn
from
torch.nn
import
functional
as
F
from
torch.nn
import
functional
as
F
from
mmdet3d.ops
import
(
GroupAll
,
PAConv
,
Points_Sampler
,
QueryAndGroup
,
from
mmdet3d.ops
import
PAConv
gather_points
)
from
.builder
import
SA_MODULES
from
.builder
import
SA_MODULES
...
...
mmdet3d/ops/roiaware_pool3d/__init__.py
deleted
100644 → 0
View file @
9c7270d0
# Copyright (c) OpenMMLab. All rights reserved.
from
.points_in_boxes
import
(
points_in_boxes_all
,
points_in_boxes_cpu
,
points_in_boxes_part
)
from
.roiaware_pool3d
import
RoIAwarePool3d
__all__
=
[
'RoIAwarePool3d'
,
'points_in_boxes_part'
,
'points_in_boxes_cpu'
,
'points_in_boxes_all'
]
mmdet3d/ops/roiaware_pool3d/points_in_boxes.py
deleted
100644 → 0
View file @
9c7270d0
# Copyright (c) OpenMMLab. All rights reserved.
import
torch
from
.
import
roiaware_pool3d_ext
def
points_in_boxes_part
(
points
,
boxes
):
"""Find the box in which each point is (CUDA).
Args:
points (torch.Tensor): [B, M, 3], [x, y, z] in LiDAR/DEPTH coordinate
boxes (torch.Tensor): [B, T, 7],
num_valid_boxes <= T, [x, y, z, x_size, y_size, z_size, rz] in
LiDAR/DEPTH coordinate, (x, y, z) is the bottom center
Returns:
box_idxs_of_pts (torch.Tensor): (B, M), default background = -1
"""
assert
points
.
shape
[
0
]
==
boxes
.
shape
[
0
],
\
f
'Points and boxes should have the same batch size, '
\
f
'got
{
points
.
shape
[
0
]
}
and
{
boxes
.
shape
[
0
]
}
'
assert
boxes
.
shape
[
2
]
==
7
,
\
f
'boxes dimension should be 7, '
\
f
'got unexpected shape
{
boxes
.
shape
[
2
]
}
'
assert
points
.
shape
[
2
]
==
3
,
\
f
'points dimension should be 3, '
\
f
'got unexpected shape
{
points
.
shape
[
2
]
}
'
batch_size
,
num_points
,
_
=
points
.
shape
box_idxs_of_pts
=
points
.
new_zeros
((
batch_size
,
num_points
),
dtype
=
torch
.
int
).
fill_
(
-
1
)
# If manually put the tensor 'points' or 'boxes' on a device
# which is not the current device, some temporary variables
# will be created on the current device in the cuda op,
# and the output will be incorrect.
# Therefore, we force the current device to be the same
# as the device of the tensors if it was not.
# Please refer to https://github.com/open-mmlab/mmdetection3d/issues/305
# for the incorrect output before the fix.
points_device
=
points
.
get_device
()
assert
points_device
==
boxes
.
get_device
(),
\
'Points and boxes should be put on the same device'
if
torch
.
cuda
.
current_device
()
!=
points_device
:
torch
.
cuda
.
set_device
(
points_device
)
roiaware_pool3d_ext
.
points_in_boxes_part
(
boxes
.
contiguous
(),
points
.
contiguous
(),
box_idxs_of_pts
)
return
box_idxs_of_pts
def
points_in_boxes_cpu
(
points
,
boxes
):
"""Find all boxes in which each point is (CPU). The CPU version of
:meth:`points_in_boxes_all`.
Args:
points (torch.Tensor): [B, M, 3], [x, y, z] in
LiDAR/DEPTH coordinate
boxes (torch.Tensor): [B, T, 7],
num_valid_boxes <= T, [x, y, z, x_size, y_size, z_size, rz],
(x, y, z) is the bottom center.
Returns:
box_idxs_of_pts (torch.Tensor): (B, M, T), default background = 0.
"""
assert
points
.
shape
[
0
]
==
boxes
.
shape
[
0
],
\
f
'Points and boxes should have the same batch size, '
\
f
'got
{
points
.
shape
[
0
]
}
and
{
boxes
.
shape
[
0
]
}
'
assert
boxes
.
shape
[
2
]
==
7
,
\
f
'boxes dimension should be 7, '
\
f
'got unexpected shape
{
boxes
.
shape
[
2
]
}
'
assert
points
.
shape
[
2
]
==
3
,
\
f
'points dimension should be 3, '
\
f
'got unexpected shape
{
points
.
shape
[
2
]
}
'
batch_size
,
num_points
,
_
=
points
.
shape
num_boxes
=
boxes
.
shape
[
1
]
point_indices
=
points
.
new_zeros
((
batch_size
,
num_boxes
,
num_points
),
dtype
=
torch
.
int
)
for
b
in
range
(
batch_size
):
roiaware_pool3d_ext
.
points_in_boxes_cpu
(
boxes
[
b
].
float
().
contiguous
(),
points
[
b
].
float
().
contiguous
(),
point_indices
[
b
])
point_indices
=
point_indices
.
transpose
(
1
,
2
)
return
point_indices
def
points_in_boxes_all
(
points
,
boxes
):
"""Find all boxes in which each point is (CUDA).
Args:
points (torch.Tensor): [B, M, 3], [x, y, z] in LiDAR/DEPTH coordinate
boxes (torch.Tensor): [B, T, 7],
num_valid_boxes <= T, [x, y, z, x_size, y_size, z_size, rz],
(x, y, z) is the bottom center.
Returns:
box_idxs_of_pts (torch.Tensor): (B, M, T), default background = 0.
"""
assert
boxes
.
shape
[
0
]
==
points
.
shape
[
0
],
\
f
'Points and boxes should have the same batch size, '
\
f
'got
{
boxes
.
shape
[
0
]
}
and
{
boxes
.
shape
[
0
]
}
'
assert
boxes
.
shape
[
2
]
==
7
,
\
f
'boxes dimension should be 7, '
\
f
'got unexpected shape
{
boxes
.
shape
[
2
]
}
'
assert
points
.
shape
[
2
]
==
3
,
\
f
'points dimension should be 3, '
\
f
'got unexpected shape
{
points
.
shape
[
2
]
}
'
batch_size
,
num_points
,
_
=
points
.
shape
num_boxes
=
boxes
.
shape
[
1
]
box_idxs_of_pts
=
points
.
new_zeros
((
batch_size
,
num_points
,
num_boxes
),
dtype
=
torch
.
int
).
fill_
(
0
)
# Same reason as line 25-32
points_device
=
points
.
get_device
()
assert
points_device
==
boxes
.
get_device
(),
\
'Points and boxes should be put on the same device'
if
torch
.
cuda
.
current_device
()
!=
points_device
:
torch
.
cuda
.
set_device
(
points_device
)
roiaware_pool3d_ext
.
points_in_boxes_all
(
boxes
.
contiguous
(),
points
.
contiguous
(),
box_idxs_of_pts
)
return
box_idxs_of_pts
mmdet3d/ops/roiaware_pool3d/roiaware_pool3d.py
deleted
100644 → 0
View file @
9c7270d0
# Copyright (c) OpenMMLab. All rights reserved.
import
mmcv
import
torch
from
torch
import
nn
as
nn
from
torch.autograd
import
Function
from
.
import
roiaware_pool3d_ext
class
RoIAwarePool3d
(
nn
.
Module
):
def
__init__
(
self
,
out_size
,
max_pts_per_voxel
=
128
,
mode
=
'max'
):
super
().
__init__
()
"""RoIAwarePool3d module
Args:
out_size (int or tuple): n or [n1, n2, n3]
max_pts_per_voxel (int): m
mode (str): 'max' or 'avg'
"""
self
.
out_size
=
out_size
self
.
max_pts_per_voxel
=
max_pts_per_voxel
assert
mode
in
[
'max'
,
'avg'
]
pool_method_map
=
{
'max'
:
0
,
'avg'
:
1
}
self
.
mode
=
pool_method_map
[
mode
]
def
forward
(
self
,
rois
,
pts
,
pts_feature
):
"""RoIAwarePool3d module forward.
Args:
rois (torch.Tensor): [N, 7],in LiDAR coordinate,
(x, y, z) is the bottom center of rois
pts (torch.Tensor): [npoints, 3]
pts_feature (torch.Tensor): [npoints, C]
Returns:
pooled_features (torch.Tensor): [N, out_x, out_y, out_z, C]
"""
return
RoIAwarePool3dFunction
.
apply
(
rois
,
pts
,
pts_feature
,
self
.
out_size
,
self
.
max_pts_per_voxel
,
self
.
mode
)
class
RoIAwarePool3dFunction
(
Function
):
@
staticmethod
def
forward
(
ctx
,
rois
,
pts
,
pts_feature
,
out_size
,
max_pts_per_voxel
,
mode
):
"""RoIAwarePool3d function forward.
Args:
rois (torch.Tensor): [N, 7], in LiDAR coordinate,
(x, y, z) is the bottom center of rois
pts (torch.Tensor): [npoints, 3]
pts_feature (torch.Tensor): [npoints, C]
out_size (int or tuple): n or [n1, n2, n3]
max_pts_per_voxel (int): m
mode (int): 0 (max pool) or 1 (average pool)
Returns:
pooled_features (torch.Tensor): [N, out_x, out_y, out_z, C]
"""
if
isinstance
(
out_size
,
int
):
out_x
=
out_y
=
out_z
=
out_size
else
:
assert
len
(
out_size
)
==
3
assert
mmcv
.
is_tuple_of
(
out_size
,
int
)
out_x
,
out_y
,
out_z
=
out_size
num_rois
=
rois
.
shape
[
0
]
num_channels
=
pts_feature
.
shape
[
-
1
]
num_pts
=
pts
.
shape
[
0
]
pooled_features
=
pts_feature
.
new_zeros
(
(
num_rois
,
out_x
,
out_y
,
out_z
,
num_channels
))
argmax
=
pts_feature
.
new_zeros
(
(
num_rois
,
out_x
,
out_y
,
out_z
,
num_channels
),
dtype
=
torch
.
int
)
pts_idx_of_voxels
=
pts_feature
.
new_zeros
(
(
num_rois
,
out_x
,
out_y
,
out_z
,
max_pts_per_voxel
),
dtype
=
torch
.
int
)
roiaware_pool3d_ext
.
forward
(
rois
,
pts
,
pts_feature
,
argmax
,
pts_idx_of_voxels
,
pooled_features
,
mode
)
ctx
.
roiaware_pool3d_for_backward
=
(
pts_idx_of_voxels
,
argmax
,
mode
,
num_pts
,
num_channels
)
return
pooled_features
@
staticmethod
def
backward
(
ctx
,
grad_out
):
"""RoIAwarePool3d function forward.
Args:
grad_out (torch.Tensor): [N, out_x, out_y, out_z, C]
Returns:
grad_in (torch.Tensor): [npoints, C]
"""
ret
=
ctx
.
roiaware_pool3d_for_backward
pts_idx_of_voxels
,
argmax
,
mode
,
num_pts
,
num_channels
=
ret
grad_in
=
grad_out
.
new_zeros
((
num_pts
,
num_channels
))
roiaware_pool3d_ext
.
backward
(
pts_idx_of_voxels
,
argmax
,
grad_out
.
contiguous
(),
grad_in
,
mode
)
return
None
,
None
,
grad_in
,
None
,
None
,
None
if
__name__
==
'__main__'
:
pass
mmdet3d/ops/roiaware_pool3d/src/points_in_boxes_cpu.cpp
deleted
100644 → 0
View file @
9c7270d0
// Modified from
// https://github.com/sshaoshuai/PCDet/blob/master/pcdet/ops/roiaware_pool3d/src/roiaware_pool3d_kernel.cu
// Written by Shaoshuai Shi
// All Rights Reserved 2019.
#include <assert.h>
#include <math.h>
#include <stdio.h>
#include <torch/extension.h>
#include <torch/serialize/tensor.h>
#define CHECK_CONTIGUOUS(x) \
TORCH_CHECK(x.is_contiguous(), #x, " must be contiguous ")
// #define DEBUG
inline
void
lidar_to_local_coords_cpu
(
float
shift_x
,
float
shift_y
,
float
rz
,
float
&
local_x
,
float
&
local_y
)
{
float
cosa
=
cos
(
-
rz
),
sina
=
sin
(
-
rz
);
local_x
=
shift_x
*
cosa
+
shift_y
*
(
-
sina
);
local_y
=
shift_x
*
sina
+
shift_y
*
cosa
;
}
inline
int
check_pt_in_box3d_cpu
(
const
float
*
pt
,
const
float
*
box3d
,
float
&
local_x
,
float
&
local_y
)
{
// param pt: (x, y, z)
// param box3d: (cx, cy, cz, x_size, y_size, z_size, rz) in LiDAR coordinate, cz in the
// bottom center
float
x
=
pt
[
0
],
y
=
pt
[
1
],
z
=
pt
[
2
];
float
cx
=
box3d
[
0
],
cy
=
box3d
[
1
],
cz
=
box3d
[
2
];
float
x_size
=
box3d
[
3
],
y_size
=
box3d
[
4
],
z_size
=
box3d
[
5
],
rz
=
box3d
[
6
];
cz
+=
z_size
/
2.0
;
// shift to the center since cz in box3d is the bottom center
if
(
fabsf
(
z
-
cz
)
>
z_size
/
2.0
)
return
0
;
lidar_to_local_coords_cpu
(
x
-
cx
,
y
-
cy
,
rz
,
local_x
,
local_y
);
float
in_flag
=
(
local_x
>
-
x_size
/
2.0
)
&
(
local_x
<
x_size
/
2.0
)
&
(
local_y
>
-
y_size
/
2.0
)
&
(
local_y
<
y_size
/
2.0
);
return
in_flag
;
}
int
points_in_boxes_cpu
(
at
::
Tensor
boxes_tensor
,
at
::
Tensor
pts_tensor
,
at
::
Tensor
pts_indices_tensor
)
{
// params boxes: (N, 7) [x, y, z, x_size, y_size, z_size, rz] in LiDAR coordinate, z is the
// bottom center, each box DO NOT overlaps params pts: (npoints, 3) [x, y, z]
// in LiDAR coordinate params pts_indices: (N, npoints)
CHECK_CONTIGUOUS
(
boxes_tensor
);
CHECK_CONTIGUOUS
(
pts_tensor
);
CHECK_CONTIGUOUS
(
pts_indices_tensor
);
int
boxes_num
=
boxes_tensor
.
size
(
0
);
int
pts_num
=
pts_tensor
.
size
(
0
);
const
float
*
boxes
=
boxes_tensor
.
data_ptr
<
float
>
();
const
float
*
pts
=
pts_tensor
.
data_ptr
<
float
>
();
int
*
pts_indices
=
pts_indices_tensor
.
data_ptr
<
int
>
();
float
local_x
=
0
,
local_y
=
0
;
for
(
int
i
=
0
;
i
<
boxes_num
;
i
++
)
{
for
(
int
j
=
0
;
j
<
pts_num
;
j
++
)
{
int
cur_in_flag
=
check_pt_in_box3d_cpu
(
pts
+
j
*
3
,
boxes
+
i
*
7
,
local_x
,
local_y
);
pts_indices
[
i
*
pts_num
+
j
]
=
cur_in_flag
;
}
}
return
1
;
}
mmdet3d/ops/roiaware_pool3d/src/points_in_boxes_cuda.cu
deleted
100644 → 0
View file @
9c7270d0
// Modified from
// https://github.com/sshaoshuai/PCDet/blob/master/pcdet/ops/roiaware_pool3d/src/roiaware_pool3d_kernel.cu
// Written by Shaoshuai Shi
// All Rights Reserved 2019.
#include <assert.h>
#include <math.h>
#include <stdio.h>
#include <torch/serialize/tensor.h>
#include <torch/types.h>
#define THREADS_PER_BLOCK 256
#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
#define CHECK_CUDA(x) \
TORCH_CHECK(x.device().is_cuda(), #x, " must be a CUDAtensor ")
#define CHECK_CONTIGUOUS(x) \
TORCH_CHECK(x.is_contiguous(), #x, " must be contiguous ")
#define CHECK_INPUT(x) \
CHECK_CUDA(x); \
CHECK_CONTIGUOUS(x)
// #define DEBUG
__device__
inline
void
lidar_to_local_coords
(
float
shift_x
,
float
shift_y
,
float
rz
,
float
&
local_x
,
float
&
local_y
)
{
float
cosa
=
cos
(
-
rz
),
sina
=
sin
(
-
rz
);
local_x
=
shift_x
*
cosa
+
shift_y
*
(
-
sina
);
local_y
=
shift_x
*
sina
+
shift_y
*
cosa
;
}
__device__
inline
int
check_pt_in_box3d
(
const
float
*
pt
,
const
float
*
box3d
,
float
&
local_x
,
float
&
local_y
)
{
// param pt: (x, y, z)
// param box3d: (cx, cy, cz, x_size, y_size, z_size, rz) in LiDAR coordinate, cz in the
// bottom center
float
x
=
pt
[
0
],
y
=
pt
[
1
],
z
=
pt
[
2
];
float
cx
=
box3d
[
0
],
cy
=
box3d
[
1
],
cz
=
box3d
[
2
];
float
x_size
=
box3d
[
3
],
y_size
=
box3d
[
4
],
z_size
=
box3d
[
5
],
rz
=
box3d
[
6
];
cz
+=
z_size
/
2.0
;
// shift to the center since cz in box3d is the bottom center
if
(
fabsf
(
z
-
cz
)
>
z_size
/
2.0
)
return
0
;
lidar_to_local_coords
(
x
-
cx
,
y
-
cy
,
rz
,
local_x
,
local_y
);
float
in_flag
=
(
local_x
>
-
x_size
/
2.0
)
&
(
local_x
<
x_size
/
2.0
)
&
(
local_y
>
-
y_size
/
2.0
)
&
(
local_y
<
y_size
/
2.0
);
return
in_flag
;
}
__global__
void
points_in_boxes_part_kernel
(
int
batch_size
,
int
boxes_num
,
int
pts_num
,
const
float
*
boxes
,
const
float
*
pts
,
int
*
box_idx_of_points
)
{
// params boxes: (B, N, 7) [x, y, z, x_size, y_size, z_size, rz] in LiDAR coordinate, z is
// the bottom center, each box DO NOT overlaps params pts: (B, npoints, 3) [x,
// y, z] in LiDAR coordinate params boxes_idx_of_points: (B, npoints), default
// -1
int
bs_idx
=
blockIdx
.
y
;
int
pt_idx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
bs_idx
>=
batch_size
||
pt_idx
>=
pts_num
)
return
;
boxes
+=
bs_idx
*
boxes_num
*
7
;
pts
+=
bs_idx
*
pts_num
*
3
+
pt_idx
*
3
;
box_idx_of_points
+=
bs_idx
*
pts_num
+
pt_idx
;
float
local_x
=
0
,
local_y
=
0
;
int
cur_in_flag
=
0
;
for
(
int
k
=
0
;
k
<
boxes_num
;
k
++
)
{
cur_in_flag
=
check_pt_in_box3d
(
pts
,
boxes
+
k
*
7
,
local_x
,
local_y
);
if
(
cur_in_flag
)
{
box_idx_of_points
[
0
]
=
k
;
break
;
}
}
}
__global__
void
points_in_boxes_all_kernel
(
int
batch_size
,
int
boxes_num
,
int
pts_num
,
const
float
*
boxes
,
const
float
*
pts
,
int
*
box_idx_of_points
)
{
// params boxes: (B, N, 7) [x, y, z, x_size, y_size, z_size, rz] in LiDAR coordinate, z is
// the bottom center, each box DO NOT overlaps params pts: (B, npoints, 3) [x,
// y, z] in LiDAR coordinate params boxes_idx_of_points: (B, npoints), default
// -1
int
bs_idx
=
blockIdx
.
y
;
int
pt_idx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
bs_idx
>=
batch_size
||
pt_idx
>=
pts_num
)
return
;
boxes
+=
bs_idx
*
boxes_num
*
7
;
pts
+=
bs_idx
*
pts_num
*
3
+
pt_idx
*
3
;
box_idx_of_points
+=
bs_idx
*
pts_num
*
boxes_num
+
pt_idx
*
boxes_num
;
float
local_x
=
0
,
local_y
=
0
;
int
cur_in_flag
=
0
;
for
(
int
k
=
0
;
k
<
boxes_num
;
k
++
)
{
cur_in_flag
=
check_pt_in_box3d
(
pts
,
boxes
+
k
*
7
,
local_x
,
local_y
);
if
(
cur_in_flag
)
{
box_idx_of_points
[
k
]
=
1
;
}
cur_in_flag
=
0
;
}
}
void
points_in_boxes_part_launcher
(
int
batch_size
,
int
boxes_num
,
int
pts_num
,
const
float
*
boxes
,
const
float
*
pts
,
int
*
box_idx_of_points
)
{
// params boxes: (B, N, 7) [x, y, z, x_size, y_size, z_size, rz] in LiDAR coordinate, z is
// the bottom center, each box DO NOT overlaps params pts: (B, npoints, 3) [x,
// y, z] in LiDAR coordinate params boxes_idx_of_points: (B, npoints), default
// -1
cudaError_t
err
;
dim3
blocks
(
DIVUP
(
pts_num
,
THREADS_PER_BLOCK
),
batch_size
);
dim3
threads
(
THREADS_PER_BLOCK
);
points_in_boxes_part_kernel
<<<
blocks
,
threads
>>>
(
batch_size
,
boxes_num
,
pts_num
,
boxes
,
pts
,
box_idx_of_points
);
err
=
cudaGetLastError
();
if
(
cudaSuccess
!=
err
)
{
fprintf
(
stderr
,
"CUDA kernel failed : %s
\n
"
,
cudaGetErrorString
(
err
));
exit
(
-
1
);
}
#ifdef DEBUG
cudaDeviceSynchronize
();
// for using printf in kernel function
#endif
}
void
points_in_boxes_all_launcher
(
int
batch_size
,
int
boxes_num
,
int
pts_num
,
const
float
*
boxes
,
const
float
*
pts
,
int
*
box_idx_of_points
)
{
// params boxes: (B, N, 7) [x, y, z, x_size, y_size, z_size, rz] in LiDAR coordinate, z is
// the bottom center, each box params pts: (B, npoints, 3) [x, y, z] in
// LiDAR coordinate params boxes_idx_of_points: (B, npoints), default -1
cudaError_t
err
;
dim3
blocks
(
DIVUP
(
pts_num
,
THREADS_PER_BLOCK
),
batch_size
);
dim3
threads
(
THREADS_PER_BLOCK
);
points_in_boxes_all_kernel
<<<
blocks
,
threads
>>>
(
batch_size
,
boxes_num
,
pts_num
,
boxes
,
pts
,
box_idx_of_points
);
err
=
cudaGetLastError
();
if
(
cudaSuccess
!=
err
)
{
fprintf
(
stderr
,
"CUDA kernel failed : %s
\n
"
,
cudaGetErrorString
(
err
));
exit
(
-
1
);
}
#ifdef DEBUG
cudaDeviceSynchronize
();
// for using printf in kernel function
#endif
}
int
points_in_boxes_part
(
at
::
Tensor
boxes_tensor
,
at
::
Tensor
pts_tensor
,
at
::
Tensor
box_idx_of_points_tensor
)
{
// params boxes: (B, N, 7) [x, y, z, x_size, y_size, z_size, rz] in LiDAR coordinate, z is
// the bottom center, each box DO NOT overlaps params pts: (B, npoints, 3) [x,
// y, z] in LiDAR coordinate params boxes_idx_of_points: (B, npoints), default
// -1
CHECK_INPUT
(
boxes_tensor
);
CHECK_INPUT
(
pts_tensor
);
CHECK_INPUT
(
box_idx_of_points_tensor
);
int
batch_size
=
boxes_tensor
.
size
(
0
);
int
boxes_num
=
boxes_tensor
.
size
(
1
);
int
pts_num
=
pts_tensor
.
size
(
1
);
const
float
*
boxes
=
boxes_tensor
.
data_ptr
<
float
>
();
const
float
*
pts
=
pts_tensor
.
data_ptr
<
float
>
();
int
*
box_idx_of_points
=
box_idx_of_points_tensor
.
data_ptr
<
int
>
();
points_in_boxes_part_launcher
(
batch_size
,
boxes_num
,
pts_num
,
boxes
,
pts
,
box_idx_of_points
);
return
1
;
}
int
points_in_boxes_all
(
at
::
Tensor
boxes_tensor
,
at
::
Tensor
pts_tensor
,
at
::
Tensor
box_idx_of_points_tensor
)
{
// params boxes: (B, N, 7) [x, y, z, x_size, y_size, z_size, rz] in LiDAR coordinate, z is
// the bottom center. params pts: (B, npoints, 3) [x, y, z] in LiDAR
// coordinate params boxes_idx_of_points: (B, npoints), default -1
CHECK_INPUT
(
boxes_tensor
);
CHECK_INPUT
(
pts_tensor
);
CHECK_INPUT
(
box_idx_of_points_tensor
);
int
batch_size
=
boxes_tensor
.
size
(
0
);
int
boxes_num
=
boxes_tensor
.
size
(
1
);
int
pts_num
=
pts_tensor
.
size
(
1
);
const
float
*
boxes
=
boxes_tensor
.
data_ptr
<
float
>
();
const
float
*
pts
=
pts_tensor
.
data_ptr
<
float
>
();
int
*
box_idx_of_points
=
box_idx_of_points_tensor
.
data_ptr
<
int
>
();
points_in_boxes_all_launcher
(
batch_size
,
boxes_num
,
pts_num
,
boxes
,
pts
,
box_idx_of_points
);
return
1
;
}
mmdet3d/ops/roiaware_pool3d/src/roiaware_pool3d.cpp
deleted
100644 → 0
View file @
9c7270d0
// Modified from
// https://github.com/sshaoshuai/PCDet/blob/master/pcdet/ops/roiaware_pool3d/src/roiaware_pool3d_kernel.cu
// Written by Shaoshuai Shi
// All Rights Reserved 2019.
#include <assert.h>
#include <torch/extension.h>
#include <torch/serialize/tensor.h>
#define CHECK_CUDA(x) \
TORCH_CHECK(x.device().is_cuda(), #x, " must be a CUDAtensor ")
#define CHECK_CONTIGUOUS(x) \
TORCH_CHECK(x.is_contiguous(), #x, " must be contiguous ")
#define CHECK_INPUT(x) \
CHECK_CUDA(x); \
CHECK_CONTIGUOUS(x)
void
roiaware_pool3d_launcher
(
int
boxes_num
,
int
pts_num
,
int
channels
,
int
max_pts_each_voxel
,
int
out_x
,
int
out_y
,
int
out_z
,
const
float
*
rois
,
const
float
*
pts
,
const
float
*
pts_feature
,
int
*
argmax
,
int
*
pts_idx_of_voxels
,
float
*
pooled_features
,
int
pool_method
);
void
roiaware_pool3d_backward_launcher
(
int
boxes_num
,
int
out_x
,
int
out_y
,
int
out_z
,
int
channels
,
int
max_pts_each_voxel
,
const
int
*
pts_idx_of_voxels
,
const
int
*
argmax
,
const
float
*
grad_out
,
float
*
grad_in
,
int
pool_method
);
int
roiaware_pool3d_gpu
(
at
::
Tensor
rois
,
at
::
Tensor
pts
,
at
::
Tensor
pts_feature
,
at
::
Tensor
argmax
,
at
::
Tensor
pts_idx_of_voxels
,
at
::
Tensor
pooled_features
,
int
pool_method
);
int
roiaware_pool3d_gpu_backward
(
at
::
Tensor
pts_idx_of_voxels
,
at
::
Tensor
argmax
,
at
::
Tensor
grad_out
,
at
::
Tensor
grad_in
,
int
pool_method
);
int
points_in_boxes_cpu
(
at
::
Tensor
boxes_tensor
,
at
::
Tensor
pts_tensor
,
at
::
Tensor
pts_indices_tensor
);
int
points_in_boxes_part
(
at
::
Tensor
boxes_tensor
,
at
::
Tensor
pts_tensor
,
at
::
Tensor
box_idx_of_points_tensor
);
int
points_in_boxes_all
(
at
::
Tensor
boxes_tensor
,
at
::
Tensor
pts_tensor
,
at
::
Tensor
box_idx_of_points_tensor
);
int
roiaware_pool3d_gpu
(
at
::
Tensor
rois
,
at
::
Tensor
pts
,
at
::
Tensor
pts_feature
,
at
::
Tensor
argmax
,
at
::
Tensor
pts_idx_of_voxels
,
at
::
Tensor
pooled_features
,
int
pool_method
)
{
// params rois: (N, 7) [x, y, z, x_size, y_size, z_size, ry] in LiDAR coordinate
// params pts: (npoints, 3) [x, y, z] in LiDAR coordinate
// params pts_feature: (npoints, C)
// params argmax: (N, out_x, out_y, out_z, C)
// params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel)
// params pooled_features: (N, out_x, out_y, out_z, C)
// params pool_method: 0: max_pool 1: avg_pool
CHECK_INPUT
(
rois
);
CHECK_INPUT
(
pts
);
CHECK_INPUT
(
pts_feature
);
CHECK_INPUT
(
argmax
);
CHECK_INPUT
(
pts_idx_of_voxels
);
CHECK_INPUT
(
pooled_features
);
int
boxes_num
=
rois
.
size
(
0
);
int
pts_num
=
pts
.
size
(
0
);
int
channels
=
pts_feature
.
size
(
1
);
int
max_pts_each_voxel
=
pts_idx_of_voxels
.
size
(
4
);
// index 0 is the counter
int
out_x
=
pts_idx_of_voxels
.
size
(
1
);
int
out_y
=
pts_idx_of_voxels
.
size
(
2
);
int
out_z
=
pts_idx_of_voxels
.
size
(
3
);
assert
((
out_x
<
256
)
&&
(
out_y
<
256
)
&&
(
out_z
<
256
));
// we encode index with 8bit
const
float
*
rois_data
=
rois
.
data_ptr
<
float
>
();
const
float
*
pts_data
=
pts
.
data_ptr
<
float
>
();
const
float
*
pts_feature_data
=
pts_feature
.
data_ptr
<
float
>
();
int
*
argmax_data
=
argmax
.
data_ptr
<
int
>
();
int
*
pts_idx_of_voxels_data
=
pts_idx_of_voxels
.
data_ptr
<
int
>
();
float
*
pooled_features_data
=
pooled_features
.
data_ptr
<
float
>
();
roiaware_pool3d_launcher
(
boxes_num
,
pts_num
,
channels
,
max_pts_each_voxel
,
out_x
,
out_y
,
out_z
,
rois_data
,
pts_data
,
pts_feature_data
,
argmax_data
,
pts_idx_of_voxels_data
,
pooled_features_data
,
pool_method
);
return
1
;
}
int
roiaware_pool3d_gpu_backward
(
at
::
Tensor
pts_idx_of_voxels
,
at
::
Tensor
argmax
,
at
::
Tensor
grad_out
,
at
::
Tensor
grad_in
,
int
pool_method
)
{
// params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel)
// params argmax: (N, out_x, out_y, out_z, C)
// params grad_out: (N, out_x, out_y, out_z, C)
// params grad_in: (npoints, C), return value
// params pool_method: 0: max_pool 1: avg_pool
CHECK_INPUT
(
pts_idx_of_voxels
);
CHECK_INPUT
(
argmax
);
CHECK_INPUT
(
grad_out
);
CHECK_INPUT
(
grad_in
);
int
boxes_num
=
pts_idx_of_voxels
.
size
(
0
);
int
out_x
=
pts_idx_of_voxels
.
size
(
1
);
int
out_y
=
pts_idx_of_voxels
.
size
(
2
);
int
out_z
=
pts_idx_of_voxels
.
size
(
3
);
int
max_pts_each_voxel
=
pts_idx_of_voxels
.
size
(
4
);
// index 0 is the counter
int
channels
=
grad_out
.
size
(
4
);
const
int
*
pts_idx_of_voxels_data
=
pts_idx_of_voxels
.
data_ptr
<
int
>
();
const
int
*
argmax_data
=
argmax
.
data_ptr
<
int
>
();
const
float
*
grad_out_data
=
grad_out
.
data_ptr
<
float
>
();
float
*
grad_in_data
=
grad_in
.
data_ptr
<
float
>
();
roiaware_pool3d_backward_launcher
(
boxes_num
,
out_x
,
out_y
,
out_z
,
channels
,
max_pts_each_voxel
,
pts_idx_of_voxels_data
,
argmax_data
,
grad_out_data
,
grad_in_data
,
pool_method
);
return
1
;
}
PYBIND11_MODULE
(
TORCH_EXTENSION_NAME
,
m
)
{
m
.
def
(
"forward"
,
&
roiaware_pool3d_gpu
,
"roiaware pool3d forward (CUDA)"
);
m
.
def
(
"backward"
,
&
roiaware_pool3d_gpu_backward
,
"roiaware pool3d backward (CUDA)"
);
m
.
def
(
"points_in_boxes_part"
,
&
points_in_boxes_part
,
"points_in_boxes_part forward (CUDA)"
);
m
.
def
(
"points_in_boxes_all"
,
&
points_in_boxes_all
,
"points_in_boxes_all forward (CUDA)"
);
m
.
def
(
"points_in_boxes_cpu"
,
&
points_in_boxes_cpu
,
"points_in_boxes_cpu forward (CPU)"
);
}
mmdet3d/ops/roiaware_pool3d/src/roiaware_pool3d_kernel.cu
deleted
100644 → 0
View file @
9c7270d0
// Modified from
// https://github.com/sshaoshuai/PCDet/blob/master/pcdet/ops/roiaware_pool3d/src/roiaware_pool3d_kernel.cu
// Written by Shaoshuai Shi
// All Rights Reserved 2019.
#include <assert.h>
#include <math.h>
#include <stdio.h>
#include <torch/serialize/tensor.h>
#include <torch/types.h>
#define THREADS_PER_BLOCK 256
#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
// #define DEBUG
__device__
inline
void
lidar_to_local_coords
(
float
shift_x
,
float
shift_y
,
float
rz
,
float
&
local_x
,
float
&
local_y
)
{
float
cosa
=
cos
(
-
rz
),
sina
=
sin
(
-
rz
);
local_x
=
shift_x
*
cosa
+
shift_y
*
(
-
sina
);
local_y
=
shift_x
*
sina
+
shift_y
*
cosa
;
}
__device__
inline
int
check_pt_in_box3d
(
const
float
*
pt
,
const
float
*
box3d
,
float
&
local_x
,
float
&
local_y
)
{
// param pt: (x, y, z)
// param box3d: (cx, cy, cz, x_size, y_size, z_size, rz) in LiDAR coordinate, cz in the
// bottom center
float
x
=
pt
[
0
],
y
=
pt
[
1
],
z
=
pt
[
2
];
float
cx
=
box3d
[
0
],
cy
=
box3d
[
1
],
cz
=
box3d
[
2
];
float
x_size
=
box3d
[
3
],
y_size
=
box3d
[
4
],
z_size
=
box3d
[
5
],
rz
=
box3d
[
6
];
cz
+=
z_size
/
2.0
;
// shift to the center since cz in box3d is the bottom center
if
(
fabsf
(
z
-
cz
)
>
z_size
/
2.0
)
return
0
;
lidar_to_local_coords
(
x
-
cx
,
y
-
cy
,
rz
,
local_x
,
local_y
);
float
in_flag
=
(
local_x
>
-
x_size
/
2.0
)
&
(
local_x
<
x_size
/
2.0
)
&
(
local_y
>
-
y_size
/
2.0
)
&
(
local_y
<
y_size
/
2.0
);
return
in_flag
;
}
__global__
void
generate_pts_mask_for_box3d
(
int
boxes_num
,
int
pts_num
,
int
out_x
,
int
out_y
,
int
out_z
,
const
float
*
rois
,
const
float
*
pts
,
int
*
pts_mask
)
{
// params rois: (N, 7) [x, y, z, x_size, y_size, z_size, rz] in LiDAR coordinate
// params pts: (npoints, 3) [x, y, z]
// params pts_mask: (N, npoints): -1 means point does not in this box,
// otherwise: encode (x_idxs, y_idxs, z_idxs) by binary bit
int
pt_idx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
int
box_idx
=
blockIdx
.
y
;
if
(
pt_idx
>=
pts_num
||
box_idx
>=
boxes_num
)
return
;
pts
+=
pt_idx
*
3
;
rois
+=
box_idx
*
7
;
pts_mask
+=
box_idx
*
pts_num
+
pt_idx
;
float
local_x
=
0
,
local_y
=
0
;
int
cur_in_flag
=
check_pt_in_box3d
(
pts
,
rois
,
local_x
,
local_y
);
pts_mask
[
0
]
=
-
1
;
if
(
cur_in_flag
>
0
)
{
float
local_z
=
pts
[
2
]
-
rois
[
2
];
float
x_size
=
rois
[
3
],
y_size
=
rois
[
4
],
z_size
=
rois
[
5
];
float
x_res
=
x_size
/
out_x
;
float
y_res
=
y_size
/
out_y
;
float
z_res
=
z_size
/
out_z
;
unsigned
int
x_idx
=
int
((
local_x
+
x_size
/
2
)
/
x_res
);
unsigned
int
y_idx
=
int
((
local_y
+
y_size
/
2
)
/
y_res
);
unsigned
int
z_idx
=
int
(
local_z
/
z_res
);
x_idx
=
min
(
max
(
x_idx
,
0
),
out_x
-
1
);
y_idx
=
min
(
max
(
y_idx
,
0
),
out_y
-
1
);
z_idx
=
min
(
max
(
z_idx
,
0
),
out_z
-
1
);
unsigned
int
idx_encoding
=
(
x_idx
<<
16
)
+
(
y_idx
<<
8
)
+
z_idx
;
#ifdef DEBUG
printf
(
"mask: pts_%d(%.3f, %.3f, %.3f), local(%.3f, %.3f, %.3f), idx(%d, %d, "
"%d), res(%.3f, %.3f, %.3f), idx_encoding=%x
\n
"
,
pt_idx
,
pts
[
0
],
pts
[
1
],
pts
[
2
],
local_x
,
local_y
,
local_z
,
x_idx
,
y_idx
,
z_idx
,
x_res
,
y_res
,
z_res
,
idx_encoding
);
#endif
pts_mask
[
0
]
=
idx_encoding
;
}
}
__global__
void
collect_inside_pts_for_box3d
(
int
boxes_num
,
int
pts_num
,
int
max_pts_each_voxel
,
int
out_x
,
int
out_y
,
int
out_z
,
const
int
*
pts_mask
,
int
*
pts_idx_of_voxels
)
{
// params pts_mask: (N, npoints) 0 or 1
// params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel)
int
box_idx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
box_idx
>=
boxes_num
)
return
;
int
max_num_pts
=
max_pts_each_voxel
-
1
;
// index 0 is the counter
pts_idx_of_voxels
+=
box_idx
*
out_x
*
out_y
*
out_z
*
max_pts_each_voxel
;
for
(
int
k
=
0
;
k
<
pts_num
;
k
++
)
{
if
(
pts_mask
[
box_idx
*
pts_num
+
k
]
!=
-
1
)
{
unsigned
int
idx_encoding
=
pts_mask
[
box_idx
*
pts_num
+
k
];
unsigned
int
x_idx
=
(
idx_encoding
>>
16
)
&
0xFF
;
unsigned
int
y_idx
=
(
idx_encoding
>>
8
)
&
0xFF
;
unsigned
int
z_idx
=
idx_encoding
&
0xFF
;
unsigned
int
base_offset
=
x_idx
*
out_y
*
out_z
*
max_pts_each_voxel
+
y_idx
*
out_z
*
max_pts_each_voxel
+
z_idx
*
max_pts_each_voxel
;
unsigned
int
cnt
=
pts_idx_of_voxels
[
base_offset
];
if
(
cnt
<
max_num_pts
)
{
pts_idx_of_voxels
[
base_offset
+
cnt
+
1
]
=
k
;
pts_idx_of_voxels
[
base_offset
]
++
;
}
#ifdef DEBUG
printf
(
"collect: pts_%d, idx(%d, %d, %d), idx_encoding=%x
\n
"
,
k
,
x_idx
,
y_idx
,
z_idx
,
idx_encoding
);
#endif
}
}
}
__global__
void
roiaware_maxpool3d
(
int
boxes_num
,
int
pts_num
,
int
channels
,
int
max_pts_each_voxel
,
int
out_x
,
int
out_y
,
int
out_z
,
const
float
*
pts_feature
,
const
int
*
pts_idx_of_voxels
,
float
*
pooled_features
,
int
*
argmax
)
{
// params pts_feature: (npoints, C)
// params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel),
// index 0 is the counter params pooled_features: (N, out_x, out_y, out_z, C)
// params argmax: (N, out_x, out_y, out_z, C)
int
box_idx
=
blockIdx
.
z
;
int
channel_idx
=
blockIdx
.
y
;
int
voxel_idx_flat
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
int
x_idx
=
voxel_idx_flat
/
(
out_y
*
out_z
);
int
y_idx
=
(
voxel_idx_flat
-
x_idx
*
(
out_y
*
out_z
))
/
out_z
;
int
z_idx
=
voxel_idx_flat
%
out_z
;
if
(
box_idx
>=
boxes_num
||
channel_idx
>=
channels
||
x_idx
>=
out_x
||
y_idx
>=
out_y
||
z_idx
>=
out_z
)
return
;
#ifdef DEBUG
printf
(
"src pts_idx_of_voxels: (%p, ), argmax: %p
\n
"
,
pts_idx_of_voxels
,
argmax
);
#endif
int
offset_base
=
x_idx
*
out_y
*
out_z
+
y_idx
*
out_z
+
z_idx
;
pts_idx_of_voxels
+=
box_idx
*
out_x
*
out_y
*
out_z
*
max_pts_each_voxel
+
offset_base
*
max_pts_each_voxel
;
pooled_features
+=
box_idx
*
out_x
*
out_y
*
out_z
*
channels
+
offset_base
*
channels
+
channel_idx
;
argmax
+=
box_idx
*
out_x
*
out_y
*
out_z
*
channels
+
offset_base
*
channels
+
channel_idx
;
int
argmax_idx
=
-
1
;
float
max_val
=
-
1e50
;
int
total_pts
=
pts_idx_of_voxels
[
0
];
for
(
int
k
=
1
;
k
<=
total_pts
;
k
++
)
{
if
(
pts_feature
[
pts_idx_of_voxels
[
k
]
*
channels
+
channel_idx
]
>
max_val
)
{
max_val
=
pts_feature
[
pts_idx_of_voxels
[
k
]
*
channels
+
channel_idx
];
argmax_idx
=
pts_idx_of_voxels
[
k
];
}
}
if
(
argmax_idx
!=
-
1
)
{
pooled_features
[
0
]
=
max_val
;
}
argmax
[
0
]
=
argmax_idx
;
#ifdef DEBUG
printf
(
"channel_%d idx(%d, %d, %d), argmax_idx=(%d, %.3f), total=%d, after "
"pts_idx: %p, argmax: (%p, %d)
\n
"
,
channel_idx
,
x_idx
,
y_idx
,
z_idx
,
argmax_idx
,
max_val
,
total_pts
,
pts_idx_of_voxels
,
argmax
,
argmax_idx
);
#endif
}
__global__
void
roiaware_avgpool3d
(
int
boxes_num
,
int
pts_num
,
int
channels
,
int
max_pts_each_voxel
,
int
out_x
,
int
out_y
,
int
out_z
,
const
float
*
pts_feature
,
const
int
*
pts_idx_of_voxels
,
float
*
pooled_features
)
{
// params pts_feature: (npoints, C)
// params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel),
// index 0 is the counter params pooled_features: (N, out_x, out_y, out_z, C)
// params argmax: (N, out_x, out_y, out_z, C)
int
box_idx
=
blockIdx
.
z
;
int
channel_idx
=
blockIdx
.
y
;
int
voxel_idx_flat
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
int
x_idx
=
voxel_idx_flat
/
(
out_y
*
out_z
);
int
y_idx
=
(
voxel_idx_flat
-
x_idx
*
(
out_y
*
out_z
))
/
out_z
;
int
z_idx
=
voxel_idx_flat
%
out_z
;
if
(
box_idx
>=
boxes_num
||
channel_idx
>=
channels
||
x_idx
>=
out_x
||
y_idx
>=
out_y
||
z_idx
>=
out_z
)
return
;
int
offset_base
=
x_idx
*
out_y
*
out_z
+
y_idx
*
out_z
+
z_idx
;
pts_idx_of_voxels
+=
box_idx
*
out_x
*
out_y
*
out_z
*
max_pts_each_voxel
+
offset_base
*
max_pts_each_voxel
;
pooled_features
+=
box_idx
*
out_x
*
out_y
*
out_z
*
channels
+
offset_base
*
channels
+
channel_idx
;
float
sum_val
=
0
;
int
total_pts
=
pts_idx_of_voxels
[
0
];
for
(
int
k
=
1
;
k
<=
total_pts
;
k
++
)
{
sum_val
+=
pts_feature
[
pts_idx_of_voxels
[
k
]
*
channels
+
channel_idx
];
}
if
(
total_pts
>
0
)
{
pooled_features
[
0
]
=
sum_val
/
total_pts
;
}
}
void
roiaware_pool3d_launcher
(
int
boxes_num
,
int
pts_num
,
int
channels
,
int
max_pts_each_voxel
,
int
out_x
,
int
out_y
,
int
out_z
,
const
float
*
rois
,
const
float
*
pts
,
const
float
*
pts_feature
,
int
*
argmax
,
int
*
pts_idx_of_voxels
,
float
*
pooled_features
,
int
pool_method
)
{
// params rois: (N, 7) [x, y, z, x_size, y_size, z_size, rz] in LiDAR coordinate
// params pts: (npoints, 3) [x, y, z] in LiDAR coordinate
// params pts_feature: (npoints, C)
// params argmax: (N, out_x, out_y, out_z, C)
// params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel)
// params pooled_features: (N, out_x, out_y, out_z, C)
// params pool_method: 0: max_pool 1: avg_pool
int
*
pts_mask
=
NULL
;
cudaMalloc
(
&
pts_mask
,
boxes_num
*
pts_num
*
sizeof
(
int
));
// (N, M)
cudaMemset
(
pts_mask
,
-
1
,
boxes_num
*
pts_num
*
sizeof
(
int
));
dim3
blocks_mask
(
DIVUP
(
pts_num
,
THREADS_PER_BLOCK
),
boxes_num
);
dim3
threads
(
THREADS_PER_BLOCK
);
generate_pts_mask_for_box3d
<<<
blocks_mask
,
threads
>>>
(
boxes_num
,
pts_num
,
out_x
,
out_y
,
out_z
,
rois
,
pts
,
pts_mask
);
// TODO: Merge the collect and pool functions, SS
dim3
blocks_collect
(
DIVUP
(
boxes_num
,
THREADS_PER_BLOCK
));
collect_inside_pts_for_box3d
<<<
blocks_collect
,
threads
>>>
(
boxes_num
,
pts_num
,
max_pts_each_voxel
,
out_x
,
out_y
,
out_z
,
pts_mask
,
pts_idx_of_voxels
);
dim3
blocks_pool
(
DIVUP
(
out_x
*
out_y
*
out_z
,
THREADS_PER_BLOCK
),
channels
,
boxes_num
);
if
(
pool_method
==
0
)
{
roiaware_maxpool3d
<<<
blocks_pool
,
threads
>>>
(
boxes_num
,
pts_num
,
channels
,
max_pts_each_voxel
,
out_x
,
out_y
,
out_z
,
pts_feature
,
pts_idx_of_voxels
,
pooled_features
,
argmax
);
}
else
if
(
pool_method
==
1
)
{
roiaware_avgpool3d
<<<
blocks_pool
,
threads
>>>
(
boxes_num
,
pts_num
,
channels
,
max_pts_each_voxel
,
out_x
,
out_y
,
out_z
,
pts_feature
,
pts_idx_of_voxels
,
pooled_features
);
}
cudaFree
(
pts_mask
);
#ifdef DEBUG
cudaDeviceSynchronize
();
// for using printf in kernel function
#endif
}
__global__
void
roiaware_maxpool3d_backward
(
int
boxes_num
,
int
channels
,
int
out_x
,
int
out_y
,
int
out_z
,
const
int
*
argmax
,
const
float
*
grad_out
,
float
*
grad_in
)
{
// params argmax: (N, out_x, out_y, out_z, C)
// params grad_out: (N, out_x, out_y, out_z, C)
// params grad_in: (npoints, C), return value
int
box_idx
=
blockIdx
.
z
;
int
channel_idx
=
blockIdx
.
y
;
int
voxel_idx_flat
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
int
x_idx
=
voxel_idx_flat
/
(
out_y
*
out_z
);
int
y_idx
=
(
voxel_idx_flat
-
x_idx
*
(
out_y
*
out_z
))
/
out_z
;
int
z_idx
=
voxel_idx_flat
%
out_z
;
if
(
box_idx
>=
boxes_num
||
channel_idx
>=
channels
||
x_idx
>=
out_x
||
y_idx
>=
out_y
||
z_idx
>=
out_z
)
return
;
int
offset_base
=
x_idx
*
out_y
*
out_z
+
y_idx
*
out_z
+
z_idx
;
argmax
+=
box_idx
*
out_x
*
out_y
*
out_z
*
channels
+
offset_base
*
channels
+
channel_idx
;
grad_out
+=
box_idx
*
out_x
*
out_y
*
out_z
*
channels
+
offset_base
*
channels
+
channel_idx
;
if
(
argmax
[
0
]
==
-
1
)
return
;
atomicAdd
(
grad_in
+
argmax
[
0
]
*
channels
+
channel_idx
,
grad_out
[
0
]
*
1
);
}
__global__
void
roiaware_avgpool3d_backward
(
int
boxes_num
,
int
channels
,
int
out_x
,
int
out_y
,
int
out_z
,
int
max_pts_each_voxel
,
const
int
*
pts_idx_of_voxels
,
const
float
*
grad_out
,
float
*
grad_in
)
{
// params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel)
// params grad_out: (N, out_x, out_y, out_z, C)
// params grad_in: (npoints, C), return value
int
box_idx
=
blockIdx
.
z
;
int
channel_idx
=
blockIdx
.
y
;
int
voxel_idx_flat
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
int
x_idx
=
voxel_idx_flat
/
(
out_y
*
out_z
);
int
y_idx
=
(
voxel_idx_flat
-
x_idx
*
(
out_y
*
out_z
))
/
out_z
;
int
z_idx
=
voxel_idx_flat
%
out_z
;
if
(
box_idx
>=
boxes_num
||
channel_idx
>=
channels
||
x_idx
>=
out_x
||
y_idx
>=
out_y
||
z_idx
>=
out_z
)
return
;
int
offset_base
=
x_idx
*
out_y
*
out_z
+
y_idx
*
out_z
+
z_idx
;
pts_idx_of_voxels
+=
box_idx
*
out_x
*
out_y
*
out_z
*
max_pts_each_voxel
+
offset_base
*
max_pts_each_voxel
;
grad_out
+=
box_idx
*
out_x
*
out_y
*
out_z
*
channels
+
offset_base
*
channels
+
channel_idx
;
int
total_pts
=
pts_idx_of_voxels
[
0
];
float
cur_grad
=
1
/
fmaxf
(
float
(
total_pts
),
1.0
);
for
(
int
k
=
1
;
k
<=
total_pts
;
k
++
)
{
atomicAdd
(
grad_in
+
pts_idx_of_voxels
[
k
]
*
channels
+
channel_idx
,
grad_out
[
0
]
*
cur_grad
);
}
}
void
roiaware_pool3d_backward_launcher
(
int
boxes_num
,
int
out_x
,
int
out_y
,
int
out_z
,
int
channels
,
int
max_pts_each_voxel
,
const
int
*
pts_idx_of_voxels
,
const
int
*
argmax
,
const
float
*
grad_out
,
float
*
grad_in
,
int
pool_method
)
{
// params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel)
// params argmax: (N, out_x, out_y, out_z, C)
// params grad_out: (N, out_x, out_y, out_z, C)
// params grad_in: (npoints, C), return value
// params pool_method: 0: max_pool, 1: avg_pool
dim3
blocks
(
DIVUP
(
out_x
*
out_y
*
out_z
,
THREADS_PER_BLOCK
),
channels
,
boxes_num
);
dim3
threads
(
THREADS_PER_BLOCK
);
if
(
pool_method
==
0
)
{
roiaware_maxpool3d_backward
<<<
blocks
,
threads
>>>
(
boxes_num
,
channels
,
out_x
,
out_y
,
out_z
,
argmax
,
grad_out
,
grad_in
);
}
else
if
(
pool_method
==
1
)
{
roiaware_avgpool3d_backward
<<<
blocks
,
threads
>>>
(
boxes_num
,
channels
,
out_x
,
out_y
,
out_z
,
max_pts_each_voxel
,
pts_idx_of_voxels
,
grad_out
,
grad_in
);
}
}
mmdet3d/ops/roipoint_pool3d/__init__.py
deleted
100644 → 0
View file @
9c7270d0
# Copyright (c) OpenMMLab. All rights reserved.
from
.roipoint_pool3d
import
RoIPointPool3d
__all__
=
[
'RoIPointPool3d'
]
mmdet3d/ops/roipoint_pool3d/roipoint_pool3d.py
deleted
100644 → 0
View file @
9c7270d0
# Copyright (c) OpenMMLab. All rights reserved.
from
torch
import
nn
as
nn
from
torch.autograd
import
Function
from
.
import
roipoint_pool3d_ext
class
RoIPointPool3d
(
nn
.
Module
):
def
__init__
(
self
,
num_sampled_points
=
512
):
super
().
__init__
()
"""
Args:
num_sampled_points (int): Number of samples in each roi
"""
self
.
num_sampled_points
=
num_sampled_points
def
forward
(
self
,
points
,
point_features
,
boxes3d
):
"""
Args:
points (torch.Tensor): Input points whose shape is BxNx3
point_features: (B, N, C)
boxes3d: (B, M, 7), [x, y, z, dx, dy, dz, heading]
Returns:
torch.Tensor: (B, M, 512, 3 + C) pooled_features
torch.Tensor: (B, M) pooled_empty_flag
"""
return
RoIPointPool3dFunction
.
apply
(
points
,
point_features
,
boxes3d
,
self
.
num_sampled_points
)
class
RoIPointPool3dFunction
(
Function
):
@
staticmethod
def
forward
(
ctx
,
points
,
point_features
,
boxes3d
,
num_sampled_points
=
512
):
"""
Args:
points (torch.Tensor): Input points whose shape is (B, N, 3)
point_features (torch.Tensor): Input points features shape is
\
(B, N, C)
boxes3d (torch.Tensor): Input bounding boxes whose shape is
\
(B, M, 7)
num_sampled_points (int): the num of sampled points
Returns:
torch.Tensor: (B, M, 512, 3 + C) pooled_features
torch.Tensor: (B, M) pooled_empty_flag
"""
assert
points
.
shape
.
__len__
()
==
3
and
points
.
shape
[
2
]
==
3
batch_size
,
boxes_num
,
feature_len
=
points
.
shape
[
0
],
boxes3d
.
shape
[
1
],
point_features
.
shape
[
2
]
pooled_boxes3d
=
boxes3d
.
view
(
batch_size
,
-
1
,
7
)
pooled_features
=
point_features
.
new_zeros
(
(
batch_size
,
boxes_num
,
num_sampled_points
,
3
+
feature_len
))
pooled_empty_flag
=
point_features
.
new_zeros
(
(
batch_size
,
boxes_num
)).
int
()
roipoint_pool3d_ext
.
forward
(
points
.
contiguous
(),
pooled_boxes3d
.
contiguous
(),
point_features
.
contiguous
(),
pooled_features
,
pooled_empty_flag
)
return
pooled_features
,
pooled_empty_flag
@
staticmethod
def
backward
(
ctx
,
grad_out
):
raise
NotImplementedError
if
__name__
==
'__main__'
:
pass
mmdet3d/ops/roipoint_pool3d/src/roipoint_pool3d.cpp
deleted
100644 → 0
View file @
9c7270d0
/*
Modified for
https://github.com/open-mmlab/OpenPCDet/blob/master/pcdet/ops/roipoint_pool3d/src/roipoint_pool3d_kernel.cu
Point cloud feature pooling
Written by Shaoshuai Shi
All Rights Reserved 2018.
*/
#include <torch/serialize/tensor.h>
#include <torch/extension.h>
#define CHECK_CUDA(x) do { \
if (!x.type().is_cuda()) { \
fprintf(stderr, "%s must be CUDA tensor at %s:%d\n", #x, __FILE__, __LINE__); \
exit(-1); \
} \
} while (0)
#define CHECK_CONTIGUOUS(x) do { \
if (!x.is_contiguous()) { \
fprintf(stderr, "%s must be contiguous tensor at %s:%d\n", #x, __FILE__, __LINE__); \
exit(-1); \
} \
} while (0)
#define CHECK_INPUT(x) CHECK_CUDA(x);CHECK_CONTIGUOUS(x)
void
roipool3dLauncher
(
int
batch_size
,
int
pts_num
,
int
boxes_num
,
int
feature_in_len
,
int
sampled_pts_num
,
const
float
*
xyz
,
const
float
*
boxes3d
,
const
float
*
pts_feature
,
float
*
pooled_features
,
int
*
pooled_empty_flag
);
int
roipool3d_gpu
(
at
::
Tensor
xyz
,
at
::
Tensor
boxes3d
,
at
::
Tensor
pts_feature
,
at
::
Tensor
pooled_features
,
at
::
Tensor
pooled_empty_flag
){
// params xyz: (B, N, 3)
// params boxes3d: (B, M, 7)
// params pts_feature: (B, N, C)
// params pooled_features: (B, M, 512, 3+C)
// params pooled_empty_flag: (B, M)
CHECK_INPUT
(
xyz
);
CHECK_INPUT
(
boxes3d
);
CHECK_INPUT
(
pts_feature
);
CHECK_INPUT
(
pooled_features
);
CHECK_INPUT
(
pooled_empty_flag
);
int
batch_size
=
xyz
.
size
(
0
);
int
pts_num
=
xyz
.
size
(
1
);
int
boxes_num
=
boxes3d
.
size
(
1
);
int
feature_in_len
=
pts_feature
.
size
(
2
);
int
sampled_pts_num
=
pooled_features
.
size
(
2
);
const
float
*
xyz_data
=
xyz
.
data
<
float
>
();
const
float
*
boxes3d_data
=
boxes3d
.
data
<
float
>
();
const
float
*
pts_feature_data
=
pts_feature
.
data
<
float
>
();
float
*
pooled_features_data
=
pooled_features
.
data
<
float
>
();
int
*
pooled_empty_flag_data
=
pooled_empty_flag
.
data
<
int
>
();
roipool3dLauncher
(
batch_size
,
pts_num
,
boxes_num
,
feature_in_len
,
sampled_pts_num
,
xyz_data
,
boxes3d_data
,
pts_feature_data
,
pooled_features_data
,
pooled_empty_flag_data
);
return
1
;
}
PYBIND11_MODULE
(
TORCH_EXTENSION_NAME
,
m
)
{
m
.
def
(
"forward"
,
&
roipool3d_gpu
,
"roipool3d forward (CUDA)"
);
}
mmdet3d/ops/roipoint_pool3d/src/roipoint_pool3d_kernel.cu
deleted
100644 → 0
View file @
9c7270d0
/*
Modified from
https://github.com/sshaoshuai/PCDet/blob/master/pcdet/ops/roipoint_pool3d/src/roipoint_pool3d_kernel.cu
Point cloud feature pooling
Written by Shaoshuai Shi
All Rights Reserved 2018.
*/
#include <math.h>
#include <stdio.h>
#define THREADS_PER_BLOCK 256
#define DIVUP(m,n) ((m) / (n) + ((m) % (n) > 0))
// #define DEBUG
__device__
inline
void
lidar_to_local_coords
(
float
shift_x
,
float
shift_y
,
float
rz
,
float
&
local_x
,
float
&
local_y
)
{
float
cosa
=
cos
(
-
rz
),
sina
=
sin
(
-
rz
);
local_x
=
shift_x
*
cosa
+
shift_y
*
(
-
sina
);
local_y
=
shift_x
*
sina
+
shift_y
*
cosa
;
}
__device__
inline
int
check_pt_in_box3d
(
const
float
*
pt
,
const
float
*
box3d
,
float
&
local_x
,
float
&
local_y
)
{
// param pt: (x, y, z)
// param box3d: (cx, cy, cz, dx, dy, dz, rz) in LiDAR coordinate, cz in the
// bottom center
float
x
=
pt
[
0
],
y
=
pt
[
1
],
z
=
pt
[
2
];
float
cx
=
box3d
[
0
],
cy
=
box3d
[
1
],
cz
=
box3d
[
2
];
float
dx
=
box3d
[
3
],
dy
=
box3d
[
4
],
dz
=
box3d
[
5
],
rz
=
box3d
[
6
];
cz
+=
dz
/
2.0
;
// shift to the center since cz in box3d is the bottom center
if
(
fabsf
(
z
-
cz
)
>
dz
/
2.0
)
return
0
;
lidar_to_local_coords
(
x
-
cx
,
y
-
cy
,
rz
,
local_x
,
local_y
);
float
in_flag
=
(
local_x
>
-
dx
/
2.0
)
&
(
local_x
<
dx
/
2.0
)
&
(
local_y
>
-
dy
/
2.0
)
&
(
local_y
<
dy
/
2.0
);
return
in_flag
;
}
__global__
void
assign_pts_to_box3d
(
int
batch_size
,
int
pts_num
,
int
boxes_num
,
const
float
*
xyz
,
const
float
*
boxes3d
,
int
*
pts_assign
){
// params xyz: (B, N, 3)
// params boxes3d: (B, M, 7)
// params pts_assign: (B, N, M): idx of the corresponding box3d, -1 means background points
int
pt_idx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
int
box_idx
=
blockIdx
.
y
;
int
bs_idx
=
blockIdx
.
z
;
if
(
pt_idx
>=
pts_num
||
box_idx
>=
boxes_num
||
bs_idx
>=
batch_size
){
return
;
}
int
assign_idx
=
bs_idx
*
pts_num
*
boxes_num
+
pt_idx
*
boxes_num
+
box_idx
;
pts_assign
[
assign_idx
]
=
0
;
int
box_offset
=
bs_idx
*
boxes_num
*
7
+
box_idx
*
7
;
int
pt_offset
=
bs_idx
*
pts_num
*
3
+
pt_idx
*
3
;
float
local_x
=
0
,
local_y
=
0
;
int
cur_in_flag
=
check_pt_in_box3d
(
xyz
+
pt_offset
,
boxes3d
+
box_offset
,
local_x
,
local_y
);
pts_assign
[
assign_idx
]
=
cur_in_flag
;
// printf("bs=%d, pt=%d, in=%d\n", bs_idx, pt_idx, pts_assign[bs_idx * pts_num + pt_idx]);
}
__global__
void
get_pooled_idx
(
int
batch_size
,
int
pts_num
,
int
boxes_num
,
int
sampled_pts_num
,
const
int
*
pts_assign
,
int
*
pts_idx
,
int
*
pooled_empty_flag
){
// params xyz: (B, N, 3)
// params pts_feature: (B, N, C)
// params pts_assign: (B, N)
// params pts_idx: (B, M, 512)
// params pooled_empty_flag: (B, M)
int
boxes_idx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
boxes_idx
>=
boxes_num
){
return
;
}
int
bs_idx
=
blockIdx
.
y
;
int
cnt
=
0
;
for
(
int
k
=
0
;
k
<
pts_num
;
k
++
){
if
(
pts_assign
[
bs_idx
*
pts_num
*
boxes_num
+
k
*
boxes_num
+
boxes_idx
]){
if
(
cnt
<
sampled_pts_num
){
pts_idx
[
bs_idx
*
boxes_num
*
sampled_pts_num
+
boxes_idx
*
sampled_pts_num
+
cnt
]
=
k
;
cnt
++
;
}
else
break
;
}
}
if
(
cnt
==
0
){
pooled_empty_flag
[
bs_idx
*
boxes_num
+
boxes_idx
]
=
1
;
}
else
if
(
cnt
<
sampled_pts_num
){
// duplicate same points for sampling
for
(
int
k
=
cnt
;
k
<
sampled_pts_num
;
k
++
){
int
duplicate_idx
=
k
%
cnt
;
int
base_offset
=
bs_idx
*
boxes_num
*
sampled_pts_num
+
boxes_idx
*
sampled_pts_num
;
pts_idx
[
base_offset
+
k
]
=
pts_idx
[
base_offset
+
duplicate_idx
];
}
}
}
__global__
void
roipool3d_forward
(
int
batch_size
,
int
pts_num
,
int
boxes_num
,
int
feature_in_len
,
int
sampled_pts_num
,
const
float
*
xyz
,
const
int
*
pts_idx
,
const
float
*
pts_feature
,
float
*
pooled_features
,
int
*
pooled_empty_flag
){
// params xyz: (B, N, 3)
// params pts_idx: (B, M, 512)
// params pts_feature: (B, N, C)
// params pooled_features: (B, M, 512, 3+C)
// params pooled_empty_flag: (B, M)
int
sample_pt_idx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
int
box_idx
=
blockIdx
.
y
;
int
bs_idx
=
blockIdx
.
z
;
if
(
sample_pt_idx
>=
sampled_pts_num
||
box_idx
>=
boxes_num
||
bs_idx
>=
batch_size
){
return
;
}
if
(
pooled_empty_flag
[
bs_idx
*
boxes_num
+
box_idx
]){
return
;
}
int
temp_idx
=
bs_idx
*
boxes_num
*
sampled_pts_num
+
box_idx
*
sampled_pts_num
+
sample_pt_idx
;
int
src_pt_idx
=
pts_idx
[
temp_idx
];
int
dst_feature_offset
=
temp_idx
*
(
3
+
feature_in_len
);
for
(
int
j
=
0
;
j
<
3
;
j
++
)
pooled_features
[
dst_feature_offset
+
j
]
=
xyz
[
bs_idx
*
pts_num
*
3
+
src_pt_idx
*
3
+
j
];
int
src_feature_offset
=
bs_idx
*
pts_num
*
feature_in_len
+
src_pt_idx
*
feature_in_len
;
for
(
int
j
=
0
;
j
<
feature_in_len
;
j
++
)
pooled_features
[
dst_feature_offset
+
3
+
j
]
=
pts_feature
[
src_feature_offset
+
j
];
}
void
roipool3dLauncher
(
int
batch_size
,
int
pts_num
,
int
boxes_num
,
int
feature_in_len
,
int
sampled_pts_num
,
const
float
*
xyz
,
const
float
*
boxes3d
,
const
float
*
pts_feature
,
float
*
pooled_features
,
int
*
pooled_empty_flag
){
// printf("batch_size=%d, pts_num=%d, boxes_num=%d\n", batch_size, pts_num, boxes_num);
int
*
pts_assign
=
NULL
;
cudaMalloc
(
&
pts_assign
,
batch_size
*
pts_num
*
boxes_num
*
sizeof
(
int
));
// (batch_size, N, M)
// cudaMemset(&pts_assign, -1, batch_size * pts_num * boxes_num * sizeof(int));
dim3
blocks
(
DIVUP
(
pts_num
,
THREADS_PER_BLOCK
),
boxes_num
,
batch_size
);
// blockIdx.x(col), blockIdx.y(row)
dim3
threads
(
THREADS_PER_BLOCK
);
assign_pts_to_box3d
<<<
blocks
,
threads
>>>
(
batch_size
,
pts_num
,
boxes_num
,
xyz
,
boxes3d
,
pts_assign
);
int
*
pts_idx
=
NULL
;
cudaMalloc
(
&
pts_idx
,
batch_size
*
boxes_num
*
sampled_pts_num
*
sizeof
(
int
));
// (batch_size, M, sampled_pts_num)
dim3
blocks2
(
DIVUP
(
boxes_num
,
THREADS_PER_BLOCK
),
batch_size
);
// blockIdx.x(col), blockIdx.y(row)
get_pooled_idx
<<<
blocks2
,
threads
>>>
(
batch_size
,
pts_num
,
boxes_num
,
sampled_pts_num
,
pts_assign
,
pts_idx
,
pooled_empty_flag
);
dim3
blocks_pool
(
DIVUP
(
sampled_pts_num
,
THREADS_PER_BLOCK
),
boxes_num
,
batch_size
);
roipool3d_forward
<<<
blocks_pool
,
threads
>>>
(
batch_size
,
pts_num
,
boxes_num
,
feature_in_len
,
sampled_pts_num
,
xyz
,
pts_idx
,
pts_feature
,
pooled_features
,
pooled_empty_flag
);
cudaFree
(
pts_assign
);
cudaFree
(
pts_idx
);
#ifdef DEBUG
cudaDeviceSynchronize
();
// for using printf in kernel function
#endif
}
Prev
1
…
3
4
5
6
7
8
9
10
11
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment