Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
mmdetection3d
Commits
701e5032
Commit
701e5032
authored
May 18, 2020
by
zhangwenwei
Browse files
Merge branch 'feat_pointnet_ops' into 'master'
Feat pointnet ops See merge request open-mmlab/mmdet.3d!34
parents
28e511cd
006b7ccf
Changes
26
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1037 additions
and
3 deletions
+1037
-3
mmdet3d/models/roi_heads/__init__.py
mmdet3d/models/roi_heads/__init__.py
+2
-2
mmdet3d/ops/__init__.py
mmdet3d/ops/__init__.py
+8
-1
mmdet3d/ops/ball_query/__init__.py
mmdet3d/ops/ball_query/__init__.py
+3
-0
mmdet3d/ops/ball_query/ball_query.py
mmdet3d/ops/ball_query/ball_query.py
+45
-0
mmdet3d/ops/ball_query/src/ball_query.cpp
mmdet3d/ops/ball_query/src/ball_query.cpp
+37
-0
mmdet3d/ops/ball_query/src/ball_query_cuda.cu
mmdet3d/ops/ball_query/src/ball_query_cuda.cu
+67
-0
mmdet3d/ops/furthest_point_sample/__init__.py
mmdet3d/ops/furthest_point_sample/__init__.py
+3
-0
mmdet3d/ops/furthest_point_sample/furthest_point_sample.py
mmdet3d/ops/furthest_point_sample/furthest_point_sample.py
+42
-0
mmdet3d/ops/furthest_point_sample/src/furthest_point_sample.cpp
...d/ops/furthest_point_sample/src/furthest_point_sample.cpp
+30
-0
mmdet3d/ops/furthest_point_sample/src/furthest_point_sample_cuda.cu
...s/furthest_point_sample/src/furthest_point_sample_cuda.cu
+181
-0
mmdet3d/ops/gather_points/__init__.py
mmdet3d/ops/gather_points/__init__.py
+3
-0
mmdet3d/ops/gather_points/gather_points.py
mmdet3d/ops/gather_points/gather_points.py
+52
-0
mmdet3d/ops/gather_points/src/gather_points.cpp
mmdet3d/ops/gather_points/src/gather_points.cpp
+49
-0
mmdet3d/ops/gather_points/src/gather_points_cuda.cu
mmdet3d/ops/gather_points/src/gather_points_cuda.cu
+84
-0
mmdet3d/ops/group_points/__init__.py
mmdet3d/ops/group_points/__init__.py
+3
-0
mmdet3d/ops/group_points/group_points.py
mmdet3d/ops/group_points/group_points.py
+205
-0
mmdet3d/ops/group_points/src/group_points.cpp
mmdet3d/ops/group_points/src/group_points.cpp
+53
-0
mmdet3d/ops/group_points/src/group_points_cuda.cu
mmdet3d/ops/group_points/src/group_points_cuda.cu
+85
-0
mmdet3d/ops/interpolate/__init__.py
mmdet3d/ops/interpolate/__init__.py
+4
-0
mmdet3d/ops/interpolate/src/interpolate.cpp
mmdet3d/ops/interpolate/src/interpolate.cpp
+81
-0
No files found.
mmdet3d/models/roi_heads/__init__.py
View file @
701e5032
...
@@ -2,9 +2,9 @@ from .base_3droi_head import Base3DRoIHead
...
@@ -2,9 +2,9 @@ from .base_3droi_head import Base3DRoIHead
from
.bbox_heads
import
PartA2BboxHead
from
.bbox_heads
import
PartA2BboxHead
from
.mask_heads
import
PointwiseSemanticHead
from
.mask_heads
import
PointwiseSemanticHead
from
.part_aggregation_roi_head
import
PartAggregationROIHead
from
.part_aggregation_roi_head
import
PartAggregationROIHead
from
.roi_extractors
import
Single3DRoIAwareExtractor
from
.roi_extractors
import
Single3DRoIAwareExtractor
,
SingleRoIExtractor
__all__
=
[
__all__
=
[
'Base3DRoIHead'
,
'PartAggregationROIHead'
,
'PointwiseSemanticHead'
,
'Base3DRoIHead'
,
'PartAggregationROIHead'
,
'PointwiseSemanticHead'
,
'Single3DRoIAwareExtractor'
,
'PartA2BboxHead'
'Single3DRoIAwareExtractor'
,
'PartA2BboxHead'
,
'SingleRoIExtractor'
]
]
mmdet3d/ops/__init__.py
View file @
701e5032
from
mmdet.ops
import
(
RoIAlign
,
SigmoidFocalLoss
,
get_compiler_version
,
from
mmdet.ops
import
(
RoIAlign
,
SigmoidFocalLoss
,
get_compiler_version
,
get_compiling_cuda_version
,
nms
,
roi_align
,
get_compiling_cuda_version
,
nms
,
roi_align
,
sigmoid_focal_loss
)
sigmoid_focal_loss
)
from
.ball_query
import
ball_query
from
.furthest_point_sample
import
furthest_point_sample
from
.gather_points
import
gather_points
from
.group_points
import
group_points
,
grouping_operation
from
.interpolate
import
three_interpolate
,
three_nn
from
.norm
import
NaiveSyncBatchNorm1d
,
NaiveSyncBatchNorm2d
from
.norm
import
NaiveSyncBatchNorm1d
,
NaiveSyncBatchNorm2d
from
.roiaware_pool3d
import
(
RoIAwarePool3d
,
points_in_boxes_cpu
,
from
.roiaware_pool3d
import
(
RoIAwarePool3d
,
points_in_boxes_cpu
,
points_in_boxes_gpu
)
points_in_boxes_gpu
)
...
@@ -15,5 +20,7 @@ __all__ = [
...
@@ -15,5 +20,7 @@ __all__ = [
'dynamic_scatter'
,
'DynamicScatter'
,
'sigmoid_focal_loss'
,
'dynamic_scatter'
,
'DynamicScatter'
,
'sigmoid_focal_loss'
,
'SigmoidFocalLoss'
,
'SparseBasicBlock'
,
'SparseBottleneck'
,
'SigmoidFocalLoss'
,
'SparseBasicBlock'
,
'SparseBottleneck'
,
'RoIAwarePool3d'
,
'points_in_boxes_gpu'
,
'points_in_boxes_cpu'
,
'RoIAwarePool3d'
,
'points_in_boxes_gpu'
,
'points_in_boxes_cpu'
,
'make_sparse_convmodule'
'make_sparse_convmodule'
,
'ball_query'
,
'furthest_point_sample'
,
'three_interpolate'
,
'three_nn'
,
'gather_points'
,
'grouping_operation'
,
'group_points'
]
]
mmdet3d/ops/ball_query/__init__.py
0 → 100644
View file @
701e5032
from
.ball_query
import
ball_query
__all__
=
[
'ball_query'
]
mmdet3d/ops/ball_query/ball_query.py
0 → 100644
View file @
701e5032
import
torch
from
torch.autograd
import
Function
from
.
import
ball_query_ext
class
BallQuery
(
Function
):
"""Ball Query
Find nearby points in spherical space.
"""
@
staticmethod
def
forward
(
ctx
,
radius
:
float
,
sample_num
:
int
,
xyz
:
torch
.
Tensor
,
center_xyz
:
torch
.
Tensor
)
->
torch
.
Tensor
:
"""forward.
Args:
radius (float): radius of the balls.
sample_num (int): maximum number of features in the balls.
xyz (Tensor): (B, N, 3) xyz coordinates of the features.
center_xyz (Tensor): (B, npoint, 3) centers of the ball query.
Returns:
Tensor: (B, npoint, nsample) tensor with the indicies of
the features that form the query balls.
"""
assert
center_xyz
.
is_contiguous
()
assert
xyz
.
is_contiguous
()
B
,
N
,
_
=
xyz
.
size
()
npoint
=
center_xyz
.
size
(
1
)
idx
=
torch
.
cuda
.
IntTensor
(
B
,
npoint
,
sample_num
).
zero_
()
ball_query_ext
.
ball_query_wrapper
(
B
,
N
,
npoint
,
radius
,
sample_num
,
center_xyz
,
xyz
,
idx
)
ctx
.
mark_non_differentiable
(
idx
)
return
idx
@
staticmethod
def
backward
(
ctx
,
a
=
None
):
return
None
,
None
,
None
,
None
ball_query
=
BallQuery
.
apply
mmdet3d/ops/ball_query/src/ball_query.cpp
0 → 100644
View file @
701e5032
#include <torch/serialize/tensor.h>
#include <vector>
#include <THC/THC.h>
#include <cuda.h>
#include <cuda_runtime_api.h>
#include <torch/extension.h>
extern
THCState
*
state
;
#define CHECK_CUDA(x) AT_CHECK(x.type().is_cuda(), #x, " must be a CUDAtensor ")
#define CHECK_CONTIGUOUS(x) AT_CHECK(x.is_contiguous(), #x, " must be contiguous ")
#define CHECK_INPUT(x) CHECK_CUDA(x);CHECK_CONTIGUOUS(x)
int
ball_query_wrapper
(
int
b
,
int
n
,
int
m
,
float
radius
,
int
nsample
,
at
::
Tensor
new_xyz_tensor
,
at
::
Tensor
xyz_tensor
,
at
::
Tensor
idx_tensor
);
void
ball_query_kernel_launcher
(
int
b
,
int
n
,
int
m
,
float
radius
,
int
nsample
,
const
float
*
xyz
,
const
float
*
new_xyz
,
int
*
idx
,
cudaStream_t
stream
);
int
ball_query_wrapper
(
int
b
,
int
n
,
int
m
,
float
radius
,
int
nsample
,
at
::
Tensor
new_xyz_tensor
,
at
::
Tensor
xyz_tensor
,
at
::
Tensor
idx_tensor
)
{
CHECK_INPUT
(
new_xyz_tensor
);
CHECK_INPUT
(
xyz_tensor
);
const
float
*
new_xyz
=
new_xyz_tensor
.
data
<
float
>
();
const
float
*
xyz
=
xyz_tensor
.
data
<
float
>
();
int
*
idx
=
idx_tensor
.
data
<
int
>
();
cudaStream_t
stream
=
THCState_getCurrentStream
(
state
);
ball_query_kernel_launcher
(
b
,
n
,
m
,
radius
,
nsample
,
new_xyz
,
xyz
,
idx
,
stream
);
return
1
;
}
PYBIND11_MODULE
(
TORCH_EXTENSION_NAME
,
m
)
{
m
.
def
(
"ball_query_wrapper"
,
&
ball_query_wrapper
,
"ball_query_wrapper"
);
}
mmdet3d/ops/ball_query/src/ball_query_cuda.cu
0 → 100644
View file @
701e5032
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#define THREADS_PER_BLOCK 256
#define DIVUP(m,n) ((m) / (n) + ((m) % (n) > 0))
__global__
void
ball_query_kernel
(
int
b
,
int
n
,
int
m
,
float
radius
,
int
nsample
,
const
float
*
__restrict__
new_xyz
,
const
float
*
__restrict__
xyz
,
int
*
__restrict__
idx
)
{
// new_xyz: (B, M, 3)
// xyz: (B, N, 3)
// output:
// idx: (B, M, nsample)
int
bs_idx
=
blockIdx
.
y
;
int
pt_idx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
bs_idx
>=
b
||
pt_idx
>=
m
)
return
;
new_xyz
+=
bs_idx
*
m
*
3
+
pt_idx
*
3
;
xyz
+=
bs_idx
*
n
*
3
;
idx
+=
bs_idx
*
m
*
nsample
+
pt_idx
*
nsample
;
float
radius2
=
radius
*
radius
;
float
new_x
=
new_xyz
[
0
];
float
new_y
=
new_xyz
[
1
];
float
new_z
=
new_xyz
[
2
];
int
cnt
=
0
;
for
(
int
k
=
0
;
k
<
n
;
++
k
)
{
float
x
=
xyz
[
k
*
3
+
0
];
float
y
=
xyz
[
k
*
3
+
1
];
float
z
=
xyz
[
k
*
3
+
2
];
float
d2
=
(
new_x
-
x
)
*
(
new_x
-
x
)
+
(
new_y
-
y
)
*
(
new_y
-
y
)
+
(
new_z
-
z
)
*
(
new_z
-
z
);
if
(
d2
<
radius2
){
if
(
cnt
==
0
){
for
(
int
l
=
0
;
l
<
nsample
;
++
l
)
{
idx
[
l
]
=
k
;
}
}
idx
[
cnt
]
=
k
;
++
cnt
;
if
(
cnt
>=
nsample
)
break
;
}
}
}
void
ball_query_kernel_launcher
(
int
b
,
int
n
,
int
m
,
float
radius
,
int
nsample
,
\
const
float
*
new_xyz
,
const
float
*
xyz
,
int
*
idx
,
cudaStream_t
stream
)
{
// new_xyz: (B, M, 3)
// xyz: (B, N, 3)
// output:
// idx: (B, M, nsample)
cudaError_t
err
;
dim3
blocks
(
DIVUP
(
m
,
THREADS_PER_BLOCK
),
b
);
// blockIdx.x(col), blockIdx.y(row)
dim3
threads
(
THREADS_PER_BLOCK
);
ball_query_kernel
<<<
blocks
,
threads
,
0
,
stream
>>>
(
b
,
n
,
m
,
radius
,
nsample
,
new_xyz
,
xyz
,
idx
);
// cudaDeviceSynchronize(); // for using printf in kernel function
err
=
cudaGetLastError
();
if
(
cudaSuccess
!=
err
)
{
fprintf
(
stderr
,
"CUDA kernel failed : %s
\n
"
,
cudaGetErrorString
(
err
));
exit
(
-
1
);
}
}
mmdet3d/ops/furthest_point_sample/__init__.py
0 → 100644
View file @
701e5032
from
.furthest_point_sample
import
furthest_point_sample
__all__
=
[
'furthest_point_sample'
]
mmdet3d/ops/furthest_point_sample/furthest_point_sample.py
0 → 100644
View file @
701e5032
import
torch
from
torch.autograd
import
Function
from
.
import
furthest_point_sample_ext
class
FurthestPointSampling
(
Function
):
"""Furthest Point Sampling.
Uses iterative furthest point sampling to select a set of
features whose corresponding points have the furthest distance.
"""
@
staticmethod
def
forward
(
ctx
,
points_xyz
:
torch
.
Tensor
,
num_points
:
int
)
->
torch
.
Tensor
:
"""forward.
Args:
points_xyz (Tensor): (B, N, 3) where N > num_points.
num_points (int): Number of points in the sampled set.
Returns:
Tensor: (B, num_points) indices of the sampled points.
"""
assert
points_xyz
.
is_contiguous
()
B
,
N
,
_
=
points_xyz
.
size
()
output
=
torch
.
cuda
.
IntTensor
(
B
,
num_points
)
temp
=
torch
.
cuda
.
FloatTensor
(
B
,
N
).
fill_
(
1e10
)
furthest_point_sample_ext
.
furthest_point_sampling_wrapper
(
B
,
N
,
num_points
,
points_xyz
,
temp
,
output
)
ctx
.
mark_non_differentiable
(
output
)
return
output
@
staticmethod
def
backward
(
xyz
,
a
=
None
):
return
None
,
None
furthest_point_sample
=
FurthestPointSampling
.
apply
mmdet3d/ops/furthest_point_sample/src/furthest_point_sample.cpp
0 → 100644
View file @
701e5032
#include <torch/serialize/tensor.h>
#include <ATen/cuda/CUDAContext.h>
#include <vector>
#include <THC/THC.h>
#include <torch/extension.h>
extern
THCState
*
state
;
int
furthest_point_sampling_wrapper
(
int
b
,
int
n
,
int
m
,
at
::
Tensor
points_tensor
,
at
::
Tensor
temp_tensor
,
at
::
Tensor
idx_tensor
);
void
furthest_point_sampling_kernel_launcher
(
int
b
,
int
n
,
int
m
,
const
float
*
dataset
,
float
*
temp
,
int
*
idxs
,
cudaStream_t
stream
);
int
furthest_point_sampling_wrapper
(
int
b
,
int
n
,
int
m
,
at
::
Tensor
points_tensor
,
at
::
Tensor
temp_tensor
,
at
::
Tensor
idx_tensor
)
{
const
float
*
points
=
points_tensor
.
data
<
float
>
();
float
*
temp
=
temp_tensor
.
data
<
float
>
();
int
*
idx
=
idx_tensor
.
data
<
int
>
();
cudaStream_t
stream
=
THCState_getCurrentStream
(
state
);
furthest_point_sampling_kernel_launcher
(
b
,
n
,
m
,
points
,
temp
,
idx
,
stream
);
return
1
;
}
PYBIND11_MODULE
(
TORCH_EXTENSION_NAME
,
m
)
{
m
.
def
(
"furthest_point_sampling_wrapper"
,
&
furthest_point_sampling_wrapper
,
"furthest_point_sampling_wrapper"
);
}
mmdet3d/ops/furthest_point_sample/src/furthest_point_sample_cuda.cu
0 → 100644
View file @
701e5032
#include <stdio.h>
#include <stdlib.h>
#define TOTAL_THREADS 1024
#define THREADS_PER_BLOCK 256
#define DIVUP(m,n) ((m) / (n) + ((m) % (n) > 0))
inline
int
opt_n_threads
(
int
work_size
)
{
const
int
pow_2
=
std
::
log
(
static_cast
<
double
>
(
work_size
))
/
std
::
log
(
2.0
);
return
max
(
min
(
1
<<
pow_2
,
TOTAL_THREADS
),
1
);
}
__device__
void
__update
(
float
*
__restrict__
dists
,
int
*
__restrict__
dists_i
,
int
idx1
,
int
idx2
){
const
float
v1
=
dists
[
idx1
],
v2
=
dists
[
idx2
];
const
int
i1
=
dists_i
[
idx1
],
i2
=
dists_i
[
idx2
];
dists
[
idx1
]
=
max
(
v1
,
v2
);
dists_i
[
idx1
]
=
v2
>
v1
?
i2
:
i1
;
}
template
<
unsigned
int
block_size
>
__global__
void
furthest_point_sampling_kernel
(
int
b
,
int
n
,
int
m
,
const
float
*
__restrict__
dataset
,
float
*
__restrict__
temp
,
int
*
__restrict__
idxs
)
{
// dataset: (B, N, 3)
// tmp: (B, N)
// output:
// idx: (B, M)
if
(
m
<=
0
)
return
;
__shared__
float
dists
[
block_size
];
__shared__
int
dists_i
[
block_size
];
int
batch_index
=
blockIdx
.
x
;
dataset
+=
batch_index
*
n
*
3
;
temp
+=
batch_index
*
n
;
idxs
+=
batch_index
*
m
;
int
tid
=
threadIdx
.
x
;
const
int
stride
=
block_size
;
int
old
=
0
;
if
(
threadIdx
.
x
==
0
)
idxs
[
0
]
=
old
;
__syncthreads
();
for
(
int
j
=
1
;
j
<
m
;
j
++
)
{
int
besti
=
0
;
float
best
=
-
1
;
float
x1
=
dataset
[
old
*
3
+
0
];
float
y1
=
dataset
[
old
*
3
+
1
];
float
z1
=
dataset
[
old
*
3
+
2
];
for
(
int
k
=
tid
;
k
<
n
;
k
+=
stride
)
{
float
x2
,
y2
,
z2
;
x2
=
dataset
[
k
*
3
+
0
];
y2
=
dataset
[
k
*
3
+
1
];
z2
=
dataset
[
k
*
3
+
2
];
// float mag = (x2 * x2) + (y2 * y2) + (z2 * z2);
// if (mag <= 1e-3)
// continue;
float
d
=
(
x2
-
x1
)
*
(
x2
-
x1
)
+
(
y2
-
y1
)
*
(
y2
-
y1
)
+
(
z2
-
z1
)
*
(
z2
-
z1
);
float
d2
=
min
(
d
,
temp
[
k
]);
temp
[
k
]
=
d2
;
besti
=
d2
>
best
?
k
:
besti
;
best
=
d2
>
best
?
d2
:
best
;
}
dists
[
tid
]
=
best
;
dists_i
[
tid
]
=
besti
;
__syncthreads
();
if
(
block_size
>=
1024
)
{
if
(
tid
<
512
)
{
__update
(
dists
,
dists_i
,
tid
,
tid
+
512
);
}
__syncthreads
();
}
if
(
block_size
>=
512
)
{
if
(
tid
<
256
)
{
__update
(
dists
,
dists_i
,
tid
,
tid
+
256
);
}
__syncthreads
();
}
if
(
block_size
>=
256
)
{
if
(
tid
<
128
)
{
__update
(
dists
,
dists_i
,
tid
,
tid
+
128
);
}
__syncthreads
();
}
if
(
block_size
>=
128
)
{
if
(
tid
<
64
)
{
__update
(
dists
,
dists_i
,
tid
,
tid
+
64
);
}
__syncthreads
();
}
if
(
block_size
>=
64
)
{
if
(
tid
<
32
)
{
__update
(
dists
,
dists_i
,
tid
,
tid
+
32
);
}
__syncthreads
();
}
if
(
block_size
>=
32
)
{
if
(
tid
<
16
)
{
__update
(
dists
,
dists_i
,
tid
,
tid
+
16
);
}
__syncthreads
();
}
if
(
block_size
>=
16
)
{
if
(
tid
<
8
)
{
__update
(
dists
,
dists_i
,
tid
,
tid
+
8
);
}
__syncthreads
();
}
if
(
block_size
>=
8
)
{
if
(
tid
<
4
)
{
__update
(
dists
,
dists_i
,
tid
,
tid
+
4
);
}
__syncthreads
();
}
if
(
block_size
>=
4
)
{
if
(
tid
<
2
)
{
__update
(
dists
,
dists_i
,
tid
,
tid
+
2
);
}
__syncthreads
();
}
if
(
block_size
>=
2
)
{
if
(
tid
<
1
)
{
__update
(
dists
,
dists_i
,
tid
,
tid
+
1
);
}
__syncthreads
();
}
old
=
dists_i
[
0
];
if
(
tid
==
0
)
idxs
[
j
]
=
old
;
}
}
void
furthest_point_sampling_kernel_launcher
(
int
b
,
int
n
,
int
m
,
const
float
*
dataset
,
float
*
temp
,
int
*
idxs
,
cudaStream_t
stream
)
{
// dataset: (B, N, 3)
// tmp: (B, N)
// output:
// idx: (B, M)
cudaError_t
err
;
unsigned
int
n_threads
=
opt_n_threads
(
n
);
switch
(
n_threads
)
{
case
1024
:
furthest_point_sampling_kernel
<
1024
><<<
b
,
n_threads
,
0
,
stream
>>>
(
b
,
n
,
m
,
dataset
,
temp
,
idxs
);
break
;
case
512
:
furthest_point_sampling_kernel
<
512
><<<
b
,
n_threads
,
0
,
stream
>>>
(
b
,
n
,
m
,
dataset
,
temp
,
idxs
);
break
;
case
256
:
furthest_point_sampling_kernel
<
256
><<<
b
,
n_threads
,
0
,
stream
>>>
(
b
,
n
,
m
,
dataset
,
temp
,
idxs
);
break
;
case
128
:
furthest_point_sampling_kernel
<
128
><<<
b
,
n_threads
,
0
,
stream
>>>
(
b
,
n
,
m
,
dataset
,
temp
,
idxs
);
break
;
case
64
:
furthest_point_sampling_kernel
<
64
><<<
b
,
n_threads
,
0
,
stream
>>>
(
b
,
n
,
m
,
dataset
,
temp
,
idxs
);
break
;
case
32
:
furthest_point_sampling_kernel
<
32
><<<
b
,
n_threads
,
0
,
stream
>>>
(
b
,
n
,
m
,
dataset
,
temp
,
idxs
);
break
;
case
16
:
furthest_point_sampling_kernel
<
16
><<<
b
,
n_threads
,
0
,
stream
>>>
(
b
,
n
,
m
,
dataset
,
temp
,
idxs
);
break
;
case
8
:
furthest_point_sampling_kernel
<
8
><<<
b
,
n_threads
,
0
,
stream
>>>
(
b
,
n
,
m
,
dataset
,
temp
,
idxs
);
break
;
case
4
:
furthest_point_sampling_kernel
<
4
><<<
b
,
n_threads
,
0
,
stream
>>>
(
b
,
n
,
m
,
dataset
,
temp
,
idxs
);
break
;
case
2
:
furthest_point_sampling_kernel
<
2
><<<
b
,
n_threads
,
0
,
stream
>>>
(
b
,
n
,
m
,
dataset
,
temp
,
idxs
);
break
;
case
1
:
furthest_point_sampling_kernel
<
1
><<<
b
,
n_threads
,
0
,
stream
>>>
(
b
,
n
,
m
,
dataset
,
temp
,
idxs
);
break
;
default:
furthest_point_sampling_kernel
<
512
><<<
b
,
n_threads
,
0
,
stream
>>>
(
b
,
n
,
m
,
dataset
,
temp
,
idxs
);
}
err
=
cudaGetLastError
();
if
(
cudaSuccess
!=
err
)
{
fprintf
(
stderr
,
"CUDA kernel failed : %s
\n
"
,
cudaGetErrorString
(
err
));
exit
(
-
1
);
}
}
mmdet3d/ops/gather_points/__init__.py
0 → 100644
View file @
701e5032
from
.gather_points
import
gather_points
__all__
=
[
'gather_points'
]
mmdet3d/ops/gather_points/gather_points.py
0 → 100644
View file @
701e5032
import
torch
from
torch.autograd
import
Function
from
.
import
gather_points_ext
class
GatherPoints
(
Function
):
"""Gather Points
Gather points with given index.
"""
@
staticmethod
def
forward
(
ctx
,
features
:
torch
.
Tensor
,
indicies
:
torch
.
Tensor
)
->
torch
.
Tensor
:
"""forward.
Args:
features (Tensor): (B, C, N) features to gather.
indicies (Tensor): (B, M) where M is the number of points.
Returns:
Tensor: (B, C, M) where M is the number of points.
"""
assert
features
.
is_contiguous
()
assert
indicies
.
is_contiguous
()
B
,
npoint
=
indicies
.
size
()
_
,
C
,
N
=
features
.
size
()
output
=
torch
.
cuda
.
FloatTensor
(
B
,
C
,
npoint
)
gather_points_ext
.
gather_points_wrapper
(
B
,
C
,
N
,
npoint
,
features
,
indicies
,
output
)
ctx
.
for_backwards
=
(
indicies
,
C
,
N
)
ctx
.
mark_non_differentiable
(
indicies
)
return
output
@
staticmethod
def
backward
(
ctx
,
grad_out
):
idx
,
C
,
N
=
ctx
.
for_backwards
B
,
npoint
=
idx
.
size
()
grad_features
=
torch
.
cuda
.
FloatTensor
(
B
,
C
,
N
).
zero_
()
grad_out_data
=
grad_out
.
data
.
contiguous
()
gather_points_ext
.
gather_points_grad_wrapper
(
B
,
C
,
N
,
npoint
,
grad_out_data
,
idx
,
grad_features
.
data
)
return
grad_features
,
None
gather_points
=
GatherPoints
.
apply
mmdet3d/ops/gather_points/src/gather_points.cpp
0 → 100644
View file @
701e5032
#include <torch/serialize/tensor.h>
#include <ATen/cuda/CUDAContext.h>
#include <vector>
#include <THC/THC.h>
#include <torch/extension.h>
extern
THCState
*
state
;
int
gather_points_wrapper
(
int
b
,
int
c
,
int
n
,
int
npoints
,
at
::
Tensor
points_tensor
,
at
::
Tensor
idx_tensor
,
at
::
Tensor
out_tensor
);
void
gather_points_kernel_launcher
(
int
b
,
int
c
,
int
n
,
int
npoints
,
const
float
*
points
,
const
int
*
idx
,
float
*
out
,
cudaStream_t
stream
);
int
gather_points_grad_wrapper
(
int
b
,
int
c
,
int
n
,
int
npoints
,
at
::
Tensor
grad_out_tensor
,
at
::
Tensor
idx_tensor
,
at
::
Tensor
grad_points_tensor
);
void
gather_points_grad_kernel_launcher
(
int
b
,
int
c
,
int
n
,
int
npoints
,
const
float
*
grad_out
,
const
int
*
idx
,
float
*
grad_points
,
cudaStream_t
stream
);
int
gather_points_wrapper
(
int
b
,
int
c
,
int
n
,
int
npoints
,
at
::
Tensor
points_tensor
,
at
::
Tensor
idx_tensor
,
at
::
Tensor
out_tensor
){
const
float
*
points
=
points_tensor
.
data
<
float
>
();
const
int
*
idx
=
idx_tensor
.
data
<
int
>
();
float
*
out
=
out_tensor
.
data
<
float
>
();
cudaStream_t
stream
=
THCState_getCurrentStream
(
state
);
gather_points_kernel_launcher
(
b
,
c
,
n
,
npoints
,
points
,
idx
,
out
,
stream
);
return
1
;
}
int
gather_points_grad_wrapper
(
int
b
,
int
c
,
int
n
,
int
npoints
,
at
::
Tensor
grad_out_tensor
,
at
::
Tensor
idx_tensor
,
at
::
Tensor
grad_points_tensor
)
{
const
float
*
grad_out
=
grad_out_tensor
.
data
<
float
>
();
const
int
*
idx
=
idx_tensor
.
data
<
int
>
();
float
*
grad_points
=
grad_points_tensor
.
data
<
float
>
();
cudaStream_t
stream
=
THCState_getCurrentStream
(
state
);
gather_points_grad_kernel_launcher
(
b
,
c
,
n
,
npoints
,
grad_out
,
idx
,
grad_points
,
stream
);
return
1
;
}
PYBIND11_MODULE
(
TORCH_EXTENSION_NAME
,
m
)
{
m
.
def
(
"gather_points_wrapper"
,
&
gather_points_wrapper
,
"gather_points_wrapper"
);
m
.
def
(
"gather_points_grad_wrapper"
,
&
gather_points_grad_wrapper
,
"gather_points_grad_wrapper"
);
}
mmdet3d/ops/gather_points/src/gather_points_cuda.cu
0 → 100644
View file @
701e5032
#include <stdio.h>
#include <stdlib.h>
#define TOTAL_THREADS 1024
#define THREADS_PER_BLOCK 256
#define DIVUP(m,n) ((m) / (n) + ((m) % (n) > 0))
__global__
void
gather_points_kernel
(
int
b
,
int
c
,
int
n
,
int
m
,
const
float
*
__restrict__
points
,
const
int
*
__restrict__
idx
,
float
*
__restrict__
out
)
{
// points: (B, C, N)
// idx: (B, M)
// output:
// out: (B, C, M)
int
bs_idx
=
blockIdx
.
z
;
int
c_idx
=
blockIdx
.
y
;
int
pt_idx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
bs_idx
>=
b
||
c_idx
>=
c
||
pt_idx
>=
m
)
return
;
out
+=
bs_idx
*
c
*
m
+
c_idx
*
m
+
pt_idx
;
idx
+=
bs_idx
*
m
+
pt_idx
;
points
+=
bs_idx
*
c
*
n
+
c_idx
*
n
;
out
[
0
]
=
points
[
idx
[
0
]];
}
void
gather_points_kernel_launcher
(
int
b
,
int
c
,
int
n
,
int
npoints
,
const
float
*
points
,
const
int
*
idx
,
float
*
out
,
cudaStream_t
stream
)
{
// points: (B, C, N)
// idx: (B, npoints)
// output:
// out: (B, C, npoints)
cudaError_t
err
;
dim3
blocks
(
DIVUP
(
npoints
,
THREADS_PER_BLOCK
),
c
,
b
);
// blockIdx.x(col), blockIdx.y(row)
dim3
threads
(
THREADS_PER_BLOCK
);
gather_points_kernel
<<<
blocks
,
threads
,
0
,
stream
>>>
(
b
,
c
,
n
,
npoints
,
points
,
idx
,
out
);
err
=
cudaGetLastError
();
if
(
cudaSuccess
!=
err
)
{
fprintf
(
stderr
,
"CUDA kernel failed : %s
\n
"
,
cudaGetErrorString
(
err
));
exit
(
-
1
);
}
}
__global__
void
gather_points_grad_kernel
(
int
b
,
int
c
,
int
n
,
int
m
,
const
float
*
__restrict__
grad_out
,
const
int
*
__restrict__
idx
,
float
*
__restrict__
grad_points
)
{
// grad_out: (B, C, M)
// idx: (B, M)
// output:
// grad_points: (B, C, N)
int
bs_idx
=
blockIdx
.
z
;
int
c_idx
=
blockIdx
.
y
;
int
pt_idx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
bs_idx
>=
b
||
c_idx
>=
c
||
pt_idx
>=
m
)
return
;
grad_out
+=
bs_idx
*
c
*
m
+
c_idx
*
m
+
pt_idx
;
idx
+=
bs_idx
*
m
+
pt_idx
;
grad_points
+=
bs_idx
*
c
*
n
+
c_idx
*
n
;
atomicAdd
(
grad_points
+
idx
[
0
],
grad_out
[
0
]);
}
void
gather_points_grad_kernel_launcher
(
int
b
,
int
c
,
int
n
,
int
npoints
,
const
float
*
grad_out
,
const
int
*
idx
,
float
*
grad_points
,
cudaStream_t
stream
)
{
// grad_out: (B, C, npoints)
// idx: (B, npoints)
// output:
// grad_points: (B, C, N)
cudaError_t
err
;
dim3
blocks
(
DIVUP
(
npoints
,
THREADS_PER_BLOCK
),
c
,
b
);
// blockIdx.x(col), blockIdx.y(row)
dim3
threads
(
THREADS_PER_BLOCK
);
gather_points_grad_kernel
<<<
blocks
,
threads
,
0
,
stream
>>>
(
b
,
c
,
n
,
npoints
,
grad_out
,
idx
,
grad_points
);
err
=
cudaGetLastError
();
if
(
cudaSuccess
!=
err
)
{
fprintf
(
stderr
,
"CUDA kernel failed : %s
\n
"
,
cudaGetErrorString
(
err
));
exit
(
-
1
);
}
}
mmdet3d/ops/group_points/__init__.py
0 → 100644
View file @
701e5032
from
.group_points
import
GroupAll
,
QueryAndGroup
,
grouping_operation
__all__
=
[
'QueryAndGroup'
,
'GroupAll'
,
'grouping_operation'
]
mmdet3d/ops/group_points/group_points.py
0 → 100644
View file @
701e5032
from
typing
import
Tuple
import
torch
import
torch.nn
as
nn
from
torch.autograd
import
Function
from
..ball_query
import
ball_query
from
.
import
group_points_ext
class
QueryAndGroup
(
nn
.
Module
):
"""Query and Group.
Groups with a ball query of radius
Args:
radius (float): radius of the balls.
sample_num (int): Maximum number of features to gather in the ball.
use_xyz (bool): Whether to use xyz.
Default: True.
return_grouped_xyz (bool): Whether to return grouped xyz.
Default: False.
normalize_xyz (bool): Whether to normalize xyz.
Default: False.
uniform_sample (bool): Whether to sample uniformly.
Default: False
return_unique_cnt (bool): Whether to return the count of
unique samples.
Default: False.
"""
def
__init__
(
self
,
radius
,
sample_num
,
use_xyz
=
True
,
return_grouped_xyz
=
False
,
normalize_xyz
=
False
,
uniform_sample
=
False
,
return_unique_cnt
=
False
):
super
(
QueryAndGroup
,
self
).
__init__
()
self
.
radius
=
radius
self
.
sample_num
=
sample_num
self
.
use_xyz
=
use_xyz
self
.
return_grouped_xyz
=
return_grouped_xyz
self
.
normalize_xyz
=
normalize_xyz
self
.
uniform_sample
=
uniform_sample
self
.
return_unique_cnt
=
return_unique_cnt
if
self
.
return_unique_cnt
:
assert
self
.
uniform_sample
def
forward
(
self
,
points_xyz
,
center_xyz
,
features
=
None
):
"""forward
Args:
points_xyz (Tensor): (B, N, 3) xyz coordinates of the features.
center_xyz (Tensor): (B, npoint, 3) Centriods.
features (Tensor): (B, C, N) Descriptors of the features.
Return:
Tensor: (B, 3 + C, npoint, sample_num) Grouped feature.
"""
idx
=
ball_query
(
self
.
radius
,
self
.
sample_num
,
points_xyz
,
center_xyz
)
if
self
.
uniform_sample
:
unique_cnt
=
torch
.
zeros
((
idx
.
shape
[
0
],
idx
.
shape
[
1
]))
for
i_batch
in
range
(
idx
.
shape
[
0
]):
for
i_region
in
range
(
idx
.
shape
[
1
]):
unique_ind
=
torch
.
unique
(
idx
[
i_batch
,
i_region
,
:])
num_unique
=
unique_ind
.
shape
[
0
]
unique_cnt
[
i_batch
,
i_region
]
=
num_unique
sample_ind
=
torch
.
randint
(
0
,
num_unique
,
(
self
.
sample_num
-
num_unique
,
),
dtype
=
torch
.
long
)
all_ind
=
torch
.
cat
((
unique_ind
,
unique_ind
[
sample_ind
]))
idx
[
i_batch
,
i_region
,
:]
=
all_ind
xyz_trans
=
points_xyz
.
transpose
(
1
,
2
).
contiguous
()
# (B, 3, npoint, sample_num)
grouped_xyz
=
grouping_operation
(
xyz_trans
,
idx
)
grouped_xyz
-=
center_xyz
.
transpose
(
1
,
2
).
unsqueeze
(
-
1
)
if
self
.
normalize_xyz
:
grouped_xyz
/=
self
.
radius
if
features
is
not
None
:
grouped_features
=
grouping_operation
(
features
,
idx
)
if
self
.
use_xyz
:
# (B, C + 3, npoint, sample_num)
new_features
=
torch
.
cat
([
grouped_xyz
,
grouped_features
],
dim
=
1
)
else
:
new_features
=
grouped_features
else
:
assert
(
self
.
use_xyz
),
'Cannot have not features and not use xyz as a feature!'
new_features
=
grouped_xyz
ret
=
[
new_features
]
if
self
.
return_grouped_xyz
:
ret
.
append
(
grouped_xyz
)
if
self
.
return_unique_cnt
:
ret
.
append
(
unique_cnt
)
if
len
(
ret
)
==
1
:
return
ret
[
0
]
else
:
return
tuple
(
ret
)
class
GroupAll
(
nn
.
Module
):
"""Group All.
Group xyz with feature.
Args:
use_xyz (bool): Whether to use xyz.
"""
def
__init__
(
self
,
use_xyz
:
bool
=
True
):
super
().
__init__
()
self
.
use_xyz
=
use_xyz
def
forward
(
self
,
xyz
:
torch
.
Tensor
,
new_xyz
:
torch
.
Tensor
,
features
:
torch
.
Tensor
=
None
):
"""forward.
Args:
xyz (Tensor): (B, N, 3) xyz coordinates of the features.
new_xyz (Tensor): Ignored.
features (Tensor): (B, C, N) features to group.
Return:
Tensor: (B, C + 3, 1, N) Grouped feature.
"""
grouped_xyz
=
xyz
.
transpose
(
1
,
2
).
unsqueeze
(
2
)
if
features
is
not
None
:
grouped_features
=
features
.
unsqueeze
(
2
)
if
self
.
use_xyz
:
new_features
=
torch
.
cat
([
grouped_xyz
,
grouped_features
],
dim
=
1
)
# (B, 3 + C, 1, N)
else
:
new_features
=
grouped_features
else
:
new_features
=
grouped_xyz
return
new_features
class
GroupingOperation
(
Function
):
"""Grouping Operation.
Group feature with given index.
"""
@
staticmethod
def
forward
(
ctx
,
features
:
torch
.
Tensor
,
indices
:
torch
.
Tensor
)
->
torch
.
Tensor
:
"""forward.
Args:
features (Tensor): (B, C, N) tensor of features to group.
indices (Tensor): (B, npoint, nsample) the indicies of
features to group with.
Returns:
Tensor: (B, C, npoint, nsample) Grouped features.
"""
assert
features
.
is_contiguous
()
assert
indices
.
is_contiguous
()
B
,
nfeatures
,
nsample
=
indices
.
size
()
_
,
C
,
N
=
features
.
size
()
output
=
torch
.
cuda
.
FloatTensor
(
B
,
C
,
nfeatures
,
nsample
)
group_points_ext
.
forward
(
B
,
C
,
N
,
nfeatures
,
nsample
,
features
,
indices
,
output
)
ctx
.
for_backwards
=
(
indices
,
N
)
return
output
@
staticmethod
def
backward
(
ctx
,
grad_out
:
torch
.
Tensor
)
->
Tuple
[
torch
.
Tensor
,
torch
.
Tensor
]:
"""backward.
Args:
grad_out (Tensor): (B, C, npoint, nsample) tensor of the gradients
of the output from forward.
Returns:
Tensor: (B, C, N) gradient of the features.
"""
idx
,
N
=
ctx
.
for_backwards
B
,
C
,
npoint
,
nsample
=
grad_out
.
size
()
grad_features
=
torch
.
cuda
.
FloatTensor
(
B
,
C
,
N
).
zero_
()
grad_out_data
=
grad_out
.
data
.
contiguous
()
group_points_ext
.
backward
(
B
,
C
,
N
,
npoint
,
nsample
,
grad_out_data
,
idx
,
grad_features
.
data
)
return
grad_features
,
None
grouping_operation
=
GroupingOperation
.
apply
mmdet3d/ops/group_points/src/group_points.cpp
0 → 100644
View file @
701e5032
#include <torch/serialize/tensor.h>
#include <cuda.h>
#include <cuda_runtime_api.h>
#include <vector>
#include <THC/THC.h>
#include <torch/extension.h>
extern
THCState
*
state
;
int
group_points_wrapper
(
int
b
,
int
c
,
int
n
,
int
npoints
,
int
nsample
,
at
::
Tensor
points_tensor
,
at
::
Tensor
idx_tensor
,
at
::
Tensor
out_tensor
);
void
group_points_kernel_launcher
(
int
b
,
int
c
,
int
n
,
int
npoints
,
int
nsample
,
const
float
*
points
,
const
int
*
idx
,
float
*
out
,
cudaStream_t
stream
);
int
group_points_grad_wrapper
(
int
b
,
int
c
,
int
n
,
int
npoints
,
int
nsample
,
at
::
Tensor
grad_out_tensor
,
at
::
Tensor
idx_tensor
,
at
::
Tensor
grad_points_tensor
);
void
group_points_grad_kernel_launcher
(
int
b
,
int
c
,
int
n
,
int
npoints
,
int
nsample
,
const
float
*
grad_out
,
const
int
*
idx
,
float
*
grad_points
,
cudaStream_t
stream
);
int
group_points_grad_wrapper
(
int
b
,
int
c
,
int
n
,
int
npoints
,
int
nsample
,
at
::
Tensor
grad_out_tensor
,
at
::
Tensor
idx_tensor
,
at
::
Tensor
grad_points_tensor
)
{
float
*
grad_points
=
grad_points_tensor
.
data
<
float
>
();
const
int
*
idx
=
idx_tensor
.
data
<
int
>
();
const
float
*
grad_out
=
grad_out_tensor
.
data
<
float
>
();
cudaStream_t
stream
=
THCState_getCurrentStream
(
state
);
group_points_grad_kernel_launcher
(
b
,
c
,
n
,
npoints
,
nsample
,
grad_out
,
idx
,
grad_points
,
stream
);
return
1
;
}
int
group_points_wrapper
(
int
b
,
int
c
,
int
n
,
int
npoints
,
int
nsample
,
at
::
Tensor
points_tensor
,
at
::
Tensor
idx_tensor
,
at
::
Tensor
out_tensor
)
{
const
float
*
points
=
points_tensor
.
data
<
float
>
();
const
int
*
idx
=
idx_tensor
.
data
<
int
>
();
float
*
out
=
out_tensor
.
data
<
float
>
();
cudaStream_t
stream
=
THCState_getCurrentStream
(
state
);
group_points_kernel_launcher
(
b
,
c
,
n
,
npoints
,
nsample
,
points
,
idx
,
out
,
stream
);
return
1
;
}
PYBIND11_MODULE
(
TORCH_EXTENSION_NAME
,
m
)
{
m
.
def
(
"forward"
,
&
group_points_wrapper
,
"group_points_wrapper"
);
m
.
def
(
"backward"
,
&
group_points_grad_wrapper
,
"group_points_grad_wrapper"
);
}
mmdet3d/ops/group_points/src/group_points_cuda.cu
0 → 100644
View file @
701e5032
#include <stdio.h>
#include <stdlib.h>
#define THREADS_PER_BLOCK 256
#define DIVUP(m,n) ((m) / (n) + ((m) % (n) > 0))
__global__
void
group_points_grad_kernel
(
int
b
,
int
c
,
int
n
,
int
npoints
,
int
nsample
,
const
float
*
__restrict__
grad_out
,
const
int
*
__restrict__
idx
,
float
*
__restrict__
grad_points
)
{
// grad_out: (B, C, npoints, nsample)
// idx: (B, npoints, nsample)
// output:
// grad_points: (B, C, N)
int
bs_idx
=
blockIdx
.
z
;
int
c_idx
=
blockIdx
.
y
;
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
int
pt_idx
=
index
/
nsample
;
if
(
bs_idx
>=
b
||
c_idx
>=
c
||
pt_idx
>=
npoints
)
return
;
int
sample_idx
=
index
%
nsample
;
grad_out
+=
bs_idx
*
c
*
npoints
*
nsample
+
c_idx
*
npoints
*
nsample
+
pt_idx
*
nsample
+
sample_idx
;
idx
+=
bs_idx
*
npoints
*
nsample
+
pt_idx
*
nsample
+
sample_idx
;
atomicAdd
(
grad_points
+
bs_idx
*
c
*
n
+
c_idx
*
n
+
idx
[
0
]
,
grad_out
[
0
]);
}
void
group_points_grad_kernel_launcher
(
int
b
,
int
c
,
int
n
,
int
npoints
,
int
nsample
,
const
float
*
grad_out
,
const
int
*
idx
,
float
*
grad_points
,
cudaStream_t
stream
)
{
// grad_out: (B, C, npoints, nsample)
// idx: (B, npoints, nsample)
// output:
// grad_points: (B, C, N)
cudaError_t
err
;
dim3
blocks
(
DIVUP
(
npoints
*
nsample
,
THREADS_PER_BLOCK
),
c
,
b
);
// blockIdx.x(col), blockIdx.y(row)
dim3
threads
(
THREADS_PER_BLOCK
);
group_points_grad_kernel
<<<
blocks
,
threads
,
0
,
stream
>>>
(
b
,
c
,
n
,
npoints
,
nsample
,
grad_out
,
idx
,
grad_points
);
err
=
cudaGetLastError
();
if
(
cudaSuccess
!=
err
)
{
fprintf
(
stderr
,
"CUDA kernel failed : %s
\n
"
,
cudaGetErrorString
(
err
));
exit
(
-
1
);
}
}
__global__
void
group_points_kernel
(
int
b
,
int
c
,
int
n
,
int
npoints
,
int
nsample
,
const
float
*
__restrict__
points
,
const
int
*
__restrict__
idx
,
float
*
__restrict__
out
)
{
// points: (B, C, N)
// idx: (B, npoints, nsample)
// output:
// out: (B, C, npoints, nsample)
int
bs_idx
=
blockIdx
.
z
;
int
c_idx
=
blockIdx
.
y
;
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
int
pt_idx
=
index
/
nsample
;
if
(
bs_idx
>=
b
||
c_idx
>=
c
||
pt_idx
>=
npoints
)
return
;
int
sample_idx
=
index
%
nsample
;
idx
+=
bs_idx
*
npoints
*
nsample
+
pt_idx
*
nsample
+
sample_idx
;
int
in_idx
=
bs_idx
*
c
*
n
+
c_idx
*
n
+
idx
[
0
];
int
out_idx
=
bs_idx
*
c
*
npoints
*
nsample
+
c_idx
*
npoints
*
nsample
+
pt_idx
*
nsample
+
sample_idx
;
out
[
out_idx
]
=
points
[
in_idx
];
}
void
group_points_kernel_launcher
(
int
b
,
int
c
,
int
n
,
int
npoints
,
int
nsample
,
const
float
*
points
,
const
int
*
idx
,
float
*
out
,
cudaStream_t
stream
)
{
// points: (B, C, N)
// idx: (B, npoints, nsample)
// output:
// out: (B, C, npoints, nsample)
cudaError_t
err
;
dim3
blocks
(
DIVUP
(
npoints
*
nsample
,
THREADS_PER_BLOCK
),
c
,
b
);
// blockIdx.x(col), blockIdx.y(row)
dim3
threads
(
THREADS_PER_BLOCK
);
group_points_kernel
<<<
blocks
,
threads
,
0
,
stream
>>>
(
b
,
c
,
n
,
npoints
,
nsample
,
points
,
idx
,
out
);
// cudaDeviceSynchronize(); // for using printf in kernel function
err
=
cudaGetLastError
();
if
(
cudaSuccess
!=
err
)
{
fprintf
(
stderr
,
"CUDA kernel failed : %s
\n
"
,
cudaGetErrorString
(
err
));
exit
(
-
1
);
}
}
mmdet3d/ops/interpolate/__init__.py
0 → 100644
View file @
701e5032
from
.three_interpolate
import
three_interpolate
from
.three_nn
import
three_nn
__all__
=
[
'three_nn'
,
'three_interpolate'
]
mmdet3d/ops/interpolate/src/interpolate.cpp
0 → 100644
View file @
701e5032
#include <torch/serialize/tensor.h>
#include <vector>
#include <THC/THC.h>
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#include <cuda_runtime_api.h>
#include <torch/extension.h>
extern
THCState
*
state
;
void
three_nn_wrapper
(
int
b
,
int
n
,
int
m
,
at
::
Tensor
unknown_tensor
,
at
::
Tensor
known_tensor
,
at
::
Tensor
dist2_tensor
,
at
::
Tensor
idx_tensor
);
void
three_nn_kernel_launcher
(
int
b
,
int
n
,
int
m
,
const
float
*
unknown
,
const
float
*
known
,
float
*
dist2
,
int
*
idx
,
cudaStream_t
stream
);
void
three_interpolate_wrapper
(
int
b
,
int
c
,
int
m
,
int
n
,
at
::
Tensor
points_tensor
,
at
::
Tensor
idx_tensor
,
at
::
Tensor
weight_tensor
,
at
::
Tensor
out_tensor
);
void
three_interpolate_kernel_launcher
(
int
b
,
int
c
,
int
m
,
int
n
,
const
float
*
points
,
const
int
*
idx
,
const
float
*
weight
,
float
*
out
,
cudaStream_t
stream
);
void
three_interpolate_grad_wrapper
(
int
b
,
int
c
,
int
n
,
int
m
,
at
::
Tensor
grad_out_tensor
,
at
::
Tensor
idx_tensor
,
at
::
Tensor
weight_tensor
,
at
::
Tensor
grad_points_tensor
);
void
three_interpolate_grad_kernel_launcher
(
int
b
,
int
c
,
int
n
,
int
m
,
const
float
*
grad_out
,
const
int
*
idx
,
const
float
*
weight
,
float
*
grad_points
,
cudaStream_t
stream
);
void
three_nn_wrapper
(
int
b
,
int
n
,
int
m
,
at
::
Tensor
unknown_tensor
,
at
::
Tensor
known_tensor
,
at
::
Tensor
dist2_tensor
,
at
::
Tensor
idx_tensor
)
{
const
float
*
unknown
=
unknown_tensor
.
data
<
float
>
();
const
float
*
known
=
known_tensor
.
data
<
float
>
();
float
*
dist2
=
dist2_tensor
.
data
<
float
>
();
int
*
idx
=
idx_tensor
.
data
<
int
>
();
cudaStream_t
stream
=
THCState_getCurrentStream
(
state
);
three_nn_kernel_launcher
(
b
,
n
,
m
,
unknown
,
known
,
dist2
,
idx
,
stream
);
}
void
three_interpolate_wrapper
(
int
b
,
int
c
,
int
m
,
int
n
,
at
::
Tensor
points_tensor
,
at
::
Tensor
idx_tensor
,
at
::
Tensor
weight_tensor
,
at
::
Tensor
out_tensor
)
{
const
float
*
points
=
points_tensor
.
data
<
float
>
();
const
float
*
weight
=
weight_tensor
.
data
<
float
>
();
float
*
out
=
out_tensor
.
data
<
float
>
();
const
int
*
idx
=
idx_tensor
.
data
<
int
>
();
cudaStream_t
stream
=
THCState_getCurrentStream
(
state
);
three_interpolate_kernel_launcher
(
b
,
c
,
m
,
n
,
points
,
idx
,
weight
,
out
,
stream
);
}
void
three_interpolate_grad_wrapper
(
int
b
,
int
c
,
int
n
,
int
m
,
at
::
Tensor
grad_out_tensor
,
at
::
Tensor
idx_tensor
,
at
::
Tensor
weight_tensor
,
at
::
Tensor
grad_points_tensor
)
{
const
float
*
grad_out
=
grad_out_tensor
.
data
<
float
>
();
const
float
*
weight
=
weight_tensor
.
data
<
float
>
();
float
*
grad_points
=
grad_points_tensor
.
data
<
float
>
();
const
int
*
idx
=
idx_tensor
.
data
<
int
>
();
cudaStream_t
stream
=
THCState_getCurrentStream
(
state
);
three_interpolate_grad_kernel_launcher
(
b
,
c
,
n
,
m
,
grad_out
,
idx
,
weight
,
grad_points
,
stream
);
}
PYBIND11_MODULE
(
TORCH_EXTENSION_NAME
,
m
)
{
m
.
def
(
"three_nn_wrapper"
,
&
three_nn_wrapper
,
"three_nn_wrapper"
);
m
.
def
(
"three_interpolate_wrapper"
,
&
three_interpolate_wrapper
,
"three_interpolate_wrapper"
);
m
.
def
(
"three_interpolate_grad_wrapper"
,
&
three_interpolate_grad_wrapper
,
"three_interpolate_grad_wrapper"
);
}
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment