Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
mmdetection3d
Commits
006b7ccf
Commit
006b7ccf
authored
May 18, 2020
by
wuyuefeng
Committed by
zhangwenwei
May 18, 2020
Browse files
Feat pointnet ops
parent
28e511cd
Changes
26
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1037 additions
and
3 deletions
+1037
-3
mmdet3d/models/roi_heads/__init__.py
mmdet3d/models/roi_heads/__init__.py
+2
-2
mmdet3d/ops/__init__.py
mmdet3d/ops/__init__.py
+8
-1
mmdet3d/ops/ball_query/__init__.py
mmdet3d/ops/ball_query/__init__.py
+3
-0
mmdet3d/ops/ball_query/ball_query.py
mmdet3d/ops/ball_query/ball_query.py
+45
-0
mmdet3d/ops/ball_query/src/ball_query.cpp
mmdet3d/ops/ball_query/src/ball_query.cpp
+37
-0
mmdet3d/ops/ball_query/src/ball_query_cuda.cu
mmdet3d/ops/ball_query/src/ball_query_cuda.cu
+67
-0
mmdet3d/ops/furthest_point_sample/__init__.py
mmdet3d/ops/furthest_point_sample/__init__.py
+3
-0
mmdet3d/ops/furthest_point_sample/furthest_point_sample.py
mmdet3d/ops/furthest_point_sample/furthest_point_sample.py
+42
-0
mmdet3d/ops/furthest_point_sample/src/furthest_point_sample.cpp
...d/ops/furthest_point_sample/src/furthest_point_sample.cpp
+30
-0
mmdet3d/ops/furthest_point_sample/src/furthest_point_sample_cuda.cu
...s/furthest_point_sample/src/furthest_point_sample_cuda.cu
+181
-0
mmdet3d/ops/gather_points/__init__.py
mmdet3d/ops/gather_points/__init__.py
+3
-0
mmdet3d/ops/gather_points/gather_points.py
mmdet3d/ops/gather_points/gather_points.py
+52
-0
mmdet3d/ops/gather_points/src/gather_points.cpp
mmdet3d/ops/gather_points/src/gather_points.cpp
+49
-0
mmdet3d/ops/gather_points/src/gather_points_cuda.cu
mmdet3d/ops/gather_points/src/gather_points_cuda.cu
+84
-0
mmdet3d/ops/group_points/__init__.py
mmdet3d/ops/group_points/__init__.py
+3
-0
mmdet3d/ops/group_points/group_points.py
mmdet3d/ops/group_points/group_points.py
+205
-0
mmdet3d/ops/group_points/src/group_points.cpp
mmdet3d/ops/group_points/src/group_points.cpp
+53
-0
mmdet3d/ops/group_points/src/group_points_cuda.cu
mmdet3d/ops/group_points/src/group_points_cuda.cu
+85
-0
mmdet3d/ops/interpolate/__init__.py
mmdet3d/ops/interpolate/__init__.py
+4
-0
mmdet3d/ops/interpolate/src/interpolate.cpp
mmdet3d/ops/interpolate/src/interpolate.cpp
+81
-0
No files found.
mmdet3d/models/roi_heads/__init__.py
View file @
006b7ccf
...
...
@@ -2,9 +2,9 @@ from .base_3droi_head import Base3DRoIHead
from
.bbox_heads
import
PartA2BboxHead
from
.mask_heads
import
PointwiseSemanticHead
from
.part_aggregation_roi_head
import
PartAggregationROIHead
from
.roi_extractors
import
Single3DRoIAwareExtractor
from
.roi_extractors
import
Single3DRoIAwareExtractor
,
SingleRoIExtractor
__all__
=
[
'Base3DRoIHead'
,
'PartAggregationROIHead'
,
'PointwiseSemanticHead'
,
'Single3DRoIAwareExtractor'
,
'PartA2BboxHead'
'Single3DRoIAwareExtractor'
,
'PartA2BboxHead'
,
'SingleRoIExtractor'
]
mmdet3d/ops/__init__.py
View file @
006b7ccf
from
mmdet.ops
import
(
RoIAlign
,
SigmoidFocalLoss
,
get_compiler_version
,
get_compiling_cuda_version
,
nms
,
roi_align
,
sigmoid_focal_loss
)
from
.ball_query
import
ball_query
from
.furthest_point_sample
import
furthest_point_sample
from
.gather_points
import
gather_points
from
.group_points
import
group_points
,
grouping_operation
from
.interpolate
import
three_interpolate
,
three_nn
from
.norm
import
NaiveSyncBatchNorm1d
,
NaiveSyncBatchNorm2d
from
.roiaware_pool3d
import
(
RoIAwarePool3d
,
points_in_boxes_cpu
,
points_in_boxes_gpu
)
...
...
@@ -15,5 +20,7 @@ __all__ = [
'dynamic_scatter'
,
'DynamicScatter'
,
'sigmoid_focal_loss'
,
'SigmoidFocalLoss'
,
'SparseBasicBlock'
,
'SparseBottleneck'
,
'RoIAwarePool3d'
,
'points_in_boxes_gpu'
,
'points_in_boxes_cpu'
,
'make_sparse_convmodule'
'make_sparse_convmodule'
,
'ball_query'
,
'furthest_point_sample'
,
'three_interpolate'
,
'three_nn'
,
'gather_points'
,
'grouping_operation'
,
'group_points'
]
mmdet3d/ops/ball_query/__init__.py
0 → 100644
View file @
006b7ccf
from
.ball_query
import
ball_query
__all__
=
[
'ball_query'
]
mmdet3d/ops/ball_query/ball_query.py
0 → 100644
View file @
006b7ccf
import
torch
from
torch.autograd
import
Function
from
.
import
ball_query_ext
class
BallQuery
(
Function
):
"""Ball Query
Find nearby points in spherical space.
"""
@
staticmethod
def
forward
(
ctx
,
radius
:
float
,
sample_num
:
int
,
xyz
:
torch
.
Tensor
,
center_xyz
:
torch
.
Tensor
)
->
torch
.
Tensor
:
"""forward.
Args:
radius (float): radius of the balls.
sample_num (int): maximum number of features in the balls.
xyz (Tensor): (B, N, 3) xyz coordinates of the features.
center_xyz (Tensor): (B, npoint, 3) centers of the ball query.
Returns:
Tensor: (B, npoint, nsample) tensor with the indicies of
the features that form the query balls.
"""
assert
center_xyz
.
is_contiguous
()
assert
xyz
.
is_contiguous
()
B
,
N
,
_
=
xyz
.
size
()
npoint
=
center_xyz
.
size
(
1
)
idx
=
torch
.
cuda
.
IntTensor
(
B
,
npoint
,
sample_num
).
zero_
()
ball_query_ext
.
ball_query_wrapper
(
B
,
N
,
npoint
,
radius
,
sample_num
,
center_xyz
,
xyz
,
idx
)
ctx
.
mark_non_differentiable
(
idx
)
return
idx
@
staticmethod
def
backward
(
ctx
,
a
=
None
):
return
None
,
None
,
None
,
None
ball_query
=
BallQuery
.
apply
mmdet3d/ops/ball_query/src/ball_query.cpp
0 → 100644
View file @
006b7ccf
#include <torch/serialize/tensor.h>
#include <vector>
#include <THC/THC.h>
#include <cuda.h>
#include <cuda_runtime_api.h>
#include <torch/extension.h>
extern
THCState
*
state
;
#define CHECK_CUDA(x) AT_CHECK(x.type().is_cuda(), #x, " must be a CUDAtensor ")
#define CHECK_CONTIGUOUS(x) AT_CHECK(x.is_contiguous(), #x, " must be contiguous ")
#define CHECK_INPUT(x) CHECK_CUDA(x);CHECK_CONTIGUOUS(x)
int
ball_query_wrapper
(
int
b
,
int
n
,
int
m
,
float
radius
,
int
nsample
,
at
::
Tensor
new_xyz_tensor
,
at
::
Tensor
xyz_tensor
,
at
::
Tensor
idx_tensor
);
void
ball_query_kernel_launcher
(
int
b
,
int
n
,
int
m
,
float
radius
,
int
nsample
,
const
float
*
xyz
,
const
float
*
new_xyz
,
int
*
idx
,
cudaStream_t
stream
);
int
ball_query_wrapper
(
int
b
,
int
n
,
int
m
,
float
radius
,
int
nsample
,
at
::
Tensor
new_xyz_tensor
,
at
::
Tensor
xyz_tensor
,
at
::
Tensor
idx_tensor
)
{
CHECK_INPUT
(
new_xyz_tensor
);
CHECK_INPUT
(
xyz_tensor
);
const
float
*
new_xyz
=
new_xyz_tensor
.
data
<
float
>
();
const
float
*
xyz
=
xyz_tensor
.
data
<
float
>
();
int
*
idx
=
idx_tensor
.
data
<
int
>
();
cudaStream_t
stream
=
THCState_getCurrentStream
(
state
);
ball_query_kernel_launcher
(
b
,
n
,
m
,
radius
,
nsample
,
new_xyz
,
xyz
,
idx
,
stream
);
return
1
;
}
PYBIND11_MODULE
(
TORCH_EXTENSION_NAME
,
m
)
{
m
.
def
(
"ball_query_wrapper"
,
&
ball_query_wrapper
,
"ball_query_wrapper"
);
}
mmdet3d/ops/ball_query/src/ball_query_cuda.cu
0 → 100644
View file @
006b7ccf
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#define THREADS_PER_BLOCK 256
#define DIVUP(m,n) ((m) / (n) + ((m) % (n) > 0))
__global__
void
ball_query_kernel
(
int
b
,
int
n
,
int
m
,
float
radius
,
int
nsample
,
const
float
*
__restrict__
new_xyz
,
const
float
*
__restrict__
xyz
,
int
*
__restrict__
idx
)
{
// new_xyz: (B, M, 3)
// xyz: (B, N, 3)
// output:
// idx: (B, M, nsample)
int
bs_idx
=
blockIdx
.
y
;
int
pt_idx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
bs_idx
>=
b
||
pt_idx
>=
m
)
return
;
new_xyz
+=
bs_idx
*
m
*
3
+
pt_idx
*
3
;
xyz
+=
bs_idx
*
n
*
3
;
idx
+=
bs_idx
*
m
*
nsample
+
pt_idx
*
nsample
;
float
radius2
=
radius
*
radius
;
float
new_x
=
new_xyz
[
0
];
float
new_y
=
new_xyz
[
1
];
float
new_z
=
new_xyz
[
2
];
int
cnt
=
0
;
for
(
int
k
=
0
;
k
<
n
;
++
k
)
{
float
x
=
xyz
[
k
*
3
+
0
];
float
y
=
xyz
[
k
*
3
+
1
];
float
z
=
xyz
[
k
*
3
+
2
];
float
d2
=
(
new_x
-
x
)
*
(
new_x
-
x
)
+
(
new_y
-
y
)
*
(
new_y
-
y
)
+
(
new_z
-
z
)
*
(
new_z
-
z
);
if
(
d2
<
radius2
){
if
(
cnt
==
0
){
for
(
int
l
=
0
;
l
<
nsample
;
++
l
)
{
idx
[
l
]
=
k
;
}
}
idx
[
cnt
]
=
k
;
++
cnt
;
if
(
cnt
>=
nsample
)
break
;
}
}
}
void
ball_query_kernel_launcher
(
int
b
,
int
n
,
int
m
,
float
radius
,
int
nsample
,
\
const
float
*
new_xyz
,
const
float
*
xyz
,
int
*
idx
,
cudaStream_t
stream
)
{
// new_xyz: (B, M, 3)
// xyz: (B, N, 3)
// output:
// idx: (B, M, nsample)
cudaError_t
err
;
dim3
blocks
(
DIVUP
(
m
,
THREADS_PER_BLOCK
),
b
);
// blockIdx.x(col), blockIdx.y(row)
dim3
threads
(
THREADS_PER_BLOCK
);
ball_query_kernel
<<<
blocks
,
threads
,
0
,
stream
>>>
(
b
,
n
,
m
,
radius
,
nsample
,
new_xyz
,
xyz
,
idx
);
// cudaDeviceSynchronize(); // for using printf in kernel function
err
=
cudaGetLastError
();
if
(
cudaSuccess
!=
err
)
{
fprintf
(
stderr
,
"CUDA kernel failed : %s
\n
"
,
cudaGetErrorString
(
err
));
exit
(
-
1
);
}
}
mmdet3d/ops/furthest_point_sample/__init__.py
0 → 100644
View file @
006b7ccf
from
.furthest_point_sample
import
furthest_point_sample
__all__
=
[
'furthest_point_sample'
]
mmdet3d/ops/furthest_point_sample/furthest_point_sample.py
0 → 100644
View file @
006b7ccf
import
torch
from
torch.autograd
import
Function
from
.
import
furthest_point_sample_ext
class
FurthestPointSampling
(
Function
):
"""Furthest Point Sampling.
Uses iterative furthest point sampling to select a set of
features whose corresponding points have the furthest distance.
"""
@
staticmethod
def
forward
(
ctx
,
points_xyz
:
torch
.
Tensor
,
num_points
:
int
)
->
torch
.
Tensor
:
"""forward.
Args:
points_xyz (Tensor): (B, N, 3) where N > num_points.
num_points (int): Number of points in the sampled set.
Returns:
Tensor: (B, num_points) indices of the sampled points.
"""
assert
points_xyz
.
is_contiguous
()
B
,
N
,
_
=
points_xyz
.
size
()
output
=
torch
.
cuda
.
IntTensor
(
B
,
num_points
)
temp
=
torch
.
cuda
.
FloatTensor
(
B
,
N
).
fill_
(
1e10
)
furthest_point_sample_ext
.
furthest_point_sampling_wrapper
(
B
,
N
,
num_points
,
points_xyz
,
temp
,
output
)
ctx
.
mark_non_differentiable
(
output
)
return
output
@
staticmethod
def
backward
(
xyz
,
a
=
None
):
return
None
,
None
furthest_point_sample
=
FurthestPointSampling
.
apply
mmdet3d/ops/furthest_point_sample/src/furthest_point_sample.cpp
0 → 100644
View file @
006b7ccf
#include <torch/serialize/tensor.h>
#include <ATen/cuda/CUDAContext.h>
#include <vector>
#include <THC/THC.h>
#include <torch/extension.h>
extern
THCState
*
state
;
int
furthest_point_sampling_wrapper
(
int
b
,
int
n
,
int
m
,
at
::
Tensor
points_tensor
,
at
::
Tensor
temp_tensor
,
at
::
Tensor
idx_tensor
);
void
furthest_point_sampling_kernel_launcher
(
int
b
,
int
n
,
int
m
,
const
float
*
dataset
,
float
*
temp
,
int
*
idxs
,
cudaStream_t
stream
);
int
furthest_point_sampling_wrapper
(
int
b
,
int
n
,
int
m
,
at
::
Tensor
points_tensor
,
at
::
Tensor
temp_tensor
,
at
::
Tensor
idx_tensor
)
{
const
float
*
points
=
points_tensor
.
data
<
float
>
();
float
*
temp
=
temp_tensor
.
data
<
float
>
();
int
*
idx
=
idx_tensor
.
data
<
int
>
();
cudaStream_t
stream
=
THCState_getCurrentStream
(
state
);
furthest_point_sampling_kernel_launcher
(
b
,
n
,
m
,
points
,
temp
,
idx
,
stream
);
return
1
;
}
PYBIND11_MODULE
(
TORCH_EXTENSION_NAME
,
m
)
{
m
.
def
(
"furthest_point_sampling_wrapper"
,
&
furthest_point_sampling_wrapper
,
"furthest_point_sampling_wrapper"
);
}
mmdet3d/ops/furthest_point_sample/src/furthest_point_sample_cuda.cu
0 → 100644
View file @
006b7ccf
#include <stdio.h>
#include <stdlib.h>
#define TOTAL_THREADS 1024
#define THREADS_PER_BLOCK 256
#define DIVUP(m,n) ((m) / (n) + ((m) % (n) > 0))
inline
int
opt_n_threads
(
int
work_size
)
{
const
int
pow_2
=
std
::
log
(
static_cast
<
double
>
(
work_size
))
/
std
::
log
(
2.0
);
return
max
(
min
(
1
<<
pow_2
,
TOTAL_THREADS
),
1
);
}
__device__
void
__update
(
float
*
__restrict__
dists
,
int
*
__restrict__
dists_i
,
int
idx1
,
int
idx2
){
const
float
v1
=
dists
[
idx1
],
v2
=
dists
[
idx2
];
const
int
i1
=
dists_i
[
idx1
],
i2
=
dists_i
[
idx2
];
dists
[
idx1
]
=
max
(
v1
,
v2
);
dists_i
[
idx1
]
=
v2
>
v1
?
i2
:
i1
;
}
template
<
unsigned
int
block_size
>
__global__
void
furthest_point_sampling_kernel
(
int
b
,
int
n
,
int
m
,
const
float
*
__restrict__
dataset
,
float
*
__restrict__
temp
,
int
*
__restrict__
idxs
)
{
// dataset: (B, N, 3)
// tmp: (B, N)
// output:
// idx: (B, M)
if
(
m
<=
0
)
return
;
__shared__
float
dists
[
block_size
];
__shared__
int
dists_i
[
block_size
];
int
batch_index
=
blockIdx
.
x
;
dataset
+=
batch_index
*
n
*
3
;
temp
+=
batch_index
*
n
;
idxs
+=
batch_index
*
m
;
int
tid
=
threadIdx
.
x
;
const
int
stride
=
block_size
;
int
old
=
0
;
if
(
threadIdx
.
x
==
0
)
idxs
[
0
]
=
old
;
__syncthreads
();
for
(
int
j
=
1
;
j
<
m
;
j
++
)
{
int
besti
=
0
;
float
best
=
-
1
;
float
x1
=
dataset
[
old
*
3
+
0
];
float
y1
=
dataset
[
old
*
3
+
1
];
float
z1
=
dataset
[
old
*
3
+
2
];
for
(
int
k
=
tid
;
k
<
n
;
k
+=
stride
)
{
float
x2
,
y2
,
z2
;
x2
=
dataset
[
k
*
3
+
0
];
y2
=
dataset
[
k
*
3
+
1
];
z2
=
dataset
[
k
*
3
+
2
];
// float mag = (x2 * x2) + (y2 * y2) + (z2 * z2);
// if (mag <= 1e-3)
// continue;
float
d
=
(
x2
-
x1
)
*
(
x2
-
x1
)
+
(
y2
-
y1
)
*
(
y2
-
y1
)
+
(
z2
-
z1
)
*
(
z2
-
z1
);
float
d2
=
min
(
d
,
temp
[
k
]);
temp
[
k
]
=
d2
;
besti
=
d2
>
best
?
k
:
besti
;
best
=
d2
>
best
?
d2
:
best
;
}
dists
[
tid
]
=
best
;
dists_i
[
tid
]
=
besti
;
__syncthreads
();
if
(
block_size
>=
1024
)
{
if
(
tid
<
512
)
{
__update
(
dists
,
dists_i
,
tid
,
tid
+
512
);
}
__syncthreads
();
}
if
(
block_size
>=
512
)
{
if
(
tid
<
256
)
{
__update
(
dists
,
dists_i
,
tid
,
tid
+
256
);
}
__syncthreads
();
}
if
(
block_size
>=
256
)
{
if
(
tid
<
128
)
{
__update
(
dists
,
dists_i
,
tid
,
tid
+
128
);
}
__syncthreads
();
}
if
(
block_size
>=
128
)
{
if
(
tid
<
64
)
{
__update
(
dists
,
dists_i
,
tid
,
tid
+
64
);
}
__syncthreads
();
}
if
(
block_size
>=
64
)
{
if
(
tid
<
32
)
{
__update
(
dists
,
dists_i
,
tid
,
tid
+
32
);
}
__syncthreads
();
}
if
(
block_size
>=
32
)
{
if
(
tid
<
16
)
{
__update
(
dists
,
dists_i
,
tid
,
tid
+
16
);
}
__syncthreads
();
}
if
(
block_size
>=
16
)
{
if
(
tid
<
8
)
{
__update
(
dists
,
dists_i
,
tid
,
tid
+
8
);
}
__syncthreads
();
}
if
(
block_size
>=
8
)
{
if
(
tid
<
4
)
{
__update
(
dists
,
dists_i
,
tid
,
tid
+
4
);
}
__syncthreads
();
}
if
(
block_size
>=
4
)
{
if
(
tid
<
2
)
{
__update
(
dists
,
dists_i
,
tid
,
tid
+
2
);
}
__syncthreads
();
}
if
(
block_size
>=
2
)
{
if
(
tid
<
1
)
{
__update
(
dists
,
dists_i
,
tid
,
tid
+
1
);
}
__syncthreads
();
}
old
=
dists_i
[
0
];
if
(
tid
==
0
)
idxs
[
j
]
=
old
;
}
}
void
furthest_point_sampling_kernel_launcher
(
int
b
,
int
n
,
int
m
,
const
float
*
dataset
,
float
*
temp
,
int
*
idxs
,
cudaStream_t
stream
)
{
// dataset: (B, N, 3)
// tmp: (B, N)
// output:
// idx: (B, M)
cudaError_t
err
;
unsigned
int
n_threads
=
opt_n_threads
(
n
);
switch
(
n_threads
)
{
case
1024
:
furthest_point_sampling_kernel
<
1024
><<<
b
,
n_threads
,
0
,
stream
>>>
(
b
,
n
,
m
,
dataset
,
temp
,
idxs
);
break
;
case
512
:
furthest_point_sampling_kernel
<
512
><<<
b
,
n_threads
,
0
,
stream
>>>
(
b
,
n
,
m
,
dataset
,
temp
,
idxs
);
break
;
case
256
:
furthest_point_sampling_kernel
<
256
><<<
b
,
n_threads
,
0
,
stream
>>>
(
b
,
n
,
m
,
dataset
,
temp
,
idxs
);
break
;
case
128
:
furthest_point_sampling_kernel
<
128
><<<
b
,
n_threads
,
0
,
stream
>>>
(
b
,
n
,
m
,
dataset
,
temp
,
idxs
);
break
;
case
64
:
furthest_point_sampling_kernel
<
64
><<<
b
,
n_threads
,
0
,
stream
>>>
(
b
,
n
,
m
,
dataset
,
temp
,
idxs
);
break
;
case
32
:
furthest_point_sampling_kernel
<
32
><<<
b
,
n_threads
,
0
,
stream
>>>
(
b
,
n
,
m
,
dataset
,
temp
,
idxs
);
break
;
case
16
:
furthest_point_sampling_kernel
<
16
><<<
b
,
n_threads
,
0
,
stream
>>>
(
b
,
n
,
m
,
dataset
,
temp
,
idxs
);
break
;
case
8
:
furthest_point_sampling_kernel
<
8
><<<
b
,
n_threads
,
0
,
stream
>>>
(
b
,
n
,
m
,
dataset
,
temp
,
idxs
);
break
;
case
4
:
furthest_point_sampling_kernel
<
4
><<<
b
,
n_threads
,
0
,
stream
>>>
(
b
,
n
,
m
,
dataset
,
temp
,
idxs
);
break
;
case
2
:
furthest_point_sampling_kernel
<
2
><<<
b
,
n_threads
,
0
,
stream
>>>
(
b
,
n
,
m
,
dataset
,
temp
,
idxs
);
break
;
case
1
:
furthest_point_sampling_kernel
<
1
><<<
b
,
n_threads
,
0
,
stream
>>>
(
b
,
n
,
m
,
dataset
,
temp
,
idxs
);
break
;
default:
furthest_point_sampling_kernel
<
512
><<<
b
,
n_threads
,
0
,
stream
>>>
(
b
,
n
,
m
,
dataset
,
temp
,
idxs
);
}
err
=
cudaGetLastError
();
if
(
cudaSuccess
!=
err
)
{
fprintf
(
stderr
,
"CUDA kernel failed : %s
\n
"
,
cudaGetErrorString
(
err
));
exit
(
-
1
);
}
}
mmdet3d/ops/gather_points/__init__.py
0 → 100644
View file @
006b7ccf
from
.gather_points
import
gather_points
__all__
=
[
'gather_points'
]
mmdet3d/ops/gather_points/gather_points.py
0 → 100644
View file @
006b7ccf
import
torch
from
torch.autograd
import
Function
from
.
import
gather_points_ext
class
GatherPoints
(
Function
):
"""Gather Points
Gather points with given index.
"""
@
staticmethod
def
forward
(
ctx
,
features
:
torch
.
Tensor
,
indicies
:
torch
.
Tensor
)
->
torch
.
Tensor
:
"""forward.
Args:
features (Tensor): (B, C, N) features to gather.
indicies (Tensor): (B, M) where M is the number of points.
Returns:
Tensor: (B, C, M) where M is the number of points.
"""
assert
features
.
is_contiguous
()
assert
indicies
.
is_contiguous
()
B
,
npoint
=
indicies
.
size
()
_
,
C
,
N
=
features
.
size
()
output
=
torch
.
cuda
.
FloatTensor
(
B
,
C
,
npoint
)
gather_points_ext
.
gather_points_wrapper
(
B
,
C
,
N
,
npoint
,
features
,
indicies
,
output
)
ctx
.
for_backwards
=
(
indicies
,
C
,
N
)
ctx
.
mark_non_differentiable
(
indicies
)
return
output
@
staticmethod
def
backward
(
ctx
,
grad_out
):
idx
,
C
,
N
=
ctx
.
for_backwards
B
,
npoint
=
idx
.
size
()
grad_features
=
torch
.
cuda
.
FloatTensor
(
B
,
C
,
N
).
zero_
()
grad_out_data
=
grad_out
.
data
.
contiguous
()
gather_points_ext
.
gather_points_grad_wrapper
(
B
,
C
,
N
,
npoint
,
grad_out_data
,
idx
,
grad_features
.
data
)
return
grad_features
,
None
gather_points
=
GatherPoints
.
apply
mmdet3d/ops/gather_points/src/gather_points.cpp
0 → 100644
View file @
006b7ccf
#include <torch/serialize/tensor.h>
#include <ATen/cuda/CUDAContext.h>
#include <vector>
#include <THC/THC.h>
#include <torch/extension.h>
extern
THCState
*
state
;
int
gather_points_wrapper
(
int
b
,
int
c
,
int
n
,
int
npoints
,
at
::
Tensor
points_tensor
,
at
::
Tensor
idx_tensor
,
at
::
Tensor
out_tensor
);
void
gather_points_kernel_launcher
(
int
b
,
int
c
,
int
n
,
int
npoints
,
const
float
*
points
,
const
int
*
idx
,
float
*
out
,
cudaStream_t
stream
);
int
gather_points_grad_wrapper
(
int
b
,
int
c
,
int
n
,
int
npoints
,
at
::
Tensor
grad_out_tensor
,
at
::
Tensor
idx_tensor
,
at
::
Tensor
grad_points_tensor
);
void
gather_points_grad_kernel_launcher
(
int
b
,
int
c
,
int
n
,
int
npoints
,
const
float
*
grad_out
,
const
int
*
idx
,
float
*
grad_points
,
cudaStream_t
stream
);
int
gather_points_wrapper
(
int
b
,
int
c
,
int
n
,
int
npoints
,
at
::
Tensor
points_tensor
,
at
::
Tensor
idx_tensor
,
at
::
Tensor
out_tensor
){
const
float
*
points
=
points_tensor
.
data
<
float
>
();
const
int
*
idx
=
idx_tensor
.
data
<
int
>
();
float
*
out
=
out_tensor
.
data
<
float
>
();
cudaStream_t
stream
=
THCState_getCurrentStream
(
state
);
gather_points_kernel_launcher
(
b
,
c
,
n
,
npoints
,
points
,
idx
,
out
,
stream
);
return
1
;
}
int
gather_points_grad_wrapper
(
int
b
,
int
c
,
int
n
,
int
npoints
,
at
::
Tensor
grad_out_tensor
,
at
::
Tensor
idx_tensor
,
at
::
Tensor
grad_points_tensor
)
{
const
float
*
grad_out
=
grad_out_tensor
.
data
<
float
>
();
const
int
*
idx
=
idx_tensor
.
data
<
int
>
();
float
*
grad_points
=
grad_points_tensor
.
data
<
float
>
();
cudaStream_t
stream
=
THCState_getCurrentStream
(
state
);
gather_points_grad_kernel_launcher
(
b
,
c
,
n
,
npoints
,
grad_out
,
idx
,
grad_points
,
stream
);
return
1
;
}
PYBIND11_MODULE
(
TORCH_EXTENSION_NAME
,
m
)
{
m
.
def
(
"gather_points_wrapper"
,
&
gather_points_wrapper
,
"gather_points_wrapper"
);
m
.
def
(
"gather_points_grad_wrapper"
,
&
gather_points_grad_wrapper
,
"gather_points_grad_wrapper"
);
}
mmdet3d/ops/gather_points/src/gather_points_cuda.cu
0 → 100644
View file @
006b7ccf
#include <stdio.h>
#include <stdlib.h>
#define TOTAL_THREADS 1024
#define THREADS_PER_BLOCK 256
#define DIVUP(m,n) ((m) / (n) + ((m) % (n) > 0))
__global__
void
gather_points_kernel
(
int
b
,
int
c
,
int
n
,
int
m
,
const
float
*
__restrict__
points
,
const
int
*
__restrict__
idx
,
float
*
__restrict__
out
)
{
// points: (B, C, N)
// idx: (B, M)
// output:
// out: (B, C, M)
int
bs_idx
=
blockIdx
.
z
;
int
c_idx
=
blockIdx
.
y
;
int
pt_idx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
bs_idx
>=
b
||
c_idx
>=
c
||
pt_idx
>=
m
)
return
;
out
+=
bs_idx
*
c
*
m
+
c_idx
*
m
+
pt_idx
;
idx
+=
bs_idx
*
m
+
pt_idx
;
points
+=
bs_idx
*
c
*
n
+
c_idx
*
n
;
out
[
0
]
=
points
[
idx
[
0
]];
}
void
gather_points_kernel_launcher
(
int
b
,
int
c
,
int
n
,
int
npoints
,
const
float
*
points
,
const
int
*
idx
,
float
*
out
,
cudaStream_t
stream
)
{
// points: (B, C, N)
// idx: (B, npoints)
// output:
// out: (B, C, npoints)
cudaError_t
err
;
dim3
blocks
(
DIVUP
(
npoints
,
THREADS_PER_BLOCK
),
c
,
b
);
// blockIdx.x(col), blockIdx.y(row)
dim3
threads
(
THREADS_PER_BLOCK
);
gather_points_kernel
<<<
blocks
,
threads
,
0
,
stream
>>>
(
b
,
c
,
n
,
npoints
,
points
,
idx
,
out
);
err
=
cudaGetLastError
();
if
(
cudaSuccess
!=
err
)
{
fprintf
(
stderr
,
"CUDA kernel failed : %s
\n
"
,
cudaGetErrorString
(
err
));
exit
(
-
1
);
}
}
__global__
void
gather_points_grad_kernel
(
int
b
,
int
c
,
int
n
,
int
m
,
const
float
*
__restrict__
grad_out
,
const
int
*
__restrict__
idx
,
float
*
__restrict__
grad_points
)
{
// grad_out: (B, C, M)
// idx: (B, M)
// output:
// grad_points: (B, C, N)
int
bs_idx
=
blockIdx
.
z
;
int
c_idx
=
blockIdx
.
y
;
int
pt_idx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
bs_idx
>=
b
||
c_idx
>=
c
||
pt_idx
>=
m
)
return
;
grad_out
+=
bs_idx
*
c
*
m
+
c_idx
*
m
+
pt_idx
;
idx
+=
bs_idx
*
m
+
pt_idx
;
grad_points
+=
bs_idx
*
c
*
n
+
c_idx
*
n
;
atomicAdd
(
grad_points
+
idx
[
0
],
grad_out
[
0
]);
}
void
gather_points_grad_kernel_launcher
(
int
b
,
int
c
,
int
n
,
int
npoints
,
const
float
*
grad_out
,
const
int
*
idx
,
float
*
grad_points
,
cudaStream_t
stream
)
{
// grad_out: (B, C, npoints)
// idx: (B, npoints)
// output:
// grad_points: (B, C, N)
cudaError_t
err
;
dim3
blocks
(
DIVUP
(
npoints
,
THREADS_PER_BLOCK
),
c
,
b
);
// blockIdx.x(col), blockIdx.y(row)
dim3
threads
(
THREADS_PER_BLOCK
);
gather_points_grad_kernel
<<<
blocks
,
threads
,
0
,
stream
>>>
(
b
,
c
,
n
,
npoints
,
grad_out
,
idx
,
grad_points
);
err
=
cudaGetLastError
();
if
(
cudaSuccess
!=
err
)
{
fprintf
(
stderr
,
"CUDA kernel failed : %s
\n
"
,
cudaGetErrorString
(
err
));
exit
(
-
1
);
}
}
mmdet3d/ops/group_points/__init__.py
0 → 100644
View file @
006b7ccf
from
.group_points
import
GroupAll
,
QueryAndGroup
,
grouping_operation
__all__
=
[
'QueryAndGroup'
,
'GroupAll'
,
'grouping_operation'
]
mmdet3d/ops/group_points/group_points.py
0 → 100644
View file @
006b7ccf
from
typing
import
Tuple
import
torch
import
torch.nn
as
nn
from
torch.autograd
import
Function
from
..ball_query
import
ball_query
from
.
import
group_points_ext
class
QueryAndGroup
(
nn
.
Module
):
"""Query and Group.
Groups with a ball query of radius
Args:
radius (float): radius of the balls.
sample_num (int): Maximum number of features to gather in the ball.
use_xyz (bool): Whether to use xyz.
Default: True.
return_grouped_xyz (bool): Whether to return grouped xyz.
Default: False.
normalize_xyz (bool): Whether to normalize xyz.
Default: False.
uniform_sample (bool): Whether to sample uniformly.
Default: False
return_unique_cnt (bool): Whether to return the count of
unique samples.
Default: False.
"""
def
__init__
(
self
,
radius
,
sample_num
,
use_xyz
=
True
,
return_grouped_xyz
=
False
,
normalize_xyz
=
False
,
uniform_sample
=
False
,
return_unique_cnt
=
False
):
super
(
QueryAndGroup
,
self
).
__init__
()
self
.
radius
=
radius
self
.
sample_num
=
sample_num
self
.
use_xyz
=
use_xyz
self
.
return_grouped_xyz
=
return_grouped_xyz
self
.
normalize_xyz
=
normalize_xyz
self
.
uniform_sample
=
uniform_sample
self
.
return_unique_cnt
=
return_unique_cnt
if
self
.
return_unique_cnt
:
assert
self
.
uniform_sample
def
forward
(
self
,
points_xyz
,
center_xyz
,
features
=
None
):
"""forward
Args:
points_xyz (Tensor): (B, N, 3) xyz coordinates of the features.
center_xyz (Tensor): (B, npoint, 3) Centriods.
features (Tensor): (B, C, N) Descriptors of the features.
Return:
Tensor: (B, 3 + C, npoint, sample_num) Grouped feature.
"""
idx
=
ball_query
(
self
.
radius
,
self
.
sample_num
,
points_xyz
,
center_xyz
)
if
self
.
uniform_sample
:
unique_cnt
=
torch
.
zeros
((
idx
.
shape
[
0
],
idx
.
shape
[
1
]))
for
i_batch
in
range
(
idx
.
shape
[
0
]):
for
i_region
in
range
(
idx
.
shape
[
1
]):
unique_ind
=
torch
.
unique
(
idx
[
i_batch
,
i_region
,
:])
num_unique
=
unique_ind
.
shape
[
0
]
unique_cnt
[
i_batch
,
i_region
]
=
num_unique
sample_ind
=
torch
.
randint
(
0
,
num_unique
,
(
self
.
sample_num
-
num_unique
,
),
dtype
=
torch
.
long
)
all_ind
=
torch
.
cat
((
unique_ind
,
unique_ind
[
sample_ind
]))
idx
[
i_batch
,
i_region
,
:]
=
all_ind
xyz_trans
=
points_xyz
.
transpose
(
1
,
2
).
contiguous
()
# (B, 3, npoint, sample_num)
grouped_xyz
=
grouping_operation
(
xyz_trans
,
idx
)
grouped_xyz
-=
center_xyz
.
transpose
(
1
,
2
).
unsqueeze
(
-
1
)
if
self
.
normalize_xyz
:
grouped_xyz
/=
self
.
radius
if
features
is
not
None
:
grouped_features
=
grouping_operation
(
features
,
idx
)
if
self
.
use_xyz
:
# (B, C + 3, npoint, sample_num)
new_features
=
torch
.
cat
([
grouped_xyz
,
grouped_features
],
dim
=
1
)
else
:
new_features
=
grouped_features
else
:
assert
(
self
.
use_xyz
),
'Cannot have not features and not use xyz as a feature!'
new_features
=
grouped_xyz
ret
=
[
new_features
]
if
self
.
return_grouped_xyz
:
ret
.
append
(
grouped_xyz
)
if
self
.
return_unique_cnt
:
ret
.
append
(
unique_cnt
)
if
len
(
ret
)
==
1
:
return
ret
[
0
]
else
:
return
tuple
(
ret
)
class
GroupAll
(
nn
.
Module
):
"""Group All.
Group xyz with feature.
Args:
use_xyz (bool): Whether to use xyz.
"""
def
__init__
(
self
,
use_xyz
:
bool
=
True
):
super
().
__init__
()
self
.
use_xyz
=
use_xyz
def
forward
(
self
,
xyz
:
torch
.
Tensor
,
new_xyz
:
torch
.
Tensor
,
features
:
torch
.
Tensor
=
None
):
"""forward.
Args:
xyz (Tensor): (B, N, 3) xyz coordinates of the features.
new_xyz (Tensor): Ignored.
features (Tensor): (B, C, N) features to group.
Return:
Tensor: (B, C + 3, 1, N) Grouped feature.
"""
grouped_xyz
=
xyz
.
transpose
(
1
,
2
).
unsqueeze
(
2
)
if
features
is
not
None
:
grouped_features
=
features
.
unsqueeze
(
2
)
if
self
.
use_xyz
:
new_features
=
torch
.
cat
([
grouped_xyz
,
grouped_features
],
dim
=
1
)
# (B, 3 + C, 1, N)
else
:
new_features
=
grouped_features
else
:
new_features
=
grouped_xyz
return
new_features
class
GroupingOperation
(
Function
):
"""Grouping Operation.
Group feature with given index.
"""
@
staticmethod
def
forward
(
ctx
,
features
:
torch
.
Tensor
,
indices
:
torch
.
Tensor
)
->
torch
.
Tensor
:
"""forward.
Args:
features (Tensor): (B, C, N) tensor of features to group.
indices (Tensor): (B, npoint, nsample) the indicies of
features to group with.
Returns:
Tensor: (B, C, npoint, nsample) Grouped features.
"""
assert
features
.
is_contiguous
()
assert
indices
.
is_contiguous
()
B
,
nfeatures
,
nsample
=
indices
.
size
()
_
,
C
,
N
=
features
.
size
()
output
=
torch
.
cuda
.
FloatTensor
(
B
,
C
,
nfeatures
,
nsample
)
group_points_ext
.
forward
(
B
,
C
,
N
,
nfeatures
,
nsample
,
features
,
indices
,
output
)
ctx
.
for_backwards
=
(
indices
,
N
)
return
output
@
staticmethod
def
backward
(
ctx
,
grad_out
:
torch
.
Tensor
)
->
Tuple
[
torch
.
Tensor
,
torch
.
Tensor
]:
"""backward.
Args:
grad_out (Tensor): (B, C, npoint, nsample) tensor of the gradients
of the output from forward.
Returns:
Tensor: (B, C, N) gradient of the features.
"""
idx
,
N
=
ctx
.
for_backwards
B
,
C
,
npoint
,
nsample
=
grad_out
.
size
()
grad_features
=
torch
.
cuda
.
FloatTensor
(
B
,
C
,
N
).
zero_
()
grad_out_data
=
grad_out
.
data
.
contiguous
()
group_points_ext
.
backward
(
B
,
C
,
N
,
npoint
,
nsample
,
grad_out_data
,
idx
,
grad_features
.
data
)
return
grad_features
,
None
grouping_operation
=
GroupingOperation
.
apply
mmdet3d/ops/group_points/src/group_points.cpp
0 → 100644
View file @
006b7ccf
#include <torch/serialize/tensor.h>
#include <cuda.h>
#include <cuda_runtime_api.h>
#include <vector>
#include <THC/THC.h>
#include <torch/extension.h>
extern
THCState
*
state
;
int
group_points_wrapper
(
int
b
,
int
c
,
int
n
,
int
npoints
,
int
nsample
,
at
::
Tensor
points_tensor
,
at
::
Tensor
idx_tensor
,
at
::
Tensor
out_tensor
);
void
group_points_kernel_launcher
(
int
b
,
int
c
,
int
n
,
int
npoints
,
int
nsample
,
const
float
*
points
,
const
int
*
idx
,
float
*
out
,
cudaStream_t
stream
);
int
group_points_grad_wrapper
(
int
b
,
int
c
,
int
n
,
int
npoints
,
int
nsample
,
at
::
Tensor
grad_out_tensor
,
at
::
Tensor
idx_tensor
,
at
::
Tensor
grad_points_tensor
);
void
group_points_grad_kernel_launcher
(
int
b
,
int
c
,
int
n
,
int
npoints
,
int
nsample
,
const
float
*
grad_out
,
const
int
*
idx
,
float
*
grad_points
,
cudaStream_t
stream
);
int
group_points_grad_wrapper
(
int
b
,
int
c
,
int
n
,
int
npoints
,
int
nsample
,
at
::
Tensor
grad_out_tensor
,
at
::
Tensor
idx_tensor
,
at
::
Tensor
grad_points_tensor
)
{
float
*
grad_points
=
grad_points_tensor
.
data
<
float
>
();
const
int
*
idx
=
idx_tensor
.
data
<
int
>
();
const
float
*
grad_out
=
grad_out_tensor
.
data
<
float
>
();
cudaStream_t
stream
=
THCState_getCurrentStream
(
state
);
group_points_grad_kernel_launcher
(
b
,
c
,
n
,
npoints
,
nsample
,
grad_out
,
idx
,
grad_points
,
stream
);
return
1
;
}
int
group_points_wrapper
(
int
b
,
int
c
,
int
n
,
int
npoints
,
int
nsample
,
at
::
Tensor
points_tensor
,
at
::
Tensor
idx_tensor
,
at
::
Tensor
out_tensor
)
{
const
float
*
points
=
points_tensor
.
data
<
float
>
();
const
int
*
idx
=
idx_tensor
.
data
<
int
>
();
float
*
out
=
out_tensor
.
data
<
float
>
();
cudaStream_t
stream
=
THCState_getCurrentStream
(
state
);
group_points_kernel_launcher
(
b
,
c
,
n
,
npoints
,
nsample
,
points
,
idx
,
out
,
stream
);
return
1
;
}
PYBIND11_MODULE
(
TORCH_EXTENSION_NAME
,
m
)
{
m
.
def
(
"forward"
,
&
group_points_wrapper
,
"group_points_wrapper"
);
m
.
def
(
"backward"
,
&
group_points_grad_wrapper
,
"group_points_grad_wrapper"
);
}
mmdet3d/ops/group_points/src/group_points_cuda.cu
0 → 100644
View file @
006b7ccf
#include <stdio.h>
#include <stdlib.h>
#define THREADS_PER_BLOCK 256
#define DIVUP(m,n) ((m) / (n) + ((m) % (n) > 0))
__global__
void
group_points_grad_kernel
(
int
b
,
int
c
,
int
n
,
int
npoints
,
int
nsample
,
const
float
*
__restrict__
grad_out
,
const
int
*
__restrict__
idx
,
float
*
__restrict__
grad_points
)
{
// grad_out: (B, C, npoints, nsample)
// idx: (B, npoints, nsample)
// output:
// grad_points: (B, C, N)
int
bs_idx
=
blockIdx
.
z
;
int
c_idx
=
blockIdx
.
y
;
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
int
pt_idx
=
index
/
nsample
;
if
(
bs_idx
>=
b
||
c_idx
>=
c
||
pt_idx
>=
npoints
)
return
;
int
sample_idx
=
index
%
nsample
;
grad_out
+=
bs_idx
*
c
*
npoints
*
nsample
+
c_idx
*
npoints
*
nsample
+
pt_idx
*
nsample
+
sample_idx
;
idx
+=
bs_idx
*
npoints
*
nsample
+
pt_idx
*
nsample
+
sample_idx
;
atomicAdd
(
grad_points
+
bs_idx
*
c
*
n
+
c_idx
*
n
+
idx
[
0
]
,
grad_out
[
0
]);
}
void
group_points_grad_kernel_launcher
(
int
b
,
int
c
,
int
n
,
int
npoints
,
int
nsample
,
const
float
*
grad_out
,
const
int
*
idx
,
float
*
grad_points
,
cudaStream_t
stream
)
{
// grad_out: (B, C, npoints, nsample)
// idx: (B, npoints, nsample)
// output:
// grad_points: (B, C, N)
cudaError_t
err
;
dim3
blocks
(
DIVUP
(
npoints
*
nsample
,
THREADS_PER_BLOCK
),
c
,
b
);
// blockIdx.x(col), blockIdx.y(row)
dim3
threads
(
THREADS_PER_BLOCK
);
group_points_grad_kernel
<<<
blocks
,
threads
,
0
,
stream
>>>
(
b
,
c
,
n
,
npoints
,
nsample
,
grad_out
,
idx
,
grad_points
);
err
=
cudaGetLastError
();
if
(
cudaSuccess
!=
err
)
{
fprintf
(
stderr
,
"CUDA kernel failed : %s
\n
"
,
cudaGetErrorString
(
err
));
exit
(
-
1
);
}
}
__global__
void
group_points_kernel
(
int
b
,
int
c
,
int
n
,
int
npoints
,
int
nsample
,
const
float
*
__restrict__
points
,
const
int
*
__restrict__
idx
,
float
*
__restrict__
out
)
{
// points: (B, C, N)
// idx: (B, npoints, nsample)
// output:
// out: (B, C, npoints, nsample)
int
bs_idx
=
blockIdx
.
z
;
int
c_idx
=
blockIdx
.
y
;
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
int
pt_idx
=
index
/
nsample
;
if
(
bs_idx
>=
b
||
c_idx
>=
c
||
pt_idx
>=
npoints
)
return
;
int
sample_idx
=
index
%
nsample
;
idx
+=
bs_idx
*
npoints
*
nsample
+
pt_idx
*
nsample
+
sample_idx
;
int
in_idx
=
bs_idx
*
c
*
n
+
c_idx
*
n
+
idx
[
0
];
int
out_idx
=
bs_idx
*
c
*
npoints
*
nsample
+
c_idx
*
npoints
*
nsample
+
pt_idx
*
nsample
+
sample_idx
;
out
[
out_idx
]
=
points
[
in_idx
];
}
void
group_points_kernel_launcher
(
int
b
,
int
c
,
int
n
,
int
npoints
,
int
nsample
,
const
float
*
points
,
const
int
*
idx
,
float
*
out
,
cudaStream_t
stream
)
{
// points: (B, C, N)
// idx: (B, npoints, nsample)
// output:
// out: (B, C, npoints, nsample)
cudaError_t
err
;
dim3
blocks
(
DIVUP
(
npoints
*
nsample
,
THREADS_PER_BLOCK
),
c
,
b
);
// blockIdx.x(col), blockIdx.y(row)
dim3
threads
(
THREADS_PER_BLOCK
);
group_points_kernel
<<<
blocks
,
threads
,
0
,
stream
>>>
(
b
,
c
,
n
,
npoints
,
nsample
,
points
,
idx
,
out
);
// cudaDeviceSynchronize(); // for using printf in kernel function
err
=
cudaGetLastError
();
if
(
cudaSuccess
!=
err
)
{
fprintf
(
stderr
,
"CUDA kernel failed : %s
\n
"
,
cudaGetErrorString
(
err
));
exit
(
-
1
);
}
}
mmdet3d/ops/interpolate/__init__.py
0 → 100644
View file @
006b7ccf
from
.three_interpolate
import
three_interpolate
from
.three_nn
import
three_nn
__all__
=
[
'three_nn'
,
'three_interpolate'
]
mmdet3d/ops/interpolate/src/interpolate.cpp
0 → 100644
View file @
006b7ccf
#include <torch/serialize/tensor.h>
#include <vector>
#include <THC/THC.h>
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#include <cuda_runtime_api.h>
#include <torch/extension.h>
extern
THCState
*
state
;
void
three_nn_wrapper
(
int
b
,
int
n
,
int
m
,
at
::
Tensor
unknown_tensor
,
at
::
Tensor
known_tensor
,
at
::
Tensor
dist2_tensor
,
at
::
Tensor
idx_tensor
);
void
three_nn_kernel_launcher
(
int
b
,
int
n
,
int
m
,
const
float
*
unknown
,
const
float
*
known
,
float
*
dist2
,
int
*
idx
,
cudaStream_t
stream
);
void
three_interpolate_wrapper
(
int
b
,
int
c
,
int
m
,
int
n
,
at
::
Tensor
points_tensor
,
at
::
Tensor
idx_tensor
,
at
::
Tensor
weight_tensor
,
at
::
Tensor
out_tensor
);
void
three_interpolate_kernel_launcher
(
int
b
,
int
c
,
int
m
,
int
n
,
const
float
*
points
,
const
int
*
idx
,
const
float
*
weight
,
float
*
out
,
cudaStream_t
stream
);
void
three_interpolate_grad_wrapper
(
int
b
,
int
c
,
int
n
,
int
m
,
at
::
Tensor
grad_out_tensor
,
at
::
Tensor
idx_tensor
,
at
::
Tensor
weight_tensor
,
at
::
Tensor
grad_points_tensor
);
void
three_interpolate_grad_kernel_launcher
(
int
b
,
int
c
,
int
n
,
int
m
,
const
float
*
grad_out
,
const
int
*
idx
,
const
float
*
weight
,
float
*
grad_points
,
cudaStream_t
stream
);
void
three_nn_wrapper
(
int
b
,
int
n
,
int
m
,
at
::
Tensor
unknown_tensor
,
at
::
Tensor
known_tensor
,
at
::
Tensor
dist2_tensor
,
at
::
Tensor
idx_tensor
)
{
const
float
*
unknown
=
unknown_tensor
.
data
<
float
>
();
const
float
*
known
=
known_tensor
.
data
<
float
>
();
float
*
dist2
=
dist2_tensor
.
data
<
float
>
();
int
*
idx
=
idx_tensor
.
data
<
int
>
();
cudaStream_t
stream
=
THCState_getCurrentStream
(
state
);
three_nn_kernel_launcher
(
b
,
n
,
m
,
unknown
,
known
,
dist2
,
idx
,
stream
);
}
void
three_interpolate_wrapper
(
int
b
,
int
c
,
int
m
,
int
n
,
at
::
Tensor
points_tensor
,
at
::
Tensor
idx_tensor
,
at
::
Tensor
weight_tensor
,
at
::
Tensor
out_tensor
)
{
const
float
*
points
=
points_tensor
.
data
<
float
>
();
const
float
*
weight
=
weight_tensor
.
data
<
float
>
();
float
*
out
=
out_tensor
.
data
<
float
>
();
const
int
*
idx
=
idx_tensor
.
data
<
int
>
();
cudaStream_t
stream
=
THCState_getCurrentStream
(
state
);
three_interpolate_kernel_launcher
(
b
,
c
,
m
,
n
,
points
,
idx
,
weight
,
out
,
stream
);
}
void
three_interpolate_grad_wrapper
(
int
b
,
int
c
,
int
n
,
int
m
,
at
::
Tensor
grad_out_tensor
,
at
::
Tensor
idx_tensor
,
at
::
Tensor
weight_tensor
,
at
::
Tensor
grad_points_tensor
)
{
const
float
*
grad_out
=
grad_out_tensor
.
data
<
float
>
();
const
float
*
weight
=
weight_tensor
.
data
<
float
>
();
float
*
grad_points
=
grad_points_tensor
.
data
<
float
>
();
const
int
*
idx
=
idx_tensor
.
data
<
int
>
();
cudaStream_t
stream
=
THCState_getCurrentStream
(
state
);
three_interpolate_grad_kernel_launcher
(
b
,
c
,
n
,
m
,
grad_out
,
idx
,
weight
,
grad_points
,
stream
);
}
PYBIND11_MODULE
(
TORCH_EXTENSION_NAME
,
m
)
{
m
.
def
(
"three_nn_wrapper"
,
&
three_nn_wrapper
,
"three_nn_wrapper"
);
m
.
def
(
"three_interpolate_wrapper"
,
&
three_interpolate_wrapper
,
"three_interpolate_wrapper"
);
m
.
def
(
"three_interpolate_grad_wrapper"
,
&
three_interpolate_grad_wrapper
,
"three_interpolate_grad_wrapper"
);
}
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment