Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
mmdetection3d
Commits
333536f6
Unverified
Commit
333536f6
authored
Apr 06, 2022
by
Wenwei Zhang
Committed by
GitHub
Apr 06, 2022
Browse files
Release v1.0.0rc1
parents
9c7270d0
f747daab
Changes
219
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
0 additions
and
1856 deletions
+0
-1856
mmdet3d/ops/gather_points/__init__.py
mmdet3d/ops/gather_points/__init__.py
+0
-4
mmdet3d/ops/gather_points/gather_points.py
mmdet3d/ops/gather_points/gather_points.py
+0
-53
mmdet3d/ops/gather_points/src/gather_points.cpp
mmdet3d/ops/gather_points/src/gather_points.cpp
+0
-56
mmdet3d/ops/gather_points/src/gather_points_cuda.cu
mmdet3d/ops/gather_points/src/gather_points_cuda.cu
+0
-124
mmdet3d/ops/group_points/__init__.py
mmdet3d/ops/group_points/__init__.py
+0
-4
mmdet3d/ops/group_points/group_points.py
mmdet3d/ops/group_points/group_points.py
+0
-236
mmdet3d/ops/group_points/src/group_points.cpp
mmdet3d/ops/group_points/src/group_points.cpp
+0
-62
mmdet3d/ops/group_points/src/group_points_cuda.cu
mmdet3d/ops/group_points/src/group_points_cuda.cu
+0
-101
mmdet3d/ops/interpolate/__init__.py
mmdet3d/ops/interpolate/__init__.py
+0
-5
mmdet3d/ops/interpolate/src/interpolate.cpp
mmdet3d/ops/interpolate/src/interpolate.cpp
+0
-93
mmdet3d/ops/interpolate/src/three_interpolate_cuda.cu
mmdet3d/ops/interpolate/src/three_interpolate_cuda.cu
+0
-108
mmdet3d/ops/interpolate/src/three_nn_cuda.cu
mmdet3d/ops/interpolate/src/three_nn_cuda.cu
+0
-89
mmdet3d/ops/interpolate/three_interpolate.py
mmdet3d/ops/interpolate/three_interpolate.py
+0
-65
mmdet3d/ops/interpolate/three_nn.py
mmdet3d/ops/interpolate/three_nn.py
+0
-47
mmdet3d/ops/iou3d/__init__.py
mmdet3d/ops/iou3d/__init__.py
+0
-4
mmdet3d/ops/iou3d/iou3d_utils.py
mmdet3d/ops/iou3d/iou3d_utils.py
+0
-79
mmdet3d/ops/iou3d/src/iou3d.cpp
mmdet3d/ops/iou3d/src/iou3d.cpp
+0
-210
mmdet3d/ops/iou3d/src/iou3d_kernel.cu
mmdet3d/ops/iou3d/src/iou3d_kernel.cu
+0
-439
mmdet3d/ops/knn/__init__.py
mmdet3d/ops/knn/__init__.py
+0
-4
mmdet3d/ops/knn/knn.py
mmdet3d/ops/knn/knn.py
+0
-73
No files found.
mmdet3d/ops/gather_points/__init__.py
deleted
100644 → 0
View file @
9c7270d0
# Copyright (c) OpenMMLab. All rights reserved.
from
.gather_points
import
gather_points
__all__
=
[
'gather_points'
]
mmdet3d/ops/gather_points/gather_points.py
deleted
100644 → 0
View file @
9c7270d0
# Copyright (c) OpenMMLab. All rights reserved.
import
torch
from
torch.autograd
import
Function
from
.
import
gather_points_ext
class
GatherPoints
(
Function
):
"""Gather Points.
Gather points with given index.
"""
@
staticmethod
def
forward
(
ctx
,
features
:
torch
.
Tensor
,
indices
:
torch
.
Tensor
)
->
torch
.
Tensor
:
"""forward.
Args:
features (Tensor): (B, C, N) features to gather.
indices (Tensor): (B, M) where M is the number of points.
Returns:
Tensor: (B, C, M) where M is the number of points.
"""
assert
features
.
is_contiguous
()
assert
indices
.
is_contiguous
()
B
,
npoint
=
indices
.
size
()
_
,
C
,
N
=
features
.
size
()
output
=
features
.
new_zeros
((
B
,
C
,
npoint
))
gather_points_ext
.
gather_points_wrapper
(
B
,
C
,
N
,
npoint
,
features
,
indices
,
output
)
ctx
.
for_backwards
=
(
indices
,
C
,
N
)
ctx
.
mark_non_differentiable
(
indices
)
return
output
@
staticmethod
def
backward
(
ctx
,
grad_out
):
idx
,
C
,
N
=
ctx
.
for_backwards
B
,
npoint
=
idx
.
size
()
grad_features
=
grad_out
.
new_zeros
((
B
,
C
,
N
))
grad_out_data
=
grad_out
.
data
.
contiguous
()
gather_points_ext
.
gather_points_grad_wrapper
(
B
,
C
,
N
,
npoint
,
grad_out_data
,
idx
,
grad_features
.
data
)
return
grad_features
,
None
gather_points
=
GatherPoints
.
apply
mmdet3d/ops/gather_points/src/gather_points.cpp
deleted
100644 → 0
View file @
9c7270d0
#include <ATen/cuda/CUDAContext.h>
#include <ATen/TensorUtils.h>
#include <THC/THC.h>
#include <torch/extension.h>
#include <torch/serialize/tensor.h>
#include <vector>
extern
THCState
*
state
;
int
gather_points_wrapper
(
int
b
,
int
c
,
int
n
,
int
npoints
,
at
::
Tensor
&
points_tensor
,
at
::
Tensor
&
idx_tensor
,
at
::
Tensor
&
out_tensor
);
void
gather_points_kernel_launcher
(
int
b
,
int
c
,
int
n
,
int
npoints
,
const
at
::
Tensor
&
points_tensor
,
const
at
::
Tensor
&
idx_tensor
,
at
::
Tensor
&
out_tensor
);
int
gather_points_grad_wrapper
(
int
b
,
int
c
,
int
n
,
int
npoints
,
at
::
Tensor
&
grad_out_tensor
,
at
::
Tensor
&
idx_tensor
,
at
::
Tensor
&
grad_points_tensor
);
void
gather_points_grad_kernel_launcher
(
int
b
,
int
c
,
int
n
,
int
npoints
,
const
at
::
Tensor
&
grad_out_tensor
,
const
at
::
Tensor
&
idx_tensor
,
at
::
Tensor
&
grad_points_tensor
);
int
gather_points_wrapper
(
int
b
,
int
c
,
int
n
,
int
npoints
,
at
::
Tensor
&
points_tensor
,
at
::
Tensor
&
idx_tensor
,
at
::
Tensor
&
out_tensor
)
{
gather_points_kernel_launcher
(
b
,
c
,
n
,
npoints
,
points_tensor
,
idx_tensor
,
out_tensor
);
return
1
;
}
int
gather_points_grad_wrapper
(
int
b
,
int
c
,
int
n
,
int
npoints
,
at
::
Tensor
&
grad_out_tensor
,
at
::
Tensor
&
idx_tensor
,
at
::
Tensor
&
grad_points_tensor
)
{
gather_points_grad_kernel_launcher
(
b
,
c
,
n
,
npoints
,
grad_out_tensor
,
idx_tensor
,
grad_points_tensor
);
return
1
;
}
PYBIND11_MODULE
(
TORCH_EXTENSION_NAME
,
m
)
{
m
.
def
(
"gather_points_wrapper"
,
&
gather_points_wrapper
,
"gather_points_wrapper"
);
m
.
def
(
"gather_points_grad_wrapper"
,
&
gather_points_grad_wrapper
,
"gather_points_grad_wrapper"
);
}
mmdet3d/ops/gather_points/src/gather_points_cuda.cu
deleted
100644 → 0
View file @
9c7270d0
#include <stdio.h>
#include <stdlib.h>
#include <ATen/ATen.h>
#include <ATen/cuda/CUDAContext.h>
#include <c10/cuda/CUDAGuard.h>
#include <torch/types.h>
#include <ATen/cuda/CUDAApplyUtils.cuh>
#define TOTAL_THREADS 1024
#define THREADS_PER_BLOCK 256
#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
template
<
typename
scalar_t
>
__global__
void
gather_points_kernel
(
int
b
,
int
c
,
int
n
,
int
m
,
const
scalar_t
*
__restrict__
points
,
const
int
*
__restrict__
idx
,
scalar_t
*
__restrict__
out
)
{
// points: (B, C, N)
// idx: (B, M)
// output:
// out: (B, C, M)
int
bs_idx
=
blockIdx
.
z
;
int
c_idx
=
blockIdx
.
y
;
int
pt_idx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
bs_idx
>=
b
||
c_idx
>=
c
||
pt_idx
>=
m
)
return
;
out
+=
bs_idx
*
c
*
m
+
c_idx
*
m
+
pt_idx
;
idx
+=
bs_idx
*
m
+
pt_idx
;
points
+=
bs_idx
*
c
*
n
+
c_idx
*
n
;
out
[
0
]
=
points
[
idx
[
0
]];
}
void
gather_points_kernel_launcher
(
int
b
,
int
c
,
int
n
,
int
npoints
,
const
at
::
Tensor
&
points_tensor
,
const
at
::
Tensor
&
idx_tensor
,
at
::
Tensor
&
out_tensor
)
{
// points: (B, C, N)
// idx: (B, npoints)
// output:
// out: (B, C, npoints)
cudaError_t
err
;
dim3
blocks
(
DIVUP
(
npoints
,
THREADS_PER_BLOCK
),
c
,
b
);
// blockIdx.x(col), blockIdx.y(row)
dim3
threads
(
THREADS_PER_BLOCK
);
cudaStream_t
stream
=
at
::
cuda
::
getCurrentCUDAStream
().
stream
();
AT_DISPATCH_FLOATING_TYPES_AND_HALF
(
out_tensor
.
scalar_type
(),
"gather_points_kernel"
,
[
&
]
{
const
scalar_t
*
points
=
points_tensor
.
data_ptr
<
scalar_t
>
();
const
int
*
idx
=
idx_tensor
.
data_ptr
<
int
>
();
scalar_t
*
out
=
out_tensor
.
data_ptr
<
scalar_t
>
();
gather_points_kernel
<<<
blocks
,
threads
,
0
,
stream
>>>
(
b
,
c
,
n
,
npoints
,
points
,
idx
,
out
);
});
err
=
cudaGetLastError
();
if
(
cudaSuccess
!=
err
)
{
fprintf
(
stderr
,
"CUDA kernel failed : %s
\n
"
,
cudaGetErrorString
(
err
));
exit
(
-
1
);
}
}
template
<
typename
scalar_t
>
__global__
void
gather_points_grad_kernel
(
int
b
,
int
c
,
int
n
,
int
m
,
const
scalar_t
*
__restrict__
grad_out
,
const
int
*
__restrict__
idx
,
scalar_t
*
__restrict__
grad_points
)
{
// grad_out: (B, C, M)
// idx: (B, M)
// output:
// grad_points: (B, C, N)
int
bs_idx
=
blockIdx
.
z
;
int
c_idx
=
blockIdx
.
y
;
int
pt_idx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
bs_idx
>=
b
||
c_idx
>=
c
||
pt_idx
>=
m
)
return
;
grad_out
+=
bs_idx
*
c
*
m
+
c_idx
*
m
+
pt_idx
;
idx
+=
bs_idx
*
m
+
pt_idx
;
grad_points
+=
bs_idx
*
c
*
n
+
c_idx
*
n
;
atomicAdd
(
grad_points
+
idx
[
0
],
grad_out
[
0
]);
}
void
gather_points_grad_kernel_launcher
(
int
b
,
int
c
,
int
n
,
int
npoints
,
const
at
::
Tensor
&
grad_out_tensor
,
const
at
::
Tensor
&
idx_tensor
,
at
::
Tensor
&
grad_points_tensor
)
{
// grad_out: (B, C, npoints)
// idx: (B, npoints)
// output:
// grad_points: (B, C, N)
cudaError_t
err
;
dim3
blocks
(
DIVUP
(
npoints
,
THREADS_PER_BLOCK
),
c
,
b
);
// blockIdx.x(col), blockIdx.y(row)
dim3
threads
(
THREADS_PER_BLOCK
);
cudaStream_t
stream
=
at
::
cuda
::
getCurrentCUDAStream
().
stream
();
AT_DISPATCH_FLOATING_TYPES_AND_HALF
(
grad_points_tensor
.
scalar_type
(),
"gather_points_grad_kernel"
,
[
&
]
{
const
scalar_t
*
grad_out
=
grad_out_tensor
.
data_ptr
<
scalar_t
>
();
const
int
*
idx
=
idx_tensor
.
data_ptr
<
int
>
();
scalar_t
*
grad_points
=
grad_points_tensor
.
data_ptr
<
scalar_t
>
();
gather_points_grad_kernel
<
scalar_t
><<<
blocks
,
threads
,
0
,
stream
>>>
(
b
,
c
,
n
,
npoints
,
grad_out
,
idx
,
grad_points
);
});
err
=
cudaGetLastError
();
if
(
cudaSuccess
!=
err
)
{
fprintf
(
stderr
,
"CUDA kernel failed : %s
\n
"
,
cudaGetErrorString
(
err
));
exit
(
-
1
);
}
}
mmdet3d/ops/group_points/__init__.py
deleted
100644 → 0
View file @
9c7270d0
# Copyright (c) OpenMMLab. All rights reserved.
from
.group_points
import
GroupAll
,
QueryAndGroup
,
grouping_operation
__all__
=
[
'QueryAndGroup'
,
'GroupAll'
,
'grouping_operation'
]
mmdet3d/ops/group_points/group_points.py
deleted
100644 → 0
View file @
9c7270d0
# Copyright (c) OpenMMLab. All rights reserved.
from
typing
import
Tuple
import
torch
from
mmcv.runner
import
force_fp32
from
torch
import
nn
as
nn
from
torch.autograd
import
Function
from
..ball_query
import
ball_query
from
..knn
import
knn
from
.
import
group_points_ext
class
QueryAndGroup
(
nn
.
Module
):
"""Query and Group.
Groups with a ball query of radius
Args:
max_radius (float): The maximum radius of the balls.
If None is given, we will use kNN sampling instead of ball query.
sample_num (int): Maximum number of features to gather in the ball.
min_radius (float, optional): The minimum radius of the balls.
Default: 0.
use_xyz (bool, optional): Whether to use xyz.
Default: True.
return_grouped_xyz (bool, optional): Whether to return grouped xyz.
Default: False.
normalize_xyz (bool, optional): Whether to normalize xyz.
Default: False.
uniform_sample (bool, optional): Whether to sample uniformly.
Default: False
return_unique_cnt (bool, optional): Whether to return the count of
unique samples. Default: False.
return_grouped_idx (bool, optional): Whether to return grouped idx.
Default: False.
"""
def
__init__
(
self
,
max_radius
,
sample_num
,
min_radius
=
0
,
use_xyz
=
True
,
return_grouped_xyz
=
False
,
normalize_xyz
=
False
,
uniform_sample
=
False
,
return_unique_cnt
=
False
,
return_grouped_idx
=
False
):
super
(
QueryAndGroup
,
self
).
__init__
()
self
.
max_radius
=
max_radius
self
.
min_radius
=
min_radius
self
.
sample_num
=
sample_num
self
.
use_xyz
=
use_xyz
self
.
return_grouped_xyz
=
return_grouped_xyz
self
.
normalize_xyz
=
normalize_xyz
self
.
uniform_sample
=
uniform_sample
self
.
return_unique_cnt
=
return_unique_cnt
self
.
return_grouped_idx
=
return_grouped_idx
if
self
.
return_unique_cnt
:
assert
self
.
uniform_sample
,
\
'uniform_sample should be True when '
\
'returning the count of unique samples'
if
self
.
max_radius
is
None
:
assert
not
self
.
normalize_xyz
,
\
'can not normalize grouped xyz when max_radius is None'
self
.
fp16_enabled
=
False
@
force_fp32
()
def
forward
(
self
,
points_xyz
,
center_xyz
,
features
=
None
):
"""forward.
Args:
points_xyz (Tensor): (B, N, 3) xyz coordinates of the features.
center_xyz (Tensor): (B, npoint, 3) Centriods.
features (Tensor): (B, C, N) Descriptors of the features.
Return:
Tensor: (B, 3 + C, npoint, sample_num) Grouped feature.
"""
# if self.max_radius is None, we will perform kNN instead of ball query
# idx is of shape [B, npoint, sample_num]
if
self
.
max_radius
is
None
:
idx
=
knn
(
self
.
sample_num
,
points_xyz
,
center_xyz
,
False
)
idx
=
idx
.
transpose
(
1
,
2
).
contiguous
()
else
:
idx
=
ball_query
(
self
.
min_radius
,
self
.
max_radius
,
self
.
sample_num
,
points_xyz
,
center_xyz
)
if
self
.
uniform_sample
:
unique_cnt
=
torch
.
zeros
((
idx
.
shape
[
0
],
idx
.
shape
[
1
]))
for
i_batch
in
range
(
idx
.
shape
[
0
]):
for
i_region
in
range
(
idx
.
shape
[
1
]):
unique_ind
=
torch
.
unique
(
idx
[
i_batch
,
i_region
,
:])
num_unique
=
unique_ind
.
shape
[
0
]
unique_cnt
[
i_batch
,
i_region
]
=
num_unique
sample_ind
=
torch
.
randint
(
0
,
num_unique
,
(
self
.
sample_num
-
num_unique
,
),
dtype
=
torch
.
long
)
all_ind
=
torch
.
cat
((
unique_ind
,
unique_ind
[
sample_ind
]))
idx
[
i_batch
,
i_region
,
:]
=
all_ind
xyz_trans
=
points_xyz
.
transpose
(
1
,
2
).
contiguous
()
# (B, 3, npoint, sample_num)
grouped_xyz
=
grouping_operation
(
xyz_trans
,
idx
)
grouped_xyz_diff
=
grouped_xyz
-
\
center_xyz
.
transpose
(
1
,
2
).
unsqueeze
(
-
1
)
# relative offsets
if
self
.
normalize_xyz
:
grouped_xyz_diff
/=
self
.
max_radius
if
features
is
not
None
:
grouped_features
=
grouping_operation
(
features
,
idx
)
if
self
.
use_xyz
:
# (B, C + 3, npoint, sample_num)
new_features
=
torch
.
cat
([
grouped_xyz_diff
,
grouped_features
],
dim
=
1
)
else
:
new_features
=
grouped_features
else
:
assert
(
self
.
use_xyz
),
'Cannot have not features and not use xyz as a feature!'
new_features
=
grouped_xyz_diff
ret
=
[
new_features
]
if
self
.
return_grouped_xyz
:
ret
.
append
(
grouped_xyz
)
if
self
.
return_unique_cnt
:
ret
.
append
(
unique_cnt
)
if
self
.
return_grouped_idx
:
ret
.
append
(
idx
)
if
len
(
ret
)
==
1
:
return
ret
[
0
]
else
:
return
tuple
(
ret
)
class
GroupAll
(
nn
.
Module
):
"""Group All.
Group xyz with feature.
Args:
use_xyz (bool): Whether to use xyz.
"""
def
__init__
(
self
,
use_xyz
:
bool
=
True
):
super
().
__init__
()
self
.
use_xyz
=
use_xyz
self
.
fp16_enabled
=
False
@
force_fp32
()
def
forward
(
self
,
xyz
:
torch
.
Tensor
,
new_xyz
:
torch
.
Tensor
,
features
:
torch
.
Tensor
=
None
):
"""forward.
Args:
xyz (Tensor): (B, N, 3) xyz coordinates of the features.
new_xyz (Tensor): Ignored.
features (Tensor): (B, C, N) features to group.
Return:
Tensor: (B, C + 3, 1, N) Grouped feature.
"""
grouped_xyz
=
xyz
.
transpose
(
1
,
2
).
unsqueeze
(
2
)
if
features
is
not
None
:
grouped_features
=
features
.
unsqueeze
(
2
)
if
self
.
use_xyz
:
new_features
=
torch
.
cat
([
grouped_xyz
,
grouped_features
],
dim
=
1
)
# (B, 3 + C, 1, N)
else
:
new_features
=
grouped_features
else
:
new_features
=
grouped_xyz
return
new_features
class
GroupingOperation
(
Function
):
"""Grouping Operation.
Group feature with given index.
"""
@
staticmethod
def
forward
(
ctx
,
features
:
torch
.
Tensor
,
indices
:
torch
.
Tensor
)
->
torch
.
Tensor
:
"""forward.
Args:
features (Tensor): (B, C, N) tensor of features to group.
indices (Tensor): (B, npoint, nsample) the indices of
features to group with.
Returns:
Tensor: (B, C, npoint, nsample) Grouped features.
"""
assert
features
.
is_contiguous
()
assert
indices
.
is_contiguous
()
B
,
nfeatures
,
nsample
=
indices
.
size
()
_
,
C
,
N
=
features
.
size
()
output
=
torch
.
cuda
.
FloatTensor
(
B
,
C
,
nfeatures
,
nsample
)
group_points_ext
.
forward
(
B
,
C
,
N
,
nfeatures
,
nsample
,
features
,
indices
,
output
)
ctx
.
for_backwards
=
(
indices
,
N
)
return
output
@
staticmethod
def
backward
(
ctx
,
grad_out
:
torch
.
Tensor
)
->
Tuple
[
torch
.
Tensor
,
torch
.
Tensor
]:
"""backward.
Args:
grad_out (Tensor): (B, C, npoint, nsample) tensor of the gradients
of the output from forward.
Returns:
Tensor: (B, C, N) gradient of the features.
"""
idx
,
N
=
ctx
.
for_backwards
B
,
C
,
npoint
,
nsample
=
grad_out
.
size
()
grad_features
=
torch
.
cuda
.
FloatTensor
(
B
,
C
,
N
).
zero_
()
grad_out_data
=
grad_out
.
data
.
contiguous
()
group_points_ext
.
backward
(
B
,
C
,
N
,
npoint
,
nsample
,
grad_out_data
,
idx
,
grad_features
.
data
)
return
grad_features
,
None
grouping_operation
=
GroupingOperation
.
apply
mmdet3d/ops/group_points/src/group_points.cpp
deleted
100644 → 0
View file @
9c7270d0
// Modified from
// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/group_points.cpp
#include <THC/THC.h>
#include <cuda.h>
#include <cuda_runtime_api.h>
#include <torch/extension.h>
#include <torch/serialize/tensor.h>
#include <vector>
extern
THCState
*
state
;
int
group_points_wrapper
(
int
b
,
int
c
,
int
n
,
int
npoints
,
int
nsample
,
at
::
Tensor
points_tensor
,
at
::
Tensor
idx_tensor
,
at
::
Tensor
out_tensor
);
void
group_points_kernel_launcher
(
int
b
,
int
c
,
int
n
,
int
npoints
,
int
nsample
,
const
float
*
points
,
const
int
*
idx
,
float
*
out
,
cudaStream_t
stream
);
int
group_points_grad_wrapper
(
int
b
,
int
c
,
int
n
,
int
npoints
,
int
nsample
,
at
::
Tensor
grad_out_tensor
,
at
::
Tensor
idx_tensor
,
at
::
Tensor
grad_points_tensor
);
void
group_points_grad_kernel_launcher
(
int
b
,
int
c
,
int
n
,
int
npoints
,
int
nsample
,
const
float
*
grad_out
,
const
int
*
idx
,
float
*
grad_points
,
cudaStream_t
stream
);
int
group_points_grad_wrapper
(
int
b
,
int
c
,
int
n
,
int
npoints
,
int
nsample
,
at
::
Tensor
grad_out_tensor
,
at
::
Tensor
idx_tensor
,
at
::
Tensor
grad_points_tensor
)
{
float
*
grad_points
=
grad_points_tensor
.
data_ptr
<
float
>
();
const
int
*
idx
=
idx_tensor
.
data_ptr
<
int
>
();
const
float
*
grad_out
=
grad_out_tensor
.
data_ptr
<
float
>
();
cudaStream_t
stream
=
at
::
cuda
::
getCurrentCUDAStream
().
stream
();
group_points_grad_kernel_launcher
(
b
,
c
,
n
,
npoints
,
nsample
,
grad_out
,
idx
,
grad_points
,
stream
);
return
1
;
}
int
group_points_wrapper
(
int
b
,
int
c
,
int
n
,
int
npoints
,
int
nsample
,
at
::
Tensor
points_tensor
,
at
::
Tensor
idx_tensor
,
at
::
Tensor
out_tensor
)
{
const
float
*
points
=
points_tensor
.
data_ptr
<
float
>
();
const
int
*
idx
=
idx_tensor
.
data_ptr
<
int
>
();
float
*
out
=
out_tensor
.
data_ptr
<
float
>
();
cudaStream_t
stream
=
at
::
cuda
::
getCurrentCUDAStream
().
stream
();
group_points_kernel_launcher
(
b
,
c
,
n
,
npoints
,
nsample
,
points
,
idx
,
out
,
stream
);
return
1
;
}
PYBIND11_MODULE
(
TORCH_EXTENSION_NAME
,
m
)
{
m
.
def
(
"forward"
,
&
group_points_wrapper
,
"group_points_wrapper"
);
m
.
def
(
"backward"
,
&
group_points_grad_wrapper
,
"group_points_grad_wrapper"
);
}
mmdet3d/ops/group_points/src/group_points_cuda.cu
deleted
100644 → 0
View file @
9c7270d0
// Modified from
// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/group_points_gpu.cu
#include <stdio.h>
#include <stdlib.h>
#define THREADS_PER_BLOCK 256
#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
__global__
void
group_points_grad_kernel
(
int
b
,
int
c
,
int
n
,
int
npoints
,
int
nsample
,
const
float
*
__restrict__
grad_out
,
const
int
*
__restrict__
idx
,
float
*
__restrict__
grad_points
)
{
// grad_out: (B, C, npoints, nsample)
// idx: (B, npoints, nsample)
// output:
// grad_points: (B, C, N)
int
bs_idx
=
blockIdx
.
z
;
int
c_idx
=
blockIdx
.
y
;
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
int
pt_idx
=
index
/
nsample
;
if
(
bs_idx
>=
b
||
c_idx
>=
c
||
pt_idx
>=
npoints
)
return
;
int
sample_idx
=
index
%
nsample
;
grad_out
+=
bs_idx
*
c
*
npoints
*
nsample
+
c_idx
*
npoints
*
nsample
+
pt_idx
*
nsample
+
sample_idx
;
idx
+=
bs_idx
*
npoints
*
nsample
+
pt_idx
*
nsample
+
sample_idx
;
atomicAdd
(
grad_points
+
bs_idx
*
c
*
n
+
c_idx
*
n
+
idx
[
0
],
grad_out
[
0
]);
}
void
group_points_grad_kernel_launcher
(
int
b
,
int
c
,
int
n
,
int
npoints
,
int
nsample
,
const
float
*
grad_out
,
const
int
*
idx
,
float
*
grad_points
,
cudaStream_t
stream
)
{
// grad_out: (B, C, npoints, nsample)
// idx: (B, npoints, nsample)
// output:
// grad_points: (B, C, N)
cudaError_t
err
;
dim3
blocks
(
DIVUP
(
npoints
*
nsample
,
THREADS_PER_BLOCK
),
c
,
b
);
// blockIdx.x(col), blockIdx.y(row)
dim3
threads
(
THREADS_PER_BLOCK
);
group_points_grad_kernel
<<<
blocks
,
threads
,
0
,
stream
>>>
(
b
,
c
,
n
,
npoints
,
nsample
,
grad_out
,
idx
,
grad_points
);
err
=
cudaGetLastError
();
if
(
cudaSuccess
!=
err
)
{
fprintf
(
stderr
,
"CUDA kernel failed : %s
\n
"
,
cudaGetErrorString
(
err
));
exit
(
-
1
);
}
}
__global__
void
group_points_kernel
(
int
b
,
int
c
,
int
n
,
int
npoints
,
int
nsample
,
const
float
*
__restrict__
points
,
const
int
*
__restrict__
idx
,
float
*
__restrict__
out
)
{
// points: (B, C, N)
// idx: (B, npoints, nsample)
// output:
// out: (B, C, npoints, nsample)
int
bs_idx
=
blockIdx
.
z
;
int
c_idx
=
blockIdx
.
y
;
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
int
pt_idx
=
index
/
nsample
;
if
(
bs_idx
>=
b
||
c_idx
>=
c
||
pt_idx
>=
npoints
)
return
;
int
sample_idx
=
index
%
nsample
;
idx
+=
bs_idx
*
npoints
*
nsample
+
pt_idx
*
nsample
+
sample_idx
;
int
in_idx
=
bs_idx
*
c
*
n
+
c_idx
*
n
+
idx
[
0
];
int
out_idx
=
bs_idx
*
c
*
npoints
*
nsample
+
c_idx
*
npoints
*
nsample
+
pt_idx
*
nsample
+
sample_idx
;
out
[
out_idx
]
=
points
[
in_idx
];
}
void
group_points_kernel_launcher
(
int
b
,
int
c
,
int
n
,
int
npoints
,
int
nsample
,
const
float
*
points
,
const
int
*
idx
,
float
*
out
,
cudaStream_t
stream
)
{
// points: (B, C, N)
// idx: (B, npoints, nsample)
// output:
// out: (B, C, npoints, nsample)
cudaError_t
err
;
dim3
blocks
(
DIVUP
(
npoints
*
nsample
,
THREADS_PER_BLOCK
),
c
,
b
);
// blockIdx.x(col), blockIdx.y(row)
dim3
threads
(
THREADS_PER_BLOCK
);
group_points_kernel
<<<
blocks
,
threads
,
0
,
stream
>>>
(
b
,
c
,
n
,
npoints
,
nsample
,
points
,
idx
,
out
);
// cudaDeviceSynchronize(); // for using printf in kernel function
err
=
cudaGetLastError
();
if
(
cudaSuccess
!=
err
)
{
fprintf
(
stderr
,
"CUDA kernel failed : %s
\n
"
,
cudaGetErrorString
(
err
));
exit
(
-
1
);
}
}
mmdet3d/ops/interpolate/__init__.py
deleted
100644 → 0
View file @
9c7270d0
# Copyright (c) OpenMMLab. All rights reserved.
from
.three_interpolate
import
three_interpolate
from
.three_nn
import
three_nn
__all__
=
[
'three_nn'
,
'three_interpolate'
]
mmdet3d/ops/interpolate/src/interpolate.cpp
deleted
100644 → 0
View file @
9c7270d0
// Modified from
// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/interpolate.cpp
#include <THC/THC.h>
#include <cuda.h>
#include <cuda_runtime_api.h>
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <torch/extension.h>
#include <torch/serialize/tensor.h>
#include <vector>
extern
THCState
*
state
;
void
three_nn_wrapper
(
int
b
,
int
n
,
int
m
,
at
::
Tensor
unknown_tensor
,
at
::
Tensor
known_tensor
,
at
::
Tensor
dist2_tensor
,
at
::
Tensor
idx_tensor
);
void
three_nn_kernel_launcher
(
int
b
,
int
n
,
int
m
,
const
float
*
unknown
,
const
float
*
known
,
float
*
dist2
,
int
*
idx
,
cudaStream_t
stream
);
void
three_interpolate_wrapper
(
int
b
,
int
c
,
int
m
,
int
n
,
at
::
Tensor
points_tensor
,
at
::
Tensor
idx_tensor
,
at
::
Tensor
weight_tensor
,
at
::
Tensor
out_tensor
);
void
three_interpolate_kernel_launcher
(
int
b
,
int
c
,
int
m
,
int
n
,
const
float
*
points
,
const
int
*
idx
,
const
float
*
weight
,
float
*
out
,
cudaStream_t
stream
);
void
three_interpolate_grad_wrapper
(
int
b
,
int
c
,
int
n
,
int
m
,
at
::
Tensor
grad_out_tensor
,
at
::
Tensor
idx_tensor
,
at
::
Tensor
weight_tensor
,
at
::
Tensor
grad_points_tensor
);
void
three_interpolate_grad_kernel_launcher
(
int
b
,
int
c
,
int
n
,
int
m
,
const
float
*
grad_out
,
const
int
*
idx
,
const
float
*
weight
,
float
*
grad_points
,
cudaStream_t
stream
);
void
three_nn_wrapper
(
int
b
,
int
n
,
int
m
,
at
::
Tensor
unknown_tensor
,
at
::
Tensor
known_tensor
,
at
::
Tensor
dist2_tensor
,
at
::
Tensor
idx_tensor
)
{
const
float
*
unknown
=
unknown_tensor
.
data_ptr
<
float
>
();
const
float
*
known
=
known_tensor
.
data_ptr
<
float
>
();
float
*
dist2
=
dist2_tensor
.
data_ptr
<
float
>
();
int
*
idx
=
idx_tensor
.
data_ptr
<
int
>
();
cudaStream_t
stream
=
at
::
cuda
::
getCurrentCUDAStream
().
stream
();
three_nn_kernel_launcher
(
b
,
n
,
m
,
unknown
,
known
,
dist2
,
idx
,
stream
);
}
void
three_interpolate_wrapper
(
int
b
,
int
c
,
int
m
,
int
n
,
at
::
Tensor
points_tensor
,
at
::
Tensor
idx_tensor
,
at
::
Tensor
weight_tensor
,
at
::
Tensor
out_tensor
)
{
const
float
*
points
=
points_tensor
.
data_ptr
<
float
>
();
const
float
*
weight
=
weight_tensor
.
data_ptr
<
float
>
();
float
*
out
=
out_tensor
.
data_ptr
<
float
>
();
const
int
*
idx
=
idx_tensor
.
data_ptr
<
int
>
();
cudaStream_t
stream
=
at
::
cuda
::
getCurrentCUDAStream
().
stream
();
three_interpolate_kernel_launcher
(
b
,
c
,
m
,
n
,
points
,
idx
,
weight
,
out
,
stream
);
}
void
three_interpolate_grad_wrapper
(
int
b
,
int
c
,
int
n
,
int
m
,
at
::
Tensor
grad_out_tensor
,
at
::
Tensor
idx_tensor
,
at
::
Tensor
weight_tensor
,
at
::
Tensor
grad_points_tensor
)
{
const
float
*
grad_out
=
grad_out_tensor
.
data_ptr
<
float
>
();
const
float
*
weight
=
weight_tensor
.
data_ptr
<
float
>
();
float
*
grad_points
=
grad_points_tensor
.
data_ptr
<
float
>
();
const
int
*
idx
=
idx_tensor
.
data_ptr
<
int
>
();
cudaStream_t
stream
=
at
::
cuda
::
getCurrentCUDAStream
().
stream
();
three_interpolate_grad_kernel_launcher
(
b
,
c
,
n
,
m
,
grad_out
,
idx
,
weight
,
grad_points
,
stream
);
}
PYBIND11_MODULE
(
TORCH_EXTENSION_NAME
,
m
)
{
m
.
def
(
"three_nn_wrapper"
,
&
three_nn_wrapper
,
"three_nn_wrapper"
);
m
.
def
(
"three_interpolate_wrapper"
,
&
three_interpolate_wrapper
,
"three_interpolate_wrapper"
);
m
.
def
(
"three_interpolate_grad_wrapper"
,
&
three_interpolate_grad_wrapper
,
"three_interpolate_grad_wrapper"
);
}
mmdet3d/ops/interpolate/src/three_interpolate_cuda.cu
deleted
100644 → 0
View file @
9c7270d0
// Modified from
// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/interpolate_gpu.cu
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#define THREADS_PER_BLOCK 256
#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
__global__
void
three_interpolate_kernel
(
int
b
,
int
c
,
int
m
,
int
n
,
const
float
*
__restrict__
points
,
const
int
*
__restrict__
idx
,
const
float
*
__restrict__
weight
,
float
*
__restrict__
out
)
{
// points: (B, C, M)
// idx: (B, N, 3)
// weight: (B, N, 3)
// output:
// out: (B, C, N)
int
bs_idx
=
blockIdx
.
z
;
int
c_idx
=
blockIdx
.
y
;
int
pt_idx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
bs_idx
>=
b
||
c_idx
>=
c
||
pt_idx
>=
n
)
return
;
weight
+=
bs_idx
*
n
*
3
+
pt_idx
*
3
;
points
+=
bs_idx
*
c
*
m
+
c_idx
*
m
;
idx
+=
bs_idx
*
n
*
3
+
pt_idx
*
3
;
out
+=
bs_idx
*
c
*
n
+
c_idx
*
n
;
out
[
pt_idx
]
=
weight
[
0
]
*
points
[
idx
[
0
]]
+
weight
[
1
]
*
points
[
idx
[
1
]]
+
weight
[
2
]
*
points
[
idx
[
2
]];
}
void
three_interpolate_kernel_launcher
(
int
b
,
int
c
,
int
m
,
int
n
,
const
float
*
points
,
const
int
*
idx
,
const
float
*
weight
,
float
*
out
,
cudaStream_t
stream
)
{
// points: (B, C, M)
// idx: (B, N, 3)
// weight: (B, N, 3)
// output:
// out: (B, C, N)
cudaError_t
err
;
dim3
blocks
(
DIVUP
(
n
,
THREADS_PER_BLOCK
),
c
,
b
);
// blockIdx.x(col), blockIdx.y(row)
dim3
threads
(
THREADS_PER_BLOCK
);
three_interpolate_kernel
<<<
blocks
,
threads
,
0
,
stream
>>>
(
b
,
c
,
m
,
n
,
points
,
idx
,
weight
,
out
);
err
=
cudaGetLastError
();
if
(
cudaSuccess
!=
err
)
{
fprintf
(
stderr
,
"CUDA kernel failed : %s
\n
"
,
cudaGetErrorString
(
err
));
exit
(
-
1
);
}
}
__global__
void
three_interpolate_grad_kernel
(
int
b
,
int
c
,
int
n
,
int
m
,
const
float
*
__restrict__
grad_out
,
const
int
*
__restrict__
idx
,
const
float
*
__restrict__
weight
,
float
*
__restrict__
grad_points
)
{
// grad_out: (B, C, N)
// weight: (B, N, 3)
// output:
// grad_points: (B, C, M)
int
bs_idx
=
blockIdx
.
z
;
int
c_idx
=
blockIdx
.
y
;
int
pt_idx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
bs_idx
>=
b
||
c_idx
>=
c
||
pt_idx
>=
n
)
return
;
grad_out
+=
bs_idx
*
c
*
n
+
c_idx
*
n
+
pt_idx
;
weight
+=
bs_idx
*
n
*
3
+
pt_idx
*
3
;
grad_points
+=
bs_idx
*
c
*
m
+
c_idx
*
m
;
idx
+=
bs_idx
*
n
*
3
+
pt_idx
*
3
;
atomicAdd
(
grad_points
+
idx
[
0
],
grad_out
[
0
]
*
weight
[
0
]);
atomicAdd
(
grad_points
+
idx
[
1
],
grad_out
[
0
]
*
weight
[
1
]);
atomicAdd
(
grad_points
+
idx
[
2
],
grad_out
[
0
]
*
weight
[
2
]);
}
void
three_interpolate_grad_kernel_launcher
(
int
b
,
int
c
,
int
n
,
int
m
,
const
float
*
grad_out
,
const
int
*
idx
,
const
float
*
weight
,
float
*
grad_points
,
cudaStream_t
stream
)
{
// grad_out: (B, C, N)
// weight: (B, N, 3)
// output:
// grad_points: (B, C, M)
cudaError_t
err
;
dim3
blocks
(
DIVUP
(
n
,
THREADS_PER_BLOCK
),
c
,
b
);
// blockIdx.x(col), blockIdx.y(row)
dim3
threads
(
THREADS_PER_BLOCK
);
three_interpolate_grad_kernel
<<<
blocks
,
threads
,
0
,
stream
>>>
(
b
,
c
,
n
,
m
,
grad_out
,
idx
,
weight
,
grad_points
);
err
=
cudaGetLastError
();
if
(
cudaSuccess
!=
err
)
{
fprintf
(
stderr
,
"CUDA kernel failed : %s
\n
"
,
cudaGetErrorString
(
err
));
exit
(
-
1
);
}
}
mmdet3d/ops/interpolate/src/three_nn_cuda.cu
deleted
100644 → 0
View file @
9c7270d0
// Modified from
// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/interpolate_gpu.cu
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#define THREADS_PER_BLOCK 256
#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
__global__
void
three_nn_kernel
(
int
b
,
int
n
,
int
m
,
const
float
*
__restrict__
unknown
,
const
float
*
__restrict__
known
,
float
*
__restrict__
dist2
,
int
*
__restrict__
idx
)
{
// unknown: (B, N, 3)
// known: (B, M, 3)
// output:
// dist2: (B, N, 3)
// idx: (B, N, 3)
int
bs_idx
=
blockIdx
.
y
;
int
pt_idx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
bs_idx
>=
b
||
pt_idx
>=
n
)
return
;
unknown
+=
bs_idx
*
n
*
3
+
pt_idx
*
3
;
known
+=
bs_idx
*
m
*
3
;
dist2
+=
bs_idx
*
n
*
3
+
pt_idx
*
3
;
idx
+=
bs_idx
*
n
*
3
+
pt_idx
*
3
;
float
ux
=
unknown
[
0
];
float
uy
=
unknown
[
1
];
float
uz
=
unknown
[
2
];
double
best1
=
1e40
,
best2
=
1e40
,
best3
=
1e40
;
int
besti1
=
0
,
besti2
=
0
,
besti3
=
0
;
for
(
int
k
=
0
;
k
<
m
;
++
k
)
{
float
x
=
known
[
k
*
3
+
0
];
float
y
=
known
[
k
*
3
+
1
];
float
z
=
known
[
k
*
3
+
2
];
float
d
=
(
ux
-
x
)
*
(
ux
-
x
)
+
(
uy
-
y
)
*
(
uy
-
y
)
+
(
uz
-
z
)
*
(
uz
-
z
);
if
(
d
<
best1
)
{
best3
=
best2
;
besti3
=
besti2
;
best2
=
best1
;
besti2
=
besti1
;
best1
=
d
;
besti1
=
k
;
}
else
if
(
d
<
best2
)
{
best3
=
best2
;
besti3
=
besti2
;
best2
=
d
;
besti2
=
k
;
}
else
if
(
d
<
best3
)
{
best3
=
d
;
besti3
=
k
;
}
}
dist2
[
0
]
=
best1
;
dist2
[
1
]
=
best2
;
dist2
[
2
]
=
best3
;
idx
[
0
]
=
besti1
;
idx
[
1
]
=
besti2
;
idx
[
2
]
=
besti3
;
}
void
three_nn_kernel_launcher
(
int
b
,
int
n
,
int
m
,
const
float
*
unknown
,
const
float
*
known
,
float
*
dist2
,
int
*
idx
,
cudaStream_t
stream
)
{
// unknown: (B, N, 3)
// known: (B, M, 3)
// output:
// dist2: (B, N, 3)
// idx: (B, N, 3)
cudaError_t
err
;
dim3
blocks
(
DIVUP
(
n
,
THREADS_PER_BLOCK
),
b
);
// blockIdx.x(col), blockIdx.y(row)
dim3
threads
(
THREADS_PER_BLOCK
);
three_nn_kernel
<<<
blocks
,
threads
,
0
,
stream
>>>
(
b
,
n
,
m
,
unknown
,
known
,
dist2
,
idx
);
err
=
cudaGetLastError
();
if
(
cudaSuccess
!=
err
)
{
fprintf
(
stderr
,
"CUDA kernel failed : %s
\n
"
,
cudaGetErrorString
(
err
));
exit
(
-
1
);
}
}
mmdet3d/ops/interpolate/three_interpolate.py
deleted
100644 → 0
View file @
9c7270d0
# Copyright (c) OpenMMLab. All rights reserved.
from
typing
import
Tuple
import
torch
from
torch.autograd
import
Function
from
.
import
interpolate_ext
class
ThreeInterpolate
(
Function
):
@
staticmethod
def
forward
(
ctx
,
features
:
torch
.
Tensor
,
indices
:
torch
.
Tensor
,
weight
:
torch
.
Tensor
)
->
torch
.
Tensor
:
"""Performs weighted linear interpolation on 3 features.
Args:
features (Tensor): (B, C, M) Features descriptors to be
interpolated from
indices (Tensor): (B, n, 3) index three nearest neighbors
of the target features in features
weight (Tensor): (B, n, 3) weights of interpolation
Returns:
Tensor: (B, C, N) tensor of the interpolated features
"""
assert
features
.
is_contiguous
()
assert
indices
.
is_contiguous
()
assert
weight
.
is_contiguous
()
B
,
c
,
m
=
features
.
size
()
n
=
indices
.
size
(
1
)
ctx
.
three_interpolate_for_backward
=
(
indices
,
weight
,
m
)
output
=
torch
.
cuda
.
FloatTensor
(
B
,
c
,
n
)
interpolate_ext
.
three_interpolate_wrapper
(
B
,
c
,
m
,
n
,
features
,
indices
,
weight
,
output
)
return
output
@
staticmethod
def
backward
(
ctx
,
grad_out
:
torch
.
Tensor
)
->
Tuple
[
torch
.
Tensor
,
torch
.
Tensor
,
torch
.
Tensor
]:
"""Backward of three interpolate.
Args:
grad_out (Tensor): (B, C, N) tensor with gradients of outputs
Returns:
Tensor: (B, C, M) tensor with gradients of features
"""
idx
,
weight
,
m
=
ctx
.
three_interpolate_for_backward
B
,
c
,
n
=
grad_out
.
size
()
grad_features
=
torch
.
cuda
.
FloatTensor
(
B
,
c
,
m
).
zero_
()
grad_out_data
=
grad_out
.
data
.
contiguous
()
interpolate_ext
.
three_interpolate_grad_wrapper
(
B
,
c
,
n
,
m
,
grad_out_data
,
idx
,
weight
,
grad_features
.
data
)
return
grad_features
,
None
,
None
three_interpolate
=
ThreeInterpolate
.
apply
mmdet3d/ops/interpolate/three_nn.py
deleted
100644 → 0
View file @
9c7270d0
# Copyright (c) OpenMMLab. All rights reserved.
from
typing
import
Tuple
import
torch
from
torch.autograd
import
Function
from
.
import
interpolate_ext
class
ThreeNN
(
Function
):
@
staticmethod
def
forward
(
ctx
,
target
:
torch
.
Tensor
,
source
:
torch
.
Tensor
)
->
Tuple
[
torch
.
Tensor
,
torch
.
Tensor
]:
"""Find the top-3 nearest neighbors of the target set from the source
set.
Args:
target (Tensor): shape (B, N, 3), points set that needs to
find the nearest neighbors.
source (Tensor): shape (B, M, 3), points set that is used
to find the nearest neighbors of points in target set.
Returns:
Tensor: shape (B, N, 3), L2 distance of each point in target
set to their corresponding nearest neighbors.
"""
assert
target
.
is_contiguous
()
assert
source
.
is_contiguous
()
B
,
N
,
_
=
target
.
size
()
m
=
source
.
size
(
1
)
dist2
=
torch
.
cuda
.
FloatTensor
(
B
,
N
,
3
)
idx
=
torch
.
cuda
.
IntTensor
(
B
,
N
,
3
)
interpolate_ext
.
three_nn_wrapper
(
B
,
N
,
m
,
target
,
source
,
dist2
,
idx
)
ctx
.
mark_non_differentiable
(
idx
)
return
torch
.
sqrt
(
dist2
),
idx
@
staticmethod
def
backward
(
ctx
,
a
=
None
,
b
=
None
):
return
None
,
None
three_nn
=
ThreeNN
.
apply
mmdet3d/ops/iou3d/__init__.py
deleted
100644 → 0
View file @
9c7270d0
# Copyright (c) OpenMMLab. All rights reserved.
from
.iou3d_utils
import
boxes_iou_bev
,
nms_gpu
,
nms_normal_gpu
__all__
=
[
'boxes_iou_bev'
,
'nms_gpu'
,
'nms_normal_gpu'
]
mmdet3d/ops/iou3d/iou3d_utils.py
deleted
100644 → 0
View file @
9c7270d0
# Copyright (c) OpenMMLab. All rights reserved.
import
torch
from
.
import
iou3d_cuda
def
boxes_iou_bev
(
boxes_a
,
boxes_b
):
"""Calculate boxes IoU in the Bird's Eye View.
Args:
boxes_a (torch.Tensor): Input boxes a with shape (M, 5).
boxes_b (torch.Tensor): Input boxes b with shape (N, 5).
Returns:
ans_iou (torch.Tensor): IoU result with shape (M, N).
"""
ans_iou
=
boxes_a
.
new_zeros
(
torch
.
Size
((
boxes_a
.
shape
[
0
],
boxes_b
.
shape
[
0
])))
iou3d_cuda
.
boxes_iou_bev_gpu
(
boxes_a
.
contiguous
(),
boxes_b
.
contiguous
(),
ans_iou
)
return
ans_iou
def
nms_gpu
(
boxes
,
scores
,
thresh
,
pre_max_size
=
None
,
post_max_size
=
None
):
"""NMS function GPU implementation (for BEV boxes). The overlap of two
boxes for IoU calculation is defined as the exact overlapping area of the
two boxes. In this function, one can also set `pre_max_size` and
`post_max_size`.
Args:
boxes (torch.Tensor): Input boxes with the shape of [N, 5]
([x1, y1, x2, y2, ry]).
scores (torch.Tensor): Scores of boxes with the shape of [N].
thresh (int): Threshold.
pre_max_size (int, optional): Max size of boxes before NMS.
Default: None.
post_max_size (int, optional): Max size of boxes after NMS.
Default: None.
Returns:
torch.Tensor: Indexes after NMS.
"""
order
=
scores
.
sort
(
0
,
descending
=
True
)[
1
]
if
pre_max_size
is
not
None
:
order
=
order
[:
pre_max_size
]
boxes
=
boxes
[
order
].
contiguous
()
keep
=
torch
.
zeros
(
boxes
.
size
(
0
),
dtype
=
torch
.
long
)
num_out
=
iou3d_cuda
.
nms_gpu
(
boxes
,
keep
,
thresh
,
boxes
.
device
.
index
)
keep
=
order
[
keep
[:
num_out
].
cuda
(
boxes
.
device
)].
contiguous
()
if
post_max_size
is
not
None
:
keep
=
keep
[:
post_max_size
]
return
keep
def
nms_normal_gpu
(
boxes
,
scores
,
thresh
):
"""Normal NMS function GPU implementation (for BEV boxes). The overlap of
two boxes for IoU calculation is defined as the exact overlapping area of
the two boxes WITH their yaw angle set to 0.
Args:
boxes (torch.Tensor): Input boxes with shape (N, 5).
scores (torch.Tensor): Scores of predicted boxes with shape (N).
thresh (torch.Tensor): Threshold of NMS.
Returns:
torch.Tensor: Remaining indices with scores in descending order.
"""
order
=
scores
.
sort
(
0
,
descending
=
True
)[
1
]
boxes
=
boxes
[
order
].
contiguous
()
keep
=
torch
.
zeros
(
boxes
.
size
(
0
),
dtype
=
torch
.
long
)
num_out
=
iou3d_cuda
.
nms_normal_gpu
(
boxes
,
keep
,
thresh
,
boxes
.
device
.
index
)
return
order
[
keep
[:
num_out
].
cuda
(
boxes
.
device
)].
contiguous
()
mmdet3d/ops/iou3d/src/iou3d.cpp
deleted
100644 → 0
View file @
9c7270d0
// Modified from
// https://github.com/open-mmlab/OpenPCDet/blob/master/pcdet/ops/iou3d_nms/src/iou3d_nms.cpp
/*
3D IoU Calculation and Rotated NMS(modified from 2D NMS written by others)
Written by Shaoshuai Shi
All Rights Reserved 2019-2020.
*/
#include <cuda.h>
#include <cuda_runtime_api.h>
#include <torch/extension.h>
#include <torch/serialize/tensor.h>
#include <cstdint>
#include <vector>
#define CHECK_CUDA(x) \
TORCH_CHECK(x.device().is_cuda(), #x, " must be a CUDAtensor ")
#define CHECK_CONTIGUOUS(x) \
TORCH_CHECK(x.is_contiguous(), #x, " must be contiguous ")
#define CHECK_INPUT(x) \
CHECK_CUDA(x); \
CHECK_CONTIGUOUS(x)
#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
#define CHECK_ERROR(ans) \
{ gpuAssert((ans), __FILE__, __LINE__); }
inline
void
gpuAssert
(
cudaError_t
code
,
const
char
*
file
,
int
line
,
bool
abort
=
true
)
{
if
(
code
!=
cudaSuccess
)
{
fprintf
(
stderr
,
"GPUassert: %s %s %d
\n
"
,
cudaGetErrorString
(
code
),
file
,
line
);
if
(
abort
)
exit
(
code
);
}
}
const
int
THREADS_PER_BLOCK_NMS
=
sizeof
(
unsigned
long
long
)
*
8
;
void
boxesoverlapLauncher
(
const
int
num_a
,
const
float
*
boxes_a
,
const
int
num_b
,
const
float
*
boxes_b
,
float
*
ans_overlap
);
void
boxesioubevLauncher
(
const
int
num_a
,
const
float
*
boxes_a
,
const
int
num_b
,
const
float
*
boxes_b
,
float
*
ans_iou
);
void
nmsLauncher
(
const
float
*
boxes
,
unsigned
long
long
*
mask
,
int
boxes_num
,
float
nms_overlap_thresh
);
void
nmsNormalLauncher
(
const
float
*
boxes
,
unsigned
long
long
*
mask
,
int
boxes_num
,
float
nms_overlap_thresh
);
int
boxes_overlap_bev_gpu
(
at
::
Tensor
boxes_a
,
at
::
Tensor
boxes_b
,
at
::
Tensor
ans_overlap
)
{
// params boxes_a: (N, 5) [x1, y1, x2, y2, ry]
// params boxes_b: (M, 5)
// params ans_overlap: (N, M)
CHECK_INPUT
(
boxes_a
);
CHECK_INPUT
(
boxes_b
);
CHECK_INPUT
(
ans_overlap
);
int
num_a
=
boxes_a
.
size
(
0
);
int
num_b
=
boxes_b
.
size
(
0
);
const
float
*
boxes_a_data
=
boxes_a
.
data_ptr
<
float
>
();
const
float
*
boxes_b_data
=
boxes_b
.
data_ptr
<
float
>
();
float
*
ans_overlap_data
=
ans_overlap
.
data_ptr
<
float
>
();
boxesoverlapLauncher
(
num_a
,
boxes_a_data
,
num_b
,
boxes_b_data
,
ans_overlap_data
);
return
1
;
}
int
boxes_iou_bev_gpu
(
at
::
Tensor
boxes_a
,
at
::
Tensor
boxes_b
,
at
::
Tensor
ans_iou
)
{
// params boxes_a: (N, 5) [x1, y1, x2, y2, ry]
// params boxes_b: (M, 5)
// params ans_overlap: (N, M)
CHECK_INPUT
(
boxes_a
);
CHECK_INPUT
(
boxes_b
);
CHECK_INPUT
(
ans_iou
);
int
num_a
=
boxes_a
.
size
(
0
);
int
num_b
=
boxes_b
.
size
(
0
);
const
float
*
boxes_a_data
=
boxes_a
.
data_ptr
<
float
>
();
const
float
*
boxes_b_data
=
boxes_b
.
data_ptr
<
float
>
();
float
*
ans_iou_data
=
ans_iou
.
data_ptr
<
float
>
();
boxesioubevLauncher
(
num_a
,
boxes_a_data
,
num_b
,
boxes_b_data
,
ans_iou_data
);
return
1
;
}
int
nms_gpu
(
at
::
Tensor
boxes
,
at
::
Tensor
keep
,
float
nms_overlap_thresh
,
int
device_id
)
{
// params boxes: (N, 5) [x1, y1, x2, y2, ry]
// params keep: (N)
CHECK_INPUT
(
boxes
);
CHECK_CONTIGUOUS
(
keep
);
cudaSetDevice
(
device_id
);
int
boxes_num
=
boxes
.
size
(
0
);
const
float
*
boxes_data
=
boxes
.
data_ptr
<
float
>
();
int64_t
*
keep_data
=
keep
.
data_ptr
<
int64_t
>
();
const
int
col_blocks
=
DIVUP
(
boxes_num
,
THREADS_PER_BLOCK_NMS
);
unsigned
long
long
*
mask_data
=
NULL
;
CHECK_ERROR
(
cudaMalloc
((
void
**
)
&
mask_data
,
boxes_num
*
col_blocks
*
sizeof
(
unsigned
long
long
)));
nmsLauncher
(
boxes_data
,
mask_data
,
boxes_num
,
nms_overlap_thresh
);
// unsigned long long mask_cpu[boxes_num * col_blocks];
// unsigned long long *mask_cpu = new unsigned long long [boxes_num *
// col_blocks];
std
::
vector
<
unsigned
long
long
>
mask_cpu
(
boxes_num
*
col_blocks
);
// printf("boxes_num=%d, col_blocks=%d\n", boxes_num, col_blocks);
CHECK_ERROR
(
cudaMemcpy
(
&
mask_cpu
[
0
],
mask_data
,
boxes_num
*
col_blocks
*
sizeof
(
unsigned
long
long
),
cudaMemcpyDeviceToHost
));
cudaFree
(
mask_data
);
unsigned
long
long
*
remv_cpu
=
new
unsigned
long
long
[
col_blocks
]();
int
num_to_keep
=
0
;
for
(
int
i
=
0
;
i
<
boxes_num
;
i
++
)
{
int
nblock
=
i
/
THREADS_PER_BLOCK_NMS
;
int
inblock
=
i
%
THREADS_PER_BLOCK_NMS
;
if
(
!
(
remv_cpu
[
nblock
]
&
(
1ULL
<<
inblock
)))
{
keep_data
[
num_to_keep
++
]
=
i
;
unsigned
long
long
*
p
=
&
mask_cpu
[
0
]
+
i
*
col_blocks
;
for
(
int
j
=
nblock
;
j
<
col_blocks
;
j
++
)
{
remv_cpu
[
j
]
|=
p
[
j
];
}
}
}
delete
[]
remv_cpu
;
if
(
cudaSuccess
!=
cudaGetLastError
())
printf
(
"Error!
\n
"
);
return
num_to_keep
;
}
int
nms_normal_gpu
(
at
::
Tensor
boxes
,
at
::
Tensor
keep
,
float
nms_overlap_thresh
,
int
device_id
)
{
// params boxes: (N, 5) [x1, y1, x2, y2, ry]
// params keep: (N)
CHECK_INPUT
(
boxes
);
CHECK_CONTIGUOUS
(
keep
);
cudaSetDevice
(
device_id
);
int
boxes_num
=
boxes
.
size
(
0
);
const
float
*
boxes_data
=
boxes
.
data_ptr
<
float
>
();
int64_t
*
keep_data
=
keep
.
data_ptr
<
int64_t
>
();
const
int
col_blocks
=
DIVUP
(
boxes_num
,
THREADS_PER_BLOCK_NMS
);
unsigned
long
long
*
mask_data
=
NULL
;
CHECK_ERROR
(
cudaMalloc
((
void
**
)
&
mask_data
,
boxes_num
*
col_blocks
*
sizeof
(
unsigned
long
long
)));
nmsNormalLauncher
(
boxes_data
,
mask_data
,
boxes_num
,
nms_overlap_thresh
);
// unsigned long long mask_cpu[boxes_num * col_blocks];
// unsigned long long *mask_cpu = new unsigned long long [boxes_num *
// col_blocks];
std
::
vector
<
unsigned
long
long
>
mask_cpu
(
boxes_num
*
col_blocks
);
// printf("boxes_num=%d, col_blocks=%d\n", boxes_num, col_blocks);
CHECK_ERROR
(
cudaMemcpy
(
&
mask_cpu
[
0
],
mask_data
,
boxes_num
*
col_blocks
*
sizeof
(
unsigned
long
long
),
cudaMemcpyDeviceToHost
));
cudaFree
(
mask_data
);
unsigned
long
long
*
remv_cpu
=
new
unsigned
long
long
[
col_blocks
]();
int
num_to_keep
=
0
;
for
(
int
i
=
0
;
i
<
boxes_num
;
i
++
)
{
int
nblock
=
i
/
THREADS_PER_BLOCK_NMS
;
int
inblock
=
i
%
THREADS_PER_BLOCK_NMS
;
if
(
!
(
remv_cpu
[
nblock
]
&
(
1ULL
<<
inblock
)))
{
keep_data
[
num_to_keep
++
]
=
i
;
unsigned
long
long
*
p
=
&
mask_cpu
[
0
]
+
i
*
col_blocks
;
for
(
int
j
=
nblock
;
j
<
col_blocks
;
j
++
)
{
remv_cpu
[
j
]
|=
p
[
j
];
}
}
}
delete
[]
remv_cpu
;
if
(
cudaSuccess
!=
cudaGetLastError
())
printf
(
"Error!
\n
"
);
return
num_to_keep
;
}
PYBIND11_MODULE
(
TORCH_EXTENSION_NAME
,
m
)
{
m
.
def
(
"boxes_overlap_bev_gpu"
,
&
boxes_overlap_bev_gpu
,
"oriented boxes overlap"
);
m
.
def
(
"boxes_iou_bev_gpu"
,
&
boxes_iou_bev_gpu
,
"oriented boxes iou"
);
m
.
def
(
"nms_gpu"
,
&
nms_gpu
,
"oriented nms gpu"
);
m
.
def
(
"nms_normal_gpu"
,
&
nms_normal_gpu
,
"nms gpu"
);
}
mmdet3d/ops/iou3d/src/iou3d_kernel.cu
deleted
100644 → 0
View file @
9c7270d0
// Modified from
// https://github.com/open-mmlab/OpenPCDet/blob/master/pcdet/ops/iou3d_nms/src/iou3d_nms_kernel.cu
/*
3D IoU Calculation and Rotated NMS(modified from 2D NMS written by others)
Written by Shaoshuai Shi
All Rights Reserved 2019-2020.
*/
#include <stdio.h>
#define THREADS_PER_BLOCK 16
#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
//#define DEBUG
const
int
THREADS_PER_BLOCK_NMS
=
sizeof
(
unsigned
long
long
)
*
8
;
__device__
const
float
EPS
=
1e-8
;
struct
Point
{
float
x
,
y
;
__device__
Point
()
{}
__device__
Point
(
double
_x
,
double
_y
)
{
x
=
_x
,
y
=
_y
;
}
__device__
void
set
(
float
_x
,
float
_y
)
{
x
=
_x
;
y
=
_y
;
}
__device__
Point
operator
+
(
const
Point
&
b
)
const
{
return
Point
(
x
+
b
.
x
,
y
+
b
.
y
);
}
__device__
Point
operator
-
(
const
Point
&
b
)
const
{
return
Point
(
x
-
b
.
x
,
y
-
b
.
y
);
}
};
__device__
inline
float
cross
(
const
Point
&
a
,
const
Point
&
b
)
{
return
a
.
x
*
b
.
y
-
a
.
y
*
b
.
x
;
}
__device__
inline
float
cross
(
const
Point
&
p1
,
const
Point
&
p2
,
const
Point
&
p0
)
{
return
(
p1
.
x
-
p0
.
x
)
*
(
p2
.
y
-
p0
.
y
)
-
(
p2
.
x
-
p0
.
x
)
*
(
p1
.
y
-
p0
.
y
);
}
__device__
int
check_rect_cross
(
const
Point
&
p1
,
const
Point
&
p2
,
const
Point
&
q1
,
const
Point
&
q2
)
{
int
ret
=
min
(
p1
.
x
,
p2
.
x
)
<=
max
(
q1
.
x
,
q2
.
x
)
&&
min
(
q1
.
x
,
q2
.
x
)
<=
max
(
p1
.
x
,
p2
.
x
)
&&
min
(
p1
.
y
,
p2
.
y
)
<=
max
(
q1
.
y
,
q2
.
y
)
&&
min
(
q1
.
y
,
q2
.
y
)
<=
max
(
p1
.
y
,
p2
.
y
);
return
ret
;
}
__device__
inline
int
check_in_box2d
(
const
float
*
box
,
const
Point
&
p
)
{
// params: box (5) [x1, y1, x2, y2, angle]
const
float
MARGIN
=
1e-5
;
float
center_x
=
(
box
[
0
]
+
box
[
2
])
/
2
;
float
center_y
=
(
box
[
1
]
+
box
[
3
])
/
2
;
float
angle_cos
=
cos
(
-
box
[
4
]),
angle_sin
=
sin
(
-
box
[
4
]);
// rotate the point in the opposite direction of box
float
rot_x
=
(
p
.
x
-
center_x
)
*
angle_cos
-
(
p
.
y
-
center_y
)
*
angle_sin
+
center_x
;
float
rot_y
=
(
p
.
x
-
center_x
)
*
angle_sin
+
(
p
.
y
-
center_y
)
*
angle_cos
+
center_y
;
#ifdef DEBUG
printf
(
"box: (%.3f, %.3f, %.3f, %.3f, %.3f)
\n
"
,
box
[
0
],
box
[
1
],
box
[
2
],
box
[
3
],
box
[
4
]);
printf
(
"center: (%.3f, %.3f), cossin(%.3f, %.3f), src(%.3f, %.3f), rot(%.3f, "
"%.3f)
\n
"
,
center_x
,
center_y
,
angle_cos
,
angle_sin
,
p
.
x
,
p
.
y
,
rot_x
,
rot_y
);
#endif
return
(
rot_x
>
box
[
0
]
-
MARGIN
&&
rot_x
<
box
[
2
]
+
MARGIN
&&
rot_y
>
box
[
1
]
-
MARGIN
&&
rot_y
<
box
[
3
]
+
MARGIN
);
}
__device__
inline
int
intersection
(
const
Point
&
p1
,
const
Point
&
p0
,
const
Point
&
q1
,
const
Point
&
q0
,
Point
&
ans
)
{
// fast exclusion
if
(
check_rect_cross
(
p0
,
p1
,
q0
,
q1
)
==
0
)
return
0
;
// check cross standing
float
s1
=
cross
(
q0
,
p1
,
p0
);
float
s2
=
cross
(
p1
,
q1
,
p0
);
float
s3
=
cross
(
p0
,
q1
,
q0
);
float
s4
=
cross
(
q1
,
p1
,
q0
);
if
(
!
(
s1
*
s2
>
0
&&
s3
*
s4
>
0
))
return
0
;
// calculate intersection of two lines
float
s5
=
cross
(
q1
,
p1
,
p0
);
if
(
fabs
(
s5
-
s1
)
>
EPS
)
{
ans
.
x
=
(
s5
*
q0
.
x
-
s1
*
q1
.
x
)
/
(
s5
-
s1
);
ans
.
y
=
(
s5
*
q0
.
y
-
s1
*
q1
.
y
)
/
(
s5
-
s1
);
}
else
{
float
a0
=
p0
.
y
-
p1
.
y
,
b0
=
p1
.
x
-
p0
.
x
,
c0
=
p0
.
x
*
p1
.
y
-
p1
.
x
*
p0
.
y
;
float
a1
=
q0
.
y
-
q1
.
y
,
b1
=
q1
.
x
-
q0
.
x
,
c1
=
q0
.
x
*
q1
.
y
-
q1
.
x
*
q0
.
y
;
float
D
=
a0
*
b1
-
a1
*
b0
;
ans
.
x
=
(
b0
*
c1
-
b1
*
c0
)
/
D
;
ans
.
y
=
(
a1
*
c0
-
a0
*
c1
)
/
D
;
}
return
1
;
}
__device__
inline
void
rotate_around_center
(
const
Point
&
center
,
const
float
angle_cos
,
const
float
angle_sin
,
Point
&
p
)
{
float
new_x
=
(
p
.
x
-
center
.
x
)
*
angle_cos
-
(
p
.
y
-
center
.
y
)
*
angle_sin
+
center
.
x
;
float
new_y
=
(
p
.
x
-
center
.
x
)
*
angle_sin
+
(
p
.
y
-
center
.
y
)
*
angle_cos
+
center
.
y
;
p
.
set
(
new_x
,
new_y
);
}
__device__
inline
int
point_cmp
(
const
Point
&
a
,
const
Point
&
b
,
const
Point
&
center
)
{
return
atan2
(
a
.
y
-
center
.
y
,
a
.
x
-
center
.
x
)
>
atan2
(
b
.
y
-
center
.
y
,
b
.
x
-
center
.
x
);
}
__device__
inline
float
box_overlap
(
const
float
*
box_a
,
const
float
*
box_b
)
{
// params: box_a (5) [x1, y1, x2, y2, angle]
// params: box_b (5) [x1, y1, x2, y2, angle]
float
a_x1
=
box_a
[
0
],
a_y1
=
box_a
[
1
],
a_x2
=
box_a
[
2
],
a_y2
=
box_a
[
3
],
a_angle
=
box_a
[
4
];
float
b_x1
=
box_b
[
0
],
b_y1
=
box_b
[
1
],
b_x2
=
box_b
[
2
],
b_y2
=
box_b
[
3
],
b_angle
=
box_b
[
4
];
Point
center_a
((
a_x1
+
a_x2
)
/
2
,
(
a_y1
+
a_y2
)
/
2
);
Point
center_b
((
b_x1
+
b_x2
)
/
2
,
(
b_y1
+
b_y2
)
/
2
);
#ifdef DEBUG
printf
(
"a: (%.3f, %.3f, %.3f, %.3f, %.3f), b: (%.3f, %.3f, %.3f, %.3f, %.3f)
\n
"
,
a_x1
,
a_y1
,
a_x2
,
a_y2
,
a_angle
,
b_x1
,
b_y1
,
b_x2
,
b_y2
,
b_angle
);
printf
(
"center a: (%.3f, %.3f), b: (%.3f, %.3f)
\n
"
,
center_a
.
x
,
center_a
.
y
,
center_b
.
x
,
center_b
.
y
);
#endif
Point
box_a_corners
[
5
];
box_a_corners
[
0
].
set
(
a_x1
,
a_y1
);
box_a_corners
[
1
].
set
(
a_x2
,
a_y1
);
box_a_corners
[
2
].
set
(
a_x2
,
a_y2
);
box_a_corners
[
3
].
set
(
a_x1
,
a_y2
);
Point
box_b_corners
[
5
];
box_b_corners
[
0
].
set
(
b_x1
,
b_y1
);
box_b_corners
[
1
].
set
(
b_x2
,
b_y1
);
box_b_corners
[
2
].
set
(
b_x2
,
b_y2
);
box_b_corners
[
3
].
set
(
b_x1
,
b_y2
);
// get oriented corners
float
a_angle_cos
=
cos
(
a_angle
),
a_angle_sin
=
sin
(
a_angle
);
float
b_angle_cos
=
cos
(
b_angle
),
b_angle_sin
=
sin
(
b_angle
);
for
(
int
k
=
0
;
k
<
4
;
k
++
)
{
#ifdef DEBUG
printf
(
"before corner %d: a(%.3f, %.3f), b(%.3f, %.3f)
\n
"
,
k
,
box_a_corners
[
k
].
x
,
box_a_corners
[
k
].
y
,
box_b_corners
[
k
].
x
,
box_b_corners
[
k
].
y
);
#endif
rotate_around_center
(
center_a
,
a_angle_cos
,
a_angle_sin
,
box_a_corners
[
k
]);
rotate_around_center
(
center_b
,
b_angle_cos
,
b_angle_sin
,
box_b_corners
[
k
]);
#ifdef DEBUG
printf
(
"corner %d: a(%.3f, %.3f), b(%.3f, %.3f)
\n
"
,
k
,
box_a_corners
[
k
].
x
,
box_a_corners
[
k
].
y
,
box_b_corners
[
k
].
x
,
box_b_corners
[
k
].
y
);
#endif
}
box_a_corners
[
4
]
=
box_a_corners
[
0
];
box_b_corners
[
4
]
=
box_b_corners
[
0
];
// get intersection of lines
Point
cross_points
[
16
];
Point
poly_center
;
int
cnt
=
0
,
flag
=
0
;
poly_center
.
set
(
0
,
0
);
for
(
int
i
=
0
;
i
<
4
;
i
++
)
{
for
(
int
j
=
0
;
j
<
4
;
j
++
)
{
flag
=
intersection
(
box_a_corners
[
i
+
1
],
box_a_corners
[
i
],
box_b_corners
[
j
+
1
],
box_b_corners
[
j
],
cross_points
[
cnt
]);
if
(
flag
)
{
poly_center
=
poly_center
+
cross_points
[
cnt
];
cnt
++
;
}
}
}
// check corners
for
(
int
k
=
0
;
k
<
4
;
k
++
)
{
if
(
check_in_box2d
(
box_a
,
box_b_corners
[
k
]))
{
poly_center
=
poly_center
+
box_b_corners
[
k
];
cross_points
[
cnt
]
=
box_b_corners
[
k
];
cnt
++
;
}
if
(
check_in_box2d
(
box_b
,
box_a_corners
[
k
]))
{
poly_center
=
poly_center
+
box_a_corners
[
k
];
cross_points
[
cnt
]
=
box_a_corners
[
k
];
cnt
++
;
}
}
poly_center
.
x
/=
cnt
;
poly_center
.
y
/=
cnt
;
// sort the points of polygon
Point
temp
;
for
(
int
j
=
0
;
j
<
cnt
-
1
;
j
++
)
{
for
(
int
i
=
0
;
i
<
cnt
-
j
-
1
;
i
++
)
{
if
(
point_cmp
(
cross_points
[
i
],
cross_points
[
i
+
1
],
poly_center
))
{
temp
=
cross_points
[
i
];
cross_points
[
i
]
=
cross_points
[
i
+
1
];
cross_points
[
i
+
1
]
=
temp
;
}
}
}
#ifdef DEBUG
printf
(
"cnt=%d
\n
"
,
cnt
);
for
(
int
i
=
0
;
i
<
cnt
;
i
++
)
{
printf
(
"All cross point %d: (%.3f, %.3f)
\n
"
,
i
,
cross_points
[
i
].
x
,
cross_points
[
i
].
y
);
}
#endif
// get the overlap areas
float
area
=
0
;
for
(
int
k
=
0
;
k
<
cnt
-
1
;
k
++
)
{
area
+=
cross
(
cross_points
[
k
]
-
cross_points
[
0
],
cross_points
[
k
+
1
]
-
cross_points
[
0
]);
}
return
fabs
(
area
)
/
2.0
;
}
__device__
inline
float
iou_bev
(
const
float
*
box_a
,
const
float
*
box_b
)
{
// params: box_a (5) [x1, y1, x2, y2, angle]
// params: box_b (5) [x1, y1, x2, y2, angle]
float
sa
=
(
box_a
[
2
]
-
box_a
[
0
])
*
(
box_a
[
3
]
-
box_a
[
1
]);
float
sb
=
(
box_b
[
2
]
-
box_b
[
0
])
*
(
box_b
[
3
]
-
box_b
[
1
]);
float
s_overlap
=
box_overlap
(
box_a
,
box_b
);
return
s_overlap
/
fmaxf
(
sa
+
sb
-
s_overlap
,
EPS
);
}
__global__
void
boxes_overlap_kernel
(
const
int
num_a
,
const
float
*
boxes_a
,
const
int
num_b
,
const
float
*
boxes_b
,
float
*
ans_overlap
)
{
const
int
a_idx
=
blockIdx
.
y
*
THREADS_PER_BLOCK
+
threadIdx
.
y
;
const
int
b_idx
=
blockIdx
.
x
*
THREADS_PER_BLOCK
+
threadIdx
.
x
;
if
(
a_idx
>=
num_a
||
b_idx
>=
num_b
)
{
return
;
}
const
float
*
cur_box_a
=
boxes_a
+
a_idx
*
5
;
const
float
*
cur_box_b
=
boxes_b
+
b_idx
*
5
;
float
s_overlap
=
box_overlap
(
cur_box_a
,
cur_box_b
);
ans_overlap
[
a_idx
*
num_b
+
b_idx
]
=
s_overlap
;
}
__global__
void
boxes_iou_bev_kernel
(
const
int
num_a
,
const
float
*
boxes_a
,
const
int
num_b
,
const
float
*
boxes_b
,
float
*
ans_iou
)
{
const
int
a_idx
=
blockIdx
.
y
*
THREADS_PER_BLOCK
+
threadIdx
.
y
;
const
int
b_idx
=
blockIdx
.
x
*
THREADS_PER_BLOCK
+
threadIdx
.
x
;
if
(
a_idx
>=
num_a
||
b_idx
>=
num_b
)
{
return
;
}
const
float
*
cur_box_a
=
boxes_a
+
a_idx
*
5
;
const
float
*
cur_box_b
=
boxes_b
+
b_idx
*
5
;
float
cur_iou_bev
=
iou_bev
(
cur_box_a
,
cur_box_b
);
ans_iou
[
a_idx
*
num_b
+
b_idx
]
=
cur_iou_bev
;
}
__global__
void
nms_kernel
(
const
int
boxes_num
,
const
float
nms_overlap_thresh
,
const
float
*
boxes
,
unsigned
long
long
*
mask
)
{
// params: boxes (N, 5) [x1, y1, x2, y2, ry]
// params: mask (N, N/THREADS_PER_BLOCK_NMS)
const
int
row_start
=
blockIdx
.
y
;
const
int
col_start
=
blockIdx
.
x
;
// if (row_start > col_start) return;
const
int
row_size
=
fminf
(
boxes_num
-
row_start
*
THREADS_PER_BLOCK_NMS
,
THREADS_PER_BLOCK_NMS
);
const
int
col_size
=
fminf
(
boxes_num
-
col_start
*
THREADS_PER_BLOCK_NMS
,
THREADS_PER_BLOCK_NMS
);
__shared__
float
block_boxes
[
THREADS_PER_BLOCK_NMS
*
5
];
if
(
threadIdx
.
x
<
col_size
)
{
block_boxes
[
threadIdx
.
x
*
5
+
0
]
=
boxes
[(
THREADS_PER_BLOCK_NMS
*
col_start
+
threadIdx
.
x
)
*
5
+
0
];
block_boxes
[
threadIdx
.
x
*
5
+
1
]
=
boxes
[(
THREADS_PER_BLOCK_NMS
*
col_start
+
threadIdx
.
x
)
*
5
+
1
];
block_boxes
[
threadIdx
.
x
*
5
+
2
]
=
boxes
[(
THREADS_PER_BLOCK_NMS
*
col_start
+
threadIdx
.
x
)
*
5
+
2
];
block_boxes
[
threadIdx
.
x
*
5
+
3
]
=
boxes
[(
THREADS_PER_BLOCK_NMS
*
col_start
+
threadIdx
.
x
)
*
5
+
3
];
block_boxes
[
threadIdx
.
x
*
5
+
4
]
=
boxes
[(
THREADS_PER_BLOCK_NMS
*
col_start
+
threadIdx
.
x
)
*
5
+
4
];
}
__syncthreads
();
if
(
threadIdx
.
x
<
row_size
)
{
const
int
cur_box_idx
=
THREADS_PER_BLOCK_NMS
*
row_start
+
threadIdx
.
x
;
const
float
*
cur_box
=
boxes
+
cur_box_idx
*
5
;
int
i
=
0
;
unsigned
long
long
t
=
0
;
int
start
=
0
;
if
(
row_start
==
col_start
)
{
start
=
threadIdx
.
x
+
1
;
}
for
(
i
=
start
;
i
<
col_size
;
i
++
)
{
if
(
iou_bev
(
cur_box
,
block_boxes
+
i
*
5
)
>
nms_overlap_thresh
)
{
t
|=
1ULL
<<
i
;
}
}
const
int
col_blocks
=
DIVUP
(
boxes_num
,
THREADS_PER_BLOCK_NMS
);
mask
[
cur_box_idx
*
col_blocks
+
col_start
]
=
t
;
}
}
__device__
inline
float
iou_normal
(
float
const
*
const
a
,
float
const
*
const
b
)
{
float
left
=
fmaxf
(
a
[
0
],
b
[
0
]),
right
=
fminf
(
a
[
2
],
b
[
2
]);
float
top
=
fmaxf
(
a
[
1
],
b
[
1
]),
bottom
=
fminf
(
a
[
3
],
b
[
3
]);
float
width
=
fmaxf
(
right
-
left
,
0.
f
),
height
=
fmaxf
(
bottom
-
top
,
0.
f
);
float
interS
=
width
*
height
;
float
Sa
=
(
a
[
2
]
-
a
[
0
])
*
(
a
[
3
]
-
a
[
1
]);
float
Sb
=
(
b
[
2
]
-
b
[
0
])
*
(
b
[
3
]
-
b
[
1
]);
return
interS
/
fmaxf
(
Sa
+
Sb
-
interS
,
EPS
);
}
__global__
void
nms_normal_kernel
(
const
int
boxes_num
,
const
float
nms_overlap_thresh
,
const
float
*
boxes
,
unsigned
long
long
*
mask
)
{
// params: boxes (N, 5) [x1, y1, x2, y2, ry]
// params: mask (N, N/THREADS_PER_BLOCK_NMS)
const
int
row_start
=
blockIdx
.
y
;
const
int
col_start
=
blockIdx
.
x
;
// if (row_start > col_start) return;
const
int
row_size
=
fminf
(
boxes_num
-
row_start
*
THREADS_PER_BLOCK_NMS
,
THREADS_PER_BLOCK_NMS
);
const
int
col_size
=
fminf
(
boxes_num
-
col_start
*
THREADS_PER_BLOCK_NMS
,
THREADS_PER_BLOCK_NMS
);
__shared__
float
block_boxes
[
THREADS_PER_BLOCK_NMS
*
5
];
if
(
threadIdx
.
x
<
col_size
)
{
block_boxes
[
threadIdx
.
x
*
5
+
0
]
=
boxes
[(
THREADS_PER_BLOCK_NMS
*
col_start
+
threadIdx
.
x
)
*
5
+
0
];
block_boxes
[
threadIdx
.
x
*
5
+
1
]
=
boxes
[(
THREADS_PER_BLOCK_NMS
*
col_start
+
threadIdx
.
x
)
*
5
+
1
];
block_boxes
[
threadIdx
.
x
*
5
+
2
]
=
boxes
[(
THREADS_PER_BLOCK_NMS
*
col_start
+
threadIdx
.
x
)
*
5
+
2
];
block_boxes
[
threadIdx
.
x
*
5
+
3
]
=
boxes
[(
THREADS_PER_BLOCK_NMS
*
col_start
+
threadIdx
.
x
)
*
5
+
3
];
block_boxes
[
threadIdx
.
x
*
5
+
4
]
=
boxes
[(
THREADS_PER_BLOCK_NMS
*
col_start
+
threadIdx
.
x
)
*
5
+
4
];
}
__syncthreads
();
if
(
threadIdx
.
x
<
row_size
)
{
const
int
cur_box_idx
=
THREADS_PER_BLOCK_NMS
*
row_start
+
threadIdx
.
x
;
const
float
*
cur_box
=
boxes
+
cur_box_idx
*
5
;
int
i
=
0
;
unsigned
long
long
t
=
0
;
int
start
=
0
;
if
(
row_start
==
col_start
)
{
start
=
threadIdx
.
x
+
1
;
}
for
(
i
=
start
;
i
<
col_size
;
i
++
)
{
if
(
iou_normal
(
cur_box
,
block_boxes
+
i
*
5
)
>
nms_overlap_thresh
)
{
t
|=
1ULL
<<
i
;
}
}
const
int
col_blocks
=
DIVUP
(
boxes_num
,
THREADS_PER_BLOCK_NMS
);
mask
[
cur_box_idx
*
col_blocks
+
col_start
]
=
t
;
}
}
void
boxesoverlapLauncher
(
const
int
num_a
,
const
float
*
boxes_a
,
const
int
num_b
,
const
float
*
boxes_b
,
float
*
ans_overlap
)
{
dim3
blocks
(
DIVUP
(
num_b
,
THREADS_PER_BLOCK
),
DIVUP
(
num_a
,
THREADS_PER_BLOCK
));
// blockIdx.x(col), blockIdx.y(row)
dim3
threads
(
THREADS_PER_BLOCK
,
THREADS_PER_BLOCK
);
boxes_overlap_kernel
<<<
blocks
,
threads
>>>
(
num_a
,
boxes_a
,
num_b
,
boxes_b
,
ans_overlap
);
#ifdef DEBUG
cudaDeviceSynchronize
();
// for using printf in kernel function
#endif
}
void
boxesioubevLauncher
(
const
int
num_a
,
const
float
*
boxes_a
,
const
int
num_b
,
const
float
*
boxes_b
,
float
*
ans_iou
)
{
dim3
blocks
(
DIVUP
(
num_b
,
THREADS_PER_BLOCK
),
DIVUP
(
num_a
,
THREADS_PER_BLOCK
));
// blockIdx.x(col), blockIdx.y(row)
dim3
threads
(
THREADS_PER_BLOCK
,
THREADS_PER_BLOCK
);
boxes_iou_bev_kernel
<<<
blocks
,
threads
>>>
(
num_a
,
boxes_a
,
num_b
,
boxes_b
,
ans_iou
);
}
void
nmsLauncher
(
const
float
*
boxes
,
unsigned
long
long
*
mask
,
int
boxes_num
,
float
nms_overlap_thresh
)
{
dim3
blocks
(
DIVUP
(
boxes_num
,
THREADS_PER_BLOCK_NMS
),
DIVUP
(
boxes_num
,
THREADS_PER_BLOCK_NMS
));
dim3
threads
(
THREADS_PER_BLOCK_NMS
);
nms_kernel
<<<
blocks
,
threads
>>>
(
boxes_num
,
nms_overlap_thresh
,
boxes
,
mask
);
}
void
nmsNormalLauncher
(
const
float
*
boxes
,
unsigned
long
long
*
mask
,
int
boxes_num
,
float
nms_overlap_thresh
)
{
dim3
blocks
(
DIVUP
(
boxes_num
,
THREADS_PER_BLOCK_NMS
),
DIVUP
(
boxes_num
,
THREADS_PER_BLOCK_NMS
));
dim3
threads
(
THREADS_PER_BLOCK_NMS
);
nms_normal_kernel
<<<
blocks
,
threads
>>>
(
boxes_num
,
nms_overlap_thresh
,
boxes
,
mask
);
}
mmdet3d/ops/knn/__init__.py
deleted
100644 → 0
View file @
9c7270d0
# Copyright (c) OpenMMLab. All rights reserved.
from
.knn
import
knn
__all__
=
[
'knn'
]
mmdet3d/ops/knn/knn.py
deleted
100644 → 0
View file @
9c7270d0
# Copyright (c) OpenMMLab. All rights reserved.
import
torch
from
torch.autograd
import
Function
from
.
import
knn_ext
class
KNN
(
Function
):
r
"""KNN (CUDA) based on heap data structure.
Modified from `PAConv <https://github.com/CVMI-Lab/PAConv/tree/main/
scene_seg/lib/pointops/src/knnquery_heap>`_.
Find k-nearest points.
"""
@
staticmethod
def
forward
(
ctx
,
k
:
int
,
xyz
:
torch
.
Tensor
,
center_xyz
:
torch
.
Tensor
=
None
,
transposed
:
bool
=
False
)
->
torch
.
Tensor
:
"""Forward.
Args:
k (int): number of nearest neighbors.
xyz (Tensor): (B, N, 3) if transposed == False, else (B, 3, N).
xyz coordinates of the features.
center_xyz (Tensor): (B, npoint, 3) if transposed == False,
else (B, 3, npoint). centers of the knn query.
transposed (bool): whether the input tensors are transposed.
defaults to False. Should not explicitly use this keyword
when calling knn (=KNN.apply), just add the fourth param.
Returns:
Tensor: (B, k, npoint) tensor with the indices of
the features that form k-nearest neighbours.
"""
assert
k
>
0
if
center_xyz
is
None
:
center_xyz
=
xyz
if
transposed
:
xyz
=
xyz
.
transpose
(
2
,
1
).
contiguous
()
center_xyz
=
center_xyz
.
transpose
(
2
,
1
).
contiguous
()
assert
xyz
.
is_contiguous
()
# [B, N, 3]
assert
center_xyz
.
is_contiguous
()
# [B, npoint, 3]
center_xyz_device
=
center_xyz
.
get_device
()
assert
center_xyz_device
==
xyz
.
get_device
(),
\
'center_xyz and xyz should be put on the same device'
if
torch
.
cuda
.
current_device
()
!=
center_xyz_device
:
torch
.
cuda
.
set_device
(
center_xyz_device
)
B
,
npoint
,
_
=
center_xyz
.
shape
N
=
xyz
.
shape
[
1
]
idx
=
center_xyz
.
new_zeros
((
B
,
npoint
,
k
)).
int
()
dist2
=
center_xyz
.
new_zeros
((
B
,
npoint
,
k
)).
float
()
knn_ext
.
knn_wrapper
(
B
,
N
,
npoint
,
k
,
xyz
,
center_xyz
,
idx
,
dist2
)
# idx shape to [B, k, npoint]
idx
=
idx
.
transpose
(
2
,
1
).
contiguous
()
ctx
.
mark_non_differentiable
(
idx
)
return
idx
@
staticmethod
def
backward
(
ctx
,
a
=
None
):
return
None
,
None
,
None
knn
=
KNN
.
apply
Prev
1
2
3
4
5
6
7
8
9
10
11
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment