Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
OpenPCDet
Commits
43baf787
Commit
43baf787
authored
Jul 27, 2020
by
Shaoshuai Shi
Browse files
Merge branch 'dev_pointrcnn' into dev_v0.2.1
parents
85ff046d
8075b170
Changes
43
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1587 additions
and
0 deletions
+1587
-0
pcdet/ops/pointnet2/pointnet2_batch/src/group_points.cpp
pcdet/ops/pointnet2/pointnet2_batch/src/group_points.cpp
+43
-0
pcdet/ops/pointnet2/pointnet2_batch/src/group_points_gpu.cu
pcdet/ops/pointnet2/pointnet2_batch/src/group_points_gpu.cu
+92
-0
pcdet/ops/pointnet2/pointnet2_batch/src/group_points_gpu.h
pcdet/ops/pointnet2/pointnet2_batch/src/group_points_gpu.h
+22
-0
pcdet/ops/pointnet2/pointnet2_batch/src/interpolate.cpp
pcdet/ops/pointnet2/pointnet2_batch/src/interpolate.cpp
+61
-0
pcdet/ops/pointnet2/pointnet2_batch/src/interpolate_gpu.cu
pcdet/ops/pointnet2/pointnet2_batch/src/interpolate_gpu.cu
+168
-0
pcdet/ops/pointnet2/pointnet2_batch/src/interpolate_gpu.h
pcdet/ops/pointnet2/pointnet2_batch/src/interpolate_gpu.h
+30
-0
pcdet/ops/pointnet2/pointnet2_batch/src/pointnet2_api.cpp
pcdet/ops/pointnet2/pointnet2_batch/src/pointnet2_api.cpp
+24
-0
pcdet/ops/pointnet2/pointnet2_batch/src/sampling.cpp
pcdet/ops/pointnet2/pointnet2_batch/src/sampling.cpp
+53
-0
pcdet/ops/pointnet2/pointnet2_batch/src/sampling_gpu.cu
pcdet/ops/pointnet2/pointnet2_batch/src/sampling_gpu.cu
+260
-0
pcdet/ops/pointnet2/pointnet2_batch/src/sampling_gpu.h
pcdet/ops/pointnet2/pointnet2_batch/src/sampling_gpu.h
+29
-0
pcdet/ops/pointnet2/pointnet2_stack/pointnet2_modules.py
pcdet/ops/pointnet2/pointnet2_stack/pointnet2_modules.py
+45
-0
pcdet/ops/pointnet2/pointnet2_stack/pointnet2_utils.py
pcdet/ops/pointnet2/pointnet2_stack/pointnet2_utils.py
+78
-0
pcdet/ops/pointnet2/pointnet2_stack/src/interpolate.cpp
pcdet/ops/pointnet2/pointnet2_stack/src/interpolate.cpp
+81
-0
pcdet/ops/pointnet2/pointnet2_stack/src/interpolate_gpu.cu
pcdet/ops/pointnet2/pointnet2_stack/src/interpolate_gpu.cu
+193
-0
pcdet/ops/pointnet2/pointnet2_stack/src/interpolate_gpu.h
pcdet/ops/pointnet2/pointnet2_stack/src/interpolate_gpu.h
+39
-0
pcdet/ops/pointnet2/pointnet2_stack/src/pointnet2_api.cpp
pcdet/ops/pointnet2/pointnet2_stack/src/pointnet2_api.cpp
+5
-0
pcdet/ops/roipoint_pool3d/roipoint_pool3d_utils.py
pcdet/ops/roipoint_pool3d/roipoint_pool3d_utils.py
+66
-0
pcdet/ops/roipoint_pool3d/src/roipoint_pool3d.cpp
pcdet/ops/roipoint_pool3d/src/roipoint_pool3d.cpp
+51
-0
pcdet/ops/roipoint_pool3d/src/roipoint_pool3d_kernel.cu
pcdet/ops/roipoint_pool3d/src/roipoint_pool3d_kernel.cu
+165
-0
pcdet/utils/box_coder_utils.py
pcdet/utils/box_coder_utils.py
+82
-0
No files found.
pcdet/ops/pointnet2/pointnet2_batch/src/group_points.cpp
0 → 100644
View file @
43baf787
/*
batch version of point grouping, modified from the original implementation of official PointNet++ codes.
Written by Shaoshuai Shi
All Rights Reserved 2018.
*/
#include <torch/serialize/tensor.h>
#include <cuda.h>
#include <cuda_runtime_api.h>
#include <vector>
#include <THC/THC.h>
#include "group_points_gpu.h"
extern
THCState
*
state
;
int
group_points_grad_wrapper_fast
(
int
b
,
int
c
,
int
n
,
int
npoints
,
int
nsample
,
at
::
Tensor
grad_out_tensor
,
at
::
Tensor
idx_tensor
,
at
::
Tensor
grad_points_tensor
)
{
float
*
grad_points
=
grad_points_tensor
.
data
<
float
>
();
const
int
*
idx
=
idx_tensor
.
data
<
int
>
();
const
float
*
grad_out
=
grad_out_tensor
.
data
<
float
>
();
cudaStream_t
stream
=
THCState_getCurrentStream
(
state
);
group_points_grad_kernel_launcher_fast
(
b
,
c
,
n
,
npoints
,
nsample
,
grad_out
,
idx
,
grad_points
,
stream
);
return
1
;
}
int
group_points_wrapper_fast
(
int
b
,
int
c
,
int
n
,
int
npoints
,
int
nsample
,
at
::
Tensor
points_tensor
,
at
::
Tensor
idx_tensor
,
at
::
Tensor
out_tensor
)
{
const
float
*
points
=
points_tensor
.
data
<
float
>
();
const
int
*
idx
=
idx_tensor
.
data
<
int
>
();
float
*
out
=
out_tensor
.
data
<
float
>
();
cudaStream_t
stream
=
THCState_getCurrentStream
(
state
);
group_points_kernel_launcher_fast
(
b
,
c
,
n
,
npoints
,
nsample
,
points
,
idx
,
out
,
stream
);
return
1
;
}
\ No newline at end of file
pcdet/ops/pointnet2/pointnet2_batch/src/group_points_gpu.cu
0 → 100644
View file @
43baf787
/*
batch version of point grouping, modified from the original implementation of official PointNet++ codes.
Written by Shaoshuai Shi
All Rights Reserved 2018.
*/
#include <stdio.h>
#include <stdlib.h>
#include "cuda_utils.h"
#include "group_points_gpu.h"
__global__
void
group_points_grad_kernel_fast
(
int
b
,
int
c
,
int
n
,
int
npoints
,
int
nsample
,
const
float
*
__restrict__
grad_out
,
const
int
*
__restrict__
idx
,
float
*
__restrict__
grad_points
)
{
// grad_out: (B, C, npoints, nsample)
// idx: (B, npoints, nsample)
// output:
// grad_points: (B, C, N)
int
bs_idx
=
blockIdx
.
z
;
int
c_idx
=
blockIdx
.
y
;
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
int
pt_idx
=
index
/
nsample
;
if
(
bs_idx
>=
b
||
c_idx
>=
c
||
pt_idx
>=
npoints
)
return
;
int
sample_idx
=
index
%
nsample
;
grad_out
+=
bs_idx
*
c
*
npoints
*
nsample
+
c_idx
*
npoints
*
nsample
+
pt_idx
*
nsample
+
sample_idx
;
idx
+=
bs_idx
*
npoints
*
nsample
+
pt_idx
*
nsample
+
sample_idx
;
atomicAdd
(
grad_points
+
bs_idx
*
c
*
n
+
c_idx
*
n
+
idx
[
0
]
,
grad_out
[
0
]);
}
void
group_points_grad_kernel_launcher_fast
(
int
b
,
int
c
,
int
n
,
int
npoints
,
int
nsample
,
const
float
*
grad_out
,
const
int
*
idx
,
float
*
grad_points
,
cudaStream_t
stream
)
{
// grad_out: (B, C, npoints, nsample)
// idx: (B, npoints, nsample)
// output:
// grad_points: (B, C, N)
cudaError_t
err
;
dim3
blocks
(
DIVUP
(
npoints
*
nsample
,
THREADS_PER_BLOCK
),
c
,
b
);
// blockIdx.x(col), blockIdx.y(row)
dim3
threads
(
THREADS_PER_BLOCK
);
group_points_grad_kernel_fast
<<<
blocks
,
threads
,
0
,
stream
>>>
(
b
,
c
,
n
,
npoints
,
nsample
,
grad_out
,
idx
,
grad_points
);
err
=
cudaGetLastError
();
if
(
cudaSuccess
!=
err
)
{
fprintf
(
stderr
,
"CUDA kernel failed : %s
\n
"
,
cudaGetErrorString
(
err
));
exit
(
-
1
);
}
}
__global__
void
group_points_kernel_fast
(
int
b
,
int
c
,
int
n
,
int
npoints
,
int
nsample
,
const
float
*
__restrict__
points
,
const
int
*
__restrict__
idx
,
float
*
__restrict__
out
)
{
// points: (B, C, N)
// idx: (B, npoints, nsample)
// output:
// out: (B, C, npoints, nsample)
int
bs_idx
=
blockIdx
.
z
;
int
c_idx
=
blockIdx
.
y
;
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
int
pt_idx
=
index
/
nsample
;
if
(
bs_idx
>=
b
||
c_idx
>=
c
||
pt_idx
>=
npoints
)
return
;
int
sample_idx
=
index
%
nsample
;
idx
+=
bs_idx
*
npoints
*
nsample
+
pt_idx
*
nsample
+
sample_idx
;
int
in_idx
=
bs_idx
*
c
*
n
+
c_idx
*
n
+
idx
[
0
];
int
out_idx
=
bs_idx
*
c
*
npoints
*
nsample
+
c_idx
*
npoints
*
nsample
+
pt_idx
*
nsample
+
sample_idx
;
out
[
out_idx
]
=
points
[
in_idx
];
}
void
group_points_kernel_launcher_fast
(
int
b
,
int
c
,
int
n
,
int
npoints
,
int
nsample
,
const
float
*
points
,
const
int
*
idx
,
float
*
out
,
cudaStream_t
stream
)
{
// points: (B, C, N)
// idx: (B, npoints, nsample)
// output:
// out: (B, C, npoints, nsample)
cudaError_t
err
;
dim3
blocks
(
DIVUP
(
npoints
*
nsample
,
THREADS_PER_BLOCK
),
c
,
b
);
// blockIdx.x(col), blockIdx.y(row)
dim3
threads
(
THREADS_PER_BLOCK
);
group_points_kernel_fast
<<<
blocks
,
threads
,
0
,
stream
>>>
(
b
,
c
,
n
,
npoints
,
nsample
,
points
,
idx
,
out
);
// cudaDeviceSynchronize(); // for using printf in kernel function
err
=
cudaGetLastError
();
if
(
cudaSuccess
!=
err
)
{
fprintf
(
stderr
,
"CUDA kernel failed : %s
\n
"
,
cudaGetErrorString
(
err
));
exit
(
-
1
);
}
}
pcdet/ops/pointnet2/pointnet2_batch/src/group_points_gpu.h
0 → 100644
View file @
43baf787
#ifndef _GROUP_POINTS_GPU_H
#define _GROUP_POINTS_GPU_H
#include <torch/serialize/tensor.h>
#include <cuda.h>
#include <cuda_runtime_api.h>
#include <vector>
int
group_points_wrapper_fast
(
int
b
,
int
c
,
int
n
,
int
npoints
,
int
nsample
,
at
::
Tensor
points_tensor
,
at
::
Tensor
idx_tensor
,
at
::
Tensor
out_tensor
);
void
group_points_kernel_launcher_fast
(
int
b
,
int
c
,
int
n
,
int
npoints
,
int
nsample
,
const
float
*
points
,
const
int
*
idx
,
float
*
out
,
cudaStream_t
stream
);
int
group_points_grad_wrapper_fast
(
int
b
,
int
c
,
int
n
,
int
npoints
,
int
nsample
,
at
::
Tensor
grad_out_tensor
,
at
::
Tensor
idx_tensor
,
at
::
Tensor
grad_points_tensor
);
void
group_points_grad_kernel_launcher_fast
(
int
b
,
int
c
,
int
n
,
int
npoints
,
int
nsample
,
const
float
*
grad_out
,
const
int
*
idx
,
float
*
grad_points
,
cudaStream_t
stream
);
#endif
pcdet/ops/pointnet2/pointnet2_batch/src/interpolate.cpp
0 → 100644
View file @
43baf787
/*
batch version of point interpolation, modified from the original implementation of official PointNet++ codes.
Written by Shaoshuai Shi
All Rights Reserved 2018.
*/
#include <torch/serialize/tensor.h>
#include <vector>
#include <THC/THC.h>
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#include <cuda_runtime_api.h>
#include "interpolate_gpu.h"
extern
THCState
*
state
;
void
three_nn_wrapper_fast
(
int
b
,
int
n
,
int
m
,
at
::
Tensor
unknown_tensor
,
at
::
Tensor
known_tensor
,
at
::
Tensor
dist2_tensor
,
at
::
Tensor
idx_tensor
)
{
const
float
*
unknown
=
unknown_tensor
.
data
<
float
>
();
const
float
*
known
=
known_tensor
.
data
<
float
>
();
float
*
dist2
=
dist2_tensor
.
data
<
float
>
();
int
*
idx
=
idx_tensor
.
data
<
int
>
();
cudaStream_t
stream
=
THCState_getCurrentStream
(
state
);
three_nn_kernel_launcher_fast
(
b
,
n
,
m
,
unknown
,
known
,
dist2
,
idx
,
stream
);
}
void
three_interpolate_wrapper_fast
(
int
b
,
int
c
,
int
m
,
int
n
,
at
::
Tensor
points_tensor
,
at
::
Tensor
idx_tensor
,
at
::
Tensor
weight_tensor
,
at
::
Tensor
out_tensor
)
{
const
float
*
points
=
points_tensor
.
data
<
float
>
();
const
float
*
weight
=
weight_tensor
.
data
<
float
>
();
float
*
out
=
out_tensor
.
data
<
float
>
();
const
int
*
idx
=
idx_tensor
.
data
<
int
>
();
cudaStream_t
stream
=
THCState_getCurrentStream
(
state
);
three_interpolate_kernel_launcher_fast
(
b
,
c
,
m
,
n
,
points
,
idx
,
weight
,
out
,
stream
);
}
void
three_interpolate_grad_wrapper_fast
(
int
b
,
int
c
,
int
n
,
int
m
,
at
::
Tensor
grad_out_tensor
,
at
::
Tensor
idx_tensor
,
at
::
Tensor
weight_tensor
,
at
::
Tensor
grad_points_tensor
)
{
const
float
*
grad_out
=
grad_out_tensor
.
data
<
float
>
();
const
float
*
weight
=
weight_tensor
.
data
<
float
>
();
float
*
grad_points
=
grad_points_tensor
.
data
<
float
>
();
const
int
*
idx
=
idx_tensor
.
data
<
int
>
();
cudaStream_t
stream
=
THCState_getCurrentStream
(
state
);
three_interpolate_grad_kernel_launcher_fast
(
b
,
c
,
n
,
m
,
grad_out
,
idx
,
weight
,
grad_points
,
stream
);
}
\ No newline at end of file
pcdet/ops/pointnet2/pointnet2_batch/src/interpolate_gpu.cu
0 → 100644
View file @
43baf787
/*
batch version of point interpolation, modified from the original implementation of official PointNet++ codes.
Written by Shaoshuai Shi
All Rights Reserved 2018.
*/
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include "cuda_utils.h"
#include "interpolate_gpu.h"
__global__
void
three_nn_kernel_fast
(
int
b
,
int
n
,
int
m
,
const
float
*
__restrict__
unknown
,
const
float
*
__restrict__
known
,
float
*
__restrict__
dist2
,
int
*
__restrict__
idx
)
{
// unknown: (B, N, 3)
// known: (B, M, 3)
// output:
// dist2: (B, N, 3)
// idx: (B, N, 3)
int
bs_idx
=
blockIdx
.
y
;
int
pt_idx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
bs_idx
>=
b
||
pt_idx
>=
n
)
return
;
unknown
+=
bs_idx
*
n
*
3
+
pt_idx
*
3
;
known
+=
bs_idx
*
m
*
3
;
dist2
+=
bs_idx
*
n
*
3
+
pt_idx
*
3
;
idx
+=
bs_idx
*
n
*
3
+
pt_idx
*
3
;
float
ux
=
unknown
[
0
];
float
uy
=
unknown
[
1
];
float
uz
=
unknown
[
2
];
double
best1
=
1e40
,
best2
=
1e40
,
best3
=
1e40
;
int
besti1
=
0
,
besti2
=
0
,
besti3
=
0
;
for
(
int
k
=
0
;
k
<
m
;
++
k
)
{
float
x
=
known
[
k
*
3
+
0
];
float
y
=
known
[
k
*
3
+
1
];
float
z
=
known
[
k
*
3
+
2
];
float
d
=
(
ux
-
x
)
*
(
ux
-
x
)
+
(
uy
-
y
)
*
(
uy
-
y
)
+
(
uz
-
z
)
*
(
uz
-
z
);
if
(
d
<
best1
)
{
best3
=
best2
;
besti3
=
besti2
;
best2
=
best1
;
besti2
=
besti1
;
best1
=
d
;
besti1
=
k
;
}
else
if
(
d
<
best2
)
{
best3
=
best2
;
besti3
=
besti2
;
best2
=
d
;
besti2
=
k
;
}
else
if
(
d
<
best3
)
{
best3
=
d
;
besti3
=
k
;
}
}
dist2
[
0
]
=
best1
;
dist2
[
1
]
=
best2
;
dist2
[
2
]
=
best3
;
idx
[
0
]
=
besti1
;
idx
[
1
]
=
besti2
;
idx
[
2
]
=
besti3
;
}
void
three_nn_kernel_launcher_fast
(
int
b
,
int
n
,
int
m
,
const
float
*
unknown
,
const
float
*
known
,
float
*
dist2
,
int
*
idx
,
cudaStream_t
stream
)
{
// unknown: (B, N, 3)
// known: (B, M, 3)
// output:
// dist2: (B, N, 3)
// idx: (B, N, 3)
cudaError_t
err
;
dim3
blocks
(
DIVUP
(
n
,
THREADS_PER_BLOCK
),
b
);
// blockIdx.x(col), blockIdx.y(row)
dim3
threads
(
THREADS_PER_BLOCK
);
three_nn_kernel_fast
<<<
blocks
,
threads
,
0
,
stream
>>>
(
b
,
n
,
m
,
unknown
,
known
,
dist2
,
idx
);
err
=
cudaGetLastError
();
if
(
cudaSuccess
!=
err
)
{
fprintf
(
stderr
,
"CUDA kernel failed : %s
\n
"
,
cudaGetErrorString
(
err
));
exit
(
-
1
);
}
}
__global__
void
three_interpolate_kernel_fast
(
int
b
,
int
c
,
int
m
,
int
n
,
const
float
*
__restrict__
points
,
const
int
*
__restrict__
idx
,
const
float
*
__restrict__
weight
,
float
*
__restrict__
out
)
{
// points: (B, C, M)
// idx: (B, N, 3)
// weight: (B, N, 3)
// output:
// out: (B, C, N)
int
bs_idx
=
blockIdx
.
z
;
int
c_idx
=
blockIdx
.
y
;
int
pt_idx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
bs_idx
>=
b
||
c_idx
>=
c
||
pt_idx
>=
n
)
return
;
weight
+=
bs_idx
*
n
*
3
+
pt_idx
*
3
;
points
+=
bs_idx
*
c
*
m
+
c_idx
*
m
;
idx
+=
bs_idx
*
n
*
3
+
pt_idx
*
3
;
out
+=
bs_idx
*
c
*
n
+
c_idx
*
n
;
out
[
pt_idx
]
=
weight
[
0
]
*
points
[
idx
[
0
]]
+
weight
[
1
]
*
points
[
idx
[
1
]]
+
weight
[
2
]
*
points
[
idx
[
2
]];
}
void
three_interpolate_kernel_launcher_fast
(
int
b
,
int
c
,
int
m
,
int
n
,
const
float
*
points
,
const
int
*
idx
,
const
float
*
weight
,
float
*
out
,
cudaStream_t
stream
)
{
// points: (B, C, M)
// idx: (B, N, 3)
// weight: (B, N, 3)
// output:
// out: (B, C, N)
cudaError_t
err
;
dim3
blocks
(
DIVUP
(
n
,
THREADS_PER_BLOCK
),
c
,
b
);
// blockIdx.x(col), blockIdx.y(row)
dim3
threads
(
THREADS_PER_BLOCK
);
three_interpolate_kernel_fast
<<<
blocks
,
threads
,
0
,
stream
>>>
(
b
,
c
,
m
,
n
,
points
,
idx
,
weight
,
out
);
err
=
cudaGetLastError
();
if
(
cudaSuccess
!=
err
)
{
fprintf
(
stderr
,
"CUDA kernel failed : %s
\n
"
,
cudaGetErrorString
(
err
));
exit
(
-
1
);
}
}
__global__
void
three_interpolate_grad_kernel_fast
(
int
b
,
int
c
,
int
n
,
int
m
,
const
float
*
__restrict__
grad_out
,
const
int
*
__restrict__
idx
,
const
float
*
__restrict__
weight
,
float
*
__restrict__
grad_points
)
{
// grad_out: (B, C, N)
// weight: (B, N, 3)
// output:
// grad_points: (B, C, M)
int
bs_idx
=
blockIdx
.
z
;
int
c_idx
=
blockIdx
.
y
;
int
pt_idx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
bs_idx
>=
b
||
c_idx
>=
c
||
pt_idx
>=
n
)
return
;
grad_out
+=
bs_idx
*
c
*
n
+
c_idx
*
n
+
pt_idx
;
weight
+=
bs_idx
*
n
*
3
+
pt_idx
*
3
;
grad_points
+=
bs_idx
*
c
*
m
+
c_idx
*
m
;
idx
+=
bs_idx
*
n
*
3
+
pt_idx
*
3
;
atomicAdd
(
grad_points
+
idx
[
0
],
grad_out
[
0
]
*
weight
[
0
]);
atomicAdd
(
grad_points
+
idx
[
1
],
grad_out
[
0
]
*
weight
[
1
]);
atomicAdd
(
grad_points
+
idx
[
2
],
grad_out
[
0
]
*
weight
[
2
]);
}
void
three_interpolate_grad_kernel_launcher_fast
(
int
b
,
int
c
,
int
n
,
int
m
,
const
float
*
grad_out
,
const
int
*
idx
,
const
float
*
weight
,
float
*
grad_points
,
cudaStream_t
stream
)
{
// grad_out: (B, C, N)
// weight: (B, N, 3)
// output:
// grad_points: (B, C, M)
cudaError_t
err
;
dim3
blocks
(
DIVUP
(
n
,
THREADS_PER_BLOCK
),
c
,
b
);
// blockIdx.x(col), blockIdx.y(row)
dim3
threads
(
THREADS_PER_BLOCK
);
three_interpolate_grad_kernel_fast
<<<
blocks
,
threads
,
0
,
stream
>>>
(
b
,
c
,
n
,
m
,
grad_out
,
idx
,
weight
,
grad_points
);
err
=
cudaGetLastError
();
if
(
cudaSuccess
!=
err
)
{
fprintf
(
stderr
,
"CUDA kernel failed : %s
\n
"
,
cudaGetErrorString
(
err
));
exit
(
-
1
);
}
}
\ No newline at end of file
pcdet/ops/pointnet2/pointnet2_batch/src/interpolate_gpu.h
0 → 100644
View file @
43baf787
#ifndef _INTERPOLATE_GPU_H
#define _INTERPOLATE_GPU_H
#include <torch/serialize/tensor.h>
#include<vector>
#include <cuda.h>
#include <cuda_runtime_api.h>
void
three_nn_wrapper_fast
(
int
b
,
int
n
,
int
m
,
at
::
Tensor
unknown_tensor
,
at
::
Tensor
known_tensor
,
at
::
Tensor
dist2_tensor
,
at
::
Tensor
idx_tensor
);
void
three_nn_kernel_launcher_fast
(
int
b
,
int
n
,
int
m
,
const
float
*
unknown
,
const
float
*
known
,
float
*
dist2
,
int
*
idx
,
cudaStream_t
stream
);
void
three_interpolate_wrapper_fast
(
int
b
,
int
c
,
int
m
,
int
n
,
at
::
Tensor
points_tensor
,
at
::
Tensor
idx_tensor
,
at
::
Tensor
weight_tensor
,
at
::
Tensor
out_tensor
);
void
three_interpolate_kernel_launcher_fast
(
int
b
,
int
c
,
int
m
,
int
n
,
const
float
*
points
,
const
int
*
idx
,
const
float
*
weight
,
float
*
out
,
cudaStream_t
stream
);
void
three_interpolate_grad_wrapper_fast
(
int
b
,
int
c
,
int
n
,
int
m
,
at
::
Tensor
grad_out_tensor
,
at
::
Tensor
idx_tensor
,
at
::
Tensor
weight_tensor
,
at
::
Tensor
grad_points_tensor
);
void
three_interpolate_grad_kernel_launcher_fast
(
int
b
,
int
c
,
int
n
,
int
m
,
const
float
*
grad_out
,
const
int
*
idx
,
const
float
*
weight
,
float
*
grad_points
,
cudaStream_t
stream
);
#endif
pcdet/ops/pointnet2/pointnet2_batch/src/pointnet2_api.cpp
0 → 100644
View file @
43baf787
#include <torch/serialize/tensor.h>
#include <torch/extension.h>
#include "ball_query_gpu.h"
#include "group_points_gpu.h"
#include "sampling_gpu.h"
#include "interpolate_gpu.h"
PYBIND11_MODULE
(
TORCH_EXTENSION_NAME
,
m
)
{
m
.
def
(
"ball_query_wrapper"
,
&
ball_query_wrapper_fast
,
"ball_query_wrapper_fast"
);
m
.
def
(
"group_points_wrapper"
,
&
group_points_wrapper_fast
,
"group_points_wrapper_fast"
);
m
.
def
(
"group_points_grad_wrapper"
,
&
group_points_grad_wrapper_fast
,
"group_points_grad_wrapper_fast"
);
m
.
def
(
"gather_points_wrapper"
,
&
gather_points_wrapper_fast
,
"gather_points_wrapper_fast"
);
m
.
def
(
"gather_points_grad_wrapper"
,
&
gather_points_grad_wrapper_fast
,
"gather_points_grad_wrapper_fast"
);
m
.
def
(
"furthest_point_sampling_wrapper"
,
&
furthest_point_sampling_wrapper
,
"furthest_point_sampling_wrapper"
);
m
.
def
(
"three_nn_wrapper"
,
&
three_nn_wrapper_fast
,
"three_nn_wrapper_fast"
);
m
.
def
(
"three_interpolate_wrapper"
,
&
three_interpolate_wrapper_fast
,
"three_interpolate_wrapper_fast"
);
m
.
def
(
"three_interpolate_grad_wrapper"
,
&
three_interpolate_grad_wrapper_fast
,
"three_interpolate_grad_wrapper_fast"
);
}
pcdet/ops/pointnet2/pointnet2_batch/src/sampling.cpp
0 → 100644
View file @
43baf787
/*
batch version of point sampling and gathering, modified from the original implementation of official PointNet++ codes.
Written by Shaoshuai Shi
All Rights Reserved 2018.
*/
#include <torch/serialize/tensor.h>
#include <ATen/cuda/CUDAContext.h>
#include <vector>
#include <THC/THC.h>
#include "sampling_gpu.h"
extern
THCState
*
state
;
int
gather_points_wrapper_fast
(
int
b
,
int
c
,
int
n
,
int
npoints
,
at
::
Tensor
points_tensor
,
at
::
Tensor
idx_tensor
,
at
::
Tensor
out_tensor
){
const
float
*
points
=
points_tensor
.
data
<
float
>
();
const
int
*
idx
=
idx_tensor
.
data
<
int
>
();
float
*
out
=
out_tensor
.
data
<
float
>
();
cudaStream_t
stream
=
THCState_getCurrentStream
(
state
);
gather_points_kernel_launcher_fast
(
b
,
c
,
n
,
npoints
,
points
,
idx
,
out
,
stream
);
return
1
;
}
int
gather_points_grad_wrapper_fast
(
int
b
,
int
c
,
int
n
,
int
npoints
,
at
::
Tensor
grad_out_tensor
,
at
::
Tensor
idx_tensor
,
at
::
Tensor
grad_points_tensor
)
{
const
float
*
grad_out
=
grad_out_tensor
.
data
<
float
>
();
const
int
*
idx
=
idx_tensor
.
data
<
int
>
();
float
*
grad_points
=
grad_points_tensor
.
data
<
float
>
();
cudaStream_t
stream
=
THCState_getCurrentStream
(
state
);
gather_points_grad_kernel_launcher_fast
(
b
,
c
,
n
,
npoints
,
grad_out
,
idx
,
grad_points
,
stream
);
return
1
;
}
int
furthest_point_sampling_wrapper
(
int
b
,
int
n
,
int
m
,
at
::
Tensor
points_tensor
,
at
::
Tensor
temp_tensor
,
at
::
Tensor
idx_tensor
)
{
const
float
*
points
=
points_tensor
.
data
<
float
>
();
float
*
temp
=
temp_tensor
.
data
<
float
>
();
int
*
idx
=
idx_tensor
.
data
<
int
>
();
cudaStream_t
stream
=
THCState_getCurrentStream
(
state
);
furthest_point_sampling_kernel_launcher
(
b
,
n
,
m
,
points
,
temp
,
idx
,
stream
);
return
1
;
}
pcdet/ops/pointnet2/pointnet2_batch/src/sampling_gpu.cu
0 → 100644
View file @
43baf787
/*
batch version of point sampling and gathering, modified from the original implementation of official PointNet++ codes.
Written by Shaoshuai Shi
All Rights Reserved 2018.
*/
#include <stdio.h>
#include <stdlib.h>
#include "cuda_utils.h"
#include "sampling_gpu.h"
__global__
void
gather_points_kernel_fast
(
int
b
,
int
c
,
int
n
,
int
m
,
const
float
*
__restrict__
points
,
const
int
*
__restrict__
idx
,
float
*
__restrict__
out
)
{
// points: (B, C, N)
// idx: (B, M)
// output:
// out: (B, C, M)
int
bs_idx
=
blockIdx
.
z
;
int
c_idx
=
blockIdx
.
y
;
int
pt_idx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
bs_idx
>=
b
||
c_idx
>=
c
||
pt_idx
>=
m
)
return
;
out
+=
bs_idx
*
c
*
m
+
c_idx
*
m
+
pt_idx
;
idx
+=
bs_idx
*
m
+
pt_idx
;
points
+=
bs_idx
*
c
*
n
+
c_idx
*
n
;
out
[
0
]
=
points
[
idx
[
0
]];
}
void
gather_points_kernel_launcher_fast
(
int
b
,
int
c
,
int
n
,
int
npoints
,
const
float
*
points
,
const
int
*
idx
,
float
*
out
,
cudaStream_t
stream
)
{
// points: (B, C, N)
// idx: (B, npoints)
// output:
// out: (B, C, npoints)
cudaError_t
err
;
dim3
blocks
(
DIVUP
(
npoints
,
THREADS_PER_BLOCK
),
c
,
b
);
// blockIdx.x(col), blockIdx.y(row)
dim3
threads
(
THREADS_PER_BLOCK
);
gather_points_kernel_fast
<<<
blocks
,
threads
,
0
,
stream
>>>
(
b
,
c
,
n
,
npoints
,
points
,
idx
,
out
);
err
=
cudaGetLastError
();
if
(
cudaSuccess
!=
err
)
{
fprintf
(
stderr
,
"CUDA kernel failed : %s
\n
"
,
cudaGetErrorString
(
err
));
exit
(
-
1
);
}
}
__global__
void
gather_points_grad_kernel_fast
(
int
b
,
int
c
,
int
n
,
int
m
,
const
float
*
__restrict__
grad_out
,
const
int
*
__restrict__
idx
,
float
*
__restrict__
grad_points
)
{
// grad_out: (B, C, M)
// idx: (B, M)
// output:
// grad_points: (B, C, N)
int
bs_idx
=
blockIdx
.
z
;
int
c_idx
=
blockIdx
.
y
;
int
pt_idx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
bs_idx
>=
b
||
c_idx
>=
c
||
pt_idx
>=
m
)
return
;
grad_out
+=
bs_idx
*
c
*
m
+
c_idx
*
m
+
pt_idx
;
idx
+=
bs_idx
*
m
+
pt_idx
;
grad_points
+=
bs_idx
*
c
*
n
+
c_idx
*
n
;
atomicAdd
(
grad_points
+
idx
[
0
],
grad_out
[
0
]);
}
void
gather_points_grad_kernel_launcher_fast
(
int
b
,
int
c
,
int
n
,
int
npoints
,
const
float
*
grad_out
,
const
int
*
idx
,
float
*
grad_points
,
cudaStream_t
stream
)
{
// grad_out: (B, C, npoints)
// idx: (B, npoints)
// output:
// grad_points: (B, C, N)
cudaError_t
err
;
dim3
blocks
(
DIVUP
(
npoints
,
THREADS_PER_BLOCK
),
c
,
b
);
// blockIdx.x(col), blockIdx.y(row)
dim3
threads
(
THREADS_PER_BLOCK
);
gather_points_grad_kernel_fast
<<<
blocks
,
threads
,
0
,
stream
>>>
(
b
,
c
,
n
,
npoints
,
grad_out
,
idx
,
grad_points
);
err
=
cudaGetLastError
();
if
(
cudaSuccess
!=
err
)
{
fprintf
(
stderr
,
"CUDA kernel failed : %s
\n
"
,
cudaGetErrorString
(
err
));
exit
(
-
1
);
}
}
__device__
void
__update
(
float
*
__restrict__
dists
,
int
*
__restrict__
dists_i
,
int
idx1
,
int
idx2
){
const
float
v1
=
dists
[
idx1
],
v2
=
dists
[
idx2
];
const
int
i1
=
dists_i
[
idx1
],
i2
=
dists_i
[
idx2
];
dists
[
idx1
]
=
max
(
v1
,
v2
);
dists_i
[
idx1
]
=
v2
>
v1
?
i2
:
i1
;
}
template
<
unsigned
int
block_size
>
__global__
void
furthest_point_sampling_kernel
(
int
b
,
int
n
,
int
m
,
const
float
*
__restrict__
dataset
,
float
*
__restrict__
temp
,
int
*
__restrict__
idxs
)
{
// dataset: (B, N, 3)
// tmp: (B, N)
// output:
// idx: (B, M)
if
(
m
<=
0
)
return
;
__shared__
float
dists
[
block_size
];
__shared__
int
dists_i
[
block_size
];
int
batch_index
=
blockIdx
.
x
;
dataset
+=
batch_index
*
n
*
3
;
temp
+=
batch_index
*
n
;
idxs
+=
batch_index
*
m
;
int
tid
=
threadIdx
.
x
;
const
int
stride
=
block_size
;
int
old
=
0
;
if
(
threadIdx
.
x
==
0
)
idxs
[
0
]
=
old
;
__syncthreads
();
for
(
int
j
=
1
;
j
<
m
;
j
++
)
{
int
besti
=
0
;
float
best
=
-
1
;
float
x1
=
dataset
[
old
*
3
+
0
];
float
y1
=
dataset
[
old
*
3
+
1
];
float
z1
=
dataset
[
old
*
3
+
2
];
for
(
int
k
=
tid
;
k
<
n
;
k
+=
stride
)
{
float
x2
,
y2
,
z2
;
x2
=
dataset
[
k
*
3
+
0
];
y2
=
dataset
[
k
*
3
+
1
];
z2
=
dataset
[
k
*
3
+
2
];
// float mag = (x2 * x2) + (y2 * y2) + (z2 * z2);
// if (mag <= 1e-3)
// continue;
float
d
=
(
x2
-
x1
)
*
(
x2
-
x1
)
+
(
y2
-
y1
)
*
(
y2
-
y1
)
+
(
z2
-
z1
)
*
(
z2
-
z1
);
float
d2
=
min
(
d
,
temp
[
k
]);
temp
[
k
]
=
d2
;
besti
=
d2
>
best
?
k
:
besti
;
best
=
d2
>
best
?
d2
:
best
;
}
dists
[
tid
]
=
best
;
dists_i
[
tid
]
=
besti
;
__syncthreads
();
if
(
block_size
>=
1024
)
{
if
(
tid
<
512
)
{
__update
(
dists
,
dists_i
,
tid
,
tid
+
512
);
}
__syncthreads
();
}
if
(
block_size
>=
512
)
{
if
(
tid
<
256
)
{
__update
(
dists
,
dists_i
,
tid
,
tid
+
256
);
}
__syncthreads
();
}
if
(
block_size
>=
256
)
{
if
(
tid
<
128
)
{
__update
(
dists
,
dists_i
,
tid
,
tid
+
128
);
}
__syncthreads
();
}
if
(
block_size
>=
128
)
{
if
(
tid
<
64
)
{
__update
(
dists
,
dists_i
,
tid
,
tid
+
64
);
}
__syncthreads
();
}
if
(
block_size
>=
64
)
{
if
(
tid
<
32
)
{
__update
(
dists
,
dists_i
,
tid
,
tid
+
32
);
}
__syncthreads
();
}
if
(
block_size
>=
32
)
{
if
(
tid
<
16
)
{
__update
(
dists
,
dists_i
,
tid
,
tid
+
16
);
}
__syncthreads
();
}
if
(
block_size
>=
16
)
{
if
(
tid
<
8
)
{
__update
(
dists
,
dists_i
,
tid
,
tid
+
8
);
}
__syncthreads
();
}
if
(
block_size
>=
8
)
{
if
(
tid
<
4
)
{
__update
(
dists
,
dists_i
,
tid
,
tid
+
4
);
}
__syncthreads
();
}
if
(
block_size
>=
4
)
{
if
(
tid
<
2
)
{
__update
(
dists
,
dists_i
,
tid
,
tid
+
2
);
}
__syncthreads
();
}
if
(
block_size
>=
2
)
{
if
(
tid
<
1
)
{
__update
(
dists
,
dists_i
,
tid
,
tid
+
1
);
}
__syncthreads
();
}
old
=
dists_i
[
0
];
if
(
tid
==
0
)
idxs
[
j
]
=
old
;
}
}
void
furthest_point_sampling_kernel_launcher
(
int
b
,
int
n
,
int
m
,
const
float
*
dataset
,
float
*
temp
,
int
*
idxs
,
cudaStream_t
stream
)
{
// dataset: (B, N, 3)
// tmp: (B, N)
// output:
// idx: (B, M)
cudaError_t
err
;
unsigned
int
n_threads
=
opt_n_threads
(
n
);
switch
(
n_threads
)
{
case
1024
:
furthest_point_sampling_kernel
<
1024
><<<
b
,
n_threads
,
0
,
stream
>>>
(
b
,
n
,
m
,
dataset
,
temp
,
idxs
);
break
;
case
512
:
furthest_point_sampling_kernel
<
512
><<<
b
,
n_threads
,
0
,
stream
>>>
(
b
,
n
,
m
,
dataset
,
temp
,
idxs
);
break
;
case
256
:
furthest_point_sampling_kernel
<
256
><<<
b
,
n_threads
,
0
,
stream
>>>
(
b
,
n
,
m
,
dataset
,
temp
,
idxs
);
break
;
case
128
:
furthest_point_sampling_kernel
<
128
><<<
b
,
n_threads
,
0
,
stream
>>>
(
b
,
n
,
m
,
dataset
,
temp
,
idxs
);
break
;
case
64
:
furthest_point_sampling_kernel
<
64
><<<
b
,
n_threads
,
0
,
stream
>>>
(
b
,
n
,
m
,
dataset
,
temp
,
idxs
);
break
;
case
32
:
furthest_point_sampling_kernel
<
32
><<<
b
,
n_threads
,
0
,
stream
>>>
(
b
,
n
,
m
,
dataset
,
temp
,
idxs
);
break
;
case
16
:
furthest_point_sampling_kernel
<
16
><<<
b
,
n_threads
,
0
,
stream
>>>
(
b
,
n
,
m
,
dataset
,
temp
,
idxs
);
break
;
case
8
:
furthest_point_sampling_kernel
<
8
><<<
b
,
n_threads
,
0
,
stream
>>>
(
b
,
n
,
m
,
dataset
,
temp
,
idxs
);
break
;
case
4
:
furthest_point_sampling_kernel
<
4
><<<
b
,
n_threads
,
0
,
stream
>>>
(
b
,
n
,
m
,
dataset
,
temp
,
idxs
);
break
;
case
2
:
furthest_point_sampling_kernel
<
2
><<<
b
,
n_threads
,
0
,
stream
>>>
(
b
,
n
,
m
,
dataset
,
temp
,
idxs
);
break
;
case
1
:
furthest_point_sampling_kernel
<
1
><<<
b
,
n_threads
,
0
,
stream
>>>
(
b
,
n
,
m
,
dataset
,
temp
,
idxs
);
break
;
default:
furthest_point_sampling_kernel
<
512
><<<
b
,
n_threads
,
0
,
stream
>>>
(
b
,
n
,
m
,
dataset
,
temp
,
idxs
);
}
err
=
cudaGetLastError
();
if
(
cudaSuccess
!=
err
)
{
fprintf
(
stderr
,
"CUDA kernel failed : %s
\n
"
,
cudaGetErrorString
(
err
));
exit
(
-
1
);
}
}
pcdet/ops/pointnet2/pointnet2_batch/src/sampling_gpu.h
0 → 100644
View file @
43baf787
#ifndef _SAMPLING_GPU_H
#define _SAMPLING_GPU_H
#include <torch/serialize/tensor.h>
#include <ATen/cuda/CUDAContext.h>
#include<vector>
int
gather_points_wrapper_fast
(
int
b
,
int
c
,
int
n
,
int
npoints
,
at
::
Tensor
points_tensor
,
at
::
Tensor
idx_tensor
,
at
::
Tensor
out_tensor
);
void
gather_points_kernel_launcher_fast
(
int
b
,
int
c
,
int
n
,
int
npoints
,
const
float
*
points
,
const
int
*
idx
,
float
*
out
,
cudaStream_t
stream
);
int
gather_points_grad_wrapper_fast
(
int
b
,
int
c
,
int
n
,
int
npoints
,
at
::
Tensor
grad_out_tensor
,
at
::
Tensor
idx_tensor
,
at
::
Tensor
grad_points_tensor
);
void
gather_points_grad_kernel_launcher_fast
(
int
b
,
int
c
,
int
n
,
int
npoints
,
const
float
*
grad_out
,
const
int
*
idx
,
float
*
grad_points
,
cudaStream_t
stream
);
int
furthest_point_sampling_wrapper
(
int
b
,
int
n
,
int
m
,
at
::
Tensor
points_tensor
,
at
::
Tensor
temp_tensor
,
at
::
Tensor
idx_tensor
);
void
furthest_point_sampling_kernel_launcher
(
int
b
,
int
n
,
int
m
,
const
float
*
dataset
,
float
*
temp
,
int
*
idxs
,
cudaStream_t
stream
);
#endif
pcdet/ops/pointnet2/pointnet2_stack/pointnet2_modules.py
View file @
43baf787
...
...
@@ -89,3 +89,48 @@ class StackSAModuleMSG(nn.Module):
return
new_xyz
,
new_features
class
StackPointnetFPModule
(
nn
.
Module
):
def
__init__
(
self
,
*
,
mlp
:
List
[
int
]):
"""
Args:
mlp: list of int
"""
super
().
__init__
()
shared_mlps
=
[]
for
k
in
range
(
len
(
mlp
)
-
1
):
shared_mlps
.
extend
([
nn
.
Conv2d
(
mlp
[
k
],
mlp
[
k
+
1
],
kernel_size
=
1
,
bias
=
False
),
nn
.
BatchNorm2d
(
mlp
[
k
+
1
]),
nn
.
ReLU
()
])
self
.
mlp
=
nn
.
Sequential
(
*
shared_mlps
)
def
forward
(
self
,
unknown
,
unknown_batch_cnt
,
known
,
known_batch_cnt
,
unknown_feats
=
None
,
known_feats
=
None
):
"""
Args:
unknown: (N1 + N2 ..., 3)
known: (M1 + M2 ..., 3)
unknow_feats: (N1 + N2 ..., C1)
known_feats: (M1 + M2 ..., C2)
Returns:
new_features: (N1 + N2 ..., C_out)
"""
dist
,
idx
=
pointnet2_utils
.
three_nn
(
unknown
,
unknown_batch_cnt
,
known
,
known_batch_cnt
)
dist_recip
=
1.0
/
(
dist
+
1e-8
)
norm
=
torch
.
sum
(
dist_recip
,
dim
=-
1
,
keepdim
=
True
)
weight
=
dist_recip
/
norm
interpolated_feats
=
pointnet2_utils
.
three_interpolate
(
known_feats
,
idx
,
weight
)
if
unknown_feats
is
not
None
:
new_features
=
torch
.
cat
([
interpolated_feats
,
unknown_feats
],
dim
=
1
)
# (N1 + N2 ..., C2 + C1)
else
:
new_features
=
interpolated_feats
new_features
=
new_features
.
permute
(
1
,
0
)[
None
,
:,
:,
None
]
# (1, C, N1 + N2 ..., 1)
new_features
=
self
.
mlp
(
new_features
)
new_features
=
new_features
.
squeeze
(
dim
=
0
).
squeeze
(
dim
=-
1
).
permute
(
1
,
0
)
# (N1 + N2 ..., C)
return
new_features
pcdet/ops/pointnet2/pointnet2_stack/pointnet2_utils.py
View file @
43baf787
...
...
@@ -185,5 +185,83 @@ class FurthestPointSampling(Function):
furthest_point_sample
=
FurthestPointSampling
.
apply
class
ThreeNN
(
Function
):
@
staticmethod
def
forward
(
ctx
,
unknown
,
unknown_batch_cnt
,
known
,
known_batch_cnt
):
"""
Args:
ctx:
unknown: (N1 + N2..., 3)
unknown_batch_cnt: (batch_size), [N1, N2, ...]
known: (M1 + M2..., 3)
known_batch_cnt: (batch_size), [M1, M2, ...]
Returns:
dist: (N1 + N2 ..., 3) l2 distance to the three nearest neighbors
idx: (N1 + N2 ..., 3) index of the three nearest neighbors, range [0, M1+M2+...]
"""
assert
unknown
.
shape
.
__len__
()
==
2
and
unknown
.
shape
[
1
]
==
3
assert
known
.
shape
.
__len__
()
==
2
and
known
.
shape
[
1
]
==
3
assert
unknown_batch_cnt
.
__len__
()
==
known_batch_cnt
.
__len__
()
dist2
=
unknown
.
new_zeros
(
unknown
.
shape
)
idx
=
unknown_batch_cnt
.
new_zeros
(
unknown
.
shape
).
int
()
pointnet2
.
three_nn_wrapper
(
unknown
.
contiguous
(),
unknown_batch_cnt
.
contiguous
(),
known
.
contiguous
(),
known_batch_cnt
.
contiguous
(),
dist2
,
idx
)
return
torch
.
sqrt
(
dist2
),
idx
@
staticmethod
def
backward
(
ctx
,
a
=
None
,
b
=
None
):
return
None
,
None
three_nn
=
ThreeNN
.
apply
class
ThreeInterpolate
(
Function
):
@
staticmethod
def
forward
(
ctx
,
features
:
torch
.
Tensor
,
idx
:
torch
.
Tensor
,
weight
:
torch
.
Tensor
):
"""
Args:
ctx:
features: (M1 + M2 ..., C)
idx: [N1 + N2 ..., 3]
weight: [N1 + N2 ..., 3]
Returns:
out_tensor: (N1 + N2 ..., C)
"""
assert
idx
.
shape
[
0
]
==
weight
.
shape
[
0
]
and
idx
.
shape
[
1
]
==
weight
.
shape
[
1
]
==
3
ctx
.
three_interpolate_for_backward
=
(
idx
,
weight
,
features
.
shape
[
0
])
output
=
features
.
new_zeros
((
idx
.
shape
[
0
],
features
.
shape
[
1
]))
pointnet2
.
three_interpolate_wrapper
(
features
.
contiguous
(),
idx
.
contiguous
(),
weight
.
contiguous
(),
output
)
return
output
@
staticmethod
def
backward
(
ctx
,
grad_out
:
torch
.
Tensor
):
"""
Args:
ctx:
grad_out: (N1 + N2 ..., C)
Returns:
grad_features: (M1 + M2 ..., C)
"""
idx
,
weight
,
M
=
ctx
.
three_interpolate_for_backward
grad_features
=
grad_out
.
new_zeros
((
M
,
grad_out
.
shape
[
1
]))
pointnet2
.
three_interpolate_grad_wrapper
(
grad_out
.
contiguous
(),
idx
.
contiguous
(),
weight
.
contiguous
(),
grad_features
)
return
grad_features
,
None
,
None
three_interpolate
=
ThreeInterpolate
.
apply
if
__name__
==
'__main__'
:
pass
pcdet/ops/pointnet2/pointnet2_stack/src/interpolate.cpp
0 → 100644
View file @
43baf787
/*
Stacked-batch-data version of point interpolation, modified from the original implementation of official PointNet++ codes.
Written by Shaoshuai Shi
All Rights Reserved 2019-2020.
*/
#include <torch/serialize/tensor.h>
#include <vector>
#include <THC/THC.h>
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#include <cuda_runtime_api.h>
#include "interpolate_gpu.h"
extern
THCState
*
state
;
void
three_nn_wrapper_stack
(
at
::
Tensor
unknown_tensor
,
at
::
Tensor
unknown_batch_cnt_tensor
,
at
::
Tensor
known_tensor
,
at
::
Tensor
known_batch_cnt_tensor
,
at
::
Tensor
dist2_tensor
,
at
::
Tensor
idx_tensor
){
// unknown: (N1 + N2 ..., 3)
// unknown_batch_cnt: (batch_size), [N1, N2, ...]
// known: (M1 + M2 ..., 3)
// known_batch_cnt: (batch_size), [M1, M2, ...]
// Return:
// dist: (N1 + N2 ..., 3) l2 distance to the three nearest neighbors
// idx: (N1 + N2 ..., 3) index of the three nearest neighbors
int
batch_size
=
unknown_batch_cnt_tensor
.
size
(
0
);
int
N
=
unknown_tensor
.
size
(
0
);
int
M
=
known_tensor
.
size
(
0
);
const
float
*
unknown
=
unknown_tensor
.
data
<
float
>
();
const
int
*
unknown_batch_cnt
=
unknown_batch_cnt_tensor
.
data
<
int
>
();
const
float
*
known
=
known_tensor
.
data
<
float
>
();
const
int
*
known_batch_cnt
=
known_batch_cnt_tensor
.
data
<
int
>
();
float
*
dist2
=
dist2_tensor
.
data
<
float
>
();
int
*
idx
=
idx_tensor
.
data
<
int
>
();
three_nn_kernel_launcher_stack
(
batch_size
,
N
,
M
,
unknown
,
unknown_batch_cnt
,
known
,
known_batch_cnt
,
dist2
,
idx
);
}
void
three_interpolate_wrapper_stack
(
at
::
Tensor
features_tensor
,
at
::
Tensor
idx_tensor
,
at
::
Tensor
weight_tensor
,
at
::
Tensor
out_tensor
)
{
// features_tensor: (M1 + M2 ..., C)
// idx_tensor: [N1 + N2 ..., 3]
// weight_tensor: [N1 + N2 ..., 3]
// Return:
// out_tensor: (N1 + N2 ..., C)
int
N
=
out_tensor
.
size
(
0
);
int
channels
=
features_tensor
.
size
(
1
);
const
float
*
features
=
features_tensor
.
data
<
float
>
();
const
float
*
weight
=
weight_tensor
.
data
<
float
>
();
const
int
*
idx
=
idx_tensor
.
data
<
int
>
();
float
*
out
=
out_tensor
.
data
<
float
>
();
three_interpolate_kernel_launcher_stack
(
N
,
channels
,
features
,
idx
,
weight
,
out
);
}
void
three_interpolate_grad_wrapper_stack
(
at
::
Tensor
grad_out_tensor
,
at
::
Tensor
idx_tensor
,
at
::
Tensor
weight_tensor
,
at
::
Tensor
grad_features_tensor
)
{
// grad_out_tensor: (N1 + N2 ..., C)
// idx_tensor: [N1 + N2 ..., 3]
// weight_tensor: [N1 + N2 ..., 3]
// Return:
// grad_features_tensor: (M1 + M2 ..., C)
int
N
=
grad_out_tensor
.
size
(
0
);
int
channels
=
grad_out_tensor
.
size
(
1
);
const
float
*
grad_out
=
grad_out_tensor
.
data
<
float
>
();
const
float
*
weight
=
weight_tensor
.
data
<
float
>
();
const
int
*
idx
=
idx_tensor
.
data
<
int
>
();
float
*
grad_features
=
grad_features_tensor
.
data
<
float
>
();
three_interpolate_grad_kernel_launcher_stack
(
N
,
channels
,
grad_out
,
idx
,
weight
,
grad_features
);
}
\ No newline at end of file
pcdet/ops/pointnet2/pointnet2_stack/src/interpolate_gpu.cu
0 → 100644
View file @
43baf787
/*
Stacked-batch-data version of point interpolation, modified from the original implementation of official PointNet++ codes.
Written by Shaoshuai Shi
All Rights Reserved 2019-2020.
*/
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include "cuda_utils.h"
#include "interpolate_gpu.h"
__global__
void
three_nn_kernel_stack
(
int
batch_size
,
int
N
,
int
M
,
const
float
*
unknown
,
const
int
*
unknown_batch_cnt
,
const
float
*
known
,
const
int
*
known_batch_cnt
,
float
*
dist2
,
int
*
idx
)
{
// unknown: (N1 + N2 ..., 3)
// unknown_batch_cnt: (batch_size), [N1, N2, ...]
// known: (M1 + M2 ..., 3)
// known_batch_cnt: (batch_size), [M1, M2, ...]
// Return:
// dist: (N1 + N2 ..., 3) l2 distance to the three nearest neighbors
// idx: (N1 + N2 ..., 3) index of the three nearest neighbors
int
pt_idx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
pt_idx
>=
N
)
return
;
int
bs_idx
=
0
,
pt_cnt
=
unknown_batch_cnt
[
0
];
for
(
int
k
=
1
;
k
<
batch_size
;
k
++
){
if
(
pt_idx
<
pt_cnt
)
break
;
pt_cnt
+=
unknown_batch_cnt
[
k
];
bs_idx
=
k
;
}
int
cur_num_known_points
=
known_batch_cnt
[
bs_idx
];
int
known_batch_start_idx
=
0
;
for
(
int
k
=
0
;
k
<
bs_idx
;
k
++
)
known_batch_start_idx
+=
known_batch_cnt
[
k
];
known
+=
known_batch_start_idx
*
3
;
unknown
+=
pt_idx
*
3
;
dist2
+=
pt_idx
*
3
;
idx
+=
pt_idx
*
3
;
float
ux
=
unknown
[
0
];
float
uy
=
unknown
[
1
];
float
uz
=
unknown
[
2
];
double
best1
=
1e40
,
best2
=
1e40
,
best3
=
1e40
;
int
besti1
=
0
,
besti2
=
0
,
besti3
=
0
;
for
(
int
k
=
0
;
k
<
cur_num_known_points
;
++
k
)
{
float
x
=
known
[
k
*
3
+
0
];
float
y
=
known
[
k
*
3
+
1
];
float
z
=
known
[
k
*
3
+
2
];
float
d
=
(
ux
-
x
)
*
(
ux
-
x
)
+
(
uy
-
y
)
*
(
uy
-
y
)
+
(
uz
-
z
)
*
(
uz
-
z
);
if
(
d
<
best1
)
{
best3
=
best2
;
besti3
=
besti2
;
best2
=
best1
;
besti2
=
besti1
;
best1
=
d
;
besti1
=
k
;
}
else
if
(
d
<
best2
)
{
best3
=
best2
;
besti3
=
besti2
;
best2
=
d
;
besti2
=
k
;
}
else
if
(
d
<
best3
)
{
best3
=
d
;
besti3
=
k
;
}
}
dist2
[
0
]
=
best1
;
dist2
[
1
]
=
best2
;
dist2
[
2
]
=
best3
;
idx
[
0
]
=
besti1
+
known_batch_start_idx
;
idx
[
1
]
=
besti2
+
known_batch_start_idx
;
idx
[
2
]
=
besti3
+
known_batch_start_idx
;
}
void
three_nn_kernel_launcher_stack
(
int
batch_size
,
int
N
,
int
M
,
const
float
*
unknown
,
const
int
*
unknown_batch_cnt
,
const
float
*
known
,
const
int
*
known_batch_cnt
,
float
*
dist2
,
int
*
idx
)
{
// unknown: (N1 + N2 ..., 3)
// unknown_batch_cnt: (batch_size), [N1, N2, ...]
// known: (M1 + M2 ..., 3)
// known_batch_cnt: (batch_size), [M1, M2, ...]
// Return:
// dist: (N1 + N2 ..., 3) l2 distance to the three nearest neighbors
// idx: (N1 + N2 ..., 3) index of the three nearest neighbors
cudaError_t
err
;
dim3
blocks
(
DIVUP
(
N
,
THREADS_PER_BLOCK
));
// blockIdx.x(col), blockIdx.y(row)
dim3
threads
(
THREADS_PER_BLOCK
);
three_nn_kernel_stack
<<<
blocks
,
threads
>>>
(
batch_size
,
N
,
M
,
unknown
,
unknown_batch_cnt
,
known
,
known_batch_cnt
,
dist2
,
idx
);
err
=
cudaGetLastError
();
if
(
cudaSuccess
!=
err
)
{
fprintf
(
stderr
,
"CUDA kernel failed : %s
\n
"
,
cudaGetErrorString
(
err
));
exit
(
-
1
);
}
}
__global__
void
three_interpolate_kernel_stack
(
int
N
,
int
channels
,
const
float
*
features
,
const
int
*
idx
,
const
float
*
weight
,
float
*
out
)
{
// features: (M1 + M2 ..., C)
// idx: [N1 + N2 ..., 3]
// weight: [N1 + N2 ..., 3]
// Return:
// out: (N1 + N2 ..., C)
int
c_idx
=
blockIdx
.
y
;
int
pt_idx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
pt_idx
>=
N
||
c_idx
>=
channels
)
return
;
weight
+=
pt_idx
*
3
;
idx
+=
pt_idx
*
3
;
out
+=
pt_idx
*
channels
+
c_idx
;
out
[
0
]
=
weight
[
0
]
*
features
[
idx
[
0
]
*
channels
+
c_idx
]
+
weight
[
1
]
*
features
[
idx
[
1
]
*
channels
+
c_idx
]
+
weight
[
2
]
*
features
[
idx
[
2
]
*
channels
+
c_idx
];
}
void
three_interpolate_kernel_launcher_stack
(
int
N
,
int
channels
,
const
float
*
features
,
const
int
*
idx
,
const
float
*
weight
,
float
*
out
)
{
// features: (M1 + M2 ..., C)
// idx: [N1 + N2 ..., 3]
// weight: [N1 + N2 ..., 3]
// Return:
// out: (N1 + N2 ..., C)
cudaError_t
err
;
dim3
blocks
(
DIVUP
(
N
,
THREADS_PER_BLOCK
),
channels
);
dim3
threads
(
THREADS_PER_BLOCK
);
three_interpolate_kernel_stack
<<<
blocks
,
threads
>>>
(
N
,
channels
,
features
,
idx
,
weight
,
out
);
err
=
cudaGetLastError
();
if
(
cudaSuccess
!=
err
)
{
fprintf
(
stderr
,
"CUDA kernel failed : %s
\n
"
,
cudaGetErrorString
(
err
));
exit
(
-
1
);
}
}
__global__
void
three_interpolate_grad_kernel_stack
(
int
N
,
int
channels
,
const
float
*
grad_out
,
const
int
*
idx
,
const
float
*
weight
,
float
*
grad_features
)
{
// grad_out_tensor: (N1 + N2 ..., C)
// idx_tensor: [N1 + N2 ..., 3]
// weight_tensor: [N1 + N2 ..., 3]
// Return:
// grad_features_tensor: (M1 + M2 ..., C)
int
c_idx
=
blockIdx
.
y
;
int
pt_idx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
pt_idx
>=
N
||
c_idx
>=
channels
)
return
;
grad_out
+=
pt_idx
*
channels
+
c_idx
;
weight
+=
pt_idx
*
3
;
idx
+=
pt_idx
*
3
;
atomicAdd
(
grad_features
+
idx
[
0
],
grad_out
[
0
]
*
weight
[
0
]);
atomicAdd
(
grad_features
+
idx
[
1
],
grad_out
[
0
]
*
weight
[
1
]);
atomicAdd
(
grad_features
+
idx
[
2
],
grad_out
[
0
]
*
weight
[
2
]);
}
void
three_interpolate_grad_kernel_launcher_stack
(
int
N
,
int
channels
,
const
float
*
grad_out
,
const
int
*
idx
,
const
float
*
weight
,
float
*
grad_features
)
{
// grad_out_tensor: (N1 + N2 ..., C)
// idx_tensor: [N1 + N2 ..., 3]
// weight_tensor: [N1 + N2 ..., 3]
// Return:
// grad_features_tensor: (M1 + M2 ..., C)
cudaError_t
err
;
dim3
blocks
(
DIVUP
(
N
,
THREADS_PER_BLOCK
),
channels
);
// blockIdx.x(col), blockIdx.y(row)
dim3
threads
(
THREADS_PER_BLOCK
);
three_interpolate_grad_kernel_stack
<<<
blocks
,
threads
>>>
(
N
,
channels
,
grad_out
,
idx
,
weight
,
grad_features
);
err
=
cudaGetLastError
();
if
(
cudaSuccess
!=
err
)
{
fprintf
(
stderr
,
"CUDA kernel failed : %s
\n
"
,
cudaGetErrorString
(
err
));
exit
(
-
1
);
}
}
\ No newline at end of file
pcdet/ops/pointnet2/pointnet2_stack/src/interpolate_gpu.h
0 → 100644
View file @
43baf787
#ifndef _INTERPOLATE_GPU_H
#define _INTERPOLATE_GPU_H
#include <torch/serialize/tensor.h>
#include<vector>
#include <cuda.h>
#include <cuda_runtime_api.h>
void
three_nn_wrapper_stack
(
at
::
Tensor
unknown_tensor
,
at
::
Tensor
unknown_batch_cnt_tensor
,
at
::
Tensor
known_tensor
,
at
::
Tensor
known_batch_cnt_tensor
,
at
::
Tensor
dist2_tensor
,
at
::
Tensor
idx_tensor
);
void
three_interpolate_wrapper_stack
(
at
::
Tensor
features_tensor
,
at
::
Tensor
idx_tensor
,
at
::
Tensor
weight_tensor
,
at
::
Tensor
out_tensor
);
void
three_interpolate_grad_wrapper_stack
(
at
::
Tensor
grad_out_tensor
,
at
::
Tensor
idx_tensor
,
at
::
Tensor
weight_tensor
,
at
::
Tensor
grad_features_tensor
);
void
three_nn_kernel_launcher_stack
(
int
batch_size
,
int
N
,
int
M
,
const
float
*
unknown
,
const
int
*
unknown_batch_cnt
,
const
float
*
known
,
const
int
*
known_batch_cnt
,
float
*
dist2
,
int
*
idx
);
void
three_interpolate_kernel_launcher_stack
(
int
N
,
int
channels
,
const
float
*
features
,
const
int
*
idx
,
const
float
*
weight
,
float
*
out
);
void
three_interpolate_grad_kernel_launcher_stack
(
int
N
,
int
channels
,
const
float
*
grad_out
,
const
int
*
idx
,
const
float
*
weight
,
float
*
grad_features
);
#endif
\ No newline at end of file
pcdet/ops/pointnet2/pointnet2_stack/src/pointnet2_api.cpp
View file @
43baf787
...
...
@@ -4,6 +4,7 @@
#include "ball_query_gpu.h"
#include "group_points_gpu.h"
#include "sampling_gpu.h"
#include "interpolate_gpu.h"
PYBIND11_MODULE
(
TORCH_EXTENSION_NAME
,
m
)
{
...
...
@@ -13,4 +14,8 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m
.
def
(
"group_points_wrapper"
,
&
group_points_wrapper_stack
,
"group_points_wrapper_stack"
);
m
.
def
(
"group_points_grad_wrapper"
,
&
group_points_grad_wrapper_stack
,
"group_points_grad_wrapper_stack"
);
m
.
def
(
"three_nn_wrapper"
,
&
three_nn_wrapper_stack
,
"three_nn_wrapper_stack"
);
m
.
def
(
"three_interpolate_wrapper"
,
&
three_interpolate_wrapper_stack
,
"three_interpolate_wrapper_stack"
);
m
.
def
(
"three_interpolate_grad_wrapper"
,
&
three_interpolate_grad_wrapper_stack
,
"three_interpolate_grad_wrapper_stack"
);
}
pcdet/ops/roipoint_pool3d/roipoint_pool3d_utils.py
0 → 100644
View file @
43baf787
import
torch
import
torch.nn
as
nn
from
torch.autograd
import
Function
from
.
import
roipoint_pool3d_cuda
from
...utils
import
box_utils
class
RoIPointPool3d
(
nn
.
Module
):
def
__init__
(
self
,
num_sampled_points
=
512
,
pool_extra_width
=
1.0
):
super
().
__init__
()
self
.
num_sampled_points
=
num_sampled_points
self
.
pool_extra_width
=
pool_extra_width
def
forward
(
self
,
points
,
point_features
,
boxes3d
):
"""
Args:
points: (B, N, 3)
point_features: (B, N, C)
boxes3d: (B, M, 7), [x, y, z, dx, dy, dz, heading]
Returns:
pooled_features: (B, M, 512, 3 + C)
pooled_empty_flag: (B, M)
"""
return
RoIPointPool3dFunction
.
apply
(
points
,
point_features
,
boxes3d
,
self
.
pool_extra_width
,
self
.
num_sampled_points
)
class
RoIPointPool3dFunction
(
Function
):
@
staticmethod
def
forward
(
ctx
,
points
,
point_features
,
boxes3d
,
pool_extra_width
,
num_sampled_points
=
512
):
"""
Args:
ctx:
points: (B, N, 3)
point_features: (B, N, C)
boxes3d: (B, num_boxes, 7), [x, y, z, dx, dy, dz, heading]
pool_extra_width:
num_sampled_points:
Returns:
pooled_features: (B, num_boxes, 512, 3 + C)
pooled_empty_flag: (B, num_boxes)
"""
assert
points
.
shape
.
__len__
()
==
3
and
points
.
shape
[
2
]
==
3
batch_size
,
boxes_num
,
feature_len
=
points
.
shape
[
0
],
boxes3d
.
shape
[
1
],
point_features
.
shape
[
2
]
pooled_boxes3d
=
box_utils
.
enlarge_box3d
(
boxes3d
.
view
(
-
1
,
7
),
pool_extra_width
).
view
(
batch_size
,
-
1
,
7
)
pooled_features
=
point_features
.
new_zeros
((
batch_size
,
boxes_num
,
num_sampled_points
,
3
+
feature_len
))
pooled_empty_flag
=
point_features
.
new_zeros
((
batch_size
,
boxes_num
)).
int
()
roipoint_pool3d_cuda
.
forward
(
points
.
contiguous
(),
pooled_boxes3d
.
contiguous
(),
point_features
.
contiguous
(),
pooled_features
,
pooled_empty_flag
)
return
pooled_features
,
pooled_empty_flag
@
staticmethod
def
backward
(
ctx
,
grad_out
):
raise
NotImplementedError
if
__name__
==
'__main__'
:
pass
pcdet/ops/roipoint_pool3d/src/roipoint_pool3d.cpp
0 → 100644
View file @
43baf787
#include <torch/serialize/tensor.h>
#include <torch/extension.h>
#define CHECK_CUDA(x) AT_CHECK(x.type().is_cuda(), #x, " must be a CUDAtensor ")
#define CHECK_CONTIGUOUS(x) AT_CHECK(x.is_contiguous(), #x, " must be contiguous ")
#define CHECK_INPUT(x) CHECK_CUDA(x);CHECK_CONTIGUOUS(x)
void
roipool3dLauncher
(
int
batch_size
,
int
pts_num
,
int
boxes_num
,
int
feature_in_len
,
int
sampled_pts_num
,
const
float
*
xyz
,
const
float
*
boxes3d
,
const
float
*
pts_feature
,
float
*
pooled_features
,
int
*
pooled_empty_flag
);
int
roipool3d_gpu
(
at
::
Tensor
xyz
,
at
::
Tensor
boxes3d
,
at
::
Tensor
pts_feature
,
at
::
Tensor
pooled_features
,
at
::
Tensor
pooled_empty_flag
){
// params xyz: (B, N, 3)
// params boxes3d: (B, M, 7)
// params pts_feature: (B, N, C)
// params pooled_features: (B, M, 512, 3+C)
// params pooled_empty_flag: (B, M)
CHECK_INPUT
(
xyz
);
CHECK_INPUT
(
boxes3d
);
CHECK_INPUT
(
pts_feature
);
CHECK_INPUT
(
pooled_features
);
CHECK_INPUT
(
pooled_empty_flag
);
int
batch_size
=
xyz
.
size
(
0
);
int
pts_num
=
xyz
.
size
(
1
);
int
boxes_num
=
boxes3d
.
size
(
1
);
int
feature_in_len
=
pts_feature
.
size
(
2
);
int
sampled_pts_num
=
pooled_features
.
size
(
2
);
const
float
*
xyz_data
=
xyz
.
data
<
float
>
();
const
float
*
boxes3d_data
=
boxes3d
.
data
<
float
>
();
const
float
*
pts_feature_data
=
pts_feature
.
data
<
float
>
();
float
*
pooled_features_data
=
pooled_features
.
data
<
float
>
();
int
*
pooled_empty_flag_data
=
pooled_empty_flag
.
data
<
int
>
();
roipool3dLauncher
(
batch_size
,
pts_num
,
boxes_num
,
feature_in_len
,
sampled_pts_num
,
xyz_data
,
boxes3d_data
,
pts_feature_data
,
pooled_features_data
,
pooled_empty_flag_data
);
return
1
;
}
PYBIND11_MODULE
(
TORCH_EXTENSION_NAME
,
m
)
{
m
.
def
(
"forward"
,
&
roipool3d_gpu
,
"roipool3d forward (CUDA)"
);
}
pcdet/ops/roipoint_pool3d/src/roipoint_pool3d_kernel.cu
0 → 100644
View file @
43baf787
/*
Point cloud feature pooling
Written by Shaoshuai Shi
All Rights Reserved 2018.
*/
#include <math.h>
#include <stdio.h>
#define THREADS_PER_BLOCK 256
#define DIVUP(m,n) ((m) / (n) + ((m) % (n) > 0))
// #define DEBUG
__device__
inline
void
lidar_to_local_coords
(
float
shift_x
,
float
shift_y
,
float
rot_angle
,
float
&
local_x
,
float
&
local_y
){
float
cosa
=
cos
(
-
rot_angle
),
sina
=
sin
(
-
rot_angle
);
local_x
=
shift_x
*
cosa
+
shift_y
*
(
-
sina
);
local_y
=
shift_x
*
sina
+
shift_y
*
cosa
;
}
__device__
inline
int
check_pt_in_box3d
(
const
float
*
pt
,
const
float
*
box3d
,
float
&
local_x
,
float
&
local_y
){
// param pt: (x, y, z)
// param box3d: [x, y, z, dx, dy, dz, heading] (x, y, z) is the box center
const
float
MARGIN
=
1e-5
;
float
x
=
pt
[
0
],
y
=
pt
[
1
],
z
=
pt
[
2
];
float
cx
=
box3d
[
0
],
cy
=
box3d
[
1
],
cz
=
box3d
[
2
];
float
dx
=
box3d
[
3
],
dy
=
box3d
[
4
],
dz
=
box3d
[
5
],
rz
=
box3d
[
6
];
if
(
fabsf
(
z
-
cz
)
>
dz
/
2.0
)
return
0
;
lidar_to_local_coords
(
x
-
cx
,
y
-
cy
,
rz
,
local_x
,
local_y
);
float
in_flag
=
(
fabs
(
local_x
)
<
dx
/
2.0
+
MARGIN
)
&
(
fabs
(
local_y
)
<
dy
/
2.0
+
MARGIN
);
return
in_flag
;
}
__global__
void
assign_pts_to_box3d
(
int
batch_size
,
int
pts_num
,
int
boxes_num
,
const
float
*
xyz
,
const
float
*
boxes3d
,
int
*
pts_assign
){
// params xyz: (B, N, 3)
// params boxes3d: (B, M, 7)
// params pts_assign: (B, N, M): idx of the corresponding box3d, -1 means background points
int
pt_idx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
int
box_idx
=
blockIdx
.
y
;
int
bs_idx
=
blockIdx
.
z
;
if
(
pt_idx
>=
pts_num
||
box_idx
>=
boxes_num
||
bs_idx
>=
batch_size
){
return
;
}
int
assign_idx
=
bs_idx
*
pts_num
*
boxes_num
+
pt_idx
*
boxes_num
+
box_idx
;
pts_assign
[
assign_idx
]
=
0
;
int
box_offset
=
bs_idx
*
boxes_num
*
7
+
box_idx
*
7
;
int
pt_offset
=
bs_idx
*
pts_num
*
3
+
pt_idx
*
3
;
float
local_x
=
0
,
local_y
=
0
;
int
cur_in_flag
=
check_pt_in_box3d
(
xyz
+
pt_offset
,
boxes3d
+
box_offset
,
local_x
,
local_y
);
pts_assign
[
assign_idx
]
=
cur_in_flag
;
// printf("bs=%d, pt=%d, in=%d\n", bs_idx, pt_idx, pts_assign[bs_idx * pts_num + pt_idx]);
}
__global__
void
get_pooled_idx
(
int
batch_size
,
int
pts_num
,
int
boxes_num
,
int
sampled_pts_num
,
const
int
*
pts_assign
,
int
*
pts_idx
,
int
*
pooled_empty_flag
){
// params xyz: (B, N, 3)
// params pts_feature: (B, N, C)
// params pts_assign: (B, N)
// params pts_idx: (B, M, 512)
// params pooled_empty_flag: (B, M)
int
boxes_idx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
boxes_idx
>=
boxes_num
){
return
;
}
int
bs_idx
=
blockIdx
.
y
;
int
cnt
=
0
;
for
(
int
k
=
0
;
k
<
pts_num
;
k
++
){
if
(
pts_assign
[
bs_idx
*
pts_num
*
boxes_num
+
k
*
boxes_num
+
boxes_idx
]){
if
(
cnt
<
sampled_pts_num
){
pts_idx
[
bs_idx
*
boxes_num
*
sampled_pts_num
+
boxes_idx
*
sampled_pts_num
+
cnt
]
=
k
;
cnt
++
;
}
else
break
;
}
}
if
(
cnt
==
0
){
pooled_empty_flag
[
bs_idx
*
boxes_num
+
boxes_idx
]
=
1
;
}
else
if
(
cnt
<
sampled_pts_num
){
// duplicate same points for sampling
for
(
int
k
=
cnt
;
k
<
sampled_pts_num
;
k
++
){
int
duplicate_idx
=
k
%
cnt
;
int
base_offset
=
bs_idx
*
boxes_num
*
sampled_pts_num
+
boxes_idx
*
sampled_pts_num
;
pts_idx
[
base_offset
+
k
]
=
pts_idx
[
base_offset
+
duplicate_idx
];
}
}
}
__global__
void
roipool3d_forward
(
int
batch_size
,
int
pts_num
,
int
boxes_num
,
int
feature_in_len
,
int
sampled_pts_num
,
const
float
*
xyz
,
const
int
*
pts_idx
,
const
float
*
pts_feature
,
float
*
pooled_features
,
int
*
pooled_empty_flag
){
// params xyz: (B, N, 3)
// params pts_idx: (B, M, 512)
// params pts_feature: (B, N, C)
// params pooled_features: (B, M, 512, 3+C)
// params pooled_empty_flag: (B, M)
int
sample_pt_idx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
int
box_idx
=
blockIdx
.
y
;
int
bs_idx
=
blockIdx
.
z
;
if
(
sample_pt_idx
>=
sampled_pts_num
||
box_idx
>=
boxes_num
||
bs_idx
>=
batch_size
){
return
;
}
if
(
pooled_empty_flag
[
bs_idx
*
boxes_num
+
box_idx
]){
return
;
}
int
temp_idx
=
bs_idx
*
boxes_num
*
sampled_pts_num
+
box_idx
*
sampled_pts_num
+
sample_pt_idx
;
int
src_pt_idx
=
pts_idx
[
temp_idx
];
int
dst_feature_offset
=
temp_idx
*
(
3
+
feature_in_len
);
for
(
int
j
=
0
;
j
<
3
;
j
++
)
pooled_features
[
dst_feature_offset
+
j
]
=
xyz
[
bs_idx
*
pts_num
*
3
+
src_pt_idx
*
3
+
j
];
int
src_feature_offset
=
bs_idx
*
pts_num
*
feature_in_len
+
src_pt_idx
*
feature_in_len
;
for
(
int
j
=
0
;
j
<
feature_in_len
;
j
++
)
pooled_features
[
dst_feature_offset
+
3
+
j
]
=
pts_feature
[
src_feature_offset
+
j
];
}
void
roipool3dLauncher
(
int
batch_size
,
int
pts_num
,
int
boxes_num
,
int
feature_in_len
,
int
sampled_pts_num
,
const
float
*
xyz
,
const
float
*
boxes3d
,
const
float
*
pts_feature
,
float
*
pooled_features
,
int
*
pooled_empty_flag
){
// printf("batch_size=%d, pts_num=%d, boxes_num=%d\n", batch_size, pts_num, boxes_num);
int
*
pts_assign
=
NULL
;
cudaMalloc
(
&
pts_assign
,
batch_size
*
pts_num
*
boxes_num
*
sizeof
(
int
));
// (batch_size, N, M)
// cudaMemset(&pts_assign, -1, batch_size * pts_num * boxes_num * sizeof(int));
dim3
blocks
(
DIVUP
(
pts_num
,
THREADS_PER_BLOCK
),
boxes_num
,
batch_size
);
// blockIdx.x(col), blockIdx.y(row)
dim3
threads
(
THREADS_PER_BLOCK
);
assign_pts_to_box3d
<<<
blocks
,
threads
>>>
(
batch_size
,
pts_num
,
boxes_num
,
xyz
,
boxes3d
,
pts_assign
);
int
*
pts_idx
=
NULL
;
cudaMalloc
(
&
pts_idx
,
batch_size
*
boxes_num
*
sampled_pts_num
*
sizeof
(
int
));
// (batch_size, M, sampled_pts_num)
dim3
blocks2
(
DIVUP
(
boxes_num
,
THREADS_PER_BLOCK
),
batch_size
);
// blockIdx.x(col), blockIdx.y(row)
get_pooled_idx
<<<
blocks2
,
threads
>>>
(
batch_size
,
pts_num
,
boxes_num
,
sampled_pts_num
,
pts_assign
,
pts_idx
,
pooled_empty_flag
);
dim3
blocks_pool
(
DIVUP
(
sampled_pts_num
,
THREADS_PER_BLOCK
),
boxes_num
,
batch_size
);
roipool3d_forward
<<<
blocks_pool
,
threads
>>>
(
batch_size
,
pts_num
,
boxes_num
,
feature_in_len
,
sampled_pts_num
,
xyz
,
pts_idx
,
pts_feature
,
pooled_features
,
pooled_empty_flag
);
cudaFree
(
pts_assign
);
cudaFree
(
pts_idx
);
#ifdef DEBUG
cudaDeviceSynchronize
();
// for using printf in kernel function
#endif
}
\ No newline at end of file
pcdet/utils/box_coder_utils.py
View file @
43baf787
import
torch
import
numpy
as
np
class
ResidualCoder
(
object
):
...
...
@@ -123,3 +124,84 @@ class PreviousResidualRoIDecoder(object):
cgs
=
[
t
+
a
for
t
,
a
in
zip
(
cts
,
cas
)]
return
torch
.
cat
([
xg
,
yg
,
zg
,
dxg
,
dyg
,
dzg
,
rg
,
*
cgs
],
dim
=-
1
)
class
PointResidualCoder
(
object
):
def
__init__
(
self
,
code_size
=
8
,
use_mean_size
=
True
,
**
kwargs
):
super
().
__init__
()
self
.
code_size
=
code_size
self
.
use_mean_size
=
use_mean_size
if
self
.
use_mean_size
:
self
.
mean_size
=
torch
.
from_numpy
(
np
.
array
(
kwargs
[
'mean_size'
])).
cuda
().
float
()
assert
self
.
mean_size
.
min
()
>
0
def
encode_torch
(
self
,
gt_boxes
,
points
,
gt_classes
=
None
):
"""
Args:
gt_boxes: (N, 7 + C) [x, y, z, dx, dy, dz, heading, ...]
points: (N, 3) [x, y, z]
gt_classes: (N) [1, num_classes]
Returns:
box_coding: (N, 8 + C)
"""
gt_boxes
[:,
3
:
6
]
=
torch
.
clamp_min
(
gt_boxes
[:,
3
:
6
],
min
=
1e-5
)
xg
,
yg
,
zg
,
dxg
,
dyg
,
dzg
,
rg
,
*
cgs
=
torch
.
split
(
gt_boxes
,
1
,
dim
=-
1
)
xa
,
ya
,
za
=
torch
.
split
(
points
,
1
,
dim
=-
1
)
if
self
.
use_mean_size
:
assert
gt_classes
.
max
()
<=
self
.
mean_size
.
shape
[
0
]
point_anchor_size
=
self
.
mean_size
[
gt_classes
-
1
]
dxa
,
dya
,
dza
=
torch
.
split
(
point_anchor_size
,
1
,
dim
=-
1
)
diagonal
=
torch
.
sqrt
(
dxa
**
2
+
dya
**
2
)
xt
=
(
xg
-
xa
)
/
diagonal
yt
=
(
yg
-
ya
)
/
diagonal
zt
=
(
zg
-
za
)
/
dza
dxt
=
torch
.
log
(
dxg
/
dxa
)
dyt
=
torch
.
log
(
dyg
/
dya
)
dzt
=
torch
.
log
(
dzg
/
dza
)
else
:
xt
=
(
xg
-
xa
)
yt
=
(
yg
-
ya
)
zt
=
(
zg
-
za
)
dxt
=
torch
.
log
(
dxg
)
dyt
=
torch
.
log
(
dyg
)
dzt
=
torch
.
log
(
dzg
)
cts
=
[
g
for
g
in
cgs
]
return
torch
.
cat
([
xt
,
yt
,
zt
,
dxt
,
dyt
,
dzt
,
torch
.
cos
(
rg
),
torch
.
sin
(
rg
),
*
cts
],
dim
=-
1
)
def
decode_torch
(
self
,
box_encodings
,
points
,
pred_classes
=
None
):
"""
Args:
box_encodings: (N, 8 + C) [x, y, z, dx, dy, dz, cos, sin, ...]
points: [x, y, z]
pred_classes: (N) [1, num_classes]
Returns:
"""
xt
,
yt
,
zt
,
dxt
,
dyt
,
dzt
,
cost
,
sint
,
*
cts
=
torch
.
split
(
box_encodings
,
1
,
dim
=-
1
)
xa
,
ya
,
za
=
torch
.
split
(
points
,
1
,
dim
=-
1
)
if
self
.
use_mean_size
:
assert
pred_classes
.
max
()
<=
self
.
mean_size
.
shape
[
0
]
point_anchor_size
=
self
.
mean_size
[
pred_classes
-
1
]
dxa
,
dya
,
dza
=
torch
.
split
(
point_anchor_size
,
1
,
dim
=-
1
)
diagonal
=
torch
.
sqrt
(
dxa
**
2
+
dya
**
2
)
xg
=
xt
*
diagonal
+
xa
yg
=
yt
*
diagonal
+
ya
zg
=
zt
*
dza
+
za
dxg
=
torch
.
exp
(
dxt
)
*
dxa
dyg
=
torch
.
exp
(
dyt
)
*
dya
dzg
=
torch
.
exp
(
dzt
)
*
dza
else
:
xg
=
xt
+
xa
yg
=
yt
+
ya
zg
=
zt
+
za
dxg
,
dyg
,
dzg
=
torch
.
split
(
torch
.
exp
(
box_encodings
[...,
3
:
6
]),
1
,
dim
=-
1
)
rg
=
torch
.
atan2
(
sint
,
cost
)
cgs
=
[
t
for
t
in
cts
]
return
torch
.
cat
([
xg
,
yg
,
zg
,
dxg
,
dyg
,
dzg
,
rg
,
*
cgs
],
dim
=-
1
)
Prev
1
2
3
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment