Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
OpenPCDet
Commits
32567b04
Unverified
Commit
32567b04
authored
Jul 30, 2020
by
Shaoshuai Shi
Committed by
GitHub
Jul 30, 2020
Browse files
Merge pull request #192 from sshaoshuai/master
Release OpenPCDet v0.3.0
parents
853b759b
04e0d4f0
Changes
114
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1302 additions
and
22 deletions
+1302
-22
pcdet/ops/pointnet2/pointnet2_batch/src/group_points.cpp
pcdet/ops/pointnet2/pointnet2_batch/src/group_points.cpp
+39
-0
pcdet/ops/pointnet2/pointnet2_batch/src/group_points_gpu.cu
pcdet/ops/pointnet2/pointnet2_batch/src/group_points_gpu.cu
+92
-0
pcdet/ops/pointnet2/pointnet2_batch/src/group_points_gpu.h
pcdet/ops/pointnet2/pointnet2_batch/src/group_points_gpu.h
+22
-0
pcdet/ops/pointnet2/pointnet2_batch/src/interpolate.cpp
pcdet/ops/pointnet2/pointnet2_batch/src/interpolate.cpp
+58
-0
pcdet/ops/pointnet2/pointnet2_batch/src/interpolate_gpu.cu
pcdet/ops/pointnet2/pointnet2_batch/src/interpolate_gpu.cu
+168
-0
pcdet/ops/pointnet2/pointnet2_batch/src/interpolate_gpu.h
pcdet/ops/pointnet2/pointnet2_batch/src/interpolate_gpu.h
+30
-0
pcdet/ops/pointnet2/pointnet2_batch/src/pointnet2_api.cpp
pcdet/ops/pointnet2/pointnet2_batch/src/pointnet2_api.cpp
+24
-0
pcdet/ops/pointnet2/pointnet2_batch/src/sampling.cpp
pcdet/ops/pointnet2/pointnet2_batch/src/sampling.cpp
+50
-0
pcdet/ops/pointnet2/pointnet2_batch/src/sampling_gpu.cu
pcdet/ops/pointnet2/pointnet2_batch/src/sampling_gpu.cu
+260
-0
pcdet/ops/pointnet2/pointnet2_batch/src/sampling_gpu.h
pcdet/ops/pointnet2/pointnet2_batch/src/sampling_gpu.h
+29
-0
pcdet/ops/pointnet2/pointnet2_stack/pointnet2_modules.py
pcdet/ops/pointnet2/pointnet2_stack/pointnet2_modules.py
+47
-1
pcdet/ops/pointnet2/pointnet2_stack/pointnet2_utils.py
pcdet/ops/pointnet2/pointnet2_stack/pointnet2_utils.py
+79
-2
pcdet/ops/pointnet2/pointnet2_stack/src/ball_query.cpp
pcdet/ops/pointnet2/pointnet2_stack/src/ball_query.cpp
+12
-2
pcdet/ops/pointnet2/pointnet2_stack/src/group_points.cpp
pcdet/ops/pointnet2/pointnet2_stack/src/group_points.cpp
+12
-2
pcdet/ops/pointnet2/pointnet2_stack/src/interpolate.cpp
pcdet/ops/pointnet2/pointnet2_stack/src/interpolate.cpp
+110
-0
pcdet/ops/pointnet2/pointnet2_stack/src/interpolate_gpu.cu
pcdet/ops/pointnet2/pointnet2_stack/src/interpolate_gpu.cu
+195
-0
pcdet/ops/pointnet2/pointnet2_stack/src/interpolate_gpu.h
pcdet/ops/pointnet2/pointnet2_stack/src/interpolate_gpu.h
+39
-0
pcdet/ops/pointnet2/pointnet2_stack/src/pointnet2_api.cpp
pcdet/ops/pointnet2/pointnet2_stack/src/pointnet2_api.cpp
+5
-0
pcdet/ops/pointnet2/pointnet2_stack/src/sampling.cpp
pcdet/ops/pointnet2/pointnet2_stack/src/sampling.cpp
+18
-2
pcdet/ops/pointnet2/pointnet2_stack/src/sampling_gpu.cu
pcdet/ops/pointnet2/pointnet2_stack/src/sampling_gpu.cu
+13
-13
No files found.
pcdet/ops/pointnet2/pointnet2_batch/src/group_points.cpp
0 → 100644
View file @
32567b04
/*
batch version of point grouping, modified from the original implementation of official PointNet++ codes.
Written by Shaoshuai Shi
All Rights Reserved 2018.
*/
#include <torch/serialize/tensor.h>
#include <cuda.h>
#include <cuda_runtime_api.h>
#include <vector>
#include <THC/THC.h>
#include "group_points_gpu.h"
extern
THCState
*
state
;
int
group_points_grad_wrapper_fast
(
int
b
,
int
c
,
int
n
,
int
npoints
,
int
nsample
,
at
::
Tensor
grad_out_tensor
,
at
::
Tensor
idx_tensor
,
at
::
Tensor
grad_points_tensor
)
{
float
*
grad_points
=
grad_points_tensor
.
data
<
float
>
();
const
int
*
idx
=
idx_tensor
.
data
<
int
>
();
const
float
*
grad_out
=
grad_out_tensor
.
data
<
float
>
();
group_points_grad_kernel_launcher_fast
(
b
,
c
,
n
,
npoints
,
nsample
,
grad_out
,
idx
,
grad_points
);
return
1
;
}
int
group_points_wrapper_fast
(
int
b
,
int
c
,
int
n
,
int
npoints
,
int
nsample
,
at
::
Tensor
points_tensor
,
at
::
Tensor
idx_tensor
,
at
::
Tensor
out_tensor
)
{
const
float
*
points
=
points_tensor
.
data
<
float
>
();
const
int
*
idx
=
idx_tensor
.
data
<
int
>
();
float
*
out
=
out_tensor
.
data
<
float
>
();
group_points_kernel_launcher_fast
(
b
,
c
,
n
,
npoints
,
nsample
,
points
,
idx
,
out
);
return
1
;
}
pcdet/ops/pointnet2/pointnet2_batch/src/group_points_gpu.cu
0 → 100644
View file @
32567b04
/*
batch version of point grouping, modified from the original implementation of official PointNet++ codes.
Written by Shaoshuai Shi
All Rights Reserved 2018.
*/
#include <stdio.h>
#include <stdlib.h>
#include "cuda_utils.h"
#include "group_points_gpu.h"
__global__
void
group_points_grad_kernel_fast
(
int
b
,
int
c
,
int
n
,
int
npoints
,
int
nsample
,
const
float
*
__restrict__
grad_out
,
const
int
*
__restrict__
idx
,
float
*
__restrict__
grad_points
)
{
// grad_out: (B, C, npoints, nsample)
// idx: (B, npoints, nsample)
// output:
// grad_points: (B, C, N)
int
bs_idx
=
blockIdx
.
z
;
int
c_idx
=
blockIdx
.
y
;
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
int
pt_idx
=
index
/
nsample
;
if
(
bs_idx
>=
b
||
c_idx
>=
c
||
pt_idx
>=
npoints
)
return
;
int
sample_idx
=
index
%
nsample
;
grad_out
+=
bs_idx
*
c
*
npoints
*
nsample
+
c_idx
*
npoints
*
nsample
+
pt_idx
*
nsample
+
sample_idx
;
idx
+=
bs_idx
*
npoints
*
nsample
+
pt_idx
*
nsample
+
sample_idx
;
atomicAdd
(
grad_points
+
bs_idx
*
c
*
n
+
c_idx
*
n
+
idx
[
0
]
,
grad_out
[
0
]);
}
void
group_points_grad_kernel_launcher_fast
(
int
b
,
int
c
,
int
n
,
int
npoints
,
int
nsample
,
const
float
*
grad_out
,
const
int
*
idx
,
float
*
grad_points
)
{
// grad_out: (B, C, npoints, nsample)
// idx: (B, npoints, nsample)
// output:
// grad_points: (B, C, N)
cudaError_t
err
;
dim3
blocks
(
DIVUP
(
npoints
*
nsample
,
THREADS_PER_BLOCK
),
c
,
b
);
// blockIdx.x(col), blockIdx.y(row)
dim3
threads
(
THREADS_PER_BLOCK
);
group_points_grad_kernel_fast
<<<
blocks
,
threads
>>>
(
b
,
c
,
n
,
npoints
,
nsample
,
grad_out
,
idx
,
grad_points
);
err
=
cudaGetLastError
();
if
(
cudaSuccess
!=
err
)
{
fprintf
(
stderr
,
"CUDA kernel failed : %s
\n
"
,
cudaGetErrorString
(
err
));
exit
(
-
1
);
}
}
__global__
void
group_points_kernel_fast
(
int
b
,
int
c
,
int
n
,
int
npoints
,
int
nsample
,
const
float
*
__restrict__
points
,
const
int
*
__restrict__
idx
,
float
*
__restrict__
out
)
{
// points: (B, C, N)
// idx: (B, npoints, nsample)
// output:
// out: (B, C, npoints, nsample)
int
bs_idx
=
blockIdx
.
z
;
int
c_idx
=
blockIdx
.
y
;
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
int
pt_idx
=
index
/
nsample
;
if
(
bs_idx
>=
b
||
c_idx
>=
c
||
pt_idx
>=
npoints
)
return
;
int
sample_idx
=
index
%
nsample
;
idx
+=
bs_idx
*
npoints
*
nsample
+
pt_idx
*
nsample
+
sample_idx
;
int
in_idx
=
bs_idx
*
c
*
n
+
c_idx
*
n
+
idx
[
0
];
int
out_idx
=
bs_idx
*
c
*
npoints
*
nsample
+
c_idx
*
npoints
*
nsample
+
pt_idx
*
nsample
+
sample_idx
;
out
[
out_idx
]
=
points
[
in_idx
];
}
void
group_points_kernel_launcher_fast
(
int
b
,
int
c
,
int
n
,
int
npoints
,
int
nsample
,
const
float
*
points
,
const
int
*
idx
,
float
*
out
)
{
// points: (B, C, N)
// idx: (B, npoints, nsample)
// output:
// out: (B, C, npoints, nsample)
cudaError_t
err
;
dim3
blocks
(
DIVUP
(
npoints
*
nsample
,
THREADS_PER_BLOCK
),
c
,
b
);
// blockIdx.x(col), blockIdx.y(row)
dim3
threads
(
THREADS_PER_BLOCK
);
group_points_kernel_fast
<<<
blocks
,
threads
>>>
(
b
,
c
,
n
,
npoints
,
nsample
,
points
,
idx
,
out
);
// cudaDeviceSynchronize(); // for using printf in kernel function
err
=
cudaGetLastError
();
if
(
cudaSuccess
!=
err
)
{
fprintf
(
stderr
,
"CUDA kernel failed : %s
\n
"
,
cudaGetErrorString
(
err
));
exit
(
-
1
);
}
}
pcdet/ops/pointnet2/pointnet2_batch/src/group_points_gpu.h
0 → 100644
View file @
32567b04
#ifndef _GROUP_POINTS_GPU_H
#define _GROUP_POINTS_GPU_H
#include <torch/serialize/tensor.h>
#include <cuda.h>
#include <cuda_runtime_api.h>
#include <vector>
int
group_points_wrapper_fast
(
int
b
,
int
c
,
int
n
,
int
npoints
,
int
nsample
,
at
::
Tensor
points_tensor
,
at
::
Tensor
idx_tensor
,
at
::
Tensor
out_tensor
);
void
group_points_kernel_launcher_fast
(
int
b
,
int
c
,
int
n
,
int
npoints
,
int
nsample
,
const
float
*
points
,
const
int
*
idx
,
float
*
out
);
int
group_points_grad_wrapper_fast
(
int
b
,
int
c
,
int
n
,
int
npoints
,
int
nsample
,
at
::
Tensor
grad_out_tensor
,
at
::
Tensor
idx_tensor
,
at
::
Tensor
grad_points_tensor
);
void
group_points_grad_kernel_launcher_fast
(
int
b
,
int
c
,
int
n
,
int
npoints
,
int
nsample
,
const
float
*
grad_out
,
const
int
*
idx
,
float
*
grad_points
);
#endif
pcdet/ops/pointnet2/pointnet2_batch/src/interpolate.cpp
0 → 100644
View file @
32567b04
/*
batch version of point interpolation, modified from the original implementation of official PointNet++ codes.
Written by Shaoshuai Shi
All Rights Reserved 2018.
*/
#include <torch/serialize/tensor.h>
#include <vector>
#include <THC/THC.h>
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#include <cuda_runtime_api.h>
#include "interpolate_gpu.h"
extern
THCState
*
state
;
void
three_nn_wrapper_fast
(
int
b
,
int
n
,
int
m
,
at
::
Tensor
unknown_tensor
,
at
::
Tensor
known_tensor
,
at
::
Tensor
dist2_tensor
,
at
::
Tensor
idx_tensor
)
{
const
float
*
unknown
=
unknown_tensor
.
data
<
float
>
();
const
float
*
known
=
known_tensor
.
data
<
float
>
();
float
*
dist2
=
dist2_tensor
.
data
<
float
>
();
int
*
idx
=
idx_tensor
.
data
<
int
>
();
three_nn_kernel_launcher_fast
(
b
,
n
,
m
,
unknown
,
known
,
dist2
,
idx
);
}
void
three_interpolate_wrapper_fast
(
int
b
,
int
c
,
int
m
,
int
n
,
at
::
Tensor
points_tensor
,
at
::
Tensor
idx_tensor
,
at
::
Tensor
weight_tensor
,
at
::
Tensor
out_tensor
)
{
const
float
*
points
=
points_tensor
.
data
<
float
>
();
const
float
*
weight
=
weight_tensor
.
data
<
float
>
();
float
*
out
=
out_tensor
.
data
<
float
>
();
const
int
*
idx
=
idx_tensor
.
data
<
int
>
();
three_interpolate_kernel_launcher_fast
(
b
,
c
,
m
,
n
,
points
,
idx
,
weight
,
out
);
}
void
three_interpolate_grad_wrapper_fast
(
int
b
,
int
c
,
int
n
,
int
m
,
at
::
Tensor
grad_out_tensor
,
at
::
Tensor
idx_tensor
,
at
::
Tensor
weight_tensor
,
at
::
Tensor
grad_points_tensor
)
{
const
float
*
grad_out
=
grad_out_tensor
.
data
<
float
>
();
const
float
*
weight
=
weight_tensor
.
data
<
float
>
();
float
*
grad_points
=
grad_points_tensor
.
data
<
float
>
();
const
int
*
idx
=
idx_tensor
.
data
<
int
>
();
three_interpolate_grad_kernel_launcher_fast
(
b
,
c
,
n
,
m
,
grad_out
,
idx
,
weight
,
grad_points
);
}
pcdet/ops/pointnet2/pointnet2_batch/src/interpolate_gpu.cu
0 → 100644
View file @
32567b04
/*
batch version of point interpolation, modified from the original implementation of official PointNet++ codes.
Written by Shaoshuai Shi
All Rights Reserved 2018.
*/
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include "cuda_utils.h"
#include "interpolate_gpu.h"
__global__
void
three_nn_kernel_fast
(
int
b
,
int
n
,
int
m
,
const
float
*
__restrict__
unknown
,
const
float
*
__restrict__
known
,
float
*
__restrict__
dist2
,
int
*
__restrict__
idx
)
{
// unknown: (B, N, 3)
// known: (B, M, 3)
// output:
// dist2: (B, N, 3)
// idx: (B, N, 3)
int
bs_idx
=
blockIdx
.
y
;
int
pt_idx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
bs_idx
>=
b
||
pt_idx
>=
n
)
return
;
unknown
+=
bs_idx
*
n
*
3
+
pt_idx
*
3
;
known
+=
bs_idx
*
m
*
3
;
dist2
+=
bs_idx
*
n
*
3
+
pt_idx
*
3
;
idx
+=
bs_idx
*
n
*
3
+
pt_idx
*
3
;
float
ux
=
unknown
[
0
];
float
uy
=
unknown
[
1
];
float
uz
=
unknown
[
2
];
double
best1
=
1e40
,
best2
=
1e40
,
best3
=
1e40
;
int
besti1
=
0
,
besti2
=
0
,
besti3
=
0
;
for
(
int
k
=
0
;
k
<
m
;
++
k
)
{
float
x
=
known
[
k
*
3
+
0
];
float
y
=
known
[
k
*
3
+
1
];
float
z
=
known
[
k
*
3
+
2
];
float
d
=
(
ux
-
x
)
*
(
ux
-
x
)
+
(
uy
-
y
)
*
(
uy
-
y
)
+
(
uz
-
z
)
*
(
uz
-
z
);
if
(
d
<
best1
)
{
best3
=
best2
;
besti3
=
besti2
;
best2
=
best1
;
besti2
=
besti1
;
best1
=
d
;
besti1
=
k
;
}
else
if
(
d
<
best2
)
{
best3
=
best2
;
besti3
=
besti2
;
best2
=
d
;
besti2
=
k
;
}
else
if
(
d
<
best3
)
{
best3
=
d
;
besti3
=
k
;
}
}
dist2
[
0
]
=
best1
;
dist2
[
1
]
=
best2
;
dist2
[
2
]
=
best3
;
idx
[
0
]
=
besti1
;
idx
[
1
]
=
besti2
;
idx
[
2
]
=
besti3
;
}
void
three_nn_kernel_launcher_fast
(
int
b
,
int
n
,
int
m
,
const
float
*
unknown
,
const
float
*
known
,
float
*
dist2
,
int
*
idx
)
{
// unknown: (B, N, 3)
// known: (B, M, 3)
// output:
// dist2: (B, N, 3)
// idx: (B, N, 3)
cudaError_t
err
;
dim3
blocks
(
DIVUP
(
n
,
THREADS_PER_BLOCK
),
b
);
// blockIdx.x(col), blockIdx.y(row)
dim3
threads
(
THREADS_PER_BLOCK
);
three_nn_kernel_fast
<<<
blocks
,
threads
>>>
(
b
,
n
,
m
,
unknown
,
known
,
dist2
,
idx
);
err
=
cudaGetLastError
();
if
(
cudaSuccess
!=
err
)
{
fprintf
(
stderr
,
"CUDA kernel failed : %s
\n
"
,
cudaGetErrorString
(
err
));
exit
(
-
1
);
}
}
__global__
void
three_interpolate_kernel_fast
(
int
b
,
int
c
,
int
m
,
int
n
,
const
float
*
__restrict__
points
,
const
int
*
__restrict__
idx
,
const
float
*
__restrict__
weight
,
float
*
__restrict__
out
)
{
// points: (B, C, M)
// idx: (B, N, 3)
// weight: (B, N, 3)
// output:
// out: (B, C, N)
int
bs_idx
=
blockIdx
.
z
;
int
c_idx
=
blockIdx
.
y
;
int
pt_idx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
bs_idx
>=
b
||
c_idx
>=
c
||
pt_idx
>=
n
)
return
;
weight
+=
bs_idx
*
n
*
3
+
pt_idx
*
3
;
points
+=
bs_idx
*
c
*
m
+
c_idx
*
m
;
idx
+=
bs_idx
*
n
*
3
+
pt_idx
*
3
;
out
+=
bs_idx
*
c
*
n
+
c_idx
*
n
;
out
[
pt_idx
]
=
weight
[
0
]
*
points
[
idx
[
0
]]
+
weight
[
1
]
*
points
[
idx
[
1
]]
+
weight
[
2
]
*
points
[
idx
[
2
]];
}
void
three_interpolate_kernel_launcher_fast
(
int
b
,
int
c
,
int
m
,
int
n
,
const
float
*
points
,
const
int
*
idx
,
const
float
*
weight
,
float
*
out
)
{
// points: (B, C, M)
// idx: (B, N, 3)
// weight: (B, N, 3)
// output:
// out: (B, C, N)
cudaError_t
err
;
dim3
blocks
(
DIVUP
(
n
,
THREADS_PER_BLOCK
),
c
,
b
);
// blockIdx.x(col), blockIdx.y(row)
dim3
threads
(
THREADS_PER_BLOCK
);
three_interpolate_kernel_fast
<<<
blocks
,
threads
>>>
(
b
,
c
,
m
,
n
,
points
,
idx
,
weight
,
out
);
err
=
cudaGetLastError
();
if
(
cudaSuccess
!=
err
)
{
fprintf
(
stderr
,
"CUDA kernel failed : %s
\n
"
,
cudaGetErrorString
(
err
));
exit
(
-
1
);
}
}
__global__
void
three_interpolate_grad_kernel_fast
(
int
b
,
int
c
,
int
n
,
int
m
,
const
float
*
__restrict__
grad_out
,
const
int
*
__restrict__
idx
,
const
float
*
__restrict__
weight
,
float
*
__restrict__
grad_points
)
{
// grad_out: (B, C, N)
// weight: (B, N, 3)
// output:
// grad_points: (B, C, M)
int
bs_idx
=
blockIdx
.
z
;
int
c_idx
=
blockIdx
.
y
;
int
pt_idx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
bs_idx
>=
b
||
c_idx
>=
c
||
pt_idx
>=
n
)
return
;
grad_out
+=
bs_idx
*
c
*
n
+
c_idx
*
n
+
pt_idx
;
weight
+=
bs_idx
*
n
*
3
+
pt_idx
*
3
;
grad_points
+=
bs_idx
*
c
*
m
+
c_idx
*
m
;
idx
+=
bs_idx
*
n
*
3
+
pt_idx
*
3
;
atomicAdd
(
grad_points
+
idx
[
0
],
grad_out
[
0
]
*
weight
[
0
]);
atomicAdd
(
grad_points
+
idx
[
1
],
grad_out
[
0
]
*
weight
[
1
]);
atomicAdd
(
grad_points
+
idx
[
2
],
grad_out
[
0
]
*
weight
[
2
]);
}
void
three_interpolate_grad_kernel_launcher_fast
(
int
b
,
int
c
,
int
n
,
int
m
,
const
float
*
grad_out
,
const
int
*
idx
,
const
float
*
weight
,
float
*
grad_points
)
{
// grad_out: (B, C, N)
// weight: (B, N, 3)
// output:
// grad_points: (B, C, M)
cudaError_t
err
;
dim3
blocks
(
DIVUP
(
n
,
THREADS_PER_BLOCK
),
c
,
b
);
// blockIdx.x(col), blockIdx.y(row)
dim3
threads
(
THREADS_PER_BLOCK
);
three_interpolate_grad_kernel_fast
<<<
blocks
,
threads
>>>
(
b
,
c
,
n
,
m
,
grad_out
,
idx
,
weight
,
grad_points
);
err
=
cudaGetLastError
();
if
(
cudaSuccess
!=
err
)
{
fprintf
(
stderr
,
"CUDA kernel failed : %s
\n
"
,
cudaGetErrorString
(
err
));
exit
(
-
1
);
}
}
pcdet/ops/pointnet2/pointnet2_batch/src/interpolate_gpu.h
0 → 100644
View file @
32567b04
#ifndef _INTERPOLATE_GPU_H
#define _INTERPOLATE_GPU_H
#include <torch/serialize/tensor.h>
#include<vector>
#include <cuda.h>
#include <cuda_runtime_api.h>
void
three_nn_wrapper_fast
(
int
b
,
int
n
,
int
m
,
at
::
Tensor
unknown_tensor
,
at
::
Tensor
known_tensor
,
at
::
Tensor
dist2_tensor
,
at
::
Tensor
idx_tensor
);
void
three_nn_kernel_launcher_fast
(
int
b
,
int
n
,
int
m
,
const
float
*
unknown
,
const
float
*
known
,
float
*
dist2
,
int
*
idx
);
void
three_interpolate_wrapper_fast
(
int
b
,
int
c
,
int
m
,
int
n
,
at
::
Tensor
points_tensor
,
at
::
Tensor
idx_tensor
,
at
::
Tensor
weight_tensor
,
at
::
Tensor
out_tensor
);
void
three_interpolate_kernel_launcher_fast
(
int
b
,
int
c
,
int
m
,
int
n
,
const
float
*
points
,
const
int
*
idx
,
const
float
*
weight
,
float
*
out
);
void
three_interpolate_grad_wrapper_fast
(
int
b
,
int
c
,
int
n
,
int
m
,
at
::
Tensor
grad_out_tensor
,
at
::
Tensor
idx_tensor
,
at
::
Tensor
weight_tensor
,
at
::
Tensor
grad_points_tensor
);
void
three_interpolate_grad_kernel_launcher_fast
(
int
b
,
int
c
,
int
n
,
int
m
,
const
float
*
grad_out
,
const
int
*
idx
,
const
float
*
weight
,
float
*
grad_points
);
#endif
pcdet/ops/pointnet2/pointnet2_batch/src/pointnet2_api.cpp
0 → 100644
View file @
32567b04
#include <torch/serialize/tensor.h>
#include <torch/extension.h>
#include "ball_query_gpu.h"
#include "group_points_gpu.h"
#include "sampling_gpu.h"
#include "interpolate_gpu.h"
PYBIND11_MODULE
(
TORCH_EXTENSION_NAME
,
m
)
{
m
.
def
(
"ball_query_wrapper"
,
&
ball_query_wrapper_fast
,
"ball_query_wrapper_fast"
);
m
.
def
(
"group_points_wrapper"
,
&
group_points_wrapper_fast
,
"group_points_wrapper_fast"
);
m
.
def
(
"group_points_grad_wrapper"
,
&
group_points_grad_wrapper_fast
,
"group_points_grad_wrapper_fast"
);
m
.
def
(
"gather_points_wrapper"
,
&
gather_points_wrapper_fast
,
"gather_points_wrapper_fast"
);
m
.
def
(
"gather_points_grad_wrapper"
,
&
gather_points_grad_wrapper_fast
,
"gather_points_grad_wrapper_fast"
);
m
.
def
(
"furthest_point_sampling_wrapper"
,
&
furthest_point_sampling_wrapper
,
"furthest_point_sampling_wrapper"
);
m
.
def
(
"three_nn_wrapper"
,
&
three_nn_wrapper_fast
,
"three_nn_wrapper_fast"
);
m
.
def
(
"three_interpolate_wrapper"
,
&
three_interpolate_wrapper_fast
,
"three_interpolate_wrapper_fast"
);
m
.
def
(
"three_interpolate_grad_wrapper"
,
&
three_interpolate_grad_wrapper_fast
,
"three_interpolate_grad_wrapper_fast"
);
}
pcdet/ops/pointnet2/pointnet2_batch/src/sampling.cpp
0 → 100644
View file @
32567b04
/*
batch version of point sampling and gathering, modified from the original implementation of official PointNet++ codes.
Written by Shaoshuai Shi
All Rights Reserved 2018.
*/
#include <torch/serialize/tensor.h>
#include <ATen/cuda/CUDAContext.h>
#include <vector>
#include <THC/THC.h>
#include "sampling_gpu.h"
extern
THCState
*
state
;
int
gather_points_wrapper_fast
(
int
b
,
int
c
,
int
n
,
int
npoints
,
at
::
Tensor
points_tensor
,
at
::
Tensor
idx_tensor
,
at
::
Tensor
out_tensor
){
const
float
*
points
=
points_tensor
.
data
<
float
>
();
const
int
*
idx
=
idx_tensor
.
data
<
int
>
();
float
*
out
=
out_tensor
.
data
<
float
>
();
gather_points_kernel_launcher_fast
(
b
,
c
,
n
,
npoints
,
points
,
idx
,
out
);
return
1
;
}
int
gather_points_grad_wrapper_fast
(
int
b
,
int
c
,
int
n
,
int
npoints
,
at
::
Tensor
grad_out_tensor
,
at
::
Tensor
idx_tensor
,
at
::
Tensor
grad_points_tensor
)
{
const
float
*
grad_out
=
grad_out_tensor
.
data
<
float
>
();
const
int
*
idx
=
idx_tensor
.
data
<
int
>
();
float
*
grad_points
=
grad_points_tensor
.
data
<
float
>
();
gather_points_grad_kernel_launcher_fast
(
b
,
c
,
n
,
npoints
,
grad_out
,
idx
,
grad_points
);
return
1
;
}
int
furthest_point_sampling_wrapper
(
int
b
,
int
n
,
int
m
,
at
::
Tensor
points_tensor
,
at
::
Tensor
temp_tensor
,
at
::
Tensor
idx_tensor
)
{
const
float
*
points
=
points_tensor
.
data
<
float
>
();
float
*
temp
=
temp_tensor
.
data
<
float
>
();
int
*
idx
=
idx_tensor
.
data
<
int
>
();
furthest_point_sampling_kernel_launcher
(
b
,
n
,
m
,
points
,
temp
,
idx
);
return
1
;
}
pcdet/ops/pointnet2/pointnet2_batch/src/sampling_gpu.cu
0 → 100644
View file @
32567b04
/*
batch version of point sampling and gathering, modified from the original implementation of official PointNet++ codes.
Written by Shaoshuai Shi
All Rights Reserved 2018.
*/
#include <stdio.h>
#include <stdlib.h>
#include "cuda_utils.h"
#include "sampling_gpu.h"
__global__
void
gather_points_kernel_fast
(
int
b
,
int
c
,
int
n
,
int
m
,
const
float
*
__restrict__
points
,
const
int
*
__restrict__
idx
,
float
*
__restrict__
out
)
{
// points: (B, C, N)
// idx: (B, M)
// output:
// out: (B, C, M)
int
bs_idx
=
blockIdx
.
z
;
int
c_idx
=
blockIdx
.
y
;
int
pt_idx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
bs_idx
>=
b
||
c_idx
>=
c
||
pt_idx
>=
m
)
return
;
out
+=
bs_idx
*
c
*
m
+
c_idx
*
m
+
pt_idx
;
idx
+=
bs_idx
*
m
+
pt_idx
;
points
+=
bs_idx
*
c
*
n
+
c_idx
*
n
;
out
[
0
]
=
points
[
idx
[
0
]];
}
void
gather_points_kernel_launcher_fast
(
int
b
,
int
c
,
int
n
,
int
npoints
,
const
float
*
points
,
const
int
*
idx
,
float
*
out
)
{
// points: (B, C, N)
// idx: (B, npoints)
// output:
// out: (B, C, npoints)
cudaError_t
err
;
dim3
blocks
(
DIVUP
(
npoints
,
THREADS_PER_BLOCK
),
c
,
b
);
// blockIdx.x(col), blockIdx.y(row)
dim3
threads
(
THREADS_PER_BLOCK
);
gather_points_kernel_fast
<<<
blocks
,
threads
>>>
(
b
,
c
,
n
,
npoints
,
points
,
idx
,
out
);
err
=
cudaGetLastError
();
if
(
cudaSuccess
!=
err
)
{
fprintf
(
stderr
,
"CUDA kernel failed : %s
\n
"
,
cudaGetErrorString
(
err
));
exit
(
-
1
);
}
}
__global__
void
gather_points_grad_kernel_fast
(
int
b
,
int
c
,
int
n
,
int
m
,
const
float
*
__restrict__
grad_out
,
const
int
*
__restrict__
idx
,
float
*
__restrict__
grad_points
)
{
// grad_out: (B, C, M)
// idx: (B, M)
// output:
// grad_points: (B, C, N)
int
bs_idx
=
blockIdx
.
z
;
int
c_idx
=
blockIdx
.
y
;
int
pt_idx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
bs_idx
>=
b
||
c_idx
>=
c
||
pt_idx
>=
m
)
return
;
grad_out
+=
bs_idx
*
c
*
m
+
c_idx
*
m
+
pt_idx
;
idx
+=
bs_idx
*
m
+
pt_idx
;
grad_points
+=
bs_idx
*
c
*
n
+
c_idx
*
n
;
atomicAdd
(
grad_points
+
idx
[
0
],
grad_out
[
0
]);
}
void
gather_points_grad_kernel_launcher_fast
(
int
b
,
int
c
,
int
n
,
int
npoints
,
const
float
*
grad_out
,
const
int
*
idx
,
float
*
grad_points
)
{
// grad_out: (B, C, npoints)
// idx: (B, npoints)
// output:
// grad_points: (B, C, N)
cudaError_t
err
;
dim3
blocks
(
DIVUP
(
npoints
,
THREADS_PER_BLOCK
),
c
,
b
);
// blockIdx.x(col), blockIdx.y(row)
dim3
threads
(
THREADS_PER_BLOCK
);
gather_points_grad_kernel_fast
<<<
blocks
,
threads
>>>
(
b
,
c
,
n
,
npoints
,
grad_out
,
idx
,
grad_points
);
err
=
cudaGetLastError
();
if
(
cudaSuccess
!=
err
)
{
fprintf
(
stderr
,
"CUDA kernel failed : %s
\n
"
,
cudaGetErrorString
(
err
));
exit
(
-
1
);
}
}
__device__
void
__update
(
float
*
__restrict__
dists
,
int
*
__restrict__
dists_i
,
int
idx1
,
int
idx2
){
const
float
v1
=
dists
[
idx1
],
v2
=
dists
[
idx2
];
const
int
i1
=
dists_i
[
idx1
],
i2
=
dists_i
[
idx2
];
dists
[
idx1
]
=
max
(
v1
,
v2
);
dists_i
[
idx1
]
=
v2
>
v1
?
i2
:
i1
;
}
template
<
unsigned
int
block_size
>
__global__
void
furthest_point_sampling_kernel
(
int
b
,
int
n
,
int
m
,
const
float
*
__restrict__
dataset
,
float
*
__restrict__
temp
,
int
*
__restrict__
idxs
)
{
// dataset: (B, N, 3)
// tmp: (B, N)
// output:
// idx: (B, M)
if
(
m
<=
0
)
return
;
__shared__
float
dists
[
block_size
];
__shared__
int
dists_i
[
block_size
];
int
batch_index
=
blockIdx
.
x
;
dataset
+=
batch_index
*
n
*
3
;
temp
+=
batch_index
*
n
;
idxs
+=
batch_index
*
m
;
int
tid
=
threadIdx
.
x
;
const
int
stride
=
block_size
;
int
old
=
0
;
if
(
threadIdx
.
x
==
0
)
idxs
[
0
]
=
old
;
__syncthreads
();
for
(
int
j
=
1
;
j
<
m
;
j
++
)
{
int
besti
=
0
;
float
best
=
-
1
;
float
x1
=
dataset
[
old
*
3
+
0
];
float
y1
=
dataset
[
old
*
3
+
1
];
float
z1
=
dataset
[
old
*
3
+
2
];
for
(
int
k
=
tid
;
k
<
n
;
k
+=
stride
)
{
float
x2
,
y2
,
z2
;
x2
=
dataset
[
k
*
3
+
0
];
y2
=
dataset
[
k
*
3
+
1
];
z2
=
dataset
[
k
*
3
+
2
];
// float mag = (x2 * x2) + (y2 * y2) + (z2 * z2);
// if (mag <= 1e-3)
// continue;
float
d
=
(
x2
-
x1
)
*
(
x2
-
x1
)
+
(
y2
-
y1
)
*
(
y2
-
y1
)
+
(
z2
-
z1
)
*
(
z2
-
z1
);
float
d2
=
min
(
d
,
temp
[
k
]);
temp
[
k
]
=
d2
;
besti
=
d2
>
best
?
k
:
besti
;
best
=
d2
>
best
?
d2
:
best
;
}
dists
[
tid
]
=
best
;
dists_i
[
tid
]
=
besti
;
__syncthreads
();
if
(
block_size
>=
1024
)
{
if
(
tid
<
512
)
{
__update
(
dists
,
dists_i
,
tid
,
tid
+
512
);
}
__syncthreads
();
}
if
(
block_size
>=
512
)
{
if
(
tid
<
256
)
{
__update
(
dists
,
dists_i
,
tid
,
tid
+
256
);
}
__syncthreads
();
}
if
(
block_size
>=
256
)
{
if
(
tid
<
128
)
{
__update
(
dists
,
dists_i
,
tid
,
tid
+
128
);
}
__syncthreads
();
}
if
(
block_size
>=
128
)
{
if
(
tid
<
64
)
{
__update
(
dists
,
dists_i
,
tid
,
tid
+
64
);
}
__syncthreads
();
}
if
(
block_size
>=
64
)
{
if
(
tid
<
32
)
{
__update
(
dists
,
dists_i
,
tid
,
tid
+
32
);
}
__syncthreads
();
}
if
(
block_size
>=
32
)
{
if
(
tid
<
16
)
{
__update
(
dists
,
dists_i
,
tid
,
tid
+
16
);
}
__syncthreads
();
}
if
(
block_size
>=
16
)
{
if
(
tid
<
8
)
{
__update
(
dists
,
dists_i
,
tid
,
tid
+
8
);
}
__syncthreads
();
}
if
(
block_size
>=
8
)
{
if
(
tid
<
4
)
{
__update
(
dists
,
dists_i
,
tid
,
tid
+
4
);
}
__syncthreads
();
}
if
(
block_size
>=
4
)
{
if
(
tid
<
2
)
{
__update
(
dists
,
dists_i
,
tid
,
tid
+
2
);
}
__syncthreads
();
}
if
(
block_size
>=
2
)
{
if
(
tid
<
1
)
{
__update
(
dists
,
dists_i
,
tid
,
tid
+
1
);
}
__syncthreads
();
}
old
=
dists_i
[
0
];
if
(
tid
==
0
)
idxs
[
j
]
=
old
;
}
}
void
furthest_point_sampling_kernel_launcher
(
int
b
,
int
n
,
int
m
,
const
float
*
dataset
,
float
*
temp
,
int
*
idxs
)
{
// dataset: (B, N, 3)
// tmp: (B, N)
// output:
// idx: (B, M)
cudaError_t
err
;
unsigned
int
n_threads
=
opt_n_threads
(
n
);
switch
(
n_threads
)
{
case
1024
:
furthest_point_sampling_kernel
<
1024
><<<
b
,
n_threads
>>>
(
b
,
n
,
m
,
dataset
,
temp
,
idxs
);
break
;
case
512
:
furthest_point_sampling_kernel
<
512
><<<
b
,
n_threads
>>>
(
b
,
n
,
m
,
dataset
,
temp
,
idxs
);
break
;
case
256
:
furthest_point_sampling_kernel
<
256
><<<
b
,
n_threads
>>>
(
b
,
n
,
m
,
dataset
,
temp
,
idxs
);
break
;
case
128
:
furthest_point_sampling_kernel
<
128
><<<
b
,
n_threads
>>>
(
b
,
n
,
m
,
dataset
,
temp
,
idxs
);
break
;
case
64
:
furthest_point_sampling_kernel
<
64
><<<
b
,
n_threads
>>>
(
b
,
n
,
m
,
dataset
,
temp
,
idxs
);
break
;
case
32
:
furthest_point_sampling_kernel
<
32
><<<
b
,
n_threads
>>>
(
b
,
n
,
m
,
dataset
,
temp
,
idxs
);
break
;
case
16
:
furthest_point_sampling_kernel
<
16
><<<
b
,
n_threads
>>>
(
b
,
n
,
m
,
dataset
,
temp
,
idxs
);
break
;
case
8
:
furthest_point_sampling_kernel
<
8
><<<
b
,
n_threads
>>>
(
b
,
n
,
m
,
dataset
,
temp
,
idxs
);
break
;
case
4
:
furthest_point_sampling_kernel
<
4
><<<
b
,
n_threads
>>>
(
b
,
n
,
m
,
dataset
,
temp
,
idxs
);
break
;
case
2
:
furthest_point_sampling_kernel
<
2
><<<
b
,
n_threads
>>>
(
b
,
n
,
m
,
dataset
,
temp
,
idxs
);
break
;
case
1
:
furthest_point_sampling_kernel
<
1
><<<
b
,
n_threads
>>>
(
b
,
n
,
m
,
dataset
,
temp
,
idxs
);
break
;
default:
furthest_point_sampling_kernel
<
512
><<<
b
,
n_threads
>>>
(
b
,
n
,
m
,
dataset
,
temp
,
idxs
);
}
err
=
cudaGetLastError
();
if
(
cudaSuccess
!=
err
)
{
fprintf
(
stderr
,
"CUDA kernel failed : %s
\n
"
,
cudaGetErrorString
(
err
));
exit
(
-
1
);
}
}
pcdet/ops/pointnet2/pointnet2_batch/src/sampling_gpu.h
0 → 100644
View file @
32567b04
#ifndef _SAMPLING_GPU_H
#define _SAMPLING_GPU_H
#include <torch/serialize/tensor.h>
#include <ATen/cuda/CUDAContext.h>
#include<vector>
int
gather_points_wrapper_fast
(
int
b
,
int
c
,
int
n
,
int
npoints
,
at
::
Tensor
points_tensor
,
at
::
Tensor
idx_tensor
,
at
::
Tensor
out_tensor
);
void
gather_points_kernel_launcher_fast
(
int
b
,
int
c
,
int
n
,
int
npoints
,
const
float
*
points
,
const
int
*
idx
,
float
*
out
);
int
gather_points_grad_wrapper_fast
(
int
b
,
int
c
,
int
n
,
int
npoints
,
at
::
Tensor
grad_out_tensor
,
at
::
Tensor
idx_tensor
,
at
::
Tensor
grad_points_tensor
);
void
gather_points_grad_kernel_launcher_fast
(
int
b
,
int
c
,
int
n
,
int
npoints
,
const
float
*
grad_out
,
const
int
*
idx
,
float
*
grad_points
);
int
furthest_point_sampling_wrapper
(
int
b
,
int
n
,
int
m
,
at
::
Tensor
points_tensor
,
at
::
Tensor
temp_tensor
,
at
::
Tensor
idx_tensor
);
void
furthest_point_sampling_kernel_launcher
(
int
b
,
int
n
,
int
m
,
const
float
*
dataset
,
float
*
temp
,
int
*
idxs
);
#endif
pcdet/ops/pointnet2/pointnet2_stack/pointnet2_modules.py
View file @
32567b04
from
typing
import
List
import
torch
import
torch.nn
as
nn
import
torch.nn.functional
as
F
from
.
import
pointnet2_utils
from
typing
import
List
class
StackSAModuleMSG
(
nn
.
Module
):
...
...
@@ -89,3 +91,47 @@ class StackSAModuleMSG(nn.Module):
return
new_xyz
,
new_features
class
StackPointnetFPModule
(
nn
.
Module
):
def
__init__
(
self
,
*
,
mlp
:
List
[
int
]):
"""
Args:
mlp: list of int
"""
super
().
__init__
()
shared_mlps
=
[]
for
k
in
range
(
len
(
mlp
)
-
1
):
shared_mlps
.
extend
([
nn
.
Conv2d
(
mlp
[
k
],
mlp
[
k
+
1
],
kernel_size
=
1
,
bias
=
False
),
nn
.
BatchNorm2d
(
mlp
[
k
+
1
]),
nn
.
ReLU
()
])
self
.
mlp
=
nn
.
Sequential
(
*
shared_mlps
)
def
forward
(
self
,
unknown
,
unknown_batch_cnt
,
known
,
known_batch_cnt
,
unknown_feats
=
None
,
known_feats
=
None
):
"""
Args:
unknown: (N1 + N2 ..., 3)
known: (M1 + M2 ..., 3)
unknow_feats: (N1 + N2 ..., C1)
known_feats: (M1 + M2 ..., C2)
Returns:
new_features: (N1 + N2 ..., C_out)
"""
dist
,
idx
=
pointnet2_utils
.
three_nn
(
unknown
,
unknown_batch_cnt
,
known
,
known_batch_cnt
)
dist_recip
=
1.0
/
(
dist
+
1e-8
)
norm
=
torch
.
sum
(
dist_recip
,
dim
=-
1
,
keepdim
=
True
)
weight
=
dist_recip
/
norm
interpolated_feats
=
pointnet2_utils
.
three_interpolate
(
known_feats
,
idx
,
weight
)
if
unknown_feats
is
not
None
:
new_features
=
torch
.
cat
([
interpolated_feats
,
unknown_feats
],
dim
=
1
)
# (N1 + N2 ..., C2 + C1)
else
:
new_features
=
interpolated_feats
new_features
=
new_features
.
permute
(
1
,
0
)[
None
,
:,
:,
None
]
# (1, C, N1 + N2 ..., 1)
new_features
=
self
.
mlp
(
new_features
)
new_features
=
new_features
.
squeeze
(
dim
=
0
).
squeeze
(
dim
=-
1
).
permute
(
1
,
0
)
# (N1 + N2 ..., C)
return
new_features
pcdet/ops/pointnet2/pointnet2_stack/pointnet2_utils.py
View file @
32567b04
import
torch
from
torch.autograd
import
Variable
from
torch.autograd
import
Function
import
torch.nn
as
nn
from
torch.autograd
import
Function
,
Variable
from
.
import
pointnet2_stack_cuda
as
pointnet2
...
...
@@ -185,5 +184,83 @@ class FurthestPointSampling(Function):
furthest_point_sample
=
FurthestPointSampling
.
apply
class
ThreeNN
(
Function
):
@
staticmethod
def
forward
(
ctx
,
unknown
,
unknown_batch_cnt
,
known
,
known_batch_cnt
):
"""
Args:
ctx:
unknown: (N1 + N2..., 3)
unknown_batch_cnt: (batch_size), [N1, N2, ...]
known: (M1 + M2..., 3)
known_batch_cnt: (batch_size), [M1, M2, ...]
Returns:
dist: (N1 + N2 ..., 3) l2 distance to the three nearest neighbors
idx: (N1 + N2 ..., 3) index of the three nearest neighbors, range [0, M1+M2+...]
"""
assert
unknown
.
shape
.
__len__
()
==
2
and
unknown
.
shape
[
1
]
==
3
assert
known
.
shape
.
__len__
()
==
2
and
known
.
shape
[
1
]
==
3
assert
unknown_batch_cnt
.
__len__
()
==
known_batch_cnt
.
__len__
()
dist2
=
unknown
.
new_zeros
(
unknown
.
shape
)
idx
=
unknown_batch_cnt
.
new_zeros
(
unknown
.
shape
).
int
()
pointnet2
.
three_nn_wrapper
(
unknown
.
contiguous
(),
unknown_batch_cnt
.
contiguous
(),
known
.
contiguous
(),
known_batch_cnt
.
contiguous
(),
dist2
,
idx
)
return
torch
.
sqrt
(
dist2
),
idx
@
staticmethod
def
backward
(
ctx
,
a
=
None
,
b
=
None
):
return
None
,
None
three_nn
=
ThreeNN
.
apply
class
ThreeInterpolate
(
Function
):
@
staticmethod
def
forward
(
ctx
,
features
:
torch
.
Tensor
,
idx
:
torch
.
Tensor
,
weight
:
torch
.
Tensor
):
"""
Args:
ctx:
features: (M1 + M2 ..., C)
idx: [N1 + N2 ..., 3]
weight: [N1 + N2 ..., 3]
Returns:
out_tensor: (N1 + N2 ..., C)
"""
assert
idx
.
shape
[
0
]
==
weight
.
shape
[
0
]
and
idx
.
shape
[
1
]
==
weight
.
shape
[
1
]
==
3
ctx
.
three_interpolate_for_backward
=
(
idx
,
weight
,
features
.
shape
[
0
])
output
=
features
.
new_zeros
((
idx
.
shape
[
0
],
features
.
shape
[
1
]))
pointnet2
.
three_interpolate_wrapper
(
features
.
contiguous
(),
idx
.
contiguous
(),
weight
.
contiguous
(),
output
)
return
output
@
staticmethod
def
backward
(
ctx
,
grad_out
:
torch
.
Tensor
):
"""
Args:
ctx:
grad_out: (N1 + N2 ..., C)
Returns:
grad_features: (M1 + M2 ..., C)
"""
idx
,
weight
,
M
=
ctx
.
three_interpolate_for_backward
grad_features
=
grad_out
.
new_zeros
((
M
,
grad_out
.
shape
[
1
]))
pointnet2
.
three_interpolate_grad_wrapper
(
grad_out
.
contiguous
(),
idx
.
contiguous
(),
weight
.
contiguous
(),
grad_features
)
return
grad_features
,
None
,
None
three_interpolate
=
ThreeInterpolate
.
apply
if
__name__
==
'__main__'
:
pass
pcdet/ops/pointnet2/pointnet2_stack/src/ball_query.cpp
View file @
32567b04
...
...
@@ -14,8 +14,18 @@ All Rights Reserved 2019-2020.
extern
THCState
*
state
;
#define CHECK_CUDA(x) AT_CHECK(x.type().is_cuda(), #x, " must be a CUDAtensor ")
#define CHECK_CONTIGUOUS(x) AT_CHECK(x.is_contiguous(), #x, " must be contiguous ")
#define CHECK_CUDA(x) do { \
if (!x.type().is_cuda()) { \
fprintf(stderr, "%s must be CUDA tensor at %s:%d\n", #x, __FILE__, __LINE__); \
exit(-1); \
} \
} while (0)
#define CHECK_CONTIGUOUS(x) do { \
if (!x.is_contiguous()) { \
fprintf(stderr, "%s must be contiguous tensor at %s:%d\n", #x, __FILE__, __LINE__); \
exit(-1); \
} \
} while (0)
#define CHECK_INPUT(x) CHECK_CUDA(x);CHECK_CONTIGUOUS(x)
int
ball_query_wrapper_stack
(
int
B
,
int
M
,
float
radius
,
int
nsample
,
...
...
pcdet/ops/pointnet2/pointnet2_stack/src/group_points.cpp
View file @
32567b04
...
...
@@ -13,8 +13,18 @@ All Rights Reserved 2019-2020.
#include "group_points_gpu.h"
extern
THCState
*
state
;
#define CHECK_CUDA(x) AT_CHECK(x.type().is_cuda(), #x, " must be a CUDAtensor ")
#define CHECK_CONTIGUOUS(x) AT_CHECK(x.is_contiguous(), #x, " must be contiguous ")
#define CHECK_CUDA(x) do { \
if (!x.type().is_cuda()) { \
fprintf(stderr, "%s must be CUDA tensor at %s:%d\n", #x, __FILE__, __LINE__); \
exit(-1); \
} \
} while (0)
#define CHECK_CONTIGUOUS(x) do { \
if (!x.is_contiguous()) { \
fprintf(stderr, "%s must be contiguous tensor at %s:%d\n", #x, __FILE__, __LINE__); \
exit(-1); \
} \
} while (0)
#define CHECK_INPUT(x) CHECK_CUDA(x);CHECK_CONTIGUOUS(x)
...
...
pcdet/ops/pointnet2/pointnet2_stack/src/interpolate.cpp
0 → 100644
View file @
32567b04
/*
Stacked-batch-data version of point interpolation, modified from the original implementation of official PointNet++ codes.
Written by Shaoshuai Shi
All Rights Reserved 2019-2020.
*/
#include <torch/serialize/tensor.h>
#include <vector>
#include <THC/THC.h>
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#include <cuda_runtime_api.h>
#include "interpolate_gpu.h"
extern
THCState
*
state
;
#define CHECK_CUDA(x) do { \
if (!x.type().is_cuda()) { \
fprintf(stderr, "%s must be CUDA tensor at %s:%d\n", #x, __FILE__, __LINE__); \
exit(-1); \
} \
} while (0)
#define CHECK_CONTIGUOUS(x) do { \
if (!x.is_contiguous()) { \
fprintf(stderr, "%s must be contiguous tensor at %s:%d\n", #x, __FILE__, __LINE__); \
exit(-1); \
} \
} while (0)
#define CHECK_INPUT(x) CHECK_CUDA(x);CHECK_CONTIGUOUS(x)
void
three_nn_wrapper_stack
(
at
::
Tensor
unknown_tensor
,
at
::
Tensor
unknown_batch_cnt_tensor
,
at
::
Tensor
known_tensor
,
at
::
Tensor
known_batch_cnt_tensor
,
at
::
Tensor
dist2_tensor
,
at
::
Tensor
idx_tensor
){
// unknown: (N1 + N2 ..., 3)
// unknown_batch_cnt: (batch_size), [N1, N2, ...]
// known: (M1 + M2 ..., 3)
// known_batch_cnt: (batch_size), [M1, M2, ...]
// Return:
// dist: (N1 + N2 ..., 3) l2 distance to the three nearest neighbors
// idx: (N1 + N2 ..., 3) index of the three nearest neighbors
CHECK_INPUT
(
unknown_tensor
);
CHECK_INPUT
(
unknown_batch_cnt_tensor
);
CHECK_INPUT
(
known_tensor
);
CHECK_INPUT
(
known_batch_cnt_tensor
);
CHECK_INPUT
(
dist2_tensor
);
CHECK_INPUT
(
idx_tensor
);
int
batch_size
=
unknown_batch_cnt_tensor
.
size
(
0
);
int
N
=
unknown_tensor
.
size
(
0
);
int
M
=
known_tensor
.
size
(
0
);
const
float
*
unknown
=
unknown_tensor
.
data
<
float
>
();
const
int
*
unknown_batch_cnt
=
unknown_batch_cnt_tensor
.
data
<
int
>
();
const
float
*
known
=
known_tensor
.
data
<
float
>
();
const
int
*
known_batch_cnt
=
known_batch_cnt_tensor
.
data
<
int
>
();
float
*
dist2
=
dist2_tensor
.
data
<
float
>
();
int
*
idx
=
idx_tensor
.
data
<
int
>
();
three_nn_kernel_launcher_stack
(
batch_size
,
N
,
M
,
unknown
,
unknown_batch_cnt
,
known
,
known_batch_cnt
,
dist2
,
idx
);
}
void
three_interpolate_wrapper_stack
(
at
::
Tensor
features_tensor
,
at
::
Tensor
idx_tensor
,
at
::
Tensor
weight_tensor
,
at
::
Tensor
out_tensor
)
{
// features_tensor: (M1 + M2 ..., C)
// idx_tensor: [N1 + N2 ..., 3]
// weight_tensor: [N1 + N2 ..., 3]
// Return:
// out_tensor: (N1 + N2 ..., C)
CHECK_INPUT
(
features_tensor
);
CHECK_INPUT
(
idx_tensor
);
CHECK_INPUT
(
weight_tensor
);
CHECK_INPUT
(
out_tensor
);
int
N
=
out_tensor
.
size
(
0
);
int
channels
=
features_tensor
.
size
(
1
);
const
float
*
features
=
features_tensor
.
data
<
float
>
();
const
float
*
weight
=
weight_tensor
.
data
<
float
>
();
const
int
*
idx
=
idx_tensor
.
data
<
int
>
();
float
*
out
=
out_tensor
.
data
<
float
>
();
three_interpolate_kernel_launcher_stack
(
N
,
channels
,
features
,
idx
,
weight
,
out
);
}
void
three_interpolate_grad_wrapper_stack
(
at
::
Tensor
grad_out_tensor
,
at
::
Tensor
idx_tensor
,
at
::
Tensor
weight_tensor
,
at
::
Tensor
grad_features_tensor
)
{
// grad_out_tensor: (N1 + N2 ..., C)
// idx_tensor: [N1 + N2 ..., 3]
// weight_tensor: [N1 + N2 ..., 3]
// Return:
// grad_features_tensor: (M1 + M2 ..., C)
CHECK_INPUT
(
grad_out_tensor
);
CHECK_INPUT
(
idx_tensor
);
CHECK_INPUT
(
weight_tensor
);
CHECK_INPUT
(
grad_features_tensor
);
int
N
=
grad_out_tensor
.
size
(
0
);
int
channels
=
grad_out_tensor
.
size
(
1
);
const
float
*
grad_out
=
grad_out_tensor
.
data
<
float
>
();
const
float
*
weight
=
weight_tensor
.
data
<
float
>
();
const
int
*
idx
=
idx_tensor
.
data
<
int
>
();
float
*
grad_features
=
grad_features_tensor
.
data
<
float
>
();
// printf("N=%d, channels=%d\n", N, channels);
three_interpolate_grad_kernel_launcher_stack
(
N
,
channels
,
grad_out
,
idx
,
weight
,
grad_features
);
}
\ No newline at end of file
pcdet/ops/pointnet2/pointnet2_stack/src/interpolate_gpu.cu
0 → 100644
View file @
32567b04
/*
Stacked-batch-data version of point interpolation, modified from the original implementation of official PointNet++ codes.
Written by Shaoshuai Shi
All Rights Reserved 2019-2020.
*/
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include "cuda_utils.h"
#include "interpolate_gpu.h"
__global__
void
three_nn_kernel_stack
(
int
batch_size
,
int
N
,
int
M
,
const
float
*
unknown
,
const
int
*
unknown_batch_cnt
,
const
float
*
known
,
const
int
*
known_batch_cnt
,
float
*
dist2
,
int
*
idx
)
{
// unknown: (N1 + N2 ..., 3)
// unknown_batch_cnt: (batch_size), [N1, N2, ...]
// known: (M1 + M2 ..., 3)
// known_batch_cnt: (batch_size), [M1, M2, ...]
// Return:
// dist: (N1 + N2 ..., 3) l2 distance to the three nearest neighbors
// idx: (N1 + N2 ..., 3) index of the three nearest neighbors
int
pt_idx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
pt_idx
>=
N
)
return
;
int
bs_idx
=
0
,
pt_cnt
=
unknown_batch_cnt
[
0
];
for
(
int
k
=
1
;
k
<
batch_size
;
k
++
){
if
(
pt_idx
<
pt_cnt
)
break
;
pt_cnt
+=
unknown_batch_cnt
[
k
];
bs_idx
=
k
;
}
int
cur_num_known_points
=
known_batch_cnt
[
bs_idx
];
int
known_batch_start_idx
=
0
;
for
(
int
k
=
0
;
k
<
bs_idx
;
k
++
)
known_batch_start_idx
+=
known_batch_cnt
[
k
];
known
+=
known_batch_start_idx
*
3
;
unknown
+=
pt_idx
*
3
;
dist2
+=
pt_idx
*
3
;
idx
+=
pt_idx
*
3
;
float
ux
=
unknown
[
0
];
float
uy
=
unknown
[
1
];
float
uz
=
unknown
[
2
];
double
best1
=
1e40
,
best2
=
1e40
,
best3
=
1e40
;
int
besti1
=
0
,
besti2
=
0
,
besti3
=
0
;
for
(
int
k
=
0
;
k
<
cur_num_known_points
;
++
k
)
{
float
x
=
known
[
k
*
3
+
0
];
float
y
=
known
[
k
*
3
+
1
];
float
z
=
known
[
k
*
3
+
2
];
float
d
=
(
ux
-
x
)
*
(
ux
-
x
)
+
(
uy
-
y
)
*
(
uy
-
y
)
+
(
uz
-
z
)
*
(
uz
-
z
);
if
(
d
<
best1
)
{
best3
=
best2
;
besti3
=
besti2
;
best2
=
best1
;
besti2
=
besti1
;
best1
=
d
;
besti1
=
k
;
}
else
if
(
d
<
best2
)
{
best3
=
best2
;
besti3
=
besti2
;
best2
=
d
;
besti2
=
k
;
}
else
if
(
d
<
best3
)
{
best3
=
d
;
besti3
=
k
;
}
}
dist2
[
0
]
=
best1
;
dist2
[
1
]
=
best2
;
dist2
[
2
]
=
best3
;
idx
[
0
]
=
besti1
+
known_batch_start_idx
;
idx
[
1
]
=
besti2
+
known_batch_start_idx
;
idx
[
2
]
=
besti3
+
known_batch_start_idx
;
}
void
three_nn_kernel_launcher_stack
(
int
batch_size
,
int
N
,
int
M
,
const
float
*
unknown
,
const
int
*
unknown_batch_cnt
,
const
float
*
known
,
const
int
*
known_batch_cnt
,
float
*
dist2
,
int
*
idx
)
{
// unknown: (N1 + N2 ..., 3)
// unknown_batch_cnt: (batch_size), [N1, N2, ...]
// known: (M1 + M2 ..., 3)
// known_batch_cnt: (batch_size), [M1, M2, ...]
// Return:
// dist: (N1 + N2 ..., 3) l2 distance to the three nearest neighbors
// idx: (N1 + N2 ..., 3) index of the three nearest neighbors
cudaError_t
err
;
dim3
blocks
(
DIVUP
(
N
,
THREADS_PER_BLOCK
));
// blockIdx.x(col), blockIdx.y(row)
dim3
threads
(
THREADS_PER_BLOCK
);
three_nn_kernel_stack
<<<
blocks
,
threads
>>>
(
batch_size
,
N
,
M
,
unknown
,
unknown_batch_cnt
,
known
,
known_batch_cnt
,
dist2
,
idx
);
err
=
cudaGetLastError
();
if
(
cudaSuccess
!=
err
)
{
fprintf
(
stderr
,
"CUDA kernel failed : %s
\n
"
,
cudaGetErrorString
(
err
));
exit
(
-
1
);
}
}
__global__
void
three_interpolate_kernel_stack
(
int
N
,
int
channels
,
const
float
*
features
,
const
int
*
idx
,
const
float
*
weight
,
float
*
out
)
{
// features: (M1 + M2 ..., C)
// idx: [N1 + N2 ..., 3]
// weight: [N1 + N2 ..., 3]
// Return:
// out: (N1 + N2 ..., C)
int
c_idx
=
blockIdx
.
y
;
int
pt_idx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
pt_idx
>=
N
||
c_idx
>=
channels
)
return
;
weight
+=
pt_idx
*
3
;
idx
+=
pt_idx
*
3
;
out
+=
pt_idx
*
channels
+
c_idx
;
out
[
0
]
=
weight
[
0
]
*
features
[
idx
[
0
]
*
channels
+
c_idx
]
+
weight
[
1
]
*
features
[
idx
[
1
]
*
channels
+
c_idx
]
+
weight
[
2
]
*
features
[
idx
[
2
]
*
channels
+
c_idx
];
}
void
three_interpolate_kernel_launcher_stack
(
int
N
,
int
channels
,
const
float
*
features
,
const
int
*
idx
,
const
float
*
weight
,
float
*
out
)
{
// features: (M1 + M2 ..., C)
// idx: [N1 + N2 ..., 3]
// weight: [N1 + N2 ..., 3]
// Return:
// out: (N1 + N2 ..., C)
cudaError_t
err
;
dim3
blocks
(
DIVUP
(
N
,
THREADS_PER_BLOCK
),
channels
);
dim3
threads
(
THREADS_PER_BLOCK
);
three_interpolate_kernel_stack
<<<
blocks
,
threads
>>>
(
N
,
channels
,
features
,
idx
,
weight
,
out
);
err
=
cudaGetLastError
();
if
(
cudaSuccess
!=
err
)
{
fprintf
(
stderr
,
"CUDA kernel failed : %s
\n
"
,
cudaGetErrorString
(
err
));
exit
(
-
1
);
}
}
__global__
void
three_interpolate_grad_kernel_stack
(
int
N
,
int
channels
,
const
float
*
grad_out
,
const
int
*
idx
,
const
float
*
weight
,
float
*
grad_features
)
{
// grad_out_tensor: (N1 + N2 ..., C)
// idx_tensor: [N1 + N2 ..., 3]
// weight_tensor: [N1 + N2 ..., 3]
// Return:
// grad_features_tensor: (M1 + M2 ..., C)
int
c_idx
=
blockIdx
.
y
;
int
pt_idx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
pt_idx
>=
N
||
c_idx
>=
channels
)
return
;
grad_out
+=
pt_idx
*
channels
+
c_idx
;
weight
+=
pt_idx
*
3
;
idx
+=
pt_idx
*
3
;
// printf("pt_idx=%d, c_idx=%d, idx=(%d, %d, %d), grad_out=%f\n", pt_idx, c_idx, idx[0], idx[1], idx[2], grad_out[0]);
atomicAdd
(
grad_features
+
idx
[
0
]
*
channels
+
c_idx
,
grad_out
[
0
]
*
weight
[
0
]);
atomicAdd
(
grad_features
+
idx
[
1
]
*
channels
+
c_idx
,
grad_out
[
0
]
*
weight
[
1
]);
atomicAdd
(
grad_features
+
idx
[
2
]
*
channels
+
c_idx
,
grad_out
[
0
]
*
weight
[
2
]);
}
void
three_interpolate_grad_kernel_launcher_stack
(
int
N
,
int
channels
,
const
float
*
grad_out
,
const
int
*
idx
,
const
float
*
weight
,
float
*
grad_features
)
{
// grad_out_tensor: (N1 + N2 ..., C)
// idx_tensor: [N1 + N2 ..., 3]
// weight_tensor: [N1 + N2 ..., 3]
// Return:
// grad_features_tensor: (M1 + M2 ..., C)
cudaError_t
err
;
dim3
blocks
(
DIVUP
(
N
,
THREADS_PER_BLOCK
),
channels
);
// blockIdx.x(col), blockIdx.y(row)
dim3
threads
(
THREADS_PER_BLOCK
);
three_interpolate_grad_kernel_stack
<<<
blocks
,
threads
>>>
(
N
,
channels
,
grad_out
,
idx
,
weight
,
grad_features
);
err
=
cudaGetLastError
();
if
(
cudaSuccess
!=
err
)
{
fprintf
(
stderr
,
"CUDA kernel failed : %s
\n
"
,
cudaGetErrorString
(
err
));
exit
(
-
1
);
}
}
\ No newline at end of file
pcdet/ops/pointnet2/pointnet2_stack/src/interpolate_gpu.h
0 → 100644
View file @
32567b04
#ifndef _INTERPOLATE_GPU_H
#define _INTERPOLATE_GPU_H
#include <torch/serialize/tensor.h>
#include<vector>
#include <cuda.h>
#include <cuda_runtime_api.h>
void
three_nn_wrapper_stack
(
at
::
Tensor
unknown_tensor
,
at
::
Tensor
unknown_batch_cnt_tensor
,
at
::
Tensor
known_tensor
,
at
::
Tensor
known_batch_cnt_tensor
,
at
::
Tensor
dist2_tensor
,
at
::
Tensor
idx_tensor
);
void
three_interpolate_wrapper_stack
(
at
::
Tensor
features_tensor
,
at
::
Tensor
idx_tensor
,
at
::
Tensor
weight_tensor
,
at
::
Tensor
out_tensor
);
void
three_interpolate_grad_wrapper_stack
(
at
::
Tensor
grad_out_tensor
,
at
::
Tensor
idx_tensor
,
at
::
Tensor
weight_tensor
,
at
::
Tensor
grad_features_tensor
);
void
three_nn_kernel_launcher_stack
(
int
batch_size
,
int
N
,
int
M
,
const
float
*
unknown
,
const
int
*
unknown_batch_cnt
,
const
float
*
known
,
const
int
*
known_batch_cnt
,
float
*
dist2
,
int
*
idx
);
void
three_interpolate_kernel_launcher_stack
(
int
N
,
int
channels
,
const
float
*
features
,
const
int
*
idx
,
const
float
*
weight
,
float
*
out
);
void
three_interpolate_grad_kernel_launcher_stack
(
int
N
,
int
channels
,
const
float
*
grad_out
,
const
int
*
idx
,
const
float
*
weight
,
float
*
grad_features
);
#endif
\ No newline at end of file
pcdet/ops/pointnet2/pointnet2_stack/src/pointnet2_api.cpp
View file @
32567b04
...
...
@@ -4,6 +4,7 @@
#include "ball_query_gpu.h"
#include "group_points_gpu.h"
#include "sampling_gpu.h"
#include "interpolate_gpu.h"
PYBIND11_MODULE
(
TORCH_EXTENSION_NAME
,
m
)
{
...
...
@@ -13,4 +14,8 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m
.
def
(
"group_points_wrapper"
,
&
group_points_wrapper_stack
,
"group_points_wrapper_stack"
);
m
.
def
(
"group_points_grad_wrapper"
,
&
group_points_grad_wrapper_stack
,
"group_points_grad_wrapper_stack"
);
m
.
def
(
"three_nn_wrapper"
,
&
three_nn_wrapper_stack
,
"three_nn_wrapper_stack"
);
m
.
def
(
"three_interpolate_wrapper"
,
&
three_interpolate_wrapper_stack
,
"three_interpolate_wrapper_stack"
);
m
.
def
(
"three_interpolate_grad_wrapper"
,
&
three_interpolate_grad_wrapper_stack
,
"three_interpolate_grad_wrapper_stack"
);
}
pcdet/ops/pointnet2/pointnet2_stack/src/sampling.cpp
View file @
32567b04
...
...
@@ -6,16 +6,32 @@
#include "sampling_gpu.h"
extern
THCState
*
state
;
#define CHECK_CUDA(x) do { \
if (!x.type().is_cuda()) { \
fprintf(stderr, "%s must be CUDA tensor at %s:%d\n", #x, __FILE__, __LINE__); \
exit(-1); \
} \
} while (0)
#define CHECK_CONTIGUOUS(x) do { \
if (!x.is_contiguous()) { \
fprintf(stderr, "%s must be contiguous tensor at %s:%d\n", #x, __FILE__, __LINE__); \
exit(-1); \
} \
} while (0)
#define CHECK_INPUT(x) CHECK_CUDA(x);CHECK_CONTIGUOUS(x)
int
furthest_point_sampling_wrapper
(
int
b
,
int
n
,
int
m
,
at
::
Tensor
points_tensor
,
at
::
Tensor
temp_tensor
,
at
::
Tensor
idx_tensor
)
{
CHECK_INPUT
(
points_tensor
);
CHECK_INPUT
(
temp_tensor
);
CHECK_INPUT
(
idx_tensor
);
const
float
*
points
=
points_tensor
.
data
<
float
>
();
float
*
temp
=
temp_tensor
.
data
<
float
>
();
int
*
idx
=
idx_tensor
.
data
<
int
>
();
cudaStream_t
stream
=
THCState_getCurrentStream
(
state
);
furthest_point_sampling_kernel_launcher
(
b
,
n
,
m
,
points
,
temp
,
idx
,
stream
);
furthest_point_sampling_kernel_launcher
(
b
,
n
,
m
,
points
,
temp
,
idx
);
return
1
;
}
pcdet/ops/pointnet2/pointnet2_stack/src/sampling_gpu.cu
View file @
32567b04
...
...
@@ -140,7 +140,7 @@ __global__ void furthest_point_sampling_kernel(int b, int n, int m,
}
void
furthest_point_sampling_kernel_launcher
(
int
b
,
int
n
,
int
m
,
const
float
*
dataset
,
float
*
temp
,
int
*
idxs
,
cudaStream_t
stream
)
{
const
float
*
dataset
,
float
*
temp
,
int
*
idxs
)
{
// dataset: (B, N, 3)
// tmp: (B, N)
// output:
...
...
@@ -151,29 +151,29 @@ void furthest_point_sampling_kernel_launcher(int b, int n, int m,
switch
(
n_threads
)
{
case
1024
:
furthest_point_sampling_kernel
<
1024
><<<
b
,
n_threads
,
0
,
stream
>>>
(
b
,
n
,
m
,
dataset
,
temp
,
idxs
);
break
;
furthest_point_sampling_kernel
<
1024
><<<
b
,
n_threads
>>>
(
b
,
n
,
m
,
dataset
,
temp
,
idxs
);
break
;
case
512
:
furthest_point_sampling_kernel
<
512
><<<
b
,
n_threads
,
0
,
stream
>>>
(
b
,
n
,
m
,
dataset
,
temp
,
idxs
);
break
;
furthest_point_sampling_kernel
<
512
><<<
b
,
n_threads
>>>
(
b
,
n
,
m
,
dataset
,
temp
,
idxs
);
break
;
case
256
:
furthest_point_sampling_kernel
<
256
><<<
b
,
n_threads
,
0
,
stream
>>>
(
b
,
n
,
m
,
dataset
,
temp
,
idxs
);
break
;
furthest_point_sampling_kernel
<
256
><<<
b
,
n_threads
>>>
(
b
,
n
,
m
,
dataset
,
temp
,
idxs
);
break
;
case
128
:
furthest_point_sampling_kernel
<
128
><<<
b
,
n_threads
,
0
,
stream
>>>
(
b
,
n
,
m
,
dataset
,
temp
,
idxs
);
break
;
furthest_point_sampling_kernel
<
128
><<<
b
,
n_threads
>>>
(
b
,
n
,
m
,
dataset
,
temp
,
idxs
);
break
;
case
64
:
furthest_point_sampling_kernel
<
64
><<<
b
,
n_threads
,
0
,
stream
>>>
(
b
,
n
,
m
,
dataset
,
temp
,
idxs
);
break
;
furthest_point_sampling_kernel
<
64
><<<
b
,
n_threads
>>>
(
b
,
n
,
m
,
dataset
,
temp
,
idxs
);
break
;
case
32
:
furthest_point_sampling_kernel
<
32
><<<
b
,
n_threads
,
0
,
stream
>>>
(
b
,
n
,
m
,
dataset
,
temp
,
idxs
);
break
;
furthest_point_sampling_kernel
<
32
><<<
b
,
n_threads
>>>
(
b
,
n
,
m
,
dataset
,
temp
,
idxs
);
break
;
case
16
:
furthest_point_sampling_kernel
<
16
><<<
b
,
n_threads
,
0
,
stream
>>>
(
b
,
n
,
m
,
dataset
,
temp
,
idxs
);
break
;
furthest_point_sampling_kernel
<
16
><<<
b
,
n_threads
>>>
(
b
,
n
,
m
,
dataset
,
temp
,
idxs
);
break
;
case
8
:
furthest_point_sampling_kernel
<
8
><<<
b
,
n_threads
,
0
,
stream
>>>
(
b
,
n
,
m
,
dataset
,
temp
,
idxs
);
break
;
furthest_point_sampling_kernel
<
8
><<<
b
,
n_threads
>>>
(
b
,
n
,
m
,
dataset
,
temp
,
idxs
);
break
;
case
4
:
furthest_point_sampling_kernel
<
4
><<<
b
,
n_threads
,
0
,
stream
>>>
(
b
,
n
,
m
,
dataset
,
temp
,
idxs
);
break
;
furthest_point_sampling_kernel
<
4
><<<
b
,
n_threads
>>>
(
b
,
n
,
m
,
dataset
,
temp
,
idxs
);
break
;
case
2
:
furthest_point_sampling_kernel
<
2
><<<
b
,
n_threads
,
0
,
stream
>>>
(
b
,
n
,
m
,
dataset
,
temp
,
idxs
);
break
;
furthest_point_sampling_kernel
<
2
><<<
b
,
n_threads
>>>
(
b
,
n
,
m
,
dataset
,
temp
,
idxs
);
break
;
case
1
:
furthest_point_sampling_kernel
<
1
><<<
b
,
n_threads
,
0
,
stream
>>>
(
b
,
n
,
m
,
dataset
,
temp
,
idxs
);
break
;
furthest_point_sampling_kernel
<
1
><<<
b
,
n_threads
>>>
(
b
,
n
,
m
,
dataset
,
temp
,
idxs
);
break
;
default:
furthest_point_sampling_kernel
<
512
><<<
b
,
n_threads
,
0
,
stream
>>>
(
b
,
n
,
m
,
dataset
,
temp
,
idxs
);
furthest_point_sampling_kernel
<
512
><<<
b
,
n_threads
>>>
(
b
,
n
,
m
,
dataset
,
temp
,
idxs
);
}
err
=
cudaGetLastError
();
...
...
Prev
1
2
3
4
5
6
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment