Commit f27d308f authored by yinchimaoliang's avatar yinchimaoliang
Browse files

merge master

parents c66ae813 27ebcfac
...@@ -3,82 +3,92 @@ ...@@ -3,82 +3,92 @@
#define TOTAL_THREADS 1024 #define TOTAL_THREADS 1024
#define THREADS_PER_BLOCK 256 #define THREADS_PER_BLOCK 256
#define DIVUP(m,n) ((m) / (n) + ((m) % (n) > 0)) #define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
__global__ void gather_points_kernel(int b, int c, int n, int m, __global__ void gather_points_kernel(int b, int c, int n, int m,
const float *__restrict__ points, const int *__restrict__ idx, float *__restrict__ out) { const float *__restrict__ points,
// points: (B, C, N) const int *__restrict__ idx,
// idx: (B, M) float *__restrict__ out) {
// output: // points: (B, C, N)
// out: (B, C, M) // idx: (B, M)
// output:
int bs_idx = blockIdx.z; // out: (B, C, M)
int c_idx = blockIdx.y;
int pt_idx = blockIdx.x * blockDim.x + threadIdx.x; int bs_idx = blockIdx.z;
if (bs_idx >= b || c_idx >= c || pt_idx >= m) return; int c_idx = blockIdx.y;
int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
out += bs_idx * c * m + c_idx * m + pt_idx; if (bs_idx >= b || c_idx >= c || pt_idx >= m) return;
idx += bs_idx * m + pt_idx;
points += bs_idx * c * n + c_idx * n; out += bs_idx * c * m + c_idx * m + pt_idx;
out[0] = points[idx[0]]; idx += bs_idx * m + pt_idx;
points += bs_idx * c * n + c_idx * n;
out[0] = points[idx[0]];
} }
void gather_points_kernel_launcher(int b, int c, int n, int npoints, void gather_points_kernel_launcher(int b, int c, int n, int npoints,
const float *points, const int *idx, float *out, cudaStream_t stream) { const float *points, const int *idx,
// points: (B, C, N) float *out, cudaStream_t stream) {
// idx: (B, npoints) // points: (B, C, N)
// output: // idx: (B, npoints)
// out: (B, C, npoints) // output:
// out: (B, C, npoints)
cudaError_t err;
dim3 blocks(DIVUP(npoints, THREADS_PER_BLOCK), c, b); // blockIdx.x(col), blockIdx.y(row) cudaError_t err;
dim3 threads(THREADS_PER_BLOCK); dim3 blocks(DIVUP(npoints, THREADS_PER_BLOCK), c,
b); // blockIdx.x(col), blockIdx.y(row)
gather_points_kernel<<<blocks, threads, 0, stream>>>(b, c, n, npoints, points, idx, out); dim3 threads(THREADS_PER_BLOCK);
err = cudaGetLastError(); gather_points_kernel<<<blocks, threads, 0, stream>>>(b, c, n, npoints, points,
if (cudaSuccess != err) { idx, out);
fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
exit(-1); err = cudaGetLastError();
} if (cudaSuccess != err) {
fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
exit(-1);
}
} }
__global__ void gather_points_grad_kernel(int b, int c, int n, int m, const float *__restrict__ grad_out, __global__ void gather_points_grad_kernel(int b, int c, int n, int m,
const int *__restrict__ idx, float *__restrict__ grad_points) { const float *__restrict__ grad_out,
// grad_out: (B, C, M) const int *__restrict__ idx,
// idx: (B, M) float *__restrict__ grad_points) {
// output: // grad_out: (B, C, M)
// grad_points: (B, C, N) // idx: (B, M)
// output:
int bs_idx = blockIdx.z; // grad_points: (B, C, N)
int c_idx = blockIdx.y;
int pt_idx = blockIdx.x * blockDim.x + threadIdx.x; int bs_idx = blockIdx.z;
if (bs_idx >= b || c_idx >= c || pt_idx >= m) return; int c_idx = blockIdx.y;
int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
grad_out += bs_idx * c * m + c_idx * m + pt_idx; if (bs_idx >= b || c_idx >= c || pt_idx >= m) return;
idx += bs_idx * m + pt_idx;
grad_points += bs_idx * c * n + c_idx * n; grad_out += bs_idx * c * m + c_idx * m + pt_idx;
idx += bs_idx * m + pt_idx;
atomicAdd(grad_points + idx[0], grad_out[0]); grad_points += bs_idx * c * n + c_idx * n;
atomicAdd(grad_points + idx[0], grad_out[0]);
} }
void gather_points_grad_kernel_launcher(int b, int c, int n, int npoints, void gather_points_grad_kernel_launcher(int b, int c, int n, int npoints,
const float *grad_out, const int *idx, float *grad_points, cudaStream_t stream) { const float *grad_out, const int *idx,
// grad_out: (B, C, npoints) float *grad_points,
// idx: (B, npoints) cudaStream_t stream) {
// output: // grad_out: (B, C, npoints)
// grad_points: (B, C, N) // idx: (B, npoints)
// output:
cudaError_t err; // grad_points: (B, C, N)
dim3 blocks(DIVUP(npoints, THREADS_PER_BLOCK), c, b); // blockIdx.x(col), blockIdx.y(row)
dim3 threads(THREADS_PER_BLOCK); cudaError_t err;
dim3 blocks(DIVUP(npoints, THREADS_PER_BLOCK), c,
gather_points_grad_kernel<<<blocks, threads, 0, stream>>>(b, c, n, npoints, grad_out, idx, grad_points); b); // blockIdx.x(col), blockIdx.y(row)
dim3 threads(THREADS_PER_BLOCK);
err = cudaGetLastError();
if (cudaSuccess != err) { gather_points_grad_kernel<<<blocks, threads, 0, stream>>>(
fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err)); b, c, n, npoints, grad_out, idx, grad_points);
exit(-1);
} err = cudaGetLastError();
if (cudaSuccess != err) {
fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
exit(-1);
}
} }
#include <torch/serialize/tensor.h> #include <THC/THC.h>
#include <cuda.h> #include <cuda.h>
#include <cuda_runtime_api.h> #include <cuda_runtime_api.h>
#include <vector>
#include <THC/THC.h>
#include <torch/extension.h> #include <torch/extension.h>
#include <torch/serialize/tensor.h>
#include <vector>
extern THCState *state; extern THCState *state;
int group_points_wrapper(int b, int c, int n, int npoints, int nsample, int group_points_wrapper(int b, int c, int n, int npoints, int nsample,
at::Tensor points_tensor, at::Tensor idx_tensor, at::Tensor out_tensor); at::Tensor points_tensor, at::Tensor idx_tensor,
at::Tensor out_tensor);
void group_points_kernel_launcher(int b, int c, int n, int npoints, int nsample, void group_points_kernel_launcher(int b, int c, int n, int npoints, int nsample,
const float *points, const int *idx, float *out, cudaStream_t stream); const float *points, const int *idx,
float *out, cudaStream_t stream);
int group_points_grad_wrapper(int b, int c, int n, int npoints, int nsample, int group_points_grad_wrapper(int b, int c, int n, int npoints, int nsample,
at::Tensor grad_out_tensor, at::Tensor idx_tensor, at::Tensor grad_points_tensor); at::Tensor grad_out_tensor, at::Tensor idx_tensor,
at::Tensor grad_points_tensor);
void group_points_grad_kernel_launcher(int b, int c, int n, int npoints, int nsample,
const float *grad_out, const int *idx, float *grad_points, cudaStream_t stream);
void group_points_grad_kernel_launcher(int b, int c, int n, int npoints,
int nsample, const float *grad_out,
const int *idx, float *grad_points,
cudaStream_t stream);
int group_points_grad_wrapper(int b, int c, int n, int npoints, int nsample, int group_points_grad_wrapper(int b, int c, int n, int npoints, int nsample,
at::Tensor grad_out_tensor, at::Tensor idx_tensor, at::Tensor grad_points_tensor) { at::Tensor grad_out_tensor, at::Tensor idx_tensor,
at::Tensor grad_points_tensor) {
float *grad_points = grad_points_tensor.data<float>(); float *grad_points = grad_points_tensor.data_ptr<float>();
const int *idx = idx_tensor.data<int>(); const int *idx = idx_tensor.data_ptr<int>();
const float *grad_out = grad_out_tensor.data<float>(); const float *grad_out = grad_out_tensor.data_ptr<float>();
cudaStream_t stream = THCState_getCurrentStream(state); cudaStream_t stream = THCState_getCurrentStream(state);
group_points_grad_kernel_launcher(b, c, n, npoints, nsample, grad_out, idx, grad_points, stream); group_points_grad_kernel_launcher(b, c, n, npoints, nsample, grad_out, idx,
return 1; grad_points, stream);
return 1;
} }
int group_points_wrapper(int b, int c, int n, int npoints, int nsample, int group_points_wrapper(int b, int c, int n, int npoints, int nsample,
at::Tensor points_tensor, at::Tensor idx_tensor, at::Tensor out_tensor) { at::Tensor points_tensor, at::Tensor idx_tensor,
at::Tensor out_tensor) {
const float *points = points_tensor.data<float>(); const float *points = points_tensor.data_ptr<float>();
const int *idx = idx_tensor.data<int>(); const int *idx = idx_tensor.data_ptr<int>();
float *out = out_tensor.data<float>(); float *out = out_tensor.data_ptr<float>();
cudaStream_t stream = THCState_getCurrentStream(state); cudaStream_t stream = THCState_getCurrentStream(state);
group_points_kernel_launcher(b, c, n, npoints, nsample, points, idx, out, stream); group_points_kernel_launcher(b, c, n, npoints, nsample, points, idx, out,
return 1; stream);
return 1;
} }
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("forward", &group_points_wrapper, "group_points_wrapper"); m.def("forward", &group_points_wrapper, "group_points_wrapper");
m.def("backward", &group_points_grad_wrapper, "group_points_grad_wrapper"); m.def("backward", &group_points_grad_wrapper, "group_points_grad_wrapper");
} }
...@@ -2,84 +2,97 @@ ...@@ -2,84 +2,97 @@
#include <stdlib.h> #include <stdlib.h>
#define THREADS_PER_BLOCK 256 #define THREADS_PER_BLOCK 256
#define DIVUP(m,n) ((m) / (n) + ((m) % (n) > 0)) #define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
__global__ void group_points_grad_kernel(int b, int c, int n, int npoints, int nsample, __global__ void group_points_grad_kernel(int b, int c, int n, int npoints,
const float *__restrict__ grad_out, const int *__restrict__ idx, float *__restrict__ grad_points) { int nsample,
// grad_out: (B, C, npoints, nsample) const float *__restrict__ grad_out,
// idx: (B, npoints, nsample) const int *__restrict__ idx,
// output: float *__restrict__ grad_points) {
// grad_points: (B, C, N) // grad_out: (B, C, npoints, nsample)
int bs_idx = blockIdx.z; // idx: (B, npoints, nsample)
int c_idx = blockIdx.y; // output:
int index = blockIdx.x * blockDim.x + threadIdx.x; // grad_points: (B, C, N)
int pt_idx = index / nsample; int bs_idx = blockIdx.z;
if (bs_idx >= b || c_idx >= c || pt_idx >= npoints) return; int c_idx = blockIdx.y;
int index = blockIdx.x * blockDim.x + threadIdx.x;
int pt_idx = index / nsample;
if (bs_idx >= b || c_idx >= c || pt_idx >= npoints) return;
int sample_idx = index % nsample; int sample_idx = index % nsample;
grad_out += bs_idx * c * npoints * nsample + c_idx * npoints * nsample + pt_idx * nsample + sample_idx; grad_out += bs_idx * c * npoints * nsample + c_idx * npoints * nsample +
idx += bs_idx * npoints * nsample + pt_idx * nsample + sample_idx; pt_idx * nsample + sample_idx;
idx += bs_idx * npoints * nsample + pt_idx * nsample + sample_idx;
atomicAdd(grad_points + bs_idx * c * n + c_idx * n + idx[0] , grad_out[0]); atomicAdd(grad_points + bs_idx * c * n + c_idx * n + idx[0], grad_out[0]);
} }
void group_points_grad_kernel_launcher(int b, int c, int n, int npoints, int nsample, void group_points_grad_kernel_launcher(int b, int c, int n, int npoints,
const float *grad_out, const int *idx, float *grad_points, cudaStream_t stream) { int nsample, const float *grad_out,
// grad_out: (B, C, npoints, nsample) const int *idx, float *grad_points,
// idx: (B, npoints, nsample) cudaStream_t stream) {
// output: // grad_out: (B, C, npoints, nsample)
// grad_points: (B, C, N) // idx: (B, npoints, nsample)
cudaError_t err; // output:
dim3 blocks(DIVUP(npoints * nsample, THREADS_PER_BLOCK), c, b); // blockIdx.x(col), blockIdx.y(row) // grad_points: (B, C, N)
dim3 threads(THREADS_PER_BLOCK); cudaError_t err;
dim3 blocks(DIVUP(npoints * nsample, THREADS_PER_BLOCK), c,
b); // blockIdx.x(col), blockIdx.y(row)
dim3 threads(THREADS_PER_BLOCK);
group_points_grad_kernel<<<blocks, threads, 0, stream>>>(b, c, n, npoints, nsample, grad_out, idx, grad_points); group_points_grad_kernel<<<blocks, threads, 0, stream>>>(
b, c, n, npoints, nsample, grad_out, idx, grad_points);
err = cudaGetLastError(); err = cudaGetLastError();
if (cudaSuccess != err) { if (cudaSuccess != err) {
fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err)); fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
exit(-1); exit(-1);
} }
} }
__global__ void group_points_kernel(int b, int c, int n, int npoints,
int nsample,
const float *__restrict__ points,
const int *__restrict__ idx,
float *__restrict__ out) {
// points: (B, C, N)
// idx: (B, npoints, nsample)
// output:
// out: (B, C, npoints, nsample)
int bs_idx = blockIdx.z;
int c_idx = blockIdx.y;
int index = blockIdx.x * blockDim.x + threadIdx.x;
int pt_idx = index / nsample;
if (bs_idx >= b || c_idx >= c || pt_idx >= npoints) return;
__global__ void group_points_kernel(int b, int c, int n, int npoints, int nsample, int sample_idx = index % nsample;
const float *__restrict__ points, const int *__restrict__ idx, float *__restrict__ out) {
// points: (B, C, N)
// idx: (B, npoints, nsample)
// output:
// out: (B, C, npoints, nsample)
int bs_idx = blockIdx.z;
int c_idx = blockIdx.y;
int index = blockIdx.x * blockDim.x + threadIdx.x;
int pt_idx = index / nsample;
if (bs_idx >= b || c_idx >= c || pt_idx >= npoints) return;
int sample_idx = index % nsample; idx += bs_idx * npoints * nsample + pt_idx * nsample + sample_idx;
int in_idx = bs_idx * c * n + c_idx * n + idx[0];
int out_idx = bs_idx * c * npoints * nsample + c_idx * npoints * nsample +
pt_idx * nsample + sample_idx;
idx += bs_idx * npoints * nsample + pt_idx * nsample + sample_idx; out[out_idx] = points[in_idx];
int in_idx = bs_idx * c * n + c_idx * n + idx[0];
int out_idx = bs_idx * c * npoints * nsample + c_idx * npoints * nsample + pt_idx * nsample + sample_idx;
out[out_idx] = points[in_idx];
} }
void group_points_kernel_launcher(int b, int c, int n, int npoints, int nsample, void group_points_kernel_launcher(int b, int c, int n, int npoints, int nsample,
const float *points, const int *idx, float *out, cudaStream_t stream) { const float *points, const int *idx,
// points: (B, C, N) float *out, cudaStream_t stream) {
// idx: (B, npoints, nsample) // points: (B, C, N)
// output: // idx: (B, npoints, nsample)
// out: (B, C, npoints, nsample) // output:
cudaError_t err; // out: (B, C, npoints, nsample)
dim3 blocks(DIVUP(npoints * nsample, THREADS_PER_BLOCK), c, b); // blockIdx.x(col), blockIdx.y(row) cudaError_t err;
dim3 threads(THREADS_PER_BLOCK); dim3 blocks(DIVUP(npoints * nsample, THREADS_PER_BLOCK), c,
b); // blockIdx.x(col), blockIdx.y(row)
dim3 threads(THREADS_PER_BLOCK);
group_points_kernel<<<blocks, threads, 0, stream>>>(b, c, n, npoints, nsample, points, idx, out); group_points_kernel<<<blocks, threads, 0, stream>>>(b, c, n, npoints, nsample,
// cudaDeviceSynchronize(); // for using printf in kernel function points, idx, out);
err = cudaGetLastError(); // cudaDeviceSynchronize(); // for using printf in kernel function
if (cudaSuccess != err) { err = cudaGetLastError();
fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err)); if (cudaSuccess != err) {
exit(-1); fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
} exit(-1);
}
} }
#include <torch/serialize/tensor.h>
#include <vector>
#include <THC/THC.h> #include <THC/THC.h>
#include <cuda.h>
#include <cuda_runtime_api.h>
#include <math.h> #include <math.h>
#include <stdio.h> #include <stdio.h>
#include <stdlib.h> #include <stdlib.h>
#include <cuda.h>
#include <cuda_runtime_api.h>
#include <torch/extension.h> #include <torch/extension.h>
#include <torch/serialize/tensor.h>
#include <vector>
extern THCState *state; extern THCState *state;
void three_nn_wrapper(int b, int n, int m, at::Tensor unknown_tensor, void three_nn_wrapper(int b, int n, int m, at::Tensor unknown_tensor,
at::Tensor known_tensor, at::Tensor dist2_tensor, at::Tensor idx_tensor); at::Tensor known_tensor, at::Tensor dist2_tensor,
at::Tensor idx_tensor);
void three_nn_kernel_launcher(int b, int n, int m, const float *unknown, void three_nn_kernel_launcher(int b, int n, int m, const float *unknown,
const float *known, float *dist2, int *idx, cudaStream_t stream); const float *known, float *dist2, int *idx,
cudaStream_t stream);
void three_interpolate_wrapper(int b, int c, int m, int n,
void three_interpolate_wrapper(int b, int c, int m, int n, at::Tensor points_tensor, at::Tensor points_tensor, at::Tensor idx_tensor,
at::Tensor idx_tensor, at::Tensor weight_tensor, at::Tensor out_tensor); at::Tensor weight_tensor, at::Tensor out_tensor);
void three_interpolate_kernel_launcher(int b, int c, int m, int n, void three_interpolate_kernel_launcher(int b, int c, int m, int n,
const float *points, const int *idx, const float *weight, float *out, cudaStream_t stream); const float *points, const int *idx,
const float *weight, float *out,
cudaStream_t stream);
void three_interpolate_grad_wrapper(int b, int c, int n, int m,
at::Tensor grad_out_tensor,
at::Tensor idx_tensor,
at::Tensor weight_tensor,
at::Tensor grad_points_tensor);
void three_interpolate_grad_wrapper(int b, int c, int n, int m, at::Tensor grad_out_tensor, void three_interpolate_grad_kernel_launcher(int b, int c, int n, int m,
at::Tensor idx_tensor, at::Tensor weight_tensor, at::Tensor grad_points_tensor); const float *grad_out,
const int *idx, const float *weight,
void three_interpolate_grad_kernel_launcher(int b, int c, int n, int m, const float *grad_out, float *grad_points,
const int *idx, const float *weight, float *grad_points, cudaStream_t stream); cudaStream_t stream);
void three_nn_wrapper(int b, int n, int m, at::Tensor unknown_tensor, void three_nn_wrapper(int b, int n, int m, at::Tensor unknown_tensor,
at::Tensor known_tensor, at::Tensor dist2_tensor, at::Tensor idx_tensor) { at::Tensor known_tensor, at::Tensor dist2_tensor,
const float *unknown = unknown_tensor.data<float>(); at::Tensor idx_tensor) {
const float *known = known_tensor.data<float>(); const float *unknown = unknown_tensor.data_ptr<float>();
float *dist2 = dist2_tensor.data<float>(); const float *known = known_tensor.data_ptr<float>();
int *idx = idx_tensor.data<int>(); float *dist2 = dist2_tensor.data_ptr<float>();
int *idx = idx_tensor.data_ptr<int>();
cudaStream_t stream = THCState_getCurrentStream(state);
three_nn_kernel_launcher(b, n, m, unknown, known, dist2, idx, stream); cudaStream_t stream = THCState_getCurrentStream(state);
three_nn_kernel_launcher(b, n, m, unknown, known, dist2, idx, stream);
} }
void three_interpolate_wrapper(int b, int c, int m, int n, void three_interpolate_wrapper(int b, int c, int m, int n,
at::Tensor points_tensor, at::Tensor points_tensor, at::Tensor idx_tensor,
at::Tensor idx_tensor, at::Tensor weight_tensor,
at::Tensor weight_tensor, at::Tensor out_tensor) {
at::Tensor out_tensor) { const float *points = points_tensor.data_ptr<float>();
const float *weight = weight_tensor.data_ptr<float>();
const float *points = points_tensor.data<float>(); float *out = out_tensor.data_ptr<float>();
const float *weight = weight_tensor.data<float>(); const int *idx = idx_tensor.data_ptr<int>();
float *out = out_tensor.data<float>();
const int *idx = idx_tensor.data<int>(); cudaStream_t stream = THCState_getCurrentStream(state);
three_interpolate_kernel_launcher(b, c, m, n, points, idx, weight, out,
cudaStream_t stream = THCState_getCurrentStream(state); stream);
three_interpolate_kernel_launcher(b, c, m, n, points, idx, weight, out, stream);
} }
void three_interpolate_grad_wrapper(int b, int c, int n, int m, void three_interpolate_grad_wrapper(int b, int c, int n, int m,
at::Tensor grad_out_tensor, at::Tensor grad_out_tensor,
at::Tensor idx_tensor, at::Tensor idx_tensor,
at::Tensor weight_tensor, at::Tensor weight_tensor,
at::Tensor grad_points_tensor) { at::Tensor grad_points_tensor) {
const float *grad_out = grad_out_tensor.data_ptr<float>();
const float *grad_out = grad_out_tensor.data<float>(); const float *weight = weight_tensor.data_ptr<float>();
const float *weight = weight_tensor.data<float>(); float *grad_points = grad_points_tensor.data_ptr<float>();
float *grad_points = grad_points_tensor.data<float>(); const int *idx = idx_tensor.data_ptr<int>();
const int *idx = idx_tensor.data<int>();
cudaStream_t stream = THCState_getCurrentStream(state);
cudaStream_t stream = THCState_getCurrentStream(state); three_interpolate_grad_kernel_launcher(b, c, n, m, grad_out, idx, weight,
three_interpolate_grad_kernel_launcher(b, c, n, m, grad_out, idx, weight, grad_points, stream); grad_points, stream);
} }
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("three_nn_wrapper", &three_nn_wrapper, "three_nn_wrapper"); m.def("three_nn_wrapper", &three_nn_wrapper, "three_nn_wrapper");
m.def("three_interpolate_wrapper", &three_interpolate_wrapper, "three_interpolate_wrapper"); m.def("three_interpolate_wrapper", &three_interpolate_wrapper,
m.def("three_interpolate_grad_wrapper", &three_interpolate_grad_wrapper, "three_interpolate_grad_wrapper"); "three_interpolate_wrapper");
m.def("three_interpolate_grad_wrapper", &three_interpolate_grad_wrapper,
"three_interpolate_grad_wrapper");
} }
...@@ -3,91 +3,103 @@ ...@@ -3,91 +3,103 @@
#include <stdlib.h> #include <stdlib.h>
#define THREADS_PER_BLOCK 256 #define THREADS_PER_BLOCK 256
#define DIVUP(m,n) ((m) / (n) + ((m) % (n) > 0)) #define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
__global__ void three_interpolate_kernel(int b, int c, int m, int n,
__global__ void three_interpolate_kernel(int b, int c, int m, int n, const float *__restrict__ points, const float *__restrict__ points,
const int *__restrict__ idx, const float *__restrict__ weight, float *__restrict__ out) { const int *__restrict__ idx,
// points: (B, C, M) const float *__restrict__ weight,
// idx: (B, N, 3) float *__restrict__ out) {
// weight: (B, N, 3) // points: (B, C, M)
// output: // idx: (B, N, 3)
// out: (B, C, N) // weight: (B, N, 3)
// output:
int bs_idx = blockIdx.z; // out: (B, C, N)
int c_idx = blockIdx.y;
int pt_idx = blockIdx.x * blockDim.x + threadIdx.x; int bs_idx = blockIdx.z;
int c_idx = blockIdx.y;
if (bs_idx >= b || c_idx >= c || pt_idx >= n) return; int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
weight += bs_idx * n * 3 + pt_idx * 3; if (bs_idx >= b || c_idx >= c || pt_idx >= n) return;
points += bs_idx * c * m + c_idx * m;
idx += bs_idx * n * 3 + pt_idx * 3; weight += bs_idx * n * 3 + pt_idx * 3;
out += bs_idx * c * n + c_idx * n; points += bs_idx * c * m + c_idx * m;
idx += bs_idx * n * 3 + pt_idx * 3;
out[pt_idx] = weight[0] * points[idx[0]] + weight[1] * points[idx[1]] + weight[2] * points[idx[2]]; out += bs_idx * c * n + c_idx * n;
out[pt_idx] = weight[0] * points[idx[0]] + weight[1] * points[idx[1]] +
weight[2] * points[idx[2]];
} }
void three_interpolate_kernel_launcher(int b, int c, int m, int n, void three_interpolate_kernel_launcher(int b, int c, int m, int n,
const float *points, const int *idx, const float *weight, float *out, cudaStream_t stream) { const float *points, const int *idx,
// points: (B, C, M) const float *weight, float *out,
// idx: (B, N, 3) cudaStream_t stream) {
// weight: (B, N, 3) // points: (B, C, M)
// output: // idx: (B, N, 3)
// out: (B, C, N) // weight: (B, N, 3)
// output:
cudaError_t err; // out: (B, C, N)
dim3 blocks(DIVUP(n, THREADS_PER_BLOCK), c, b); // blockIdx.x(col), blockIdx.y(row)
dim3 threads(THREADS_PER_BLOCK); cudaError_t err;
three_interpolate_kernel<<<blocks, threads, 0, stream>>>(b, c, m, n, points, idx, weight, out); dim3 blocks(DIVUP(n, THREADS_PER_BLOCK), c,
b); // blockIdx.x(col), blockIdx.y(row)
err = cudaGetLastError(); dim3 threads(THREADS_PER_BLOCK);
if (cudaSuccess != err) { three_interpolate_kernel<<<blocks, threads, 0, stream>>>(b, c, m, n, points,
fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err)); idx, weight, out);
exit(-1);
} err = cudaGetLastError();
if (cudaSuccess != err) {
fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
exit(-1);
}
} }
__global__ void three_interpolate_grad_kernel(
__global__ void three_interpolate_grad_kernel(int b, int c, int n, int m, const float *__restrict__ grad_out, int b, int c, int n, int m, const float *__restrict__ grad_out,
const int *__restrict__ idx, const float *__restrict__ weight, float *__restrict__ grad_points) { const int *__restrict__ idx, const float *__restrict__ weight,
// grad_out: (B, C, N) float *__restrict__ grad_points) {
// weight: (B, N, 3) // grad_out: (B, C, N)
// output: // weight: (B, N, 3)
// grad_points: (B, C, M) // output:
// grad_points: (B, C, M)
int bs_idx = blockIdx.z;
int c_idx = blockIdx.y; int bs_idx = blockIdx.z;
int pt_idx = blockIdx.x * blockDim.x + threadIdx.x; int c_idx = blockIdx.y;
int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
if (bs_idx >= b || c_idx >= c || pt_idx >= n) return;
if (bs_idx >= b || c_idx >= c || pt_idx >= n) return;
grad_out += bs_idx * c * n + c_idx * n + pt_idx;
weight += bs_idx * n * 3 + pt_idx * 3; grad_out += bs_idx * c * n + c_idx * n + pt_idx;
grad_points += bs_idx * c * m + c_idx * m; weight += bs_idx * n * 3 + pt_idx * 3;
idx += bs_idx * n * 3 + pt_idx * 3; grad_points += bs_idx * c * m + c_idx * m;
idx += bs_idx * n * 3 + pt_idx * 3;
atomicAdd(grad_points + idx[0], grad_out[0] * weight[0]); atomicAdd(grad_points + idx[0], grad_out[0] * weight[0]);
atomicAdd(grad_points + idx[1], grad_out[0] * weight[1]); atomicAdd(grad_points + idx[1], grad_out[0] * weight[1]);
atomicAdd(grad_points + idx[2], grad_out[0] * weight[2]); atomicAdd(grad_points + idx[2], grad_out[0] * weight[2]);
} }
void three_interpolate_grad_kernel_launcher(int b, int c, int n, int m, const float *grad_out, void three_interpolate_grad_kernel_launcher(int b, int c, int n, int m,
const int *idx, const float *weight, float *grad_points, cudaStream_t stream) { const float *grad_out,
// grad_out: (B, C, N) const int *idx, const float *weight,
// weight: (B, N, 3) float *grad_points,
// output: cudaStream_t stream) {
// grad_points: (B, C, M) // grad_out: (B, C, N)
// weight: (B, N, 3)
cudaError_t err; // output:
dim3 blocks(DIVUP(n, THREADS_PER_BLOCK), c, b); // blockIdx.x(col), blockIdx.y(row) // grad_points: (B, C, M)
dim3 threads(THREADS_PER_BLOCK);
three_interpolate_grad_kernel<<<blocks, threads, 0, stream>>>(b, c, n, m, grad_out, idx, weight, grad_points); cudaError_t err;
dim3 blocks(DIVUP(n, THREADS_PER_BLOCK), c,
err = cudaGetLastError(); b); // blockIdx.x(col), blockIdx.y(row)
if (cudaSuccess != err) { dim3 threads(THREADS_PER_BLOCK);
fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err)); three_interpolate_grad_kernel<<<blocks, threads, 0, stream>>>(
exit(-1); b, c, n, m, grad_out, idx, weight, grad_points);
}
err = cudaGetLastError();
if (cudaSuccess != err) {
fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
exit(-1);
}
} }
...@@ -3,72 +3,84 @@ ...@@ -3,72 +3,84 @@
#include <stdlib.h> #include <stdlib.h>
#define THREADS_PER_BLOCK 256 #define THREADS_PER_BLOCK 256
#define DIVUP(m,n) ((m) / (n) + ((m) % (n) > 0)) #define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
__global__ void three_nn_kernel(int b, int n, int m,
const float *__restrict__ unknown,
const float *__restrict__ known,
float *__restrict__ dist2,
int *__restrict__ idx) {
// unknown: (B, N, 3)
// known: (B, M, 3)
// output:
// dist2: (B, N, 3)
// idx: (B, N, 3)
__global__ void three_nn_kernel(int b, int n, int m, const float *__restrict__ unknown, int bs_idx = blockIdx.y;
const float *__restrict__ known, float *__restrict__ dist2, int *__restrict__ idx) { int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
// unknown: (B, N, 3) if (bs_idx >= b || pt_idx >= n) return;
// known: (B, M, 3)
// output:
// dist2: (B, N, 3)
// idx: (B, N, 3)
int bs_idx = blockIdx.y; unknown += bs_idx * n * 3 + pt_idx * 3;
int pt_idx = blockIdx.x * blockDim.x + threadIdx.x; known += bs_idx * m * 3;
if (bs_idx >= b || pt_idx >= n) return; dist2 += bs_idx * n * 3 + pt_idx * 3;
idx += bs_idx * n * 3 + pt_idx * 3;
unknown += bs_idx * n * 3 + pt_idx * 3; float ux = unknown[0];
known += bs_idx * m * 3; float uy = unknown[1];
dist2 += bs_idx * n * 3 + pt_idx * 3; float uz = unknown[2];
idx += bs_idx * n * 3 + pt_idx * 3;
float ux = unknown[0]; double best1 = 1e40, best2 = 1e40, best3 = 1e40;
float uy = unknown[1]; int besti1 = 0, besti2 = 0, besti3 = 0;
float uz = unknown[2]; for (int k = 0; k < m; ++k) {
float x = known[k * 3 + 0];
double best1 = 1e40, best2 = 1e40, best3 = 1e40; float y = known[k * 3 + 1];
int besti1 = 0, besti2 = 0, besti3 = 0; float z = known[k * 3 + 2];
for (int k = 0; k < m; ++k) { float d = (ux - x) * (ux - x) + (uy - y) * (uy - y) + (uz - z) * (uz - z);
float x = known[k * 3 + 0]; if (d < best1) {
float y = known[k * 3 + 1]; best3 = best2;
float z = known[k * 3 + 2]; besti3 = besti2;
float d = (ux - x) * (ux - x) + (uy - y) * (uy - y) + (uz - z) * (uz - z); best2 = best1;
if (d < best1) { besti2 = besti1;
best3 = best2; besti3 = besti2; best1 = d;
best2 = best1; besti2 = besti1; besti1 = k;
best1 = d; besti1 = k; } else if (d < best2) {
} best3 = best2;
else if (d < best2) { besti3 = besti2;
best3 = best2; besti3 = besti2; best2 = d;
best2 = d; besti2 = k; besti2 = k;
} } else if (d < best3) {
else if (d < best3) { best3 = d;
best3 = d; besti3 = k; besti3 = k;
}
} }
dist2[0] = best1; dist2[1] = best2; dist2[2] = best3; }
idx[0] = besti1; idx[1] = besti2; idx[2] = besti3; dist2[0] = best1;
dist2[1] = best2;
dist2[2] = best3;
idx[0] = besti1;
idx[1] = besti2;
idx[2] = besti3;
} }
void three_nn_kernel_launcher(int b, int n, int m, const float *unknown, void three_nn_kernel_launcher(int b, int n, int m, const float *unknown,
const float *known, float *dist2, int *idx, cudaStream_t stream) { const float *known, float *dist2, int *idx,
// unknown: (B, N, 3) cudaStream_t stream) {
// known: (B, M, 3) // unknown: (B, N, 3)
// output: // known: (B, M, 3)
// dist2: (B, N, 3) // output:
// idx: (B, N, 3) // dist2: (B, N, 3)
// idx: (B, N, 3)
cudaError_t err; cudaError_t err;
dim3 blocks(DIVUP(n, THREADS_PER_BLOCK), b); // blockIdx.x(col), blockIdx.y(row) dim3 blocks(DIVUP(n, THREADS_PER_BLOCK),
dim3 threads(THREADS_PER_BLOCK); b); // blockIdx.x(col), blockIdx.y(row)
dim3 threads(THREADS_PER_BLOCK);
three_nn_kernel<<<blocks, threads, 0, stream>>>(b, n, m, unknown, known, dist2, idx); three_nn_kernel<<<blocks, threads, 0, stream>>>(b, n, m, unknown, known,
dist2, idx);
err = cudaGetLastError(); err = cudaGetLastError();
if (cudaSuccess != err) { if (cudaSuccess != err) {
fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err)); fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
exit(-1); exit(-1);
} }
} }
This diff is collapsed.
from .points_in_boxes import points_in_boxes_cpu, points_in_boxes_gpu from .points_in_boxes import (points_in_boxes_batch, points_in_boxes_cpu,
points_in_boxes_gpu)
from .roiaware_pool3d import RoIAwarePool3d from .roiaware_pool3d import RoIAwarePool3d
__all__ = ['RoIAwarePool3d', 'points_in_boxes_gpu', 'points_in_boxes_cpu'] __all__ = [
'RoIAwarePool3d', 'points_in_boxes_gpu', 'points_in_boxes_cpu',
'points_in_boxes_batch'
]
...@@ -53,3 +53,29 @@ def points_in_boxes_cpu(points, boxes): ...@@ -53,3 +53,29 @@ def points_in_boxes_cpu(points, boxes):
point_indices) point_indices)
return point_indices return point_indices
def points_in_boxes_batch(points, boxes):
"""Find points that are in boxes (CUDA)
Args:
points (torch.Tensor): [B, M, 3], [x, y, z] in LiDAR coordinate
boxes (torch.Tensor): [B, T, 7],
num_valid_boxes <= T, [x, y, z, w, l, h, ry] in LiDAR coordinate,
(x, y, z) is the bottom center
Returns:
box_idxs_of_pts (torch.Tensor): (B, M, T), default background = 0
"""
assert boxes.shape[0] == points.shape[0]
assert boxes.shape[2] == 7
batch_size, num_points, _ = points.shape
num_boxes = boxes.shape[1]
box_idxs_of_pts = points.new_zeros((batch_size, num_points, num_boxes),
dtype=torch.int).fill_(0)
roiaware_pool3d_ext.points_in_boxes_batch(boxes.contiguous(),
points.contiguous(),
box_idxs_of_pts)
return box_idxs_of_pts
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment