Commit d1b9ae40 authored by zhangwenwei's avatar zhangwenwei
Browse files

clean c files

parent 16c3f6e1
......@@ -15,6 +15,7 @@ repos:
rev: v0.30.0
hooks:
- id: yapf
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v2.5.0
hooks:
......
#include <torch/serialize/tensor.h>
#include <vector>
#include <THC/THC.h>
#include <cuda.h>
#include <cuda_runtime_api.h>
#include <torch/extension.h>
#include <torch/serialize/tensor.h>
#include <vector>
extern THCState *state;
#define CHECK_CUDA(x) AT_CHECK(x.type().is_cuda(), #x, " must be a CUDAtensor ")
#define CHECK_CONTIGUOUS(x) AT_CHECK(x.is_contiguous(), #x, " must be contiguous ")
#define CHECK_INPUT(x) CHECK_CUDA(x);CHECK_CONTIGUOUS(x)
#define CHECK_CUDA(x) \
TORCH_CHECK(x.type().is_cuda(), #x, " must be a CUDAtensor ")
#define CHECK_CONTIGUOUS(x) \
TORCH_CHECK(x.is_contiguous(), #x, " must be contiguous ")
#define CHECK_INPUT(x) \
CHECK_CUDA(x); \
CHECK_CONTIGUOUS(x)
int ball_query_wrapper(int b, int n, int m, float radius, int nsample,
at::Tensor new_xyz_tensor, at::Tensor xyz_tensor, at::Tensor idx_tensor);
at::Tensor new_xyz_tensor, at::Tensor xyz_tensor,
at::Tensor idx_tensor);
void ball_query_kernel_launcher(int b, int n, int m, float radius, int nsample,
const float *xyz, const float *new_xyz, int *idx, cudaStream_t stream);
const float *xyz, const float *new_xyz,
int *idx, cudaStream_t stream);
int ball_query_wrapper(int b, int n, int m, float radius, int nsample,
at::Tensor new_xyz_tensor, at::Tensor xyz_tensor, at::Tensor idx_tensor) {
at::Tensor new_xyz_tensor, at::Tensor xyz_tensor,
at::Tensor idx_tensor) {
CHECK_INPUT(new_xyz_tensor);
CHECK_INPUT(xyz_tensor);
const float *new_xyz = new_xyz_tensor.data<float>();
const float *xyz = xyz_tensor.data<float>();
int *idx = idx_tensor.data<int>();
const float *new_xyz = new_xyz_tensor.data_ptr<float>();
const float *xyz = xyz_tensor.data_ptr<float>();
int *idx = idx_tensor.data_ptr<int>();
cudaStream_t stream = THCState_getCurrentStream(state);
ball_query_kernel_launcher(b, n, m, radius, nsample, new_xyz, xyz, idx, stream);
ball_query_kernel_launcher(b, n, m, radius, nsample, new_xyz, xyz, idx,
stream);
return 1;
}
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("ball_query_wrapper", &ball_query_wrapper, "ball_query_wrapper");
}
......@@ -3,11 +3,13 @@
#include <stdlib.h>
#define THREADS_PER_BLOCK 256
#define DIVUP(m,n) ((m) / (n) + ((m) % (n) > 0))
#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
__global__ void ball_query_kernel(int b, int n, int m, float radius, int nsample,
const float *__restrict__ new_xyz, const float *__restrict__ xyz, int *__restrict__ idx) {
__global__ void ball_query_kernel(int b, int n, int m, float radius,
int nsample,
const float *__restrict__ new_xyz,
const float *__restrict__ xyz,
int *__restrict__ idx) {
// new_xyz: (B, M, 3)
// xyz: (B, N, 3)
// output:
......@@ -30,9 +32,10 @@ __global__ void ball_query_kernel(int b, int n, int m, float radius, int nsample
float x = xyz[k * 3 + 0];
float y = xyz[k * 3 + 1];
float z = xyz[k * 3 + 2];
float d2 = (new_x - x) * (new_x - x) + (new_y - y) * (new_y - y) + (new_z - z) * (new_z - z);
if (d2 < radius2){
if (cnt == 0){
float d2 = (new_x - x) * (new_x - x) + (new_y - y) * (new_y - y) +
(new_z - z) * (new_z - z);
if (d2 < radius2) {
if (cnt == 0) {
for (int l = 0; l < nsample; ++l) {
idx[l] = k;
}
......@@ -44,9 +47,9 @@ __global__ void ball_query_kernel(int b, int n, int m, float radius, int nsample
}
}
void ball_query_kernel_launcher(int b, int n, int m, float radius, int nsample, \
const float *new_xyz, const float *xyz, int *idx, cudaStream_t stream) {
void ball_query_kernel_launcher(int b, int n, int m, float radius, int nsample,
const float *new_xyz, const float *xyz,
int *idx, cudaStream_t stream) {
// new_xyz: (B, M, 3)
// xyz: (B, N, 3)
// output:
......@@ -54,10 +57,12 @@ void ball_query_kernel_launcher(int b, int n, int m, float radius, int nsample,
cudaError_t err;
dim3 blocks(DIVUP(m, THREADS_PER_BLOCK), b); // blockIdx.x(col), blockIdx.y(row)
dim3 blocks(DIVUP(m, THREADS_PER_BLOCK),
b); // blockIdx.x(col), blockIdx.y(row)
dim3 threads(THREADS_PER_BLOCK);
ball_query_kernel<<<blocks, threads, 0, stream>>>(b, n, m, radius, nsample, new_xyz, xyz, idx);
ball_query_kernel<<<blocks, threads, 0, stream>>>(b, n, m, radius, nsample,
new_xyz, xyz, idx);
// cudaDeviceSynchronize(); // for using printf in kernel function
err = cudaGetLastError();
if (cudaSuccess != err) {
......
#include <torch/serialize/tensor.h>
#include <ATen/cuda/CUDAContext.h>
#include <vector>
#include <THC/THC.h>
#include <torch/extension.h>
#include <torch/serialize/tensor.h>
#include <vector>
extern THCState *state;
int furthest_point_sampling_wrapper(int b, int n, int m,
at::Tensor points_tensor, at::Tensor temp_tensor, at::Tensor idx_tensor);
at::Tensor points_tensor,
at::Tensor temp_tensor,
at::Tensor idx_tensor);
void furthest_point_sampling_kernel_launcher(int b, int n, int m,
const float *dataset, float *temp, int *idxs, cudaStream_t stream);
const float *dataset, float *temp,
int *idxs, cudaStream_t stream);
int furthest_point_sampling_wrapper(int b, int n, int m,
at::Tensor points_tensor, at::Tensor temp_tensor, at::Tensor idx_tensor) {
const float *points = points_tensor.data<float>();
float *temp = temp_tensor.data<float>();
int *idx = idx_tensor.data<int>();
at::Tensor points_tensor,
at::Tensor temp_tensor,
at::Tensor idx_tensor) {
const float *points = points_tensor.data_ptr<float>();
float *temp = temp_tensor.data_ptr<float>();
int *idx = idx_tensor.data_ptr<int>();
cudaStream_t stream = THCState_getCurrentStream(state);
furthest_point_sampling_kernel_launcher(b, n, m, points, temp, idx, stream);
......@@ -26,5 +30,6 @@ int furthest_point_sampling_wrapper(int b, int n, int m,
}
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("furthest_point_sampling_wrapper", &furthest_point_sampling_wrapper, "furthest_point_sampling_wrapper");
m.def("furthest_point_sampling_wrapper", &furthest_point_sampling_wrapper,
"furthest_point_sampling_wrapper");
}
......@@ -3,7 +3,7 @@
#define TOTAL_THREADS 1024
#define THREADS_PER_BLOCK 256
#define DIVUP(m,n) ((m) / (n) + ((m) % (n) > 0))
#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
inline int opt_n_threads(int work_size) {
const int pow_2 = std::log(static_cast<double>(work_size)) / std::log(2.0);
......@@ -11,7 +11,8 @@ inline int opt_n_threads(int work_size) {
return max(min(1 << pow_2, TOTAL_THREADS), 1);
}
__device__ void __update(float *__restrict__ dists, int *__restrict__ dists_i, int idx1, int idx2){
__device__ void __update(float *__restrict__ dists, int *__restrict__ dists_i,
int idx1, int idx2) {
const float v1 = dists[idx1], v2 = dists[idx2];
const int i1 = dists_i[idx1], i2 = dists_i[idx2];
dists[idx1] = max(v1, v2);
......@@ -19,8 +20,9 @@ __device__ void __update(float *__restrict__ dists, int *__restrict__ dists_i, i
}
template <unsigned int block_size>
__global__ void furthest_point_sampling_kernel(int b, int n, int m,
const float *__restrict__ dataset, float *__restrict__ temp, int *__restrict__ idxs) {
__global__ void furthest_point_sampling_kernel(
int b, int n, int m, const float *__restrict__ dataset,
float *__restrict__ temp, int *__restrict__ idxs) {
// dataset: (B, N, 3)
// tmp: (B, N)
// output:
......@@ -39,8 +41,7 @@ __global__ void furthest_point_sampling_kernel(int b, int n, int m,
const int stride = block_size;
int old = 0;
if (threadIdx.x == 0)
idxs[0] = old;
if (threadIdx.x == 0) idxs[0] = old;
__syncthreads();
for (int j = 1; j < m; j++) {
......@@ -58,7 +59,8 @@ __global__ void furthest_point_sampling_kernel(int b, int n, int m,
// if (mag <= 1e-3)
// continue;
float d = (x2 - x1) * (x2 - x1) + (y2 - y1) * (y2 - y1) + (z2 - z1) * (z2 - z1);
float d =
(x2 - x1) * (x2 - x1) + (y2 - y1) * (y2 - y1) + (z2 - z1) * (z2 - z1);
float d2 = min(d, temp[k]);
temp[k] = d2;
besti = d2 > best ? k : besti;
......@@ -131,13 +133,13 @@ __global__ void furthest_point_sampling_kernel(int b, int n, int m,
}
old = dists_i[0];
if (tid == 0)
idxs[j] = old;
if (tid == 0) idxs[j] = old;
}
}
void furthest_point_sampling_kernel_launcher(int b, int n, int m,
const float *dataset, float *temp, int *idxs, cudaStream_t stream) {
const float *dataset, float *temp,
int *idxs, cudaStream_t stream) {
// dataset: (B, N, 3)
// tmp: (B, N)
// output:
......@@ -148,29 +150,52 @@ void furthest_point_sampling_kernel_launcher(int b, int n, int m,
switch (n_threads) {
case 1024:
furthest_point_sampling_kernel<1024><<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs); break;
furthest_point_sampling_kernel<1024>
<<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
break;
case 512:
furthest_point_sampling_kernel<512><<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs); break;
furthest_point_sampling_kernel<512>
<<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
break;
case 256:
furthest_point_sampling_kernel<256><<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs); break;
furthest_point_sampling_kernel<256>
<<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
break;
case 128:
furthest_point_sampling_kernel<128><<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs); break;
furthest_point_sampling_kernel<128>
<<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
break;
case 64:
furthest_point_sampling_kernel<64><<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs); break;
furthest_point_sampling_kernel<64>
<<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
break;
case 32:
furthest_point_sampling_kernel<32><<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs); break;
furthest_point_sampling_kernel<32>
<<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
break;
case 16:
furthest_point_sampling_kernel<16><<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs); break;
furthest_point_sampling_kernel<16>
<<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
break;
case 8:
furthest_point_sampling_kernel<8><<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs); break;
furthest_point_sampling_kernel<8>
<<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
break;
case 4:
furthest_point_sampling_kernel<4><<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs); break;
furthest_point_sampling_kernel<4>
<<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
break;
case 2:
furthest_point_sampling_kernel<2><<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs); break;
furthest_point_sampling_kernel<2>
<<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
break;
case 1:
furthest_point_sampling_kernel<1><<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs); break;
furthest_point_sampling_kernel<1>
<<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
break;
default:
furthest_point_sampling_kernel<512><<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
furthest_point_sampling_kernel<512>
<<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
}
err = cudaGetLastError();
......
#include <torch/serialize/tensor.h>
#include <ATen/cuda/CUDAContext.h>
#include <vector>
#include <THC/THC.h>
#include <torch/extension.h>
#include <torch/serialize/tensor.h>
#include <vector>
extern THCState *state;
int gather_points_wrapper(int b, int c, int n, int npoints,
at::Tensor points_tensor, at::Tensor idx_tensor, at::Tensor out_tensor);
at::Tensor points_tensor, at::Tensor idx_tensor,
at::Tensor out_tensor);
void gather_points_kernel_launcher(int b, int c, int n, int npoints,
const float *points, const int *idx, float *out, cudaStream_t stream);
const float *points, const int *idx,
float *out, cudaStream_t stream);
int gather_points_grad_wrapper(int b, int c, int n, int npoints,
at::Tensor grad_out_tensor, at::Tensor idx_tensor, at::Tensor grad_points_tensor);
at::Tensor grad_out_tensor,
at::Tensor idx_tensor,
at::Tensor grad_points_tensor);
void gather_points_grad_kernel_launcher(int b, int c, int n, int npoints,
const float *grad_out, const int *idx, float *grad_points, cudaStream_t stream);
const float *grad_out, const int *idx,
float *grad_points,
cudaStream_t stream);
int gather_points_wrapper(int b, int c, int n, int npoints,
at::Tensor points_tensor, at::Tensor idx_tensor, at::Tensor out_tensor){
const float *points = points_tensor.data<float>();
const int *idx = idx_tensor.data<int>();
float *out = out_tensor.data<float>();
at::Tensor points_tensor, at::Tensor idx_tensor,
at::Tensor out_tensor) {
const float *points = points_tensor.data_ptr<float>();
const int *idx = idx_tensor.data_ptr<int>();
float *out = out_tensor.data_ptr<float>();
cudaStream_t stream = THCState_getCurrentStream(state);
gather_points_kernel_launcher(b, c, n, npoints, points, idx, out, stream);
return 1;
}
int gather_points_grad_wrapper(int b, int c, int n, int npoints,
at::Tensor grad_out_tensor, at::Tensor idx_tensor, at::Tensor grad_points_tensor) {
const float *grad_out = grad_out_tensor.data<float>();
const int *idx = idx_tensor.data<int>();
float *grad_points = grad_points_tensor.data<float>();
at::Tensor grad_out_tensor,
at::Tensor idx_tensor,
at::Tensor grad_points_tensor) {
const float *grad_out = grad_out_tensor.data_ptr<float>();
const int *idx = idx_tensor.data_ptr<int>();
float *grad_points = grad_points_tensor.data_ptr<float>();
cudaStream_t stream = THCState_getCurrentStream(state);
gather_points_grad_kernel_launcher(b, c, n, npoints, grad_out, idx, grad_points, stream);
gather_points_grad_kernel_launcher(b, c, n, npoints, grad_out, idx,
grad_points, stream);
return 1;
}
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("gather_points_wrapper", &gather_points_wrapper, "gather_points_wrapper");
m.def("gather_points_grad_wrapper", &gather_points_grad_wrapper, "gather_points_grad_wrapper");
m.def("gather_points_wrapper", &gather_points_wrapper,
"gather_points_wrapper");
m.def("gather_points_grad_wrapper", &gather_points_grad_wrapper,
"gather_points_grad_wrapper");
}
......@@ -3,11 +3,12 @@
#define TOTAL_THREADS 1024
#define THREADS_PER_BLOCK 256
#define DIVUP(m,n) ((m) / (n) + ((m) % (n) > 0))
#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
__global__ void gather_points_kernel(int b, int c, int n, int m,
const float *__restrict__ points, const int *__restrict__ idx, float *__restrict__ out) {
const float *__restrict__ points,
const int *__restrict__ idx,
float *__restrict__ out) {
// points: (B, C, N)
// idx: (B, M)
// output:
......@@ -25,17 +26,20 @@ __global__ void gather_points_kernel(int b, int c, int n, int m,
}
void gather_points_kernel_launcher(int b, int c, int n, int npoints,
const float *points, const int *idx, float *out, cudaStream_t stream) {
const float *points, const int *idx,
float *out, cudaStream_t stream) {
// points: (B, C, N)
// idx: (B, npoints)
// output:
// out: (B, C, npoints)
cudaError_t err;
dim3 blocks(DIVUP(npoints, THREADS_PER_BLOCK), c, b); // blockIdx.x(col), blockIdx.y(row)
dim3 blocks(DIVUP(npoints, THREADS_PER_BLOCK), c,
b); // blockIdx.x(col), blockIdx.y(row)
dim3 threads(THREADS_PER_BLOCK);
gather_points_kernel<<<blocks, threads, 0, stream>>>(b, c, n, npoints, points, idx, out);
gather_points_kernel<<<blocks, threads, 0, stream>>>(b, c, n, npoints, points,
idx, out);
err = cudaGetLastError();
if (cudaSuccess != err) {
......@@ -44,8 +48,10 @@ void gather_points_kernel_launcher(int b, int c, int n, int npoints,
}
}
__global__ void gather_points_grad_kernel(int b, int c, int n, int m, const float *__restrict__ grad_out,
const int *__restrict__ idx, float *__restrict__ grad_points) {
__global__ void gather_points_grad_kernel(int b, int c, int n, int m,
const float *__restrict__ grad_out,
const int *__restrict__ idx,
float *__restrict__ grad_points) {
// grad_out: (B, C, M)
// idx: (B, M)
// output:
......@@ -64,17 +70,21 @@ __global__ void gather_points_grad_kernel(int b, int c, int n, int m, const floa
}
void gather_points_grad_kernel_launcher(int b, int c, int n, int npoints,
const float *grad_out, const int *idx, float *grad_points, cudaStream_t stream) {
const float *grad_out, const int *idx,
float *grad_points,
cudaStream_t stream) {
// grad_out: (B, C, npoints)
// idx: (B, npoints)
// output:
// grad_points: (B, C, N)
cudaError_t err;
dim3 blocks(DIVUP(npoints, THREADS_PER_BLOCK), c, b); // blockIdx.x(col), blockIdx.y(row)
dim3 blocks(DIVUP(npoints, THREADS_PER_BLOCK), c,
b); // blockIdx.x(col), blockIdx.y(row)
dim3 threads(THREADS_PER_BLOCK);
gather_points_grad_kernel<<<blocks, threads, 0, stream>>>(b, c, n, npoints, grad_out, idx, grad_points);
gather_points_grad_kernel<<<blocks, threads, 0, stream>>>(
b, c, n, npoints, grad_out, idx, grad_points);
err = cudaGetLastError();
if (cudaSuccess != err) {
......
#include <torch/serialize/tensor.h>
#include <THC/THC.h>
#include <cuda.h>
#include <cuda_runtime_api.h>
#include <vector>
#include <THC/THC.h>
#include <torch/extension.h>
#include <torch/serialize/tensor.h>
#include <vector>
extern THCState *state;
int group_points_wrapper(int b, int c, int n, int npoints, int nsample,
at::Tensor points_tensor, at::Tensor idx_tensor, at::Tensor out_tensor);
at::Tensor points_tensor, at::Tensor idx_tensor,
at::Tensor out_tensor);
void group_points_kernel_launcher(int b, int c, int n, int npoints, int nsample,
const float *points, const int *idx, float *out, cudaStream_t stream);
const float *points, const int *idx,
float *out, cudaStream_t stream);
int group_points_grad_wrapper(int b, int c, int n, int npoints, int nsample,
at::Tensor grad_out_tensor, at::Tensor idx_tensor, at::Tensor grad_points_tensor);
void group_points_grad_kernel_launcher(int b, int c, int n, int npoints, int nsample,
const float *grad_out, const int *idx, float *grad_points, cudaStream_t stream);
at::Tensor grad_out_tensor, at::Tensor idx_tensor,
at::Tensor grad_points_tensor);
void group_points_grad_kernel_launcher(int b, int c, int n, int npoints,
int nsample, const float *grad_out,
const int *idx, float *grad_points,
cudaStream_t stream);
int group_points_grad_wrapper(int b, int c, int n, int npoints, int nsample,
at::Tensor grad_out_tensor, at::Tensor idx_tensor, at::Tensor grad_points_tensor) {
float *grad_points = grad_points_tensor.data<float>();
const int *idx = idx_tensor.data<int>();
const float *grad_out = grad_out_tensor.data<float>();
at::Tensor grad_out_tensor, at::Tensor idx_tensor,
at::Tensor grad_points_tensor) {
float *grad_points = grad_points_tensor.data_ptr<float>();
const int *idx = idx_tensor.data_ptr<int>();
const float *grad_out = grad_out_tensor.data_ptr<float>();
cudaStream_t stream = THCState_getCurrentStream(state);
group_points_grad_kernel_launcher(b, c, n, npoints, nsample, grad_out, idx, grad_points, stream);
group_points_grad_kernel_launcher(b, c, n, npoints, nsample, grad_out, idx,
grad_points, stream);
return 1;
}
int group_points_wrapper(int b, int c, int n, int npoints, int nsample,
at::Tensor points_tensor, at::Tensor idx_tensor, at::Tensor out_tensor) {
const float *points = points_tensor.data<float>();
const int *idx = idx_tensor.data<int>();
float *out = out_tensor.data<float>();
at::Tensor points_tensor, at::Tensor idx_tensor,
at::Tensor out_tensor) {
const float *points = points_tensor.data_ptr<float>();
const int *idx = idx_tensor.data_ptr<int>();
float *out = out_tensor.data_ptr<float>();
cudaStream_t stream = THCState_getCurrentStream(state);
group_points_kernel_launcher(b, c, n, npoints, nsample, points, idx, out, stream);
group_points_kernel_launcher(b, c, n, npoints, nsample, points, idx, out,
stream);
return 1;
}
......
......@@ -2,10 +2,13 @@
#include <stdlib.h>
#define THREADS_PER_BLOCK 256
#define DIVUP(m,n) ((m) / (n) + ((m) % (n) > 0))
#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
__global__ void group_points_grad_kernel(int b, int c, int n, int npoints, int nsample,
const float *__restrict__ grad_out, const int *__restrict__ idx, float *__restrict__ grad_points) {
__global__ void group_points_grad_kernel(int b, int c, int n, int npoints,
int nsample,
const float *__restrict__ grad_out,
const int *__restrict__ idx,
float *__restrict__ grad_points) {
// grad_out: (B, C, npoints, nsample)
// idx: (B, npoints, nsample)
// output:
......@@ -17,23 +20,28 @@ __global__ void group_points_grad_kernel(int b, int c, int n, int npoints, int n
if (bs_idx >= b || c_idx >= c || pt_idx >= npoints) return;
int sample_idx = index % nsample;
grad_out += bs_idx * c * npoints * nsample + c_idx * npoints * nsample + pt_idx * nsample + sample_idx;
grad_out += bs_idx * c * npoints * nsample + c_idx * npoints * nsample +
pt_idx * nsample + sample_idx;
idx += bs_idx * npoints * nsample + pt_idx * nsample + sample_idx;
atomicAdd(grad_points + bs_idx * c * n + c_idx * n + idx[0] , grad_out[0]);
atomicAdd(grad_points + bs_idx * c * n + c_idx * n + idx[0], grad_out[0]);
}
void group_points_grad_kernel_launcher(int b, int c, int n, int npoints, int nsample,
const float *grad_out, const int *idx, float *grad_points, cudaStream_t stream) {
void group_points_grad_kernel_launcher(int b, int c, int n, int npoints,
int nsample, const float *grad_out,
const int *idx, float *grad_points,
cudaStream_t stream) {
// grad_out: (B, C, npoints, nsample)
// idx: (B, npoints, nsample)
// output:
// grad_points: (B, C, N)
cudaError_t err;
dim3 blocks(DIVUP(npoints * nsample, THREADS_PER_BLOCK), c, b); // blockIdx.x(col), blockIdx.y(row)
dim3 blocks(DIVUP(npoints * nsample, THREADS_PER_BLOCK), c,
b); // blockIdx.x(col), blockIdx.y(row)
dim3 threads(THREADS_PER_BLOCK);
group_points_grad_kernel<<<blocks, threads, 0, stream>>>(b, c, n, npoints, nsample, grad_out, idx, grad_points);
group_points_grad_kernel<<<blocks, threads, 0, stream>>>(
b, c, n, npoints, nsample, grad_out, idx, grad_points);
err = cudaGetLastError();
if (cudaSuccess != err) {
......@@ -42,9 +50,11 @@ void group_points_grad_kernel_launcher(int b, int c, int n, int npoints, int nsa
}
}
__global__ void group_points_kernel(int b, int c, int n, int npoints, int nsample,
const float *__restrict__ points, const int *__restrict__ idx, float *__restrict__ out) {
__global__ void group_points_kernel(int b, int c, int n, int npoints,
int nsample,
const float *__restrict__ points,
const int *__restrict__ idx,
float *__restrict__ out) {
// points: (B, C, N)
// idx: (B, npoints, nsample)
// output:
......@@ -59,23 +69,26 @@ __global__ void group_points_kernel(int b, int c, int n, int npoints, int nsampl
idx += bs_idx * npoints * nsample + pt_idx * nsample + sample_idx;
int in_idx = bs_idx * c * n + c_idx * n + idx[0];
int out_idx = bs_idx * c * npoints * nsample + c_idx * npoints * nsample + pt_idx * nsample + sample_idx;
int out_idx = bs_idx * c * npoints * nsample + c_idx * npoints * nsample +
pt_idx * nsample + sample_idx;
out[out_idx] = points[in_idx];
}
void group_points_kernel_launcher(int b, int c, int n, int npoints, int nsample,
const float *points, const int *idx, float *out, cudaStream_t stream) {
const float *points, const int *idx,
float *out, cudaStream_t stream) {
// points: (B, C, N)
// idx: (B, npoints, nsample)
// output:
// out: (B, C, npoints, nsample)
cudaError_t err;
dim3 blocks(DIVUP(npoints * nsample, THREADS_PER_BLOCK), c, b); // blockIdx.x(col), blockIdx.y(row)
dim3 blocks(DIVUP(npoints * nsample, THREADS_PER_BLOCK), c,
b); // blockIdx.x(col), blockIdx.y(row)
dim3 threads(THREADS_PER_BLOCK);
group_points_kernel<<<blocks, threads, 0, stream>>>(b, c, n, npoints, nsample, points, idx, out);
group_points_kernel<<<blocks, threads, 0, stream>>>(b, c, n, npoints, nsample,
points, idx, out);
// cudaDeviceSynchronize(); // for using printf in kernel function
err = cudaGetLastError();
if (cudaSuccess != err) {
......
#include <torch/serialize/tensor.h>
#include <vector>
#include <THC/THC.h>
#include <cuda.h>
#include <cuda_runtime_api.h>
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#include <cuda_runtime_api.h>
#include <torch/extension.h>
#include <torch/serialize/tensor.h>
#include <vector>
extern THCState *state;
void three_nn_wrapper(int b, int n, int m, at::Tensor unknown_tensor,
at::Tensor known_tensor, at::Tensor dist2_tensor, at::Tensor idx_tensor);
at::Tensor known_tensor, at::Tensor dist2_tensor,
at::Tensor idx_tensor);
void three_nn_kernel_launcher(int b, int n, int m, const float *unknown,
const float *known, float *dist2, int *idx, cudaStream_t stream);
const float *known, float *dist2, int *idx,
cudaStream_t stream);
void three_interpolate_wrapper(int b, int c, int m, int n, at::Tensor points_tensor,
at::Tensor idx_tensor, at::Tensor weight_tensor, at::Tensor out_tensor);
void three_interpolate_wrapper(int b, int c, int m, int n,
at::Tensor points_tensor, at::Tensor idx_tensor,
at::Tensor weight_tensor, at::Tensor out_tensor);
void three_interpolate_kernel_launcher(int b, int c, int m, int n,
const float *points, const int *idx, const float *weight, float *out, cudaStream_t stream);
const float *points, const int *idx,
const float *weight, float *out,
cudaStream_t stream);
void three_interpolate_grad_wrapper(int b, int c, int n, int m,
at::Tensor grad_out_tensor,
at::Tensor idx_tensor,
at::Tensor weight_tensor,
at::Tensor grad_points_tensor);
void three_interpolate_grad_wrapper(int b, int c, int n, int m, at::Tensor grad_out_tensor,
at::Tensor idx_tensor, at::Tensor weight_tensor, at::Tensor grad_points_tensor);
void three_interpolate_grad_kernel_launcher(int b, int c, int n, int m, const float *grad_out,
const int *idx, const float *weight, float *grad_points, cudaStream_t stream);
void three_interpolate_grad_kernel_launcher(int b, int c, int n, int m,
const float *grad_out,
const int *idx, const float *weight,
float *grad_points,
cudaStream_t stream);
void three_nn_wrapper(int b, int n, int m, at::Tensor unknown_tensor,
at::Tensor known_tensor, at::Tensor dist2_tensor, at::Tensor idx_tensor) {
const float *unknown = unknown_tensor.data<float>();
const float *known = known_tensor.data<float>();
float *dist2 = dist2_tensor.data<float>();
int *idx = idx_tensor.data<int>();
at::Tensor known_tensor, at::Tensor dist2_tensor,
at::Tensor idx_tensor) {
const float *unknown = unknown_tensor.data_ptr<float>();
const float *known = known_tensor.data_ptr<float>();
float *dist2 = dist2_tensor.data_ptr<float>();
int *idx = idx_tensor.data_ptr<int>();
cudaStream_t stream = THCState_getCurrentStream(state);
three_nn_kernel_launcher(b, n, m, unknown, known, dist2, idx, stream);
}
void three_interpolate_wrapper(int b, int c, int m, int n,
at::Tensor points_tensor,
at::Tensor idx_tensor,
at::Tensor points_tensor, at::Tensor idx_tensor,
at::Tensor weight_tensor,
at::Tensor out_tensor) {
const float *points = points_tensor.data<float>();
const float *weight = weight_tensor.data<float>();
float *out = out_tensor.data<float>();
const int *idx = idx_tensor.data<int>();
const float *points = points_tensor.data_ptr<float>();
const float *weight = weight_tensor.data_ptr<float>();
float *out = out_tensor.data_ptr<float>();
const int *idx = idx_tensor.data_ptr<int>();
cudaStream_t stream = THCState_getCurrentStream(state);
three_interpolate_kernel_launcher(b, c, m, n, points, idx, weight, out, stream);
three_interpolate_kernel_launcher(b, c, m, n, points, idx, weight, out,
stream);
}
void three_interpolate_grad_wrapper(int b, int c, int n, int m,
......@@ -63,19 +71,20 @@ void three_interpolate_grad_wrapper(int b, int c, int n, int m,
at::Tensor idx_tensor,
at::Tensor weight_tensor,
at::Tensor grad_points_tensor) {
const float *grad_out = grad_out_tensor.data<float>();
const float *weight = weight_tensor.data<float>();
float *grad_points = grad_points_tensor.data<float>();
const int *idx = idx_tensor.data<int>();
const float *grad_out = grad_out_tensor.data_ptr<float>();
const float *weight = weight_tensor.data_ptr<float>();
float *grad_points = grad_points_tensor.data_ptr<float>();
const int *idx = idx_tensor.data_ptr<int>();
cudaStream_t stream = THCState_getCurrentStream(state);
three_interpolate_grad_kernel_launcher(b, c, n, m, grad_out, idx, weight, grad_points, stream);
three_interpolate_grad_kernel_launcher(b, c, n, m, grad_out, idx, weight,
grad_points, stream);
}
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("three_nn_wrapper", &three_nn_wrapper, "three_nn_wrapper");
m.def("three_interpolate_wrapper", &three_interpolate_wrapper, "three_interpolate_wrapper");
m.def("three_interpolate_grad_wrapper", &three_interpolate_grad_wrapper, "three_interpolate_grad_wrapper");
m.def("three_interpolate_wrapper", &three_interpolate_wrapper,
"three_interpolate_wrapper");
m.def("three_interpolate_grad_wrapper", &three_interpolate_grad_wrapper,
"three_interpolate_grad_wrapper");
}
......@@ -3,11 +3,13 @@
#include <stdlib.h>
#define THREADS_PER_BLOCK 256
#define DIVUP(m,n) ((m) / (n) + ((m) % (n) > 0))
#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
__global__ void three_interpolate_kernel(int b, int c, int m, int n, const float *__restrict__ points,
const int *__restrict__ idx, const float *__restrict__ weight, float *__restrict__ out) {
__global__ void three_interpolate_kernel(int b, int c, int m, int n,
const float *__restrict__ points,
const int *__restrict__ idx,
const float *__restrict__ weight,
float *__restrict__ out) {
// points: (B, C, M)
// idx: (B, N, 3)
// weight: (B, N, 3)
......@@ -25,11 +27,14 @@ __global__ void three_interpolate_kernel(int b, int c, int m, int n, const float
idx += bs_idx * n * 3 + pt_idx * 3;
out += bs_idx * c * n + c_idx * n;
out[pt_idx] = weight[0] * points[idx[0]] + weight[1] * points[idx[1]] + weight[2] * points[idx[2]];
out[pt_idx] = weight[0] * points[idx[0]] + weight[1] * points[idx[1]] +
weight[2] * points[idx[2]];
}
void three_interpolate_kernel_launcher(int b, int c, int m, int n,
const float *points, const int *idx, const float *weight, float *out, cudaStream_t stream) {
const float *points, const int *idx,
const float *weight, float *out,
cudaStream_t stream) {
// points: (B, C, M)
// idx: (B, N, 3)
// weight: (B, N, 3)
......@@ -37,9 +42,11 @@ void three_interpolate_kernel_launcher(int b, int c, int m, int n,
// out: (B, C, N)
cudaError_t err;
dim3 blocks(DIVUP(n, THREADS_PER_BLOCK), c, b); // blockIdx.x(col), blockIdx.y(row)
dim3 blocks(DIVUP(n, THREADS_PER_BLOCK), c,
b); // blockIdx.x(col), blockIdx.y(row)
dim3 threads(THREADS_PER_BLOCK);
three_interpolate_kernel<<<blocks, threads, 0, stream>>>(b, c, m, n, points, idx, weight, out);
three_interpolate_kernel<<<blocks, threads, 0, stream>>>(b, c, m, n, points,
idx, weight, out);
err = cudaGetLastError();
if (cudaSuccess != err) {
......@@ -48,9 +55,10 @@ void three_interpolate_kernel_launcher(int b, int c, int m, int n,
}
}
__global__ void three_interpolate_grad_kernel(int b, int c, int n, int m, const float *__restrict__ grad_out,
const int *__restrict__ idx, const float *__restrict__ weight, float *__restrict__ grad_points) {
__global__ void three_interpolate_grad_kernel(
int b, int c, int n, int m, const float *__restrict__ grad_out,
const int *__restrict__ idx, const float *__restrict__ weight,
float *__restrict__ grad_points) {
// grad_out: (B, C, N)
// weight: (B, N, 3)
// output:
......@@ -67,23 +75,27 @@ __global__ void three_interpolate_grad_kernel(int b, int c, int n, int m, const
grad_points += bs_idx * c * m + c_idx * m;
idx += bs_idx * n * 3 + pt_idx * 3;
atomicAdd(grad_points + idx[0], grad_out[0] * weight[0]);
atomicAdd(grad_points + idx[1], grad_out[0] * weight[1]);
atomicAdd(grad_points + idx[2], grad_out[0] * weight[2]);
}
void three_interpolate_grad_kernel_launcher(int b, int c, int n, int m, const float *grad_out,
const int *idx, const float *weight, float *grad_points, cudaStream_t stream) {
void three_interpolate_grad_kernel_launcher(int b, int c, int n, int m,
const float *grad_out,
const int *idx, const float *weight,
float *grad_points,
cudaStream_t stream) {
// grad_out: (B, C, N)
// weight: (B, N, 3)
// output:
// grad_points: (B, C, M)
cudaError_t err;
dim3 blocks(DIVUP(n, THREADS_PER_BLOCK), c, b); // blockIdx.x(col), blockIdx.y(row)
dim3 blocks(DIVUP(n, THREADS_PER_BLOCK), c,
b); // blockIdx.x(col), blockIdx.y(row)
dim3 threads(THREADS_PER_BLOCK);
three_interpolate_grad_kernel<<<blocks, threads, 0, stream>>>(b, c, n, m, grad_out, idx, weight, grad_points);
three_interpolate_grad_kernel<<<blocks, threads, 0, stream>>>(
b, c, n, m, grad_out, idx, weight, grad_points);
err = cudaGetLastError();
if (cudaSuccess != err) {
......
......@@ -3,11 +3,13 @@
#include <stdlib.h>
#define THREADS_PER_BLOCK 256
#define DIVUP(m,n) ((m) / (n) + ((m) % (n) > 0))
#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
__global__ void three_nn_kernel(int b, int n, int m, const float *__restrict__ unknown,
const float *__restrict__ known, float *__restrict__ dist2, int *__restrict__ idx) {
__global__ void three_nn_kernel(int b, int n, int m,
const float *__restrict__ unknown,
const float *__restrict__ known,
float *__restrict__ dist2,
int *__restrict__ idx) {
// unknown: (B, N, 3)
// known: (B, M, 3)
// output:
......@@ -35,25 +37,33 @@ __global__ void three_nn_kernel(int b, int n, int m, const float *__restrict__ u
float z = known[k * 3 + 2];
float d = (ux - x) * (ux - x) + (uy - y) * (uy - y) + (uz - z) * (uz - z);
if (d < best1) {
best3 = best2; besti3 = besti2;
best2 = best1; besti2 = besti1;
best1 = d; besti1 = k;
}
else if (d < best2) {
best3 = best2; besti3 = besti2;
best2 = d; besti2 = k;
best3 = best2;
besti3 = besti2;
best2 = best1;
besti2 = besti1;
best1 = d;
besti1 = k;
} else if (d < best2) {
best3 = best2;
besti3 = besti2;
best2 = d;
besti2 = k;
} else if (d < best3) {
best3 = d;
besti3 = k;
}
else if (d < best3) {
best3 = d; besti3 = k;
}
}
dist2[0] = best1; dist2[1] = best2; dist2[2] = best3;
idx[0] = besti1; idx[1] = besti2; idx[2] = besti3;
dist2[0] = best1;
dist2[1] = best2;
dist2[2] = best3;
idx[0] = besti1;
idx[1] = besti2;
idx[2] = besti3;
}
void three_nn_kernel_launcher(int b, int n, int m, const float *unknown,
const float *known, float *dist2, int *idx, cudaStream_t stream) {
const float *known, float *dist2, int *idx,
cudaStream_t stream) {
// unknown: (B, N, 3)
// known: (B, M, 3)
// output:
......@@ -61,10 +71,12 @@ void three_nn_kernel_launcher(int b, int n, int m, const float *unknown,
// idx: (B, N, 3)
cudaError_t err;
dim3 blocks(DIVUP(n, THREADS_PER_BLOCK), b); // blockIdx.x(col), blockIdx.y(row)
dim3 blocks(DIVUP(n, THREADS_PER_BLOCK),
b); // blockIdx.x(col), blockIdx.y(row)
dim3 threads(THREADS_PER_BLOCK);
three_nn_kernel<<<blocks, threads, 0, stream>>>(b, n, m, unknown, known, dist2, idx);
three_nn_kernel<<<blocks, threads, 0, stream>>>(b, n, m, unknown, known,
dist2, idx);
err = cudaGetLastError();
if (cudaSuccess != err) {
......
......@@ -8,57 +8,68 @@ const float EPS = 1e-8;
struct Point {
float x, y;
__device__ Point() {}
__device__ Point(double _x, double _y){
x = _x, y = _y;
}
__device__ Point(double _x, double _y) { x = _x, y = _y; }
__device__ void set(float _x, float _y){
x = _x; y = _y;
__device__ void set(float _x, float _y) {
x = _x;
y = _y;
}
__device__ Point operator +(const Point &b)const{
__device__ Point operator+(const Point &b) const {
return Point(x + b.x, y + b.y);
}
__device__ Point operator -(const Point &b)const{
__device__ Point operator-(const Point &b) const {
return Point(x - b.x, y - b.y);
}
};
__device__ inline float cross(const Point &a, const Point &b){
__device__ inline float cross(const Point &a, const Point &b) {
return a.x * b.y - a.y * b.x;
}
__device__ inline float cross(const Point &p1, const Point &p2, const Point &p0){
__device__ inline float cross(const Point &p1, const Point &p2,
const Point &p0) {
return (p1.x - p0.x) * (p2.y - p0.y) - (p2.x - p0.x) * (p1.y - p0.y);
}
__device__ int check_rect_cross(const Point &p1, const Point &p2, const Point &q1, const Point &q2){
int ret = min(p1.x,p2.x) <= max(q1.x,q2.x) &&
min(q1.x,q2.x) <= max(p1.x,p2.x) &&
min(p1.y,p2.y) <= max(q1.y,q2.y) &&
min(q1.y,q2.y) <= max(p1.y,p2.y);
__device__ int check_rect_cross(const Point &p1, const Point &p2,
const Point &q1, const Point &q2) {
int ret = min(p1.x, p2.x) <= max(q1.x, q2.x) &&
min(q1.x, q2.x) <= max(p1.x, p2.x) &&
min(p1.y, p2.y) <= max(q1.y, q2.y) &&
min(q1.y, q2.y) <= max(p1.y, p2.y);
return ret;
}
__device__ inline int check_in_box2d(const float *box, const Point &p){
//params: box (5) [x1, y1, x2, y2, angle]
__device__ inline int check_in_box2d(const float *box, const Point &p) {
// params: box (5) [x1, y1, x2, y2, angle]
const float MARGIN = 1e-5;
float center_x = (box[0] + box[2]) / 2;
float center_y = (box[1] + box[3]) / 2;
float angle_cos = cos(-box[4]), angle_sin = sin(-box[4]); // rotate the point in the opposite direction of box
float rot_x = (p.x - center_x) * angle_cos + (p.y - center_y) * angle_sin + center_x;
float rot_y = -(p.x - center_x) * angle_sin + (p.y - center_y) * angle_cos + center_y;
float angle_cos = cos(-box[4]),
angle_sin =
sin(-box[4]); // rotate the point in the opposite direction of box
float rot_x =
(p.x - center_x) * angle_cos + (p.y - center_y) * angle_sin + center_x;
float rot_y =
-(p.x - center_x) * angle_sin + (p.y - center_y) * angle_cos + center_y;
#ifdef DEBUG
printf("box: (%.3f, %.3f, %.3f, %.3f, %.3f)\n", box[0], box[1], box[2], box[3], box[4]);
printf("center: (%.3f, %.3f), cossin(%.3f, %.3f), src(%.3f, %.3f), rot(%.3f, %.3f)\n", center_x, center_y,
angle_cos, angle_sin, p.x, p.y, rot_x, rot_y);
printf("box: (%.3f, %.3f, %.3f, %.3f, %.3f)\n", box[0], box[1], box[2],
box[3], box[4]);
printf(
"center: (%.3f, %.3f), cossin(%.3f, %.3f), src(%.3f, %.3f), rot(%.3f, "
"%.3f)\n",
center_x, center_y, angle_cos, angle_sin, p.x, p.y, rot_x, rot_y);
#endif
return (rot_x > box[0] - MARGIN && rot_x < box[2] + MARGIN && rot_y > box[1] - MARGIN && rot_y < box[3] + MARGIN);
return (rot_x > box[0] - MARGIN && rot_x < box[2] + MARGIN &&
rot_y > box[1] - MARGIN && rot_y < box[3] + MARGIN);
}
__device__ inline int intersection(const Point &p1, const Point &p0, const Point &q1, const Point &q0, Point &ans){
__device__ inline int intersection(const Point &p1, const Point &p0,
const Point &q1, const Point &q0,
Point &ans) {
// fast exclusion
if (check_rect_cross(p0, p1, q0, q1) == 0) return 0;
......@@ -72,12 +83,11 @@ __device__ inline int intersection(const Point &p1, const Point &p0, const Point
// calculate intersection of two lines
float s5 = cross(q1, p1, p0);
if(fabs(s5 - s1) > EPS){
if (fabs(s5 - s1) > EPS) {
ans.x = (s5 * q0.x - s1 * q1.x) / (s5 - s1);
ans.y = (s5 * q0.y - s1 * q1.y) / (s5 - s1);
}
else{
} else {
float a0 = p0.y - p1.y, b0 = p1.x - p0.x, c0 = p0.x * p1.y - p1.x * p0.y;
float a1 = q0.y - q1.y, b1 = q1.x - q0.x, c1 = q0.x * q1.y - q1.x * q0.y;
float D = a0 * b1 - a1 * b0;
......@@ -89,29 +99,39 @@ __device__ inline int intersection(const Point &p1, const Point &p0, const Point
return 1;
}
__device__ inline void rotate_around_center(const Point &center, const float angle_cos, const float angle_sin, Point &p){
float new_x = (p.x - center.x) * angle_cos + (p.y - center.y) * angle_sin + center.x;
float new_y = -(p.x - center.x) * angle_sin + (p.y - center.y) * angle_cos + center.y;
__device__ inline void rotate_around_center(const Point &center,
const float angle_cos,
const float angle_sin, Point &p) {
float new_x =
(p.x - center.x) * angle_cos + (p.y - center.y) * angle_sin + center.x;
float new_y =
-(p.x - center.x) * angle_sin + (p.y - center.y) * angle_cos + center.y;
p.set(new_x, new_y);
}
__device__ inline int point_cmp(const Point &a, const Point &b, const Point &center){
return atan2(a.y - center.y, a.x - center.x) > atan2(b.y - center.y, b.x - center.x);
__device__ inline int point_cmp(const Point &a, const Point &b,
const Point &center) {
return atan2(a.y - center.y, a.x - center.x) >
atan2(b.y - center.y, b.x - center.x);
}
__device__ inline float box_overlap(const float *box_a, const float *box_b){
__device__ inline float box_overlap(const float *box_a, const float *box_b) {
// params: box_a (5) [x1, y1, x2, y2, angle]
// params: box_b (5) [x1, y1, x2, y2, angle]
float a_x1 = box_a[0], a_y1 = box_a[1], a_x2 = box_a[2], a_y2 = box_a[3], a_angle = box_a[4];
float b_x1 = box_b[0], b_y1 = box_b[1], b_x2 = box_b[2], b_y2 = box_b[3], b_angle = box_b[4];
float a_x1 = box_a[0], a_y1 = box_a[1], a_x2 = box_a[2], a_y2 = box_a[3],
a_angle = box_a[4];
float b_x1 = box_b[0], b_y1 = box_b[1], b_x2 = box_b[2], b_y2 = box_b[3],
b_angle = box_b[4];
Point center_a((a_x1 + a_x2) / 2, (a_y1 + a_y2) / 2);
Point center_b((b_x1 + b_x2) / 2, (b_y1 + b_y2) / 2);
#ifdef DEBUG
printf("a: (%.3f, %.3f, %.3f, %.3f, %.3f), b: (%.3f, %.3f, %.3f, %.3f, %.3f)\n", a_x1, a_y1, a_x2, a_y2, a_angle,
b_x1, b_y1, b_x2, b_y2, b_angle);
printf("center a: (%.3f, %.3f), b: (%.3f, %.3f)\n", center_a.x, center_a.y, center_b.x, center_b.y);
printf(
"a: (%.3f, %.3f, %.3f, %.3f, %.3f), b: (%.3f, %.3f, %.3f, %.3f, %.3f)\n",
a_x1, a_y1, a_x2, a_y2, a_angle, b_x1, b_y1, b_x2, b_y2, b_angle);
printf("center a: (%.3f, %.3f), b: (%.3f, %.3f)\n", center_a.x, center_a.y,
center_b.x, center_b.y);
#endif
Point box_a_corners[5];
......@@ -130,14 +150,17 @@ __device__ inline float box_overlap(const float *box_a, const float *box_b){
float a_angle_cos = cos(a_angle), a_angle_sin = sin(a_angle);
float b_angle_cos = cos(b_angle), b_angle_sin = sin(b_angle);
for (int k = 0; k < 4; k++){
for (int k = 0; k < 4; k++) {
#ifdef DEBUG
printf("before corner %d: a(%.3f, %.3f), b(%.3f, %.3f) \n", k, box_a_corners[k].x, box_a_corners[k].y, box_b_corners[k].x, box_b_corners[k].y);
printf("before corner %d: a(%.3f, %.3f), b(%.3f, %.3f) \n", k,
box_a_corners[k].x, box_a_corners[k].y, box_b_corners[k].x,
box_b_corners[k].y);
#endif
rotate_around_center(center_a, a_angle_cos, a_angle_sin, box_a_corners[k]);
rotate_around_center(center_b, b_angle_cos, b_angle_sin, box_b_corners[k]);
#ifdef DEBUG
printf("corner %d: a(%.3f, %.3f), b(%.3f, %.3f) \n", k, box_a_corners[k].x, box_a_corners[k].y, box_b_corners[k].x, box_b_corners[k].y);
printf("corner %d: a(%.3f, %.3f), b(%.3f, %.3f) \n", k, box_a_corners[k].x,
box_a_corners[k].y, box_b_corners[k].x, box_b_corners[k].y);
#endif
}
......@@ -150,10 +173,12 @@ __device__ inline float box_overlap(const float *box_a, const float *box_b){
int cnt = 0, flag = 0;
poly_center.set(0, 0);
for (int i = 0; i < 4; i++){
for (int j = 0; j < 4; j++){
flag = intersection(box_a_corners[i + 1], box_a_corners[i], box_b_corners[j + 1], box_b_corners[j], cross_points[cnt]);
if (flag){
for (int i = 0; i < 4; i++) {
for (int j = 0; j < 4; j++) {
flag = intersection(box_a_corners[i + 1], box_a_corners[i],
box_b_corners[j + 1], box_b_corners[j],
cross_points[cnt]);
if (flag) {
poly_center = poly_center + cross_points[cnt];
cnt++;
}
......@@ -161,13 +186,13 @@ __device__ inline float box_overlap(const float *box_a, const float *box_b){
}
// check corners
for (int k = 0; k < 4; k++){
if (check_in_box2d(box_a, box_b_corners[k])){
for (int k = 0; k < 4; k++) {
if (check_in_box2d(box_a, box_b_corners[k])) {
poly_center = poly_center + box_b_corners[k];
cross_points[cnt] = box_b_corners[k];
cnt++;
}
if (check_in_box2d(box_b, box_a_corners[k])){
if (check_in_box2d(box_b, box_a_corners[k])) {
poly_center = poly_center + box_a_corners[k];
cross_points[cnt] = box_a_corners[k];
cnt++;
......@@ -179,9 +204,9 @@ __device__ inline float box_overlap(const float *box_a, const float *box_b){
// sort the points of polygon
Point temp;
for (int j = 0; j < cnt - 1; j++){
for (int i = 0; i < cnt - j - 1; i++){
if (point_cmp(cross_points[i], cross_points[i + 1], poly_center)){
for (int j = 0; j < cnt - 1; j++) {
for (int i = 0; i < cnt - j - 1; i++) {
if (point_cmp(cross_points[i], cross_points[i + 1], poly_center)) {
temp = cross_points[i];
cross_points[i] = cross_points[i + 1];
cross_points[i + 1] = temp;
......@@ -191,21 +216,23 @@ __device__ inline float box_overlap(const float *box_a, const float *box_b){
#ifdef DEBUG
printf("cnt=%d\n", cnt);
for (int i = 0; i < cnt; i++){
printf("All cross point %d: (%.3f, %.3f)\n", i, cross_points[i].x, cross_points[i].y);
for (int i = 0; i < cnt; i++) {
printf("All cross point %d: (%.3f, %.3f)\n", i, cross_points[i].x,
cross_points[i].y);
}
#endif
// get the overlap areas
float area = 0;
for (int k = 0; k < cnt - 1; k++){
area += cross(cross_points[k] - cross_points[0], cross_points[k + 1] - cross_points[0]);
for (int k = 0; k < cnt - 1; k++) {
area += cross(cross_points[k] - cross_points[0],
cross_points[k + 1] - cross_points[0]);
}
return fabs(area) / 2.0;
}
__device__ inline float iou_bev(const float *box_a, const float *box_b){
__device__ inline float iou_bev(const float *box_a, const float *box_b) {
// params: box_a (5) [x1, y1, x2, y2, angle]
// params: box_b (5) [x1, y1, x2, y2, angle]
float sa = (box_a[2] - box_a[0]) * (box_a[3] - box_a[1]);
......@@ -214,54 +241,65 @@ __device__ inline float iou_bev(const float *box_a, const float *box_b){
return s_overlap / fmaxf(sa + sb - s_overlap, EPS);
}
__global__ void boxes_overlap_kernel(const int num_a, const float *boxes_a, const int num_b, const float *boxes_b, float *ans_overlap){
__global__ void boxes_overlap_kernel(const int num_a, const float *boxes_a,
const int num_b, const float *boxes_b,
float *ans_overlap) {
const int a_idx = blockIdx.y * THREADS_PER_BLOCK + threadIdx.y;
const int b_idx = blockIdx.x * THREADS_PER_BLOCK + threadIdx.x;
if (a_idx >= num_a || b_idx >= num_b){
if (a_idx >= num_a || b_idx >= num_b) {
return;
}
const float * cur_box_a = boxes_a + a_idx * 5;
const float * cur_box_b = boxes_b + b_idx * 5;
const float *cur_box_a = boxes_a + a_idx * 5;
const float *cur_box_b = boxes_b + b_idx * 5;
float s_overlap = box_overlap(cur_box_a, cur_box_b);
ans_overlap[a_idx * num_b + b_idx] = s_overlap;
}
__global__ void boxes_iou_bev_kernel(const int num_a, const float *boxes_a, const int num_b, const float *boxes_b, float *ans_iou){
__global__ void boxes_iou_bev_kernel(const int num_a, const float *boxes_a,
const int num_b, const float *boxes_b,
float *ans_iou) {
const int a_idx = blockIdx.y * THREADS_PER_BLOCK + threadIdx.y;
const int b_idx = blockIdx.x * THREADS_PER_BLOCK + threadIdx.x;
if (a_idx >= num_a || b_idx >= num_b){
if (a_idx >= num_a || b_idx >= num_b) {
return;
}
const float * cur_box_a = boxes_a + a_idx * 5;
const float * cur_box_b = boxes_b + b_idx * 5;
const float *cur_box_a = boxes_a + a_idx * 5;
const float *cur_box_b = boxes_b + b_idx * 5;
float cur_iou_bev = iou_bev(cur_box_a, cur_box_b);
ans_iou[a_idx * num_b + b_idx] = cur_iou_bev;
}
__global__ void nms_kernel(const int boxes_num, const float nms_overlap_thresh,
const float *boxes, unsigned long long *mask){
//params: boxes (N, 5) [x1, y1, x2, y2, ry]
//params: mask (N, N/THREADS_PER_BLOCK_NMS)
const float *boxes, unsigned long long *mask) {
// params: boxes (N, 5) [x1, y1, x2, y2, ry]
// params: mask (N, N/THREADS_PER_BLOCK_NMS)
const int row_start = blockIdx.y;
const int col_start = blockIdx.x;
// if (row_start > col_start) return;
const int row_size = fminf(boxes_num - row_start * THREADS_PER_BLOCK_NMS, THREADS_PER_BLOCK_NMS);
const int col_size = fminf(boxes_num - col_start * THREADS_PER_BLOCK_NMS, THREADS_PER_BLOCK_NMS);
const int row_size = fminf(boxes_num - row_start * THREADS_PER_BLOCK_NMS,
THREADS_PER_BLOCK_NMS);
const int col_size = fminf(boxes_num - col_start * THREADS_PER_BLOCK_NMS,
THREADS_PER_BLOCK_NMS);
__shared__ float block_boxes[THREADS_PER_BLOCK_NMS * 5];
if (threadIdx.x < col_size) {
block_boxes[threadIdx.x * 5 + 0] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 0];
block_boxes[threadIdx.x * 5 + 1] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 1];
block_boxes[threadIdx.x * 5 + 2] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 2];
block_boxes[threadIdx.x * 5 + 3] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 3];
block_boxes[threadIdx.x * 5 + 4] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 4];
block_boxes[threadIdx.x * 5 + 0] =
boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 0];
block_boxes[threadIdx.x * 5 + 1] =
boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 1];
block_boxes[threadIdx.x * 5 + 2] =
boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 2];
block_boxes[threadIdx.x * 5 + 3] =
boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 3];
block_boxes[threadIdx.x * 5 + 4] =
boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 4];
}
__syncthreads();
......@@ -276,7 +314,7 @@ __global__ void nms_kernel(const int boxes_num, const float nms_overlap_thresh,
start = threadIdx.x + 1;
}
for (i = start; i < col_size; i++) {
if (iou_bev(cur_box, block_boxes + i * 5) > nms_overlap_thresh){
if (iou_bev(cur_box, block_boxes + i * 5) > nms_overlap_thresh) {
t |= 1ULL << i;
}
}
......@@ -285,8 +323,7 @@ __global__ void nms_kernel(const int boxes_num, const float nms_overlap_thresh,
}
}
__device__ inline float iou_normal(float const * const a, float const * const b) {
__device__ inline float iou_normal(float const *const a, float const *const b) {
float left = fmaxf(a[0], b[0]), right = fminf(a[2], b[2]);
float top = fmaxf(a[1], b[1]), bottom = fminf(a[3], b[3]);
float width = fmaxf(right - left, 0.f), height = fmaxf(bottom - top, 0.f);
......@@ -296,28 +333,36 @@ __device__ inline float iou_normal(float const * const a, float const * const b)
return interS / fmaxf(Sa + Sb - interS, EPS);
}
__global__ void nms_normal_kernel(const int boxes_num, const float nms_overlap_thresh,
const float *boxes, unsigned long long *mask){
//params: boxes (N, 5) [x1, y1, x2, y2, ry]
//params: mask (N, N/THREADS_PER_BLOCK_NMS)
__global__ void nms_normal_kernel(const int boxes_num,
const float nms_overlap_thresh,
const float *boxes,
unsigned long long *mask) {
// params: boxes (N, 5) [x1, y1, x2, y2, ry]
// params: mask (N, N/THREADS_PER_BLOCK_NMS)
const int row_start = blockIdx.y;
const int col_start = blockIdx.x;
// if (row_start > col_start) return;
const int row_size = fminf(boxes_num - row_start * THREADS_PER_BLOCK_NMS, THREADS_PER_BLOCK_NMS);
const int col_size = fminf(boxes_num - col_start * THREADS_PER_BLOCK_NMS, THREADS_PER_BLOCK_NMS);
const int row_size = fminf(boxes_num - row_start * THREADS_PER_BLOCK_NMS,
THREADS_PER_BLOCK_NMS);
const int col_size = fminf(boxes_num - col_start * THREADS_PER_BLOCK_NMS,
THREADS_PER_BLOCK_NMS);
__shared__ float block_boxes[THREADS_PER_BLOCK_NMS * 5];
if (threadIdx.x < col_size) {
block_boxes[threadIdx.x * 5 + 0] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 0];
block_boxes[threadIdx.x * 5 + 1] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 1];
block_boxes[threadIdx.x * 5 + 2] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 2];
block_boxes[threadIdx.x * 5 + 3] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 3];
block_boxes[threadIdx.x * 5 + 4] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 4];
block_boxes[threadIdx.x * 5 + 0] =
boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 0];
block_boxes[threadIdx.x * 5 + 1] =
boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 1];
block_boxes[threadIdx.x * 5 + 2] =
boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 2];
block_boxes[threadIdx.x * 5 + 3] =
boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 3];
block_boxes[threadIdx.x * 5 + 4] =
boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 4];
}
__syncthreads();
......@@ -332,7 +377,7 @@ __global__ void nms_normal_kernel(const int boxes_num, const float nms_overlap_t
start = threadIdx.x + 1;
}
for (i = start; i < col_size; i++) {
if (iou_normal(cur_box, block_boxes + i * 5) > nms_overlap_thresh){
if (iou_normal(cur_box, block_boxes + i * 5) > nms_overlap_thresh) {
t |= 1ULL << i;
}
}
......@@ -341,41 +386,45 @@ __global__ void nms_normal_kernel(const int boxes_num, const float nms_overlap_t
}
}
void boxesoverlapLauncher(const int num_a, const float *boxes_a, const int num_b, const float *boxes_b, float *ans_overlap){
dim3 blocks(DIVUP(num_b, THREADS_PER_BLOCK), DIVUP(num_a, THREADS_PER_BLOCK)); // blockIdx.x(col), blockIdx.y(row)
void boxesoverlapLauncher(const int num_a, const float *boxes_a,
const int num_b, const float *boxes_b,
float *ans_overlap) {
dim3 blocks(
DIVUP(num_b, THREADS_PER_BLOCK),
DIVUP(num_a, THREADS_PER_BLOCK)); // blockIdx.x(col), blockIdx.y(row)
dim3 threads(THREADS_PER_BLOCK, THREADS_PER_BLOCK);
boxes_overlap_kernel<<<blocks, threads>>>(num_a, boxes_a, num_b, boxes_b, ans_overlap);
boxes_overlap_kernel<<<blocks, threads>>>(num_a, boxes_a, num_b, boxes_b,
ans_overlap);
#ifdef DEBUG
cudaDeviceSynchronize(); // for using printf in kernel function
#endif
}
void boxesioubevLauncher(const int num_a, const float *boxes_a, const int num_b, const float *boxes_b, float *ans_iou){
dim3 blocks(DIVUP(num_b, THREADS_PER_BLOCK), DIVUP(num_a, THREADS_PER_BLOCK)); // blockIdx.x(col), blockIdx.y(row)
void boxesioubevLauncher(const int num_a, const float *boxes_a, const int num_b,
const float *boxes_b, float *ans_iou) {
dim3 blocks(
DIVUP(num_b, THREADS_PER_BLOCK),
DIVUP(num_a, THREADS_PER_BLOCK)); // blockIdx.x(col), blockIdx.y(row)
dim3 threads(THREADS_PER_BLOCK, THREADS_PER_BLOCK);
boxes_iou_bev_kernel<<<blocks, threads>>>(num_a, boxes_a, num_b, boxes_b, ans_iou);
boxes_iou_bev_kernel<<<blocks, threads>>>(num_a, boxes_a, num_b, boxes_b,
ans_iou);
}
void nmsLauncher(const float *boxes, unsigned long long * mask, int boxes_num, float nms_overlap_thresh){
void nmsLauncher(const float *boxes, unsigned long long *mask, int boxes_num,
float nms_overlap_thresh) {
dim3 blocks(DIVUP(boxes_num, THREADS_PER_BLOCK_NMS),
DIVUP(boxes_num, THREADS_PER_BLOCK_NMS));
dim3 threads(THREADS_PER_BLOCK_NMS);
nms_kernel<<<blocks, threads>>>(boxes_num, nms_overlap_thresh, boxes, mask);
}
void nmsNormalLauncher(const float *boxes, unsigned long long * mask, int boxes_num, float nms_overlap_thresh){
void nmsNormalLauncher(const float *boxes, unsigned long long *mask,
int boxes_num, float nms_overlap_thresh) {
dim3 blocks(DIVUP(boxes_num, THREADS_PER_BLOCK_NMS),
DIVUP(boxes_num, THREADS_PER_BLOCK_NMS));
dim3 threads(THREADS_PER_BLOCK_NMS);
nms_normal_kernel<<<blocks, threads>>>(boxes_num, nms_overlap_thresh, boxes, mask);
nms_normal_kernel<<<blocks, threads>>>(boxes_num, nms_overlap_thresh, boxes,
mask);
}
......@@ -135,13 +135,13 @@ void points_in_boxes_batch_launcher(int batch_size, int boxes_num, int pts_num,
int *box_idx_of_points) {
// params boxes: (B, N, 7) [x, y, z, w, l, h, rz] in LiDAR coordinate, z is
// the bottom center, each box params pts: (B, npoints, 3) [x, y, z] in
//LiDAR coordinate params boxes_idx_of_points: (B, npoints), default -1
// LiDAR coordinate params boxes_idx_of_points: (B, npoints), default -1
cudaError_t err;
dim3 blocks(DIVUP(pts_num, THREADS_PER_BLOCK), batch_size);
dim3 threads(THREADS_PER_BLOCK);
points_in_boxes_batch_kernel<<<blocks, threads>>>(batch_size, boxes_num, pts_num,
boxes, pts, box_idx_of_points);
points_in_boxes_batch_kernel<<<blocks, threads>>>(
batch_size, boxes_num, pts_num, boxes, pts, box_idx_of_points);
err = cudaGetLastError();
if (cudaSuccess != err) {
......
......@@ -18,13 +18,19 @@
#include <vector>
namespace detail {
template <class T> int getTotalSize(std::vector<T> arg) { return arg.size(); }
template <class T>
int getTotalSize(std::vector<T> arg) {
return arg.size();
}
template <class T, class... TArgs>
int getTotalSize(std::vector<T> arg, std::vector<TArgs>... args) {
return arg.size() * getTotalSize(args...);
}
template <typename T> int getSize(std::vector<T> arg) { return arg.size(); }
template <typename T>
int getSize(std::vector<T> arg) {
return arg.size();
}
template <int Idx, class TT, class T>
void assigner(TT &src, std::vector<int> counter, std::vector<T> &arg) {
......
......@@ -22,105 +22,98 @@
#include <utility>
#include <valarray>
namespace pretty_print
{
namespace detail
{
// SFINAE type trait to detect whether T::const_iterator exists.
struct sfinae_base
{
namespace pretty_print {
namespace detail {
// SFINAE type trait to detect whether T::const_iterator exists.
struct sfinae_base {
using yes = char;
using no = yes[2];
};
};
template <typename T>
struct has_const_iterator : private sfinae_base
{
template <typename T>
struct has_const_iterator : private sfinae_base {
private:
template <typename C> static yes & test(typename C::const_iterator*);
template <typename C> static no & test(...);
template <typename C>
static yes &test(typename C::const_iterator *);
template <typename C>
static no &test(...);
public:
static const bool value = sizeof(test<T>(nullptr)) == sizeof(yes);
using type = T;
};
};
template <typename T>
struct has_begin_end : private sfinae_base
{
template <typename T>
struct has_begin_end : private sfinae_base {
private:
template <typename C>
static yes & f(typename std::enable_if<
std::is_same<decltype(static_cast<typename C::const_iterator(C::*)() const>(&C::begin)),
typename C::const_iterator(C::*)() const>::value>::type *);
static yes &
f(typename std::enable_if<
std::is_same<decltype(static_cast<typename C::const_iterator (C::*)()
const>(&C::begin)),
typename C::const_iterator (C::*)() const>::value>::type *);
template <typename C> static no & f(...);
template <typename C>
static no &f(...);
template <typename C>
static yes & g(typename std::enable_if<
std::is_same<decltype(static_cast<typename C::const_iterator(C::*)() const>(&C::end)),
typename C::const_iterator(C::*)() const>::value, void>::type*);
static yes &g(typename std::enable_if<
std::is_same<decltype(static_cast<typename C::const_iterator (
C::*)() const>(&C::end)),
typename C::const_iterator (C::*)() const>::value,
void>::type *);
template <typename C> static no & g(...);
template <typename C>
static no &g(...);
public:
static bool const beg_value = sizeof(f<T>(nullptr)) == sizeof(yes);
static bool const end_value = sizeof(g<T>(nullptr)) == sizeof(yes);
};
} // namespace detail
};
} // namespace detail
// Holds the delimiter values for a specific character type
// Holds the delimiter values for a specific character type
template <typename TChar>
struct delimiters_values
{
template <typename TChar>
struct delimiters_values {
using char_type = TChar;
const char_type * prefix;
const char_type * delimiter;
const char_type * postfix;
};
const char_type *prefix;
const char_type *delimiter;
const char_type *postfix;
};
// Defines the delimiter values for a specific container and character type
// Defines the delimiter values for a specific container and character type
template <typename T, typename TChar>
struct delimiters
{
template <typename T, typename TChar>
struct delimiters {
using type = delimiters_values<TChar>;
static const type values;
};
};
// Functor to print containers. You can use this directly if you want
// to specificy a non-default delimiters type. The printing logic can
// be customized by specializing the nested template.
// Functor to print containers. You can use this directly if you want
// to specificy a non-default delimiters type. The printing logic can
// be customized by specializing the nested template.
template <typename T,
typename TChar = char,
template <typename T, typename TChar = char,
typename TCharTraits = ::std::char_traits<TChar>,
typename TDelimiters = delimiters<T, TChar>>
struct print_container_helper
{
struct print_container_helper {
using delimiters_type = TDelimiters;
using ostream_type = std::basic_ostream<TChar, TCharTraits>;
template <typename U>
struct printer
{
static void print_body(const U & c, ostream_type & stream)
{
struct printer {
static void print_body(const U &c, ostream_type &stream) {
using std::begin;
using std::end;
auto it = begin(c);
const auto the_end = end(c);
if (it != the_end)
{
for ( ; ; )
{
if (it != the_end) {
for (;;) {
stream << *it;
if (++it == the_end) break;
......@@ -132,12 +125,9 @@ namespace pretty_print
}
};
print_container_helper(const T & container)
: container_(container)
{ }
print_container_helper(const T &container) : container_(container) {}
inline void operator()(ostream_type & stream) const
{
inline void operator()(ostream_type &stream) const {
if (delimiters_type::values.prefix != NULL)
stream << delimiters_type::values.prefix;
......@@ -148,298 +138,356 @@ namespace pretty_print
}
private:
const T & container_;
};
const T &container_;
};
// Specialization for pairs
// Specialization for pairs
template <typename T, typename TChar, typename TCharTraits, typename TDelimiters>
template <typename T1, typename T2>
struct print_container_helper<T, TChar, TCharTraits, TDelimiters>::printer<std::pair<T1, T2>>
{
using ostream_type = typename print_container_helper<T, TChar, TCharTraits, TDelimiters>::ostream_type;
template <typename T, typename TChar, typename TCharTraits,
typename TDelimiters>
template <typename T1, typename T2>
struct print_container_helper<T, TChar, TCharTraits,
TDelimiters>::printer<std::pair<T1, T2>> {
using ostream_type =
typename print_container_helper<T, TChar, TCharTraits,
TDelimiters>::ostream_type;
static void print_body(const std::pair<T1, T2> & c, ostream_type & stream)
{
static void print_body(const std::pair<T1, T2> &c, ostream_type &stream) {
stream << c.first;
if (print_container_helper<T, TChar, TCharTraits, TDelimiters>::delimiters_type::values.delimiter != NULL)
stream << print_container_helper<T, TChar, TCharTraits, TDelimiters>::delimiters_type::values.delimiter;
if (print_container_helper<T, TChar, TCharTraits,
TDelimiters>::delimiters_type::values
.delimiter != NULL)
stream << print_container_helper<T, TChar, TCharTraits,
TDelimiters>::delimiters_type::values
.delimiter;
stream << c.second;
}
};
// Specialization for tuples
template <typename T, typename TChar, typename TCharTraits, typename TDelimiters>
template <typename ...Args>
struct print_container_helper<T, TChar, TCharTraits, TDelimiters>::printer<std::tuple<Args...>>
{
using ostream_type = typename print_container_helper<T, TChar, TCharTraits, TDelimiters>::ostream_type;
};
// Specialization for tuples
template <typename T, typename TChar, typename TCharTraits,
typename TDelimiters>
template <typename... Args>
struct print_container_helper<T, TChar, TCharTraits,
TDelimiters>::printer<std::tuple<Args...>> {
using ostream_type =
typename print_container_helper<T, TChar, TCharTraits,
TDelimiters>::ostream_type;
using element_type = std::tuple<Args...>;
template <std::size_t I> struct Int { };
template <std::size_t I>
struct Int {};
static void print_body(const element_type & c, ostream_type & stream)
{
static void print_body(const element_type &c, ostream_type &stream) {
tuple_print(c, stream, Int<0>());
}
static void tuple_print(const element_type &, ostream_type &, Int<sizeof...(Args)>)
{
}
static void tuple_print(const element_type &, ostream_type &,
Int<sizeof...(Args)>) {}
static void tuple_print(const element_type & c, ostream_type & stream,
typename std::conditional<sizeof...(Args) != 0, Int<0>, std::nullptr_t>::type)
{
static void tuple_print(
const element_type &c, ostream_type &stream,
typename std::conditional<sizeof...(Args) != 0, Int<0>,
std::nullptr_t>::type) {
stream << std::get<0>(c);
tuple_print(c, stream, Int<1>());
}
template <std::size_t N>
static void tuple_print(const element_type & c, ostream_type & stream, Int<N>)
{
if (print_container_helper<T, TChar, TCharTraits, TDelimiters>::delimiters_type::values.delimiter != NULL)
stream << print_container_helper<T, TChar, TCharTraits, TDelimiters>::delimiters_type::values.delimiter;
static void tuple_print(const element_type &c, ostream_type &stream, Int<N>) {
if (print_container_helper<T, TChar, TCharTraits,
TDelimiters>::delimiters_type::values
.delimiter != NULL)
stream << print_container_helper<T, TChar, TCharTraits,
TDelimiters>::delimiters_type::values
.delimiter;
stream << std::get<N>(c);
tuple_print(c, stream, Int<N + 1>());
}
};
};
// Prints a print_container_helper to the specified stream.
// Prints a print_container_helper to the specified stream.
template<typename T, typename TChar, typename TCharTraits, typename TDelimiters>
inline std::basic_ostream<TChar, TCharTraits> & operator<<(
std::basic_ostream<TChar, TCharTraits> & stream,
const print_container_helper<T, TChar, TCharTraits, TDelimiters> & helper)
{
template <typename T, typename TChar, typename TCharTraits,
typename TDelimiters>
inline std::basic_ostream<TChar, TCharTraits> &operator<<(
std::basic_ostream<TChar, TCharTraits> &stream,
const print_container_helper<T, TChar, TCharTraits, TDelimiters> &helper) {
helper(stream);
return stream;
}
}
// Basic is_container template; specialize to derive from std::true_type for all desired container types
// Basic is_container template; specialize to derive from std::true_type for all
// desired container types
template <typename T>
struct is_container : public std::integral_constant<bool,
template <typename T>
struct is_container
: public std::integral_constant<bool,
detail::has_const_iterator<T>::value &&
detail::has_begin_end<T>::beg_value &&
detail::has_begin_end<T>::end_value> { };
template <typename T, std::size_t N>
struct is_container<T[N]> : std::true_type { };
template <std::size_t N>
struct is_container<char[N]> : std::false_type { };
template <typename T>
struct is_container<std::valarray<T>> : std::true_type { };
template <typename T1, typename T2>
struct is_container<std::pair<T1, T2>> : std::true_type { };
template <typename ...Args>
struct is_container<std::tuple<Args...>> : std::true_type { };
// Default delimiters
template <typename T> struct delimiters<T, char> { static const delimiters_values<char> values; };
template <typename T> const delimiters_values<char> delimiters<T, char>::values = { "[", ", ", "]" };
template <typename T> struct delimiters<T, wchar_t> { static const delimiters_values<wchar_t> values; };
template <typename T> const delimiters_values<wchar_t> delimiters<T, wchar_t>::values = { L"[", L", ", L"]" };
// Delimiters for (multi)set and unordered_(multi)set
template <typename T, typename TComp, typename TAllocator>
struct delimiters< ::std::set<T, TComp, TAllocator>, char> { static const delimiters_values<char> values; };
template <typename T, typename TComp, typename TAllocator>
const delimiters_values<char> delimiters< ::std::set<T, TComp, TAllocator>, char>::values = { "{", ", ", "}" };
template <typename T, typename TComp, typename TAllocator>
struct delimiters< ::std::set<T, TComp, TAllocator>, wchar_t> { static const delimiters_values<wchar_t> values; };
template <typename T, typename TComp, typename TAllocator>
const delimiters_values<wchar_t> delimiters< ::std::set<T, TComp, TAllocator>, wchar_t>::values = { L"{", L", ", L"}" };
template <typename T, typename TComp, typename TAllocator>
struct delimiters< ::std::multiset<T, TComp, TAllocator>, char> { static const delimiters_values<char> values; };
template <typename T, typename TComp, typename TAllocator>
const delimiters_values<char> delimiters< ::std::multiset<T, TComp, TAllocator>, char>::values = { "{", ", ", "}" };
template <typename T, typename TComp, typename TAllocator>
struct delimiters< ::std::multiset<T, TComp, TAllocator>, wchar_t> { static const delimiters_values<wchar_t> values; };
template <typename T, typename TComp, typename TAllocator>
const delimiters_values<wchar_t> delimiters< ::std::multiset<T, TComp, TAllocator>, wchar_t>::values = { L"{", L", ", L"}" };
template <typename T, typename THash, typename TEqual, typename TAllocator>
struct delimiters< ::std::unordered_set<T, THash, TEqual, TAllocator>, char> { static const delimiters_values<char> values; };
template <typename T, typename THash, typename TEqual, typename TAllocator>
const delimiters_values<char> delimiters< ::std::unordered_set<T, THash, TEqual, TAllocator>, char>::values = { "{", ", ", "}" };
template <typename T, typename THash, typename TEqual, typename TAllocator>
struct delimiters< ::std::unordered_set<T, THash, TEqual, TAllocator>, wchar_t> { static const delimiters_values<wchar_t> values; };
template <typename T, typename THash, typename TEqual, typename TAllocator>
const delimiters_values<wchar_t> delimiters< ::std::unordered_set<T, THash, TEqual, TAllocator>, wchar_t>::values = { L"{", L", ", L"}" };
template <typename T, typename THash, typename TEqual, typename TAllocator>
struct delimiters< ::std::unordered_multiset<T, THash, TEqual, TAllocator>, char> { static const delimiters_values<char> values; };
template <typename T, typename THash, typename TEqual, typename TAllocator>
const delimiters_values<char> delimiters< ::std::unordered_multiset<T, THash, TEqual, TAllocator>, char>::values = { "{", ", ", "}" };
template <typename T, typename THash, typename TEqual, typename TAllocator>
struct delimiters< ::std::unordered_multiset<T, THash, TEqual, TAllocator>, wchar_t> { static const delimiters_values<wchar_t> values; };
template <typename T, typename THash, typename TEqual, typename TAllocator>
const delimiters_values<wchar_t> delimiters< ::std::unordered_multiset<T, THash, TEqual, TAllocator>, wchar_t>::values = { L"{", L", ", L"}" };
// Delimiters for pair and tuple
template <typename T1, typename T2> struct delimiters<std::pair<T1, T2>, char> { static const delimiters_values<char> values; };
template <typename T1, typename T2> const delimiters_values<char> delimiters<std::pair<T1, T2>, char>::values = { "(", ", ", ")" };
template <typename T1, typename T2> struct delimiters< ::std::pair<T1, T2>, wchar_t> { static const delimiters_values<wchar_t> values; };
template <typename T1, typename T2> const delimiters_values<wchar_t> delimiters< ::std::pair<T1, T2>, wchar_t>::values = { L"(", L", ", L")" };
template <typename ...Args> struct delimiters<std::tuple<Args...>, char> { static const delimiters_values<char> values; };
template <typename ...Args> const delimiters_values<char> delimiters<std::tuple<Args...>, char>::values = { "(", ", ", ")" };
template <typename ...Args> struct delimiters< ::std::tuple<Args...>, wchar_t> { static const delimiters_values<wchar_t> values; };
template <typename ...Args> const delimiters_values<wchar_t> delimiters< ::std::tuple<Args...>, wchar_t>::values = { L"(", L", ", L")" };
// Type-erasing helper class for easy use of custom delimiters.
// Requires TCharTraits = std::char_traits<TChar> and TChar = char or wchar_t, and MyDelims needs to be defined for TChar.
// Usage: "cout << pretty_print::custom_delims<MyDelims>(x)".
struct custom_delims_base
{
virtual ~custom_delims_base() { }
virtual std::ostream & stream(::std::ostream &) = 0;
virtual std::wostream & stream(::std::wostream &) = 0;
};
template <typename T, typename Delims>
struct custom_delims_wrapper : custom_delims_base
{
custom_delims_wrapper(const T & t_) : t(t_) { }
std::ostream & stream(std::ostream & s)
{
return s << print_container_helper<T, char, std::char_traits<char>, Delims>(t);
detail::has_begin_end<T>::end_value> {};
template <typename T, std::size_t N>
struct is_container<T[N]> : std::true_type {};
template <std::size_t N>
struct is_container<char[N]> : std::false_type {};
template <typename T>
struct is_container<std::valarray<T>> : std::true_type {};
template <typename T1, typename T2>
struct is_container<std::pair<T1, T2>> : std::true_type {};
template <typename... Args>
struct is_container<std::tuple<Args...>> : std::true_type {};
// Default delimiters
template <typename T>
struct delimiters<T, char> {
static const delimiters_values<char> values;
};
template <typename T>
const delimiters_values<char> delimiters<T, char>::values = {"[", ", ", "]"};
template <typename T>
struct delimiters<T, wchar_t> {
static const delimiters_values<wchar_t> values;
};
template <typename T>
const delimiters_values<wchar_t> delimiters<T, wchar_t>::values = {L"[", L", ",
L"]"};
// Delimiters for (multi)set and unordered_(multi)set
template <typename T, typename TComp, typename TAllocator>
struct delimiters<::std::set<T, TComp, TAllocator>, char> {
static const delimiters_values<char> values;
};
template <typename T, typename TComp, typename TAllocator>
const delimiters_values<char>
delimiters<::std::set<T, TComp, TAllocator>, char>::values = {"{", ", ",
"}"};
template <typename T, typename TComp, typename TAllocator>
struct delimiters<::std::set<T, TComp, TAllocator>, wchar_t> {
static const delimiters_values<wchar_t> values;
};
template <typename T, typename TComp, typename TAllocator>
const delimiters_values<wchar_t>
delimiters<::std::set<T, TComp, TAllocator>, wchar_t>::values = {
L"{", L", ", L"}"};
template <typename T, typename TComp, typename TAllocator>
struct delimiters<::std::multiset<T, TComp, TAllocator>, char> {
static const delimiters_values<char> values;
};
template <typename T, typename TComp, typename TAllocator>
const delimiters_values<char>
delimiters<::std::multiset<T, TComp, TAllocator>, char>::values = {
"{", ", ", "}"};
template <typename T, typename TComp, typename TAllocator>
struct delimiters<::std::multiset<T, TComp, TAllocator>, wchar_t> {
static const delimiters_values<wchar_t> values;
};
template <typename T, typename TComp, typename TAllocator>
const delimiters_values<wchar_t>
delimiters<::std::multiset<T, TComp, TAllocator>, wchar_t>::values = {
L"{", L", ", L"}"};
template <typename T, typename THash, typename TEqual, typename TAllocator>
struct delimiters<::std::unordered_set<T, THash, TEqual, TAllocator>, char> {
static const delimiters_values<char> values;
};
template <typename T, typename THash, typename TEqual, typename TAllocator>
const delimiters_values<char> delimiters<
::std::unordered_set<T, THash, TEqual, TAllocator>, char>::values = {
"{", ", ", "}"};
template <typename T, typename THash, typename TEqual, typename TAllocator>
struct delimiters<::std::unordered_set<T, THash, TEqual, TAllocator>, wchar_t> {
static const delimiters_values<wchar_t> values;
};
template <typename T, typename THash, typename TEqual, typename TAllocator>
const delimiters_values<wchar_t> delimiters<
::std::unordered_set<T, THash, TEqual, TAllocator>, wchar_t>::values = {
L"{", L", ", L"}"};
template <typename T, typename THash, typename TEqual, typename TAllocator>
struct delimiters<::std::unordered_multiset<T, THash, TEqual, TAllocator>,
char> {
static const delimiters_values<char> values;
};
template <typename T, typename THash, typename TEqual, typename TAllocator>
const delimiters_values<char> delimiters<
::std::unordered_multiset<T, THash, TEqual, TAllocator>, char>::values = {
"{", ", ", "}"};
template <typename T, typename THash, typename TEqual, typename TAllocator>
struct delimiters<::std::unordered_multiset<T, THash, TEqual, TAllocator>,
wchar_t> {
static const delimiters_values<wchar_t> values;
};
template <typename T, typename THash, typename TEqual, typename TAllocator>
const delimiters_values<wchar_t>
delimiters<::std::unordered_multiset<T, THash, TEqual, TAllocator>,
wchar_t>::values = {L"{", L", ", L"}"};
// Delimiters for pair and tuple
template <typename T1, typename T2>
struct delimiters<std::pair<T1, T2>, char> {
static const delimiters_values<char> values;
};
template <typename T1, typename T2>
const delimiters_values<char> delimiters<std::pair<T1, T2>, char>::values = {
"(", ", ", ")"};
template <typename T1, typename T2>
struct delimiters<::std::pair<T1, T2>, wchar_t> {
static const delimiters_values<wchar_t> values;
};
template <typename T1, typename T2>
const delimiters_values<wchar_t>
delimiters<::std::pair<T1, T2>, wchar_t>::values = {L"(", L", ", L")"};
template <typename... Args>
struct delimiters<std::tuple<Args...>, char> {
static const delimiters_values<char> values;
};
template <typename... Args>
const delimiters_values<char> delimiters<std::tuple<Args...>, char>::values = {
"(", ", ", ")"};
template <typename... Args>
struct delimiters<::std::tuple<Args...>, wchar_t> {
static const delimiters_values<wchar_t> values;
};
template <typename... Args>
const delimiters_values<wchar_t>
delimiters<::std::tuple<Args...>, wchar_t>::values = {L"(", L", ", L")"};
// Type-erasing helper class for easy use of custom delimiters.
// Requires TCharTraits = std::char_traits<TChar> and TChar = char or wchar_t,
// and MyDelims needs to be defined for TChar. Usage: "cout <<
// pretty_print::custom_delims<MyDelims>(x)".
struct custom_delims_base {
virtual ~custom_delims_base() {}
virtual std::ostream &stream(::std::ostream &) = 0;
virtual std::wostream &stream(::std::wostream &) = 0;
};
template <typename T, typename Delims>
struct custom_delims_wrapper : custom_delims_base {
custom_delims_wrapper(const T &t_) : t(t_) {}
std::ostream &stream(std::ostream &s) {
return s << print_container_helper<T, char, std::char_traits<char>, Delims>(
t);
}
std::wostream & stream(std::wostream & s)
{
return s << print_container_helper<T, wchar_t, std::char_traits<wchar_t>, Delims>(t);
std::wostream &stream(std::wostream &s) {
return s << print_container_helper<T, wchar_t, std::char_traits<wchar_t>,
Delims>(t);
}
private:
const T & t;
};
const T &t;
};
template <typename Delims>
struct custom_delims
{
template <typename Delims>
struct custom_delims {
template <typename Container>
custom_delims(const Container & c) : base(new custom_delims_wrapper<Container, Delims>(c)) { }
custom_delims(const Container &c)
: base(new custom_delims_wrapper<Container, Delims>(c)) {}
std::unique_ptr<custom_delims_base> base;
};
};
template <typename TChar, typename TCharTraits, typename Delims>
inline std::basic_ostream<TChar, TCharTraits> & operator<<(std::basic_ostream<TChar, TCharTraits> & s, const custom_delims<Delims> & p)
{
template <typename TChar, typename TCharTraits, typename Delims>
inline std::basic_ostream<TChar, TCharTraits> &operator<<(
std::basic_ostream<TChar, TCharTraits> &s, const custom_delims<Delims> &p) {
return p.base->stream(s);
}
}
// A wrapper for a C-style array given as pointer-plus-size.
// Usage: std::cout << pretty_print_array(arr, n) << std::endl;
// A wrapper for a C-style array given as pointer-plus-size.
// Usage: std::cout << pretty_print_array(arr, n) << std::endl;
template<typename T>
struct array_wrapper_n
{
typedef const T * const_iterator;
template <typename T>
struct array_wrapper_n {
typedef const T *const_iterator;
typedef T value_type;
array_wrapper_n(const T * const a, size_t n) : _array(a), _n(n) { }
array_wrapper_n(const T *const a, size_t n) : _array(a), _n(n) {}
inline const_iterator begin() const { return _array; }
inline const_iterator end() const { return _array + _n; }
private:
const T * const _array;
const T *const _array;
size_t _n;
};
};
// A wrapper for hash-table based containers that offer local iterators to each
// bucket. Usage: std::cout << bucket_print(m, 4) << std::endl; (Prints bucket
// 5 of container m.)
// A wrapper for hash-table based containers that offer local iterators to each bucket.
// Usage: std::cout << bucket_print(m, 4) << std::endl; (Prints bucket 5 of container m.)
template <typename T>
struct bucket_print_wrapper
{
template <typename T>
struct bucket_print_wrapper {
typedef typename T::const_local_iterator const_iterator;
typedef typename T::size_type size_type;
const_iterator begin() const
{
return m_map.cbegin(n);
}
const_iterator begin() const { return m_map.cbegin(n); }
const_iterator end() const
{
return m_map.cend(n);
}
const_iterator end() const { return m_map.cend(n); }
bucket_print_wrapper(const T & m, size_type bucket) : m_map(m), n(bucket) { }
bucket_print_wrapper(const T &m, size_type bucket) : m_map(m), n(bucket) {}
private:
const T & m_map;
const T &m_map;
const size_type n;
};
};
} // namespace pretty_print
// Global accessor functions for the convenience wrappers
template<typename T>
inline pretty_print::array_wrapper_n<T> pretty_print_array(const T * const a, size_t n)
{
template <typename T>
inline pretty_print::array_wrapper_n<T> pretty_print_array(const T *const a,
size_t n) {
return pretty_print::array_wrapper_n<T>(a, n);
}
template <typename T> pretty_print::bucket_print_wrapper<T>
bucket_print(const T & m, typename T::size_type n)
{
template <typename T>
pretty_print::bucket_print_wrapper<T> bucket_print(const T &m,
typename T::size_type n) {
return pretty_print::bucket_print_wrapper<T>(m, n);
}
// Main magic entry point: An overload snuck into namespace std.
// Can we do better?
namespace std
{
// Prints a container to the stream using default delimiters
namespace std {
// Prints a container to the stream using default delimiters
template<typename T, typename TChar, typename TCharTraits>
inline typename enable_if< ::pretty_print::is_container<T>::value,
template <typename T, typename TChar, typename TCharTraits>
inline typename enable_if<::pretty_print::is_container<T>::value,
basic_ostream<TChar, TCharTraits> &>::type
operator<<(basic_ostream<TChar, TCharTraits> & stream, const T & container)
{
return stream << ::pretty_print::print_container_helper<T, TChar, TCharTraits>(container);
}
operator<<(basic_ostream<TChar, TCharTraits> &stream, const T &container) {
return stream
<< ::pretty_print::print_container_helper<T, TChar, TCharTraits>(
container);
}
} // namespace std
#endif // H_PRETTY_PRINT
......@@ -12,15 +12,15 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef BOX_IOU_H
#define BOX_IOU_H
#include <pybind11/pybind11.h>
// must include pybind11/eigen.h if using eigen matrix as arguments.
#include <pybind11/numpy.h>
#include <algorithm>
#include <boost/geometry.hpp>
#include <pybind11/numpy.h>
namespace spconv {
// #include "voxelnet/core/cc/pybind11_helper.h"
......@@ -40,9 +40,10 @@ inline py::array_t<DType> zeros(std::vector<long int> shape) {
}
template <typename DType>
py::array_t<DType>
rbbox_iou(py::array_t<DType> box_corners, py::array_t<DType> qbox_corners,
py::array_t<DType> standup_iou, DType standup_thresh) {
py::array_t<DType> rbbox_iou(py::array_t<DType> box_corners,
py::array_t<DType> qbox_corners,
py::array_t<DType> standup_iou,
DType standup_thresh) {
namespace bg = boost::geometry;
typedef bg::model::point<DType, 2, bg::cs::cartesian> point_t;
typedef bg::model::polygon<point_t> polygon_t;
......@@ -61,8 +62,7 @@ rbbox_iou(py::array_t<DType> box_corners, py::array_t<DType> qbox_corners,
}
for (int k = 0; k < K; ++k) {
for (int n = 0; n < N; ++n) {
if (standup_iou_r(n, k) <= standup_thresh)
continue;
if (standup_iou_r(n, k) <= standup_thresh) continue;
bg::append(poly, point_t(box_corners_r(n, 0, 0), box_corners_r(n, 0, 1)));
bg::append(poly, point_t(box_corners_r(n, 1, 0), box_corners_r(n, 1, 1)));
bg::append(poly, point_t(box_corners_r(n, 2, 0), box_corners_r(n, 2, 1)));
......@@ -99,9 +99,10 @@ rbbox_iou(py::array_t<DType> box_corners, py::array_t<DType> qbox_corners,
}
template <typename DType>
py::array_t<DType>
rbbox_intersection(py::array_t<DType> box_corners, py::array_t<DType> qbox_corners,
py::array_t<DType> standup_iou, DType standup_thresh) {
py::array_t<DType> rbbox_intersection(py::array_t<DType> box_corners,
py::array_t<DType> qbox_corners,
py::array_t<DType> standup_iou,
DType standup_thresh) {
namespace bg = boost::geometry;
typedef bg::model::point<DType, 2, bg::cs::cartesian> point_t;
typedef bg::model::polygon<point_t> polygon_t;
......@@ -120,8 +121,7 @@ rbbox_intersection(py::array_t<DType> box_corners, py::array_t<DType> qbox_corne
}
for (int k = 0; k < K; ++k) {
for (int n = 0; n < N; ++n) {
if (standup_iou_r(n, k) <= standup_thresh)
continue;
if (standup_iou_r(n, k) <= standup_thresh) continue;
bg::append(poly, point_t(box_corners_r(n, 0, 0), box_corners_r(n, 0, 1)));
bg::append(poly, point_t(box_corners_r(n, 1, 0), box_corners_r(n, 1, 1)));
bg::append(poly, point_t(box_corners_r(n, 2, 0), box_corners_r(n, 2, 1)));
......@@ -152,6 +152,5 @@ rbbox_intersection(py::array_t<DType> box_corners, py::array_t<DType> qbox_corne
return overlaps;
}
} // namespace spconv
#endif
......@@ -15,9 +15,10 @@
#ifndef SPCONV_GEOMETRY_H_
#define SPCONV_GEOMETRY_H_
#include <tensorview/tensorview.h>
#include <iostream>
#include <limits>
#include <tensorview/tensorview.h>
namespace spconv {
template <typename Index, unsigned NDim>
......@@ -70,8 +71,7 @@ TV_HOST_DEVICE Index getValidOutPos(const Index *input_pos,
}
out[pointCounter * (NDim + 1) + NDim] = offset;
if (valid)
++pointCounter;
if (valid) ++pointCounter;
counter[NDim - 1] += 1;
#pragma unroll
for (int c = NDim - 1; c >= 0; --c) {
......@@ -128,8 +128,7 @@ TV_HOST_DEVICE Index getValidOutPosTranspose(
m *= kernelSize[j];
}
out[pointCounter * (NDim + 1) + NDim] = offset;
if (valid)
++pointCounter;
if (valid) ++pointCounter;
counter[NDim - 1] += 1;
#pragma unroll
for (int c = NDim - 1; c >= 0; --c) {
......@@ -167,7 +166,7 @@ Index getIndicePairsConv(tv::TensorView<const Index> indicesIn,
}
Index numValidPoints = 0;
std::vector<Index> validPoints_(kernelVolume * (NDim + 1));
Index* validPoints = validPoints_.data();
Index *validPoints = validPoints_.data();
Index *pointPtr = nullptr;
for (int j = 0; j < numActIn; ++j) {
batchIdx = indicesIn(j, 0);
......@@ -218,7 +217,7 @@ Index getIndicePairsDeConv(tv::TensorView<const Index> indicesIn,
}
Index numValidPoints = 0;
std::vector<Index> validPoints_(kernelVolume * (NDim + 1));
Index* validPoints = validPoints_.data();
Index *validPoints = validPoints_.data();
Index *pointPtr = nullptr;
for (int j = 0; j < numActIn; ++j) {
batchIdx = indicesIn(j, 0);
......@@ -252,7 +251,8 @@ Index getIndicePairsSubM(tv::TensorView<const Index> indicesIn,
tv::TensorView<Index> indiceNum,
const Index *const kernelSize,
const Index *const stride, const Index *const padding,
const Index *dilation, const Index *const outSpatialShape) {
const Index *dilation,
const Index *const outSpatialShape) {
Index numAct = 0;
auto numActIn = indicesIn.dim(0);
Index batchIdx = 0;
......@@ -269,7 +269,7 @@ Index getIndicePairsSubM(tv::TensorView<const Index> indicesIn,
Index numValidPoints = 0;
// Index validPoints[kernelVolume * (NDim + 1)];
std::vector<Index> validPoints_(kernelVolume * (NDim + 1));
Index* validPoints = validPoints_.data();
Index *validPoints = validPoints_.data();
Index *pointPtr = nullptr;
Index index = 0;
for (int j = 0; j < numActIn; ++j) {
......
......@@ -14,9 +14,9 @@
#ifndef INDICE_CU_H_
#define INDICE_CU_H_
#include <tensorview/tensorview.h>
#include <tensorview/helper_kernel.cu.h>
#include <spconv/geometry.h>
#include <tensorview/helper_kernel.cu.h>
#include <tensorview/tensorview.h>
namespace spconv {
template <typename Index, typename IndexGrid, unsigned NDim,
......@@ -115,7 +115,6 @@ __global__ void assignGridAndIndiceOutKernel(
int numAct, tv::TensorView<Index> indicePairs,
tv::TensorView<Index> indicePairUnique,
const tv::SimpleVector<Index, NDim> outSpatialShape, int batchSize) {
Index index;
auto indicesOutPtr = indicesOut.data();
for (int ix : tv::KernelLoopX<int>(numAct)) {
......@@ -128,13 +127,11 @@ __global__ void assignGridAndIndiceOutKernel(
}
template <typename Index, typename IndexGrid, unsigned NDim>
__global__ void
assignIndicePairsKernel(tv::TensorView<Index> indicesOut,
tv::TensorView<IndexGrid> gridsOut, int numActIn,
tv::TensorView<Index> indicePairs,
__global__ void assignIndicePairsKernel(
tv::TensorView<Index> indicesOut, tv::TensorView<IndexGrid> gridsOut,
int numActIn, tv::TensorView<Index> indicePairs,
tv::TensorView<Index> indicePairUnique,
const tv::SimpleVector<Index, NDim> outSpatialShape) {
Index index;
int kernelVolume = indicePairs.dim(0);
for (int ix : tv::KernelLoopX<int>(numActIn)) {
......@@ -148,9 +145,8 @@ assignIndicePairsKernel(tv::TensorView<Index> indicesOut,
}
template <typename Index, typename IndexGrid, unsigned NDim>
__global__ void
prepareSubMGridKernel(tv::TensorView<const Index> indicesIn,
tv::TensorView<IndexGrid> gridsOut,
__global__ void prepareSubMGridKernel(
tv::TensorView<const Index> indicesIn, tv::TensorView<IndexGrid> gridsOut,
const tv::SimpleVector<Index, NDim> outSpatialShape) {
auto numActIn = indicesIn.dim(0);
Index spatialVolume = 1;
......@@ -216,10 +212,9 @@ __global__ void resetGridKernel(const Index *indicePairUnique,
}
template <typename Index, typename IndexGrid, unsigned NDim>
__global__ void
resetGridSubMKernel(const Index *indices, tv::TensorView<IndexGrid> gridsOut,
const tv::SimpleVector<Index, NDim> outSpatialShape,
int numAct) {
__global__ void resetGridSubMKernel(
const Index *indices, tv::TensorView<IndexGrid> gridsOut,
const tv::SimpleVector<Index, NDim> outSpatialShape, int numAct) {
int outSpatialShapeReg[NDim];
for (int i = 0; i < NDim; ++i) {
outSpatialShapeReg[i] = outSpatialShape[i];
......
......@@ -16,62 +16,63 @@
#define SPARSE_CONV_INDICE_FUNCTOR_H_
#include <tensorview/tensorview.h>
namespace spconv
{
namespace functor
{
namespace spconv {
namespace functor {
template <typename Device, typename Index, typename IndexGrid, unsigned NDim>
struct CreateConvIndicePairFunctorP1
{
Index operator()(
const Device& d, tv::TensorView<const Index> indicesIn,
tv::TensorView<Index> indicesOut, tv::TensorView<IndexGrid> gridsOut,
tv::TensorView<Index> indicePairs, tv::TensorView<Index> indiceNum,
struct CreateConvIndicePairFunctorP1 {
Index operator()(const Device& d, tv::TensorView<const Index> indicesIn,
tv::TensorView<Index> indicesOut,
tv::TensorView<IndexGrid> gridsOut,
tv::TensorView<Index> indicePairs,
tv::TensorView<Index> indiceNum,
tv::TensorView<Index> indicePairUnique,
const tv::SimpleVector<Index, NDim> kernelSize,
const tv::SimpleVector<Index, NDim> stride,
const tv::SimpleVector<Index, NDim> padding,
const tv::SimpleVector<Index, NDim> dilation,
const tv::SimpleVector<Index, NDim> outSpatialShape, bool transpose);
const tv::SimpleVector<Index, NDim> outSpatialShape,
bool transpose);
};
template <typename Device, typename Index, typename IndexGrid, unsigned NDim>
struct CreateConvIndicePairFunctorP2
{
Index operator()(
const Device& d, tv::TensorView<const Index> indicesIn,
tv::TensorView<Index> indicesOut, tv::TensorView<IndexGrid> gridsOut,
tv::TensorView<Index> indicePairs, tv::TensorView<Index> indiceNum,
struct CreateConvIndicePairFunctorP2 {
Index operator()(const Device& d, tv::TensorView<const Index> indicesIn,
tv::TensorView<Index> indicesOut,
tv::TensorView<IndexGrid> gridsOut,
tv::TensorView<Index> indicePairs,
tv::TensorView<Index> indiceNum,
tv::TensorView<Index> indicePairUnique,
const tv::SimpleVector<Index, NDim> outSpatialShape, bool transpose,
bool resetGrid=false);
const tv::SimpleVector<Index, NDim> outSpatialShape,
bool transpose, bool resetGrid = false);
};
template <typename Device, typename Index, typename IndexGrid, unsigned NDim>
struct CreateConvIndicePairFunctor
{
Index operator()(
const Device& d, tv::TensorView<const Index> indicesIn,
tv::TensorView<Index> indicesOut, tv::TensorView<IndexGrid> gridsOut,
tv::TensorView<Index> indicePairs, tv::TensorView<Index> indiceNum,
struct CreateConvIndicePairFunctor {
Index operator()(const Device& d, tv::TensorView<const Index> indicesIn,
tv::TensorView<Index> indicesOut,
tv::TensorView<IndexGrid> gridsOut,
tv::TensorView<Index> indicePairs,
tv::TensorView<Index> indiceNum,
const tv::SimpleVector<Index, NDim> kernelSize,
const tv::SimpleVector<Index, NDim> stride,
const tv::SimpleVector<Index, NDim> padding,
const tv::SimpleVector<Index, NDim> dilation,
const tv::SimpleVector<Index, NDim> outSpatialShape, bool transpose, bool resetGrid=false);
const tv::SimpleVector<Index, NDim> outSpatialShape,
bool transpose, bool resetGrid = false);
};
template <typename Device, typename Index, typename IndexGrid, unsigned NDim>
struct CreateSubMIndicePairFunctor
{
Index operator()(
const Device& d, tv::TensorView<const Index> indicesIn, tv::TensorView<IndexGrid> gridsOut,
tv::TensorView<Index> indicePairs, tv::TensorView<Index> indiceNum,
struct CreateSubMIndicePairFunctor {
Index operator()(const Device& d, tv::TensorView<const Index> indicesIn,
tv::TensorView<IndexGrid> gridsOut,
tv::TensorView<Index> indicePairs,
tv::TensorView<Index> indiceNum,
const tv::SimpleVector<Index, NDim> kernelSize,
const tv::SimpleVector<Index, NDim> stride,
const tv::SimpleVector<Index, NDim> padding,
const tv::SimpleVector<Index, NDim> dilation,
const tv::SimpleVector<Index, NDim> outSpatialShape, bool transpose, bool resetGrid=false);
const tv::SimpleVector<Index, NDim> outSpatialShape,
bool transpose, bool resetGrid = false);
};
} // namespace functor
} // namespace spconv
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment