Commit a7eb7dbe authored by wangchao1's avatar wangchao1
Browse files

update voxelization

parent 7374ed8c
......@@ -10,6 +10,7 @@
typedef enum { SUM = 0, MEAN = 1, MAX = 2 } reduce_t;
template <typename T, typename T_int>
__global__ void dynamic_voxelize_kernel(
const T* points, T_int* coors, const float voxel_x, const float voxel_y,
......@@ -49,8 +50,55 @@ __global__ void dynamic_voxelize_kernel(
}
}
template <typename T, typename T_int>
__global__ void __launch_bounds__(1024) dynamic_voxelize_kernel_fast(
const T* points, T_int* coors,int64_t* coors64, const float voxel_x, const float voxel_y,
const float voxel_z, const float coors_x_min, const float coors_y_min,
const float coors_z_min, const float coors_x_max, const float coors_y_max,
const float coors_z_max, const int grid_x, const int grid_y,
const int grid_z, const int num_points, const int num_features,
const int NDim) {
CUDA_1D_KERNEL_LOOP(index, num_points) {
// To save some computation
auto points_offset = points + index * num_features;
auto coors_offset = coors + index * NDim;
int c_x = floorf((points_offset[0] - coors_x_min) / voxel_x);
if (c_x < 0 || c_x >= grid_x) {
coors_offset[0] = -1;
coors64[index] = -1;
continue;
}
int c_y = floorf((points_offset[1] - coors_y_min) / voxel_y);
if (c_y < 0 || c_y >= grid_y) {
coors_offset[0] = -1;
coors_offset[1] = -1;
coors64[index] = -1;
continue;
}
int c_z = floorf((points_offset[2] - coors_z_min) / voxel_z);
if (c_z < 0 || c_z >= grid_z) {
coors_offset[0] = -1;
coors_offset[1] = -1;
coors_offset[2] = -1;
coors64[index] = -1;
} else {
coors_offset[0] = c_z;
coors_offset[1] = c_y;
coors_offset[2] = c_x;
coors64[index] = ((int64_t)c_x)*(grid_z*grid_y) +((int64_t)c_y)*(grid_z)+c_z;
}
}
}
template <typename T, typename T_int>
__global__ void assign_point_to_voxel(const int nthreads, const T* points,
__global__ void __launch_bounds__(1024) assign_point_to_voxel(const int nthreads, const T* points,
T_int* point_to_voxelidx,
T_int* coor_to_voxelidx, T* voxels,
const int max_points,
......@@ -73,7 +121,7 @@ __global__ void assign_point_to_voxel(const int nthreads, const T* points,
}
template <typename T, typename T_int>
__global__ void assign_voxel_coors(const int nthreads, T_int* coor,
__global__ void __launch_bounds__(1024) assign_voxel_coors(const int nthreads, T_int* coor,
T_int* point_to_voxelidx,
T_int* coor_to_voxelidx, T_int* voxel_coors,
const int num_points, const int NDim) {
......@@ -135,6 +183,66 @@ __global__ void point_to_voxelidx_kernel(const T_int* coor,
}
}
template <typename T_int,int VEC>
__global__ void __launch_bounds__(1024) point_to_voxelidx_kernel_fast(const int64_t* coor,
T_int* point_to_voxelidx,
T_int* point_to_pointidx,
const int max_points,
const int max_voxels,
const int num_points) {
using Int64VEC = __attribute__( (__vector_size__(VEC * sizeof(int64_t)) )) int64_t;
int tid=threadIdx.x;
int index=(blockIdx.x * blockDim.x + tid)*VEC;
auto i_coor = *reinterpret_cast<const Int64VEC*>(coor+index);
if (index >= num_points) return;
int num[VEC];
for(int i=0;i<VEC;i++){
num[i]=0;
}
for (int i = 0; i < index; i+=VEC) {
auto prev_coor = *reinterpret_cast<const Int64VEC*>(coor+i);
for(int k=0;k<VEC;k++){
if (prev_coor[k] == -1) continue;
for(int m=0;m<VEC;m++){
if (prev_coor[k]==i_coor[m]) {
num[m]++;
if (num[m] == 1) {
point_to_pointidx[index+m] = i+k;
}
}
}
}
}
{
for(int k=0;k<VEC-1;k++){
if (i_coor[k] == -1) continue;
for(int m=k+1;m<VEC;m++){
if (i_coor[k]==i_coor[m]) {
num[m]++;
if (num[m] == 1) {
point_to_pointidx[index+m] = index+k;
}
}
}
}
}
for(int k=0;k<VEC;k++){
if(i_coor[k]==-1)continue;
if (num[k] == 0) {
point_to_pointidx[index+k] = index+k;
}
if (num[k] < max_points) {
point_to_voxelidx[index+k] = num[k];
}
}
}
template <typename T_int>
__global__ void determin_voxel_num(
// const T_int* coor,
......@@ -166,6 +274,31 @@ __global__ void determin_voxel_num(
}
}
template <typename T_int>
void determin_voxel_num_cpu( T_int* num_points_per_voxel, T_int* point_to_voxelidx,
T_int* point_to_pointidx, T_int* coor_to_voxelidx, T_int* voxel_num,
const int max_points, const int max_voxels, const int num_points) {
for (int i = 0; i < num_points; ++i) {
int point_pos_in_voxel = point_to_voxelidx[i];
if (point_pos_in_voxel == -1) {
continue;
} else if (point_pos_in_voxel == 0) {
int voxelidx = voxel_num[0];
if (voxel_num[0] >= max_voxels) continue;
voxel_num[0] += 1;
coor_to_voxelidx[i] = voxelidx;
num_points_per_voxel[voxelidx] = 1;
} else {
int point_idx = point_to_pointidx[i];
int voxelidx = coor_to_voxelidx[point_idx];
if (voxelidx != -1) {
coor_to_voxelidx[i] = voxelidx;
num_points_per_voxel[voxelidx] += 1;
}
}
}
}
__global__ void nondeterministic_get_assign_pos(
const int nthreads, const int32_t* coors_map, int32_t* pts_id,
int32_t* coors_count, int32_t* reduce_count, int32_t* coors_order) {
......
......@@ -36,19 +36,22 @@ int HardVoxelizeForwardCUDAKernelLauncher(
// map points to voxel coors
at::Tensor temp_coors =
at::zeros({num_points, NDim}, points.options().dtype(at::kInt));
at::Tensor temp_coors64 =
at::zeros({num_points}, points.options().dtype(at::kLong));
dim3 grid(std::min(at::cuda::ATenCeilDiv(num_points, 512), 4096));
dim3 block(512);
// 1. link point to corresponding voxel coors
AT_DISPATCH_ALL_TYPES(
points.scalar_type(), "hard_voxelize_kernel", ([&] {
dynamic_voxelize_kernel<scalar_t, int><<<grid, block, 0, stream>>>(
dynamic_voxelize_kernel_fast<scalar_t, int>
<<<grid, block, 0, stream>>>(
points.contiguous().data_ptr<scalar_t>(),
temp_coors.contiguous().data_ptr<int>(), voxel_x, voxel_y, voxel_z,
coors_x_min, coors_y_min, coors_z_min, coors_x_max, coors_y_max,
coors_z_max, grid_x, grid_y, grid_z, num_points, num_features,
NDim);
temp_coors.contiguous().data_ptr<int>(),
temp_coors64.data_ptr<int64_t>(), voxel_x, voxel_y,
voxel_z, coors_x_min, coors_y_min, coors_z_min, coors_x_max,
coors_y_max, coors_z_max, grid_x, grid_y, grid_z, num_points,
num_features, NDim);
}));
AT_CUDA_CHECK(cudaGetLastError());
......@@ -66,45 +69,58 @@ int HardVoxelizeForwardCUDAKernelLauncher(
},
points.options().dtype(at::kInt));
dim3 map_grid(std::min(at::cuda::ATenCeilDiv(num_points, 512), 4096));
dim3 map_block(512);
int blocksize=256;
constexpr int VEC=2;
dim3 map_grid(at::cuda::ATenCeilDiv(num_points, blocksize*VEC));
dim3 map_block(blocksize);
AT_DISPATCH_ALL_TYPES(
temp_coors.scalar_type(), "determin_duplicate", ([&] {
point_to_voxelidx_kernel<int><<<map_grid, map_block, 0, stream>>>(
temp_coors.contiguous().data_ptr<int>(),
point_to_voxelidx_kernel_fast<int,VEC>
<<<map_grid, map_block, 0, stream>>>(
temp_coors64.contiguous().data_ptr<int64_t>(),
point_to_voxelidx.contiguous().data_ptr<int>(),
point_to_pointidx.contiguous().data_ptr<int>(), max_points,
max_voxels, num_points, NDim);
max_voxels, num_points);
}));
cudaDeviceSynchronize();
AT_CUDA_CHECK(cudaGetLastError());
// 3. determine voxel num and voxel's coor index
// make the logic in the CUDA device could accelerate about 10 times
auto coor_to_voxelidx = -at::ones(
auto coor_to_voxelidx_cpu = -at::ones(
{
num_points,
},
points.options().dtype(at::kInt));
auto voxel_num = at::zeros(
points.options().device(at::kCPU).dtype(at::kInt));
auto voxel_num_cpu = at::zeros(
{
1,
},
points.options().dtype(at::kInt)); // must be zero from the beginning
points.options().device(at::kCPU).dtype(at::kInt));
AT_DISPATCH_ALL_TYPES(temp_coors.scalar_type(), "determin_duplicate", ([&] {
determin_voxel_num<int><<<1, 1, 0, stream>>>(
num_points_per_voxel.contiguous().data_ptr<int>(),
point_to_voxelidx.contiguous().data_ptr<int>(),
point_to_pointidx.contiguous().data_ptr<int>(),
coor_to_voxelidx.contiguous().data_ptr<int>(),
voxel_num.contiguous().data_ptr<int>(),
max_points, max_voxels, num_points);
auto point_to_voxelidx_cpu = point_to_voxelidx.to(at::kCPU);
auto point_to_pointidx_cpu = point_to_pointidx.to(at::kCPU);
auto num_points_per_voxel_cpu=num_points_per_voxel.to(at::kCPU);
AT_DISPATCH_ALL_TYPES(
temp_coors.scalar_type(), "determin_duplicate", ([&] {
determin_voxel_num_cpu<int>(
num_points_per_voxel_cpu.contiguous().data_ptr<int>(),
point_to_voxelidx_cpu.contiguous().data_ptr<int>(),
point_to_pointidx_cpu.contiguous().data_ptr<int>(),
coor_to_voxelidx_cpu.contiguous().data_ptr<int>(),
voxel_num_cpu.contiguous().data_ptr<int>(), max_points, max_voxels,
num_points);
}));
cudaDeviceSynchronize();
AT_CUDA_CHECK(cudaGetLastError());
auto coor_to_voxelidx=coor_to_voxelidx_cpu.to(at::kCUDA);
num_points_per_voxel.copy_(num_points_per_voxel_cpu);
// 4. copy point features to voxels
// Step 4 & 5 could be parallel
auto pts_output_size = num_points * num_features;
......@@ -139,7 +155,7 @@ int HardVoxelizeForwardCUDAKernelLauncher(
AT_CUDA_CHECK(cudaGetLastError());
auto voxel_num_cpu = voxel_num.to(at::kCPU);
// auto voxel_num_cpu = voxel_num.to(at::kCPU);
int voxel_num_int = voxel_num_cpu.data_ptr<int>()[0];
return voxel_num_int;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment