Commit c328c870 authored by xmyqsh's avatar xmyqsh
Browse files

init GPU Voxelization

parent cfaa1a3a
#pragma once
#include <tensorview/kernel_utils.h>
#include <tensorview/tensorview.h>
#include <torch/script.h>
namespace spconv {
template <typename Index, unsigned NDim>
__global__ void scatterPointToGridKernel(
tv::TensorView<const float> points,
tv::TensorView<const Index> indexes,
tv::TensorView<float> grids,
tv::TensorView<Index> numPointsPerGrid,
tv::TensorView<Index> pointIndexUnique,
const tv::SimpleVector<Index, NDim> gridShape) {
Index index;
int numPoints = points.dim(0);
int numFeatures = points.dim(1);
for (int ix : tv::KernelLoopX<int>(numPoints)) {
// slow here, atomic Add + random access
// Use ILP to speed up it
index = tv::ArrayIndexRowMajor<NDim, NDim>::runPtrs(
indexes.data() + ix * NDim, gridShape.data(), 0);
pointIndexUnique(ix) = index;
atomicAdd(numPointsPerGrid.data() + index, Index(1));
#pragma unroll
for (int k = 0; k != numFeatures; ++k) {
atomicAdd(grids.data() + index * numFeatures + k, *(points.data() + ix * numFeatures + k));
}
}
}
template <typename Index, unsigned NDim>
__global__ void gatherPointFromGridKernel(
tv::TensorView<const float> grids,
tv::TensorView<const Index> numPointsPerGrid,
tv::TensorView<const Index> pointIndexUnique,
tv::TensorView<float> voxels,
tv::TensorView<Index> coors,
const tv::SimpleVector<Index, NDim> gridShape) {
Index index;
int numVoxels = voxels.dim(0);
int numFeatures = grids.dim(1);
for (int ix : tv::KernelLoopX<int>(numVoxels)) {
// slow here, random access
// Use ILP to speed up it
index = pointIndexUnique(ix);
#pragma unroll
for (int k = 0; k != numFeatures; ++k) {
voxels(ix, k) = grids(index, k) / numPointsPerGrid(index);
}
index = tv::rowArrayIdxInv<Index, NDim>(
index, coors.data() + ix * NDim, gridShape.data());
}
}
} // namespace spconv
// Copyright 2020 xmyqsh
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <spconv/points2voxels.h>
#include <tensorview/torch_utils.h>
#include <torch/script.h>
#include <utility/timer.h>
namespace spconv {
std::vector<torch::Tensor>
pointsToVoxel(torch::Tensor points, torch::Tensor indexes,
std::vector<int64_t> gridShape,
const int64_t ndim,
const int64_t gridVolume);
} // namespace spconv
#pragma once
#include <tensorview/tensorview.h>
#include <torch/script.h>
namespace spconv {
void scatter_point_to_grid_cuda(
torch::Tensor points,
torch::Tensor indexes,
torch::Tensor grids,
torch::Tensor numPointsPerGrid,
torch::Tensor pointIndexUnique,
std::vector<int64_t> gridShape,
const int ndim);
void gather_point_from_grid_cuda(
torch::Tensor grids, torch::Tensor numPointsPerGrid,
torch::Tensor pointIndexUnique,
torch::Tensor voxels, torch::Tensor coors,
std::vector<int64_t> gridShape,
const int ndim);
} // namespace spconv
...@@ -157,3 +157,36 @@ def pillar_scatter(features, coors, shape): ...@@ -157,3 +157,36 @@ def pillar_scatter(features, coors, shape):
return torch.ops.spconv.pillar_scatter_half(features, coors, shape) return torch.ops.spconv.pillar_scatter_half(features, coors, shape)
else: else:
raise NotImplementedError raise NotImplementedError
def points_to_voxel(points, voxel_size, coors_range):
"""
points: [N, ndim] float tensor. points[:, :3] contain xyz points and
points[:, 3:] contain other information such as reflectivity.
voxel_size: [3] list/tuple or array or tensor, float. xyz, indicate voxel size
coors_range: [6] list/tuple or array or tensor, float. indicate voxel range.
format: xyzxyz, minmax
"""
if not isinstance(voxel_size, torch.Tensor):
if not isinstance(voxel_size, np.ndarray):
voxel_size = np.array(voxel_size, dtype=points.dtype)
voxel_size = torch.from_numpy(voxel_size).to(points.device)
if not isinstance(voxel_size, torch.Tensor):
if not isinstance(coors_range, np.ndarray):
coors_range = np.array(coors_range, dtype=points.dtype)
coors_range = torch.from_numpy(coors_range).to(points.device)
grid_shape = torch.round((coors_range[3:] - coors_range[:3]) / voxel_size).to(torch.int32)
grid_volume = grid_shape.prod()
ndim = grid_shape.shape[0]
# indexes = torch.round((points[:, :3] - coors_range[:3]) / voxel_size).to(torch.int32)
indexes = torch.floor((points[:, :3] - coors_range[:3]) / voxel_size).to(torch.int32)
voxels, coors = torch.ops.spconv.points_to_voxel(points, indexes, grid_shape.cpu().numpy().tolist(), ndim, grid_volume.item())
# xyz --> zyx
#coors = coors[::-1]
x, y, z = coors[:, 0].reshape([-1, 1]), coors[:, 1].reshape([-1, 1]), coors[:, 2].reshape([-1, 1])
coors = torch.cat([z, y, x], dim=1)
# can be skipped
x, y, z, f = voxels[:, 0].reshape([-1, 1]), voxels[:, 1].reshape([-1, 1]), voxels[:, 2].reshape([-1, 1]), voxels[:, 3:]
voxels = torch.cat([z, y, x, f], dim=1)
return voxels, coors
...@@ -13,8 +13,10 @@ ...@@ -13,8 +13,10 @@
# limitations under the License. # limitations under the License.
import numpy as np import numpy as np
import torch
from spconv import spconv_utils from spconv import spconv_utils
from spconv.ops import points_to_voxel as points_to_voxel3
from spconv.spconv_utils import (non_max_suppression_cpu, from spconv.spconv_utils import (non_max_suppression_cpu,
points_to_voxel_3d_np, points_to_voxel_3d_np,
points_to_voxel_3d_np_mean, points_to_voxel_3d_np_mean,
...@@ -292,3 +294,33 @@ class VoxelGeneratorV2: ...@@ -292,3 +294,33 @@ class VoxelGeneratorV2:
@property @property
def grid_size(self): def grid_size(self):
return self._grid_size return self._grid_size
class VoxelGeneratorV3:
def __init__(self,
voxel_size,
point_cloud_range):
self._point_cloud_range = point_cloud_range
self._voxel_size = voxel_size
self._grid_size = torch.round((self._point_cloud_range[3:] - self._point_cloud_range[:3]) / self._voxel_size).to(torch.int32)
self._grid_size = self._grid_size.cpu().numpy().tolist()
def generate(self, points):
res = points_to_voxel3(points, self._voxel_size, self._point_cloud_range)
return res
def generate_multi_gpu(self, points, max_voxels=None):
res = points_to_voxel3(points, self._voxel_size, self._point_cloud_range)
return res
@property
def voxel_size(self):
return self._voxel_size
@property
def point_cloud_range(self):
return self._point_cloud_range
@property
def grid_size(self):
return self._grid_size
set(ALL_FILES all.cc indice.cc reordering.cc maxpool.cc nms.cc spconv_ops.cc pool_ops.cc) set(ALL_FILES all.cc indice.cc reordering.cc maxpool.cc nms.cc spconv_ops.cc pool_ops.cc point2voxel_ops.cc)
if (SPCONV_BuildCUDA) if (SPCONV_BuildCUDA)
set(ALL_FILES ${ALL_FILES} indice.cu reordering.cu maxpool.cu pillar_scatter.cu cublas_gemm.cc fused_conv.cu) set(ALL_FILES ${ALL_FILES} indice.cu reordering.cu maxpool.cu pillar_scatter.cu cublas_gemm.cc point2voxel.cu fused_conv.cu)
endif() endif()
add_library(spconv SHARED ${ALL_FILES}) add_library(spconv SHARED ${ALL_FILES})
......
...@@ -15,12 +15,14 @@ ...@@ -15,12 +15,14 @@
#include <spconv/fused_spconv_ops.h> #include <spconv/fused_spconv_ops.h>
#include <spconv/nms_ops.h> #include <spconv/nms_ops.h>
#include <spconv/pillar_scatter_ops.h> #include <spconv/pillar_scatter_ops.h>
#include <spconv/point2voxel_ops.h>
#include <spconv/pool_ops.h> #include <spconv/pool_ops.h>
#include <spconv/spconv_ops.h> #include <spconv/spconv_ops.h>
#include <torch/script.h> #include <torch/script.h>
static auto registry = static auto registry =
torch::RegisterOperators() torch::RegisterOperators()
.op("spconv::points_to_voxel", &spconv::pointsToVoxel)
.op("spconv::get_indice_pairs", &spconv::getIndicePairs) .op("spconv::get_indice_pairs", &spconv::getIndicePairs)
.op("spconv::indice_conv", &spconv::indiceConv) .op("spconv::indice_conv", &spconv::indiceConv)
.op("spconv::indice_conv_backward", &spconv::indiceConvBackward) .op("spconv::indice_conv_backward", &spconv::indiceConvBackward)
......
#include <ATen/ATen.h>
#include <spconv/point2voxel.cu.h>
//#include <spconv/point2voxel.h>
#include <tensorview/cuda_utils.h>
#include <tensorview/mp_helper.h>
#include <tensorview/tensor.h>
#include <tensorview/tensorview.h>
#include <tensorview/torch_utils.h>
namespace spconv {
void scatter_point_to_grid_cuda(
torch::Tensor points,
torch::Tensor indexes,
torch::Tensor grids,
torch::Tensor numPointsPerGrid,
torch::Tensor pointIndexUnique,
std::vector<int64_t> gridShape,
const int ndim) {
auto stream = at::cuda::getCurrentCUDAStream();
auto num_points = points.size(0);
auto num_features = points.size(1);
tv::dispatch_torch<int32_t>(pointIndexUnique.scalar_type(), [&](auto IndexValue) {
using Index = decltype(IndexValue);
tv::dispatch_int<2, 3, 4>(ndim, [&](auto I) {
constexpr int NDim = decltype(I)::value;
tv::SimpleVector<Index, NDim> gs(gridShape.begin(), gridShape.end());
scatterPointToGridKernel<Index, NDim>
<<<tv::cuda::getBlocks(num_points), tv::cuda::CUDA_NUM_THREADS,
0, stream>>>(tv::torch2tv<float>(points),
tv::torch2tv<Index>(indexes),
tv::torch2tv<float>(grids),
tv::torch2tv<Index>(numPointsPerGrid),
tv::torch2tv<Index>(pointIndexUnique),
gs);
TV_CHECK_CUDA_ERR_V2("scatterPointToGridKernel failed");
#ifdef TV_LOG_KERNEL_INFO
cudaFuncAttributes attr;
checkCudaErrors(cudaFuncGetAttributes(
&attr, scatterPointToGridKernel<Index, NDim>));
tv::ssprint("scatterPointToGridKernel<", tv::type_s<Index>, NDim,
">", attr.numRegs);
#endif
});
});
}
void gather_point_from_grid_cuda(
torch::Tensor grids, torch::Tensor numPointsPerGrid,
torch::Tensor pointIndexUnique,
torch::Tensor voxels, torch::Tensor coors,
std::vector<int64_t> gridShape,
const int ndim) {
auto stream = at::cuda::getCurrentCUDAStream();
auto num_voxel = voxels.size(0);
tv::dispatch_torch<int32_t>(pointIndexUnique.scalar_type(), [&](auto IndexValue) {
using Index = decltype(IndexValue);
tv::dispatch_int<2, 3, 4>(ndim, [&](auto I) {
constexpr int NDim = decltype(I)::value;
tv::SimpleVector<Index, NDim> gs(gridShape.begin(), gridShape.end());
gatherPointFromGridKernel<Index, NDim>
<<<tv::cuda::getBlocks(num_voxel), tv::cuda::CUDA_NUM_THREADS,
0, stream>>>(tv::torch2tv<float>(grids),
tv::torch2tv<Index>(numPointsPerGrid),
tv::torch2tv<Index>(pointIndexUnique),
tv::torch2tv<float>(voxels),
tv::torch2tv<Index>(coors),
gs);
TV_CHECK_CUDA_ERR_V2("gatherPointFromGridKernel failed");
cudaFuncAttributes attr2;
#ifdef TV_LOG_KERNEL_INFO
checkCudaErrors(cudaFuncGetAttributes(
&attr2, gatherPointFromGridKernel<Index, NDim>));
tv::ssprint("gatherPointFromGridKernel<", tv::type_s<Index>, NDim, ">",
attr2.numRegs);
#endif
});
});
}
} // namespace spconv
#include <spconv/point2voxel_ops.h>
//#include <spconv/point2voxel.cu.h>
namespace spconv {
std::vector<torch::Tensor>
pointsToVoxel(torch::Tensor points, torch::Tensor indexes,
std::vector<int64_t> gridShape,
const int64_t ndim,
const int64_t gridVolume) {
auto device = points.device().type();
auto num_points = points.size(0);
auto num_features = points.size(1);
auto pointIndexUnique = torch::full(
{num_points + 1}, std::numeric_limits<int>::max(),
torch::dtype(torch::kInt32).device(points.device()));
auto grids = torch::zeros({gridVolume, num_features},
torch::dtype(points.dtype()).device(points.device()));
auto numPointsPerGrid = torch::zeros({gridVolume},
torch::dtype(torch::kInt32).device(points.device()));
if (points.device().type() == torch::kCPU) {
TV_THROW_INVALID_ARG("not support cpu currently");
}
#ifdef TV_CUDA
else if (points.device().type() == torch::kCUDA) {
scatter_point_to_grid_cuda(points, indexes, grids,
numPointsPerGrid, pointIndexUnique, gridShape, ndim);
}
#endif
else {
TV_THROW_INVALID_ARG("unknown device type");
}
auto res = torch::_unique(pointIndexUnique);
pointIndexUnique = std::get<0>(res);
auto num_voxel = pointIndexUnique.size(0) - 1;
auto voxels = torch::zeros({num_voxel, num_features},
torch::dtype(points.dtype()).device(points.device()));
auto coors = torch::zeros({num_voxel, ndim},
torch::dtype(torch::kInt32).device(points.device()));
if (points.device().type() == torch::kCPU) {
TV_THROW_INVALID_ARG("not support cpu currently");
}
#ifdef TV_CUDA
else if (points.device().type() == torch::kCUDA) {
gather_point_from_grid_cuda(grids, numPointsPerGrid,
pointIndexUnique, voxels, coors, gridShape, ndim);
}
#endif
else {
TV_THROW_INVALID_ARG("unknown device type");
}
return {voxels, coors};
}
} // namespace spconv
import time
from pathlib import Path
import numpy as np
import torch
from torch import nn
import spconv
from spconv.utils import VoxelGeneratorV3
def waymo_data(batch_size=1):
data = np.load(Path(__file__).parent / "data" / "benchmark-pc.npz")
points = torch.from_numpy(data['pc']).cuda().float()
voxel_size = torch.Tensor([0.1, 0.1, 0.1]).to(points.dtype).to(points.device)
coors_range = torch.Tensor([-80, -80, -2, 80, 80, 6]).to(points.dtype).to(points.device)
gen = VoxelGeneratorV3(voxel_size, coors_range)
voxels, coors = gen.generate(points)
N = coors.shape[0]
batch_id = torch.zeros([N, 1], dtype=coors.dtype, device=coors.device)
coors = torch.cat([batch_id, coors], dim=1)
return voxels, coors, gen.grid_size
class Net(nn.Module):
def __init__(self, shape, algo, device):
super().__init__()
self.device = device
self.net = spconv.SparseSequential(
spconv.SubMConv3d(3, 64, 3, bias=False, indice_key="c0", algo=algo),
spconv.SubMConv3d(64, 64, 3, bias=False, indice_key="c0", algo=algo),
# nn.BatchNorm1d(32),
# nn.ReLU(),
spconv.SparseMaxPool3d(2, 2),
spconv.SubMConv3d(64, 96, 3, bias=False, indice_key="c1", algo=algo),
spconv.SubMConv3d(96, 96, 3, bias=False, indice_key="c1", algo=algo),
# nn.BatchNorm1d(64),
# nn.ReLU(),
spconv.SparseMaxPool3d(2, 2),
spconv.SubMConv3d(96, 128, 3, bias=False, indice_key="c2", algo=algo),
spconv.SubMConv3d(128, 128, 3, bias=False, indice_key="c2", algo=algo),
# nn.BatchNorm1d(128),
# nn.ReLU(),
spconv.SparseMaxPool3d(2, 2),
spconv.SubMConv3d(128, 160, 3, bias=False, indice_key="c3", algo=algo),
spconv.SubMConv3d(160, 160, 3, bias=False, indice_key="c3", algo=algo),
# nn.BatchNorm1d(128),
# nn.ReLU(),
spconv.SparseMaxPool3d(2, 2),
spconv.SubMConv3d(160, 192, 3, bias=False, indice_key="c4", algo=algo),
spconv.SubMConv3d(192, 192, 3, bias=False, indice_key="c4", algo=algo),
# nn.BatchNorm1d(128),
# nn.ReLU(),
spconv.SparseMaxPool3d(2, 2),
spconv.SubMConv3d(192, 224, 3, bias=False, indice_key="c5", algo=algo),
spconv.SubMConv3d(224, 224, 3, bias=False, indice_key="c5", algo=algo),
# nn.BatchNorm1d(128),
# nn.ReLU(),
spconv.SparseMaxPool3d(2, 2),
spconv.SubMConv3d(224, 256, 3, bias=False, indice_key="c6", algo=algo),
spconv.SubMConv3d(256, 256, 3, bias=False, indice_key="c6", algo=algo),
)
max_batch_size = 1
# grid (dense map) is used for indice generation. use pre-allocated grid can run faster.
self.grid = torch.full([max_batch_size, *shape], -1,
dtype=torch.int32, device=self.device)
# self.grid = None
self.shape = shape
def forward(self, features, coors, batch_size):
x = spconv.SparseConvTensor(features, coors, self.shape, batch_size,
self.grid)
return self.net(x)
def main():
voxels, coors, spatial_shape = waymo_data()
voxels_th, coors_th = voxels, coors
algo = spconv.ConvAlgo.Native
net = Net(spatial_shape[::-1], algo, voxels_th.device).cuda(device=voxels_th.device).eval().float()
print(coors_th.shape)
out = net(voxels_th, coors_th, 1)
print(out.spatial_shape)
times = []
with torch.no_grad():
for i in range(20):
torch.cuda.synchronize()
t = time.time()
out = net(voxels_th, coors_th, 1)
torch.cuda.synchronize()
times.append(time.time() - t)
# print((net.grid == -1).float().sum(), net.grid.numel())
# print("spconv time", time.time() - t)
print("spconv time", np.mean(times[10:]))
if __name__ == "__main__":
main()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment