Commit c88efea2 authored by xmyqsh's avatar xmyqsh
Browse files

pre-alloc and re-use gpu memory

9ms --> 1ms on waymo open dataset
parent c328c870
...@@ -11,7 +11,7 @@ __global__ void scatterPointToGridKernel( ...@@ -11,7 +11,7 @@ __global__ void scatterPointToGridKernel(
tv::TensorView<const Index> indexes, tv::TensorView<const Index> indexes,
tv::TensorView<float> grids, tv::TensorView<float> grids,
tv::TensorView<Index> numPointsPerGrid, tv::TensorView<Index> numPointsPerGrid,
tv::TensorView<Index> pointIndexUnique, tv::TensorView<Index> pointIndex,
const tv::SimpleVector<Index, NDim> gridShape) { const tv::SimpleVector<Index, NDim> gridShape) {
Index index; Index index;
int numPoints = points.dim(0); int numPoints = points.dim(0);
...@@ -22,7 +22,7 @@ __global__ void scatterPointToGridKernel( ...@@ -22,7 +22,7 @@ __global__ void scatterPointToGridKernel(
// Use ILP to speed up it // Use ILP to speed up it
index = tv::ArrayIndexRowMajor<NDim, NDim>::runPtrs( index = tv::ArrayIndexRowMajor<NDim, NDim>::runPtrs(
indexes.data() + ix * NDim, gridShape.data(), 0); indexes.data() + ix * NDim, gridShape.data(), 0);
pointIndexUnique(ix) = index; pointIndex(ix) = index;
atomicAdd(numPointsPerGrid.data() + index, Index(1)); atomicAdd(numPointsPerGrid.data() + index, Index(1));
#pragma unroll #pragma unroll
for (int k = 0; k != numFeatures; ++k) { for (int k = 0; k != numFeatures; ++k) {
...@@ -55,4 +55,35 @@ __global__ void gatherPointFromGridKernel( ...@@ -55,4 +55,35 @@ __global__ void gatherPointFromGridKernel(
index, coors.data() + ix * NDim, gridShape.data()); index, coors.data() + ix * NDim, gridShape.data());
} }
} }
template <typename Index>
__global__ void resetGridKernel(
tv::TensorView<float> grids,
tv::TensorView<Index> numPointsPerGrid,
tv::TensorView<Index> pointIndexUnique) {
Index index;
int numVoxels = pointIndexUnique.dim(0) - 1;
int numFeatures = grids.dim(1);
for (int ix : tv::KernelLoopX<int>(numVoxels)) {
// slow here, random access
// Use ILP to speed up it
index = pointIndexUnique(ix);
#pragma unroll
for (int k = 0; k != numFeatures; ++k) {
grids(index, k) = 0;
numPointsPerGrid(index) = 0;
}
}
}
template <typename Index>
__global__ void resetPointIndexKernel(
tv::TensorView<Index> pointIndex, const Index gridVolume) {
int num_max_points = pointIndex.dim(0) - 1;
for (int ix : tv::KernelLoopX<int>(num_max_points)) {
pointIndex(ix) = gridVolume;
}
}
} // namespace spconv } // namespace spconv
...@@ -21,10 +21,15 @@ ...@@ -21,10 +21,15 @@
namespace spconv { namespace spconv {
std::vector<torch::Tensor> int64_t
pointsToVoxel(torch::Tensor points, torch::Tensor indexes, pointsToVoxel(torch::Tensor points,
std::vector<int64_t> gridShape, torch::Tensor indexes,
const int64_t ndim, torch::Tensor pointIndex,
const int64_t gridVolume); torch::Tensor grids,
torch::Tensor numPointsPerGrid,
torch::Tensor voxels,
torch::Tensor coors,
std::vector<int64_t> gridShape,
const int64_t ndim);
} // namespace spconv } // namespace spconv
...@@ -11,12 +11,13 @@ void scatter_point_to_grid_cuda( ...@@ -11,12 +11,13 @@ void scatter_point_to_grid_cuda(
torch::Tensor indexes, torch::Tensor indexes,
torch::Tensor grids, torch::Tensor grids,
torch::Tensor numPointsPerGrid, torch::Tensor numPointsPerGrid,
torch::Tensor pointIndexUnique, torch::Tensor pointIndex,
std::vector<int64_t> gridShape, std::vector<int64_t> gridShape,
const int ndim); const int ndim);
void gather_point_from_grid_cuda( void gather_point_from_grid_cuda(
torch::Tensor grids, torch::Tensor numPointsPerGrid, torch::Tensor grids, torch::Tensor numPointsPerGrid,
torch::Tensor pointIndex,
torch::Tensor pointIndexUnique, torch::Tensor pointIndexUnique,
torch::Tensor voxels, torch::Tensor coors, torch::Tensor voxels, torch::Tensor coors,
std::vector<int64_t> gridShape, std::vector<int64_t> gridShape,
......
...@@ -157,36 +157,3 @@ def pillar_scatter(features, coors, shape): ...@@ -157,36 +157,3 @@ def pillar_scatter(features, coors, shape):
return torch.ops.spconv.pillar_scatter_half(features, coors, shape) return torch.ops.spconv.pillar_scatter_half(features, coors, shape)
else: else:
raise NotImplementedError raise NotImplementedError
def points_to_voxel(points, voxel_size, coors_range):
"""
points: [N, ndim] float tensor. points[:, :3] contain xyz points and
points[:, 3:] contain other information such as reflectivity.
voxel_size: [3] list/tuple or array or tensor, float. xyz, indicate voxel size
coors_range: [6] list/tuple or array or tensor, float. indicate voxel range.
format: xyzxyz, minmax
"""
if not isinstance(voxel_size, torch.Tensor):
if not isinstance(voxel_size, np.ndarray):
voxel_size = np.array(voxel_size, dtype=points.dtype)
voxel_size = torch.from_numpy(voxel_size).to(points.device)
if not isinstance(voxel_size, torch.Tensor):
if not isinstance(coors_range, np.ndarray):
coors_range = np.array(coors_range, dtype=points.dtype)
coors_range = torch.from_numpy(coors_range).to(points.device)
grid_shape = torch.round((coors_range[3:] - coors_range[:3]) / voxel_size).to(torch.int32)
grid_volume = grid_shape.prod()
ndim = grid_shape.shape[0]
# indexes = torch.round((points[:, :3] - coors_range[:3]) / voxel_size).to(torch.int32)
indexes = torch.floor((points[:, :3] - coors_range[:3]) / voxel_size).to(torch.int32)
voxels, coors = torch.ops.spconv.points_to_voxel(points, indexes, grid_shape.cpu().numpy().tolist(), ndim, grid_volume.item())
# xyz --> zyx
#coors = coors[::-1]
x, y, z = coors[:, 0].reshape([-1, 1]), coors[:, 1].reshape([-1, 1]), coors[:, 2].reshape([-1, 1])
coors = torch.cat([z, y, x], dim=1)
# can be skipped
x, y, z, f = voxels[:, 0].reshape([-1, 1]), voxels[:, 1].reshape([-1, 1]), voxels[:, 2].reshape([-1, 1]), voxels[:, 3:]
voxels = torch.cat([z, y, x, f], dim=1)
return voxels, coors
...@@ -16,7 +16,6 @@ import numpy as np ...@@ -16,7 +16,6 @@ import numpy as np
import torch import torch
from spconv import spconv_utils from spconv import spconv_utils
from spconv.ops import points_to_voxel as points_to_voxel3
from spconv.spconv_utils import (non_max_suppression_cpu, from spconv.spconv_utils import (non_max_suppression_cpu,
points_to_voxel_3d_np, points_to_voxel_3d_np,
points_to_voxel_3d_np_mean, points_to_voxel_3d_np_mean,
...@@ -298,19 +297,39 @@ class VoxelGeneratorV2: ...@@ -298,19 +297,39 @@ class VoxelGeneratorV2:
class VoxelGeneratorV3: class VoxelGeneratorV3:
def __init__(self, def __init__(self,
voxel_size, voxel_size,
point_cloud_range): point_cloud_range,
max_points,
num_features,
dtype,
device):
self._max_points = max_points
self._point_cloud_range = point_cloud_range self._point_cloud_range = point_cloud_range
self._voxel_size = voxel_size self._voxel_size = voxel_size
self._grid_size = torch.round((self._point_cloud_range[3:] - self._point_cloud_range[:3]) / self._voxel_size).to(torch.int32) self._grid_size = torch.round((self._point_cloud_range[3:] - self._point_cloud_range[:3]) / self._voxel_size).to(torch.int32)
grid_volume = self._grid_size.prod()
self._grid_size = self._grid_size.cpu().numpy().tolist() self._grid_size = self._grid_size.cpu().numpy().tolist()
self._ndim = len(self._grid_size)
self._dtype = dtype
self._device = device
self._point_index = torch.full([max_points + 1], grid_volume, dtype=torch.int32, device=self._device)
self._grids = torch.zeros([grid_volume, num_features], dtype=self._dtype, device=self._device)
self._num_points_per_grid = torch.zeros([grid_volume], dtype=torch.int32, device=self._device)
self._voxels = torch.zeros([max_points, num_features], dtype=self._dtype, device=self._device)
self._coors = torch.zeros([max_points, self._ndim], dtype=torch.int32, device=self._device)
def generate(self, points): def generate(self, points):
res = points_to_voxel3(points, self._voxel_size, self._point_cloud_range) assert points.shape[0] <= self._max_points, 'please enlarge max_points to not smaller than ' + str(points.shape[0])
return res points.to(self._dtype).to(self._device)
return self.points_to_voxel(points)
def generate_multi_gpu(self, points, max_voxels=None): def generate_multi_gpu(self, points):
res = points_to_voxel3(points, self._voxel_size, self._point_cloud_range) assert points.shape[0] <= self._max_points, 'please enlarge max_points to not smaller than ' + str(points.shape[0])
return res points.to(self._dtype).to(self._device)
return self.points_to_voxel(points)
@property @property
def voxel_size(self): def voxel_size(self):
...@@ -324,3 +343,31 @@ class VoxelGeneratorV3: ...@@ -324,3 +343,31 @@ class VoxelGeneratorV3:
def grid_size(self): def grid_size(self):
return self._grid_size return self._grid_size
def points_to_voxel(self, points):
"""
points: [N, ndim] float tensor. points[:, :3] contain xyz points and
points[:, 3:] contain other information such as reflectivity.
voxel_size: [3] list/tuple or array or tensor, float. xyz, indicate voxel size
coors_range: [6] list/tuple or array or tensor, float. indicate voxel range.
format: xyzxyz, minmax
"""
indexes = torch.floor((points[:, :3] - self._point_cloud_range[:3]) / self._voxel_size).to(torch.int32)
num_voxel = torch.ops.spconv.points_to_voxel(points, indexes,
self._point_index,
self._grids,
self._num_points_per_grid,
self._voxels,
self._coors,
self._grid_size,
self._ndim)
voxels = self._voxels[:num_voxel, :]
coors = self._coors[:num_voxel, :]
# xyz --> zyx
#coors = coors[::-1]
x, y, z = coors[:, 0].reshape([-1, 1]), coors[:, 1].reshape([-1, 1]), coors[:, 2].reshape([-1, 1])
coors = torch.cat([z, y, x], dim=1)
# can be skipped
# x, y, z, f = voxels[:, 0].reshape([-1, 1]), voxels[:, 1].reshape([-1, 1]), voxels[:, 2].reshape([-1, 1]), voxels[:, 3:]
# voxels = torch.cat([z, y, x, f], dim=1)
return voxels, coors
...@@ -15,13 +15,13 @@ void scatter_point_to_grid_cuda( ...@@ -15,13 +15,13 @@ void scatter_point_to_grid_cuda(
torch::Tensor indexes, torch::Tensor indexes,
torch::Tensor grids, torch::Tensor grids,
torch::Tensor numPointsPerGrid, torch::Tensor numPointsPerGrid,
torch::Tensor pointIndexUnique, torch::Tensor pointIndex,
std::vector<int64_t> gridShape, std::vector<int64_t> gridShape,
const int ndim) { const int ndim) {
auto stream = at::cuda::getCurrentCUDAStream(); auto stream = at::cuda::getCurrentCUDAStream();
auto num_points = points.size(0); auto num_points = points.size(0);
auto num_features = points.size(1); auto num_features = points.size(1);
tv::dispatch_torch<int32_t>(pointIndexUnique.scalar_type(), [&](auto IndexValue) { tv::dispatch_torch<int32_t>(pointIndex.scalar_type(), [&](auto IndexValue) {
using Index = decltype(IndexValue); using Index = decltype(IndexValue);
tv::dispatch_int<2, 3, 4>(ndim, [&](auto I) { tv::dispatch_int<2, 3, 4>(ndim, [&](auto I) {
constexpr int NDim = decltype(I)::value; constexpr int NDim = decltype(I)::value;
...@@ -32,7 +32,7 @@ void scatter_point_to_grid_cuda( ...@@ -32,7 +32,7 @@ void scatter_point_to_grid_cuda(
tv::torch2tv<Index>(indexes), tv::torch2tv<Index>(indexes),
tv::torch2tv<float>(grids), tv::torch2tv<float>(grids),
tv::torch2tv<Index>(numPointsPerGrid), tv::torch2tv<Index>(numPointsPerGrid),
tv::torch2tv<Index>(pointIndexUnique), tv::torch2tv<Index>(pointIndex),
gs); gs);
TV_CHECK_CUDA_ERR_V2("scatterPointToGridKernel failed"); TV_CHECK_CUDA_ERR_V2("scatterPointToGridKernel failed");
#ifdef TV_LOG_KERNEL_INFO #ifdef TV_LOG_KERNEL_INFO
...@@ -48,17 +48,33 @@ void scatter_point_to_grid_cuda( ...@@ -48,17 +48,33 @@ void scatter_point_to_grid_cuda(
void gather_point_from_grid_cuda( void gather_point_from_grid_cuda(
torch::Tensor grids, torch::Tensor numPointsPerGrid, torch::Tensor grids, torch::Tensor numPointsPerGrid,
torch::Tensor pointIndex,
torch::Tensor pointIndexUnique, torch::Tensor pointIndexUnique,
torch::Tensor voxels, torch::Tensor coors, torch::Tensor voxels, torch::Tensor coors,
std::vector<int64_t> gridShape, std::vector<int64_t> gridShape,
const int ndim) { const int ndim) {
auto stream = at::cuda::getCurrentCUDAStream(); auto stream = at::cuda::getCurrentCUDAStream();
auto num_voxel = voxels.size(0); auto num_voxel = voxels.size(0);
auto num_max_points = pointIndex.size(0) - 1;
auto grid_volume = grids.size(0);
tv::dispatch_torch<int32_t>(pointIndexUnique.scalar_type(), [&](auto IndexValue) { tv::dispatch_torch<int32_t>(pointIndexUnique.scalar_type(), [&](auto IndexValue) {
using Index = decltype(IndexValue); using Index = decltype(IndexValue);
tv::dispatch_int<2, 3, 4>(ndim, [&](auto I) { tv::dispatch_int<2, 3, 4>(ndim, [&](auto I) {
constexpr int NDim = decltype(I)::value; constexpr int NDim = decltype(I)::value;
tv::SimpleVector<Index, NDim> gs(gridShape.begin(), gridShape.end()); tv::SimpleVector<Index, NDim> gs(gridShape.begin(), gridShape.end());
resetPointIndexKernel<Index>
<<<tv::cuda::getBlocks(num_max_points), tv::cuda::CUDA_NUM_THREADS,
0, stream>>>(tv::torch2tv<Index>(pointIndex), grid_volume);
TV_CHECK_CUDA_ERR_V2("resetPointIndexKernel failed");
#ifdef TV_LOG_KERNEL_INFO
cudaFuncAttributes attr0;
checkCudaErrors(cudaFuncGetAttributes(
&attr0, resetPointIndexKernel<Index, NDim>));
tv::ssprint("resetPointIndexKernel<", tv::type_s<Index>, NDim, ">",
attr0.numRegs);
#endif
gatherPointFromGridKernel<Index, NDim> gatherPointFromGridKernel<Index, NDim>
<<<tv::cuda::getBlocks(num_voxel), tv::cuda::CUDA_NUM_THREADS, <<<tv::cuda::getBlocks(num_voxel), tv::cuda::CUDA_NUM_THREADS,
0, stream>>>(tv::torch2tv<float>(grids), 0, stream>>>(tv::torch2tv<float>(grids),
...@@ -68,11 +84,25 @@ void gather_point_from_grid_cuda( ...@@ -68,11 +84,25 @@ void gather_point_from_grid_cuda(
tv::torch2tv<Index>(coors), tv::torch2tv<Index>(coors),
gs); gs);
TV_CHECK_CUDA_ERR_V2("gatherPointFromGridKernel failed"); TV_CHECK_CUDA_ERR_V2("gatherPointFromGridKernel failed");
cudaFuncAttributes attr2;
#ifdef TV_LOG_KERNEL_INFO #ifdef TV_LOG_KERNEL_INFO
cudaFuncAttributes attr1;
checkCudaErrors(cudaFuncGetAttributes( checkCudaErrors(cudaFuncGetAttributes(
&attr2, gatherPointFromGridKernel<Index, NDim>)); &attr1, gatherPointFromGridKernel<Index, NDim>));
tv::ssprint("gatherPointFromGridKernel<", tv::type_s<Index>, NDim, ">", tv::ssprint("gatherPointFromGridKernel<", tv::type_s<Index>, NDim, ">",
attr1.numRegs);
#endif
resetGridKernel<Index>
<<<tv::cuda::getBlocks(num_voxel), tv::cuda::CUDA_NUM_THREADS,
0, stream>>>(tv::torch2tv<float>(grids),
tv::torch2tv<Index>(numPointsPerGrid),
tv::torch2tv<Index>(pointIndexUnique));
TV_CHECK_CUDA_ERR_V2("resetGridKernel failed");
#ifdef TV_LOG_KERNEL_INFO
cudaFuncAttributes attr2;
checkCudaErrors(cudaFuncGetAttributes(
&attr2, resetGridKernel<Index, NDim>));
tv::ssprint("resetGridKernel<", tv::type_s<Index>, NDim, ">",
attr2.numRegs); attr2.numRegs);
#endif #endif
}); });
......
...@@ -3,53 +3,44 @@ ...@@ -3,53 +3,44 @@
namespace spconv { namespace spconv {
std::vector<torch::Tensor> int64_t
pointsToVoxel(torch::Tensor points, torch::Tensor indexes, pointsToVoxel(torch::Tensor points,
std::vector<int64_t> gridShape, torch::Tensor indexes,
const int64_t ndim, torch::Tensor pointIndex,
const int64_t gridVolume) { torch::Tensor grids,
auto device = points.device().type(); torch::Tensor numPointsPerGrid,
auto num_points = points.size(0); torch::Tensor voxels,
auto num_features = points.size(1); torch::Tensor coors,
auto pointIndexUnique = torch::full( std::vector<int64_t> gridShape,
{num_points + 1}, std::numeric_limits<int>::max(), const int64_t ndim) {
torch::dtype(torch::kInt32).device(points.device()));
auto grids = torch::zeros({gridVolume, num_features},
torch::dtype(points.dtype()).device(points.device()));
auto numPointsPerGrid = torch::zeros({gridVolume},
torch::dtype(torch::kInt32).device(points.device()));
if (points.device().type() == torch::kCPU) { if (points.device().type() == torch::kCPU) {
TV_THROW_INVALID_ARG("not support cpu currently"); TV_THROW_INVALID_ARG("not support cpu currently");
} }
#ifdef TV_CUDA #ifdef TV_CUDA
else if (points.device().type() == torch::kCUDA) { else if (points.device().type() == torch::kCUDA) {
scatter_point_to_grid_cuda(points, indexes, grids, scatter_point_to_grid_cuda(points, indexes, grids,
numPointsPerGrid, pointIndexUnique, gridShape, ndim); numPointsPerGrid, pointIndex, gridShape, ndim);
} }
#endif #endif
else { else {
TV_THROW_INVALID_ARG("unknown device type"); TV_THROW_INVALID_ARG("unknown device type");
} }
auto res = torch::_unique(pointIndexUnique); auto res = torch::_unique(pointIndex);
pointIndexUnique = std::get<0>(res); auto pointIndexUnique = std::get<0>(res);
auto num_voxel = pointIndexUnique.size(0) - 1; auto num_voxel = pointIndexUnique.size(0) - 1;
auto voxels = torch::zeros({num_voxel, num_features},
torch::dtype(points.dtype()).device(points.device()));
auto coors = torch::zeros({num_voxel, ndim},
torch::dtype(torch::kInt32).device(points.device()));
if (points.device().type() == torch::kCPU) { if (points.device().type() == torch::kCPU) {
TV_THROW_INVALID_ARG("not support cpu currently"); TV_THROW_INVALID_ARG("not support cpu currently");
} }
#ifdef TV_CUDA #ifdef TV_CUDA
else if (points.device().type() == torch::kCUDA) { else if (points.device().type() == torch::kCUDA) {
gather_point_from_grid_cuda(grids, numPointsPerGrid, gather_point_from_grid_cuda(grids, numPointsPerGrid,
pointIndexUnique, voxels, coors, gridShape, ndim); pointIndex, pointIndexUnique, voxels, coors, gridShape, ndim);
} }
#endif #endif
else { else {
TV_THROW_INVALID_ARG("unknown device type"); TV_THROW_INVALID_ARG("unknown device type");
} }
return {voxels, coors}; return num_voxel;
} }
} // namespace spconv } // namespace spconv
import time
from pathlib import Path
import numpy as np
import torch
from torch import nn
import spconv
from spconv.utils import VoxelGeneratorV2, VoxelGeneratorV3
def waymo_data_gpu(batch_size=1):
print('gpu with total points available per voxel')
data = np.load(Path(__file__).parent / "data" / "benchmark-pc.npz")
points = torch.from_numpy(data['pc']).cuda().float()
voxel_size = torch.Tensor([0.1, 0.1, 0.1]).to(points.dtype).to(points.device)
coors_range = torch.Tensor([-80, -80, -2, 80, 80, 6]).to(points.dtype).to(points.device)
gen = VoxelGeneratorV3(voxel_size, coors_range, max_points=200000,
num_features=points.shape[1],
dtype=points.dtype,
device=points.device)
voxels, coors = gen.generate(points)
times = []
with torch.no_grad():
for i in range(200):
torch.cuda.synchronize()
t = time.time()
voxels, coors = gen.generate(points)
torch.cuda.synchronize()
times.append(time.time() - t)
print("voxelization time", np.mean(times[100:]))
N = coors.shape[0]
batch_id = torch.zeros([N, 1], dtype=coors.dtype, device=coors.device)
coors = torch.cat([batch_id, coors], dim=1)
return voxels, coors, gen.grid_size
def waymo_data_cpu(max_points_per_voxel=1, batch_size=1):
print('cpu with %d max points per voxel' % max_points_per_voxel)
gen = VoxelGeneratorV2([0.1, 0.1, 0.1], [-80, -80, -2, 80, 80, 6], max_points_per_voxel,
150000)
data = np.load(Path(__file__).parent / "data" / "benchmark-pc.npz")
pc = data["pc"]
data = gen.generate(pc)
times = []
with torch.no_grad():
for i in range(200):
torch.cuda.synchronize()
t = time.time()
data = gen.generate(pc)
torch.cuda.synchronize()
times.append(time.time() - t)
print("voxelization time", np.mean(times[100:]))
voxels = data["voxels"].reshape(-1, 3)
coors = data["coordinates"]
N = coors.shape[0]
coors = np.concatenate([np.full([N, 1], 0, coors.dtype), coors], axis=1)
return voxels, coors, gen.grid_size
def main():
waymo_data_gpu()
waymo_data_cpu(1)
waymo_data_cpu(10)
waymo_data_cpu(40)
if __name__ == "__main__":
main()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment