pre-alloc and re-use gpu memory

9ms --> 1ms on waymo open dataset

pre-alloc and re-use gpu memory
9ms --> 1ms on waymo open dataset
c88efea2 · xmyqsh · c328c870 · c88efea2 · c88efea2 · c88efea2
Commit c88efea2 authored Jul 04, 2020 by xmyqsh
8 changed files
--- a/include/spconv/point2voxel.cu.h
+++ b/include/spconv/point2voxel.cu.h
@@ -11,7 +11,7 @@ __global__ void scatterPointToGridKernel(
    tv::TensorView<const Index> indexes,
    tv::TensorView<float> grids,
    tv::TensorView<Index> numPointsPerGrid,
-    tv::TensorView<Index> pointIndexUnique,
+    tv::TensorView<Index> pointIndex,
    const tv::SimpleVector<Index, NDim> gridShape) {
  Index index;
  int numPoints = points.dim(0);
@@ -22,7 +22,7 @@ __global__ void scatterPointToGridKernel(
    // Use ILP to speed up it
    index = tv::ArrayIndexRowMajor<NDim, NDim>::runPtrs(
            indexes.data() + ix * NDim, gridShape.data(), 0);
-    pointIndexUnique(ix) = index;
+    pointIndex(ix) = index;
    atomicAdd(numPointsPerGrid.data() + index, Index(1));
 #pragma unroll
    for (int k = 0; k != numFeatures; ++k) {
@@ -55,4 +55,35 @@ __global__ void gatherPointFromGridKernel(
        index, coors.data() + ix * NDim, gridShape.data());
  }
 }
+template <typename Index>
+__global__ void resetGridKernel(
+    tv::TensorView<float> grids,
+    tv::TensorView<Index> numPointsPerGrid,
+    tv::TensorView<Index> pointIndexUnique) {
+  Index index;
+  int numVoxels = pointIndexUnique.dim(0) - 1;
+  int numFeatures = grids.dim(1);
+  for (int ix : tv::KernelLoopX<int>(numVoxels)) {
+    // slow here, random access
+    // Use ILP to speed up it
+    index = pointIndexUnique(ix);
+#pragma unroll
+    for (int k = 0; k != numFeatures; ++k) {
+      grids(index, k) = 0;
+      numPointsPerGrid(index) = 0;
+    }
+  }
+}
+template <typename Index>
+__global__ void resetPointIndexKernel(
+    tv::TensorView<Index> pointIndex, const Index gridVolume) {
+  int num_max_points = pointIndex.dim(0) - 1;
+  for (int ix : tv::KernelLoopX<int>(num_max_points)) {
+    pointIndex(ix) = gridVolume;
+  }
+}
 } // namespace spconv
--- a/include/spconv/point2voxel_ops.h
+++ b/include/spconv/point2voxel_ops.h
@@ -21,10 +21,15 @@
 namespace spconv {
-std::vector<torch::Tensor>
+int64_t
-pointsToVoxel(torch::Tensor points, torch::Tensor indexes,
+pointsToVoxel(torch::Tensor points,
-                std::vector<int64_t> gridShape,
+              torch::Tensor indexes,
-                const int64_t ndim,
+              torch::Tensor pointIndex,
-                const int64_t gridVolume);
+              torch::Tensor grids,
+              torch::Tensor numPointsPerGrid,
+              torch::Tensor voxels,
+              torch::Tensor coors,
+              std::vector<int64_t> gridShape,
+              const int64_t ndim);
 } // namespace spconv
--- a/include/spconv/points2voxels.h
+++ b/include/spconv/points2voxels.h
@@ -11,12 +11,13 @@ void scatter_point_to_grid_cuda(
    torch::Tensor indexes,
    torch::Tensor grids,
    torch::Tensor numPointsPerGrid,
-    torch::Tensor pointIndexUnique,
+    torch::Tensor pointIndex,
    std::vector<int64_t> gridShape,
    const int ndim);
 void gather_point_from_grid_cuda(
    torch::Tensor grids, torch::Tensor numPointsPerGrid,
+    torch::Tensor pointIndex,
    torch::Tensor pointIndexUnique,
    torch::Tensor voxels, torch::Tensor coors,
    std::vector<int64_t> gridShape,

--- a/spconv/ops.py
+++ b/spconv/ops.py
@@ -157,36 +157,3 @@ def pillar_scatter(features, coors, shape):
        return torch.ops.spconv.pillar_scatter_half(features, coors, shape)
    else:
        raise NotImplementedError
-def points_to_voxel(points, voxel_size, coors_range):
-    """
-        points: [N, ndim] float tensor. points[:, :3] contain xyz points and
-            points[:, 3:] contain other information such as reflectivity.
-        voxel_size: [3] list/tuple or array or tensor, float. xyz, indicate voxel size
-        coors_range: [6] list/tuple or array or tensor, float. indicate voxel range.
-            format: xyzxyz, minmax
-    """
-    if not isinstance(voxel_size, torch.Tensor):
-        if not isinstance(voxel_size, np.ndarray):
-            voxel_size = np.array(voxel_size, dtype=points.dtype)
-        voxel_size = torch.from_numpy(voxel_size).to(points.device)
-    if not isinstance(voxel_size, torch.Tensor):
-        if not isinstance(coors_range, np.ndarray):
-            coors_range = np.array(coors_range, dtype=points.dtype)
-        coors_range = torch.from_numpy(coors_range).to(points.device)
-    grid_shape = torch.round((coors_range[3:] - coors_range[:3]) / voxel_size).to(torch.int32)
-    grid_volume = grid_shape.prod()
-    ndim = grid_shape.shape[0]
-#    indexes = torch.round((points[:, :3] - coors_range[:3]) / voxel_size).to(torch.int32)
-    indexes = torch.floor((points[:, :3] - coors_range[:3]) / voxel_size).to(torch.int32)
-    voxels, coors = torch.ops.spconv.points_to_voxel(points, indexes, grid_shape.cpu().numpy().tolist(), ndim, grid_volume.item())
-    # xyz --> zyx
-    #coors = coors[::-1]
-    x, y, z = coors[:, 0].reshape([-1, 1]), coors[:, 1].reshape([-1, 1]), coors[:, 2].reshape([-1, 1])
-    coors = torch.cat([z, y, x], dim=1)
-    # can be skipped
-    x, y, z, f = voxels[:, 0].reshape([-1, 1]), voxels[:, 1].reshape([-1, 1]), voxels[:, 2].reshape([-1, 1]), voxels[:, 3:]
-    voxels = torch.cat([z, y, x, f], dim=1)
-    return voxels, coors
--- a/spconv/utils/__init__.py
+++ b/spconv/utils/__init__.py
@@ -16,7 +16,6 @@ import numpy as np
 import torch
 from spconv import spconv_utils
-from spconv.ops import points_to_voxel as points_to_voxel3
 from spconv.spconv_utils import (non_max_suppression_cpu,
                                 points_to_voxel_3d_np,
                                 points_to_voxel_3d_np_mean,
@@ -298,19 +297,39 @@ class VoxelGeneratorV2:
 class VoxelGeneratorV3:
    def __init__(self,
                 voxel_size,
-                 point_cloud_range):
+                 point_cloud_range,
+                 max_points,
+                 num_features,
+                 dtype,
+                 device):
+        self._max_points = max_points
        self._point_cloud_range = point_cloud_range
        self._voxel_size = voxel_size
        self._grid_size = torch.round((self._point_cloud_range[3:] - self._point_cloud_range[:3]) / self._voxel_size).to(torch.int32)
+        grid_volume = self._grid_size.prod()
        self._grid_size = self._grid_size.cpu().numpy().tolist()
+        self._ndim = len(self._grid_size)
+        self._dtype = dtype
+        self._device = device
+        self._point_index = torch.full([max_points + 1], grid_volume, dtype=torch.int32, device=self._device)
+        self._grids = torch.zeros([grid_volume, num_features], dtype=self._dtype, device=self._device)
+        self._num_points_per_grid = torch.zeros([grid_volume], dtype=torch.int32, device=self._device)
+        self._voxels = torch.zeros([max_points, num_features], dtype=self._dtype, device=self._device)
+        self._coors = torch.zeros([max_points, self._ndim], dtype=torch.int32, device=self._device)
    def generate(self, points):
-        res = points_to_voxel3(points, self._voxel_size, self._point_cloud_range)
+        assert points.shape[0] <= self._max_points, 'please enlarge max_points to not smaller than ' + str(points.shape[0])
-        return res
+        points.to(self._dtype).to(self._device)
+        return self.points_to_voxel(points)
-    def generate_multi_gpu(self, points, max_voxels=None):
+    def generate_multi_gpu(self, points):
-        res = points_to_voxel3(points, self._voxel_size, self._point_cloud_range)
+        assert points.shape[0] <= self._max_points, 'please enlarge max_points to not smaller than ' + str(points.shape[0])
-        return res
+        points.to(self._dtype).to(self._device)
+        return self.points_to_voxel(points)
    @property
    def voxel_size(self):
@@ -324,3 +343,31 @@ class VoxelGeneratorV3:
    def grid_size(self):
        return self._grid_size
+    def points_to_voxel(self, points):
+        """
+            points: [N, ndim] float tensor. points[:, :3] contain xyz points and
+                points[:, 3:] contain other information such as reflectivity.
+            voxel_size: [3] list/tuple or array or tensor, float. xyz, indicate voxel size
+            coors_range: [6] list/tuple or array or tensor, float. indicate voxel range.
+                format: xyzxyz, minmax
+        """
+        indexes = torch.floor((points[:, :3] - self._point_cloud_range[:3]) / self._voxel_size).to(torch.int32)
+        num_voxel = torch.ops.spconv.points_to_voxel(points, indexes,
+                                                     self._point_index,
+                                                     self._grids,
+                                                     self._num_points_per_grid,
+                                                     self._voxels,
+                                                     self._coors,
+                                                     self._grid_size,
+                                                     self._ndim)
+        voxels = self._voxels[:num_voxel, :]
+        coors = self._coors[:num_voxel, :]
+        # xyz --> zyx
+        #coors = coors[::-1]
+        x, y, z = coors[:, 0].reshape([-1, 1]), coors[:, 1].reshape([-1, 1]), coors[:, 2].reshape([-1, 1])
+        coors = torch.cat([z, y, x], dim=1)
+        # can be skipped
+#        x, y, z, f = voxels[:, 0].reshape([-1, 1]), voxels[:, 1].reshape([-1, 1]), voxels[:, 2].reshape([-1, 1]), voxels[:, 3:]
+#        voxels = torch.cat([z, y, x, f], dim=1)
+        return voxels, coors
--- a/src/spconv/point2voxel.cu
+++ b/src/spconv/point2voxel.cu
@@ -15,13 +15,13 @@ void scatter_point_to_grid_cuda(
    torch::Tensor indexes,
    torch::Tensor grids,
    torch::Tensor numPointsPerGrid,
-    torch::Tensor pointIndexUnique,
+    torch::Tensor pointIndex,
    std::vector<int64_t> gridShape,
    const int ndim) {
  auto stream = at::cuda::getCurrentCUDAStream();
  auto num_points = points.size(0);
  auto num_features = points.size(1);
-  tv::dispatch_torch<int32_t>(pointIndexUnique.scalar_type(), [&](auto IndexValue) {
+  tv::dispatch_torch<int32_t>(pointIndex.scalar_type(), [&](auto IndexValue) {
    using Index = decltype(IndexValue);
    tv::dispatch_int<2, 3, 4>(ndim, [&](auto I) {
      constexpr int NDim = decltype(I)::value;
@@ -32,7 +32,7 @@ void scatter_point_to_grid_cuda(
                          tv::torch2tv<Index>(indexes),
                          tv::torch2tv<float>(grids),
                          tv::torch2tv<Index>(numPointsPerGrid),
-                          tv::torch2tv<Index>(pointIndexUnique),
+                          tv::torch2tv<Index>(pointIndex),
                          gs);
      TV_CHECK_CUDA_ERR_V2("scatterPointToGridKernel failed");
 #ifdef TV_LOG_KERNEL_INFO
@@ -48,17 +48,33 @@ void scatter_point_to_grid_cuda(
 void gather_point_from_grid_cuda(
    torch::Tensor grids, torch::Tensor numPointsPerGrid,
+    torch::Tensor pointIndex,
    torch::Tensor pointIndexUnique,
    torch::Tensor voxels, torch::Tensor coors,
    std::vector<int64_t> gridShape,
    const int ndim) {
  auto stream = at::cuda::getCurrentCUDAStream();
  auto num_voxel = voxels.size(0);
+  auto num_max_points = pointIndex.size(0) - 1;
+  auto grid_volume = grids.size(0);
  tv::dispatch_torch<int32_t>(pointIndexUnique.scalar_type(), [&](auto IndexValue) {
    using Index = decltype(IndexValue);
    tv::dispatch_int<2, 3, 4>(ndim, [&](auto I) {
      constexpr int NDim = decltype(I)::value;
      tv::SimpleVector<Index, NDim> gs(gridShape.begin(), gridShape.end());
+      resetPointIndexKernel<Index>
+          <<<tv::cuda::getBlocks(num_max_points), tv::cuda::CUDA_NUM_THREADS,
+             0, stream>>>(tv::torch2tv<Index>(pointIndex), grid_volume);
+      TV_CHECK_CUDA_ERR_V2("resetPointIndexKernel failed");
+#ifdef TV_LOG_KERNEL_INFO
+      cudaFuncAttributes attr0;
+      checkCudaErrors(cudaFuncGetAttributes(
+          &attr0, resetPointIndexKernel<Index, NDim>));
+      tv::ssprint("resetPointIndexKernel<", tv::type_s<Index>, NDim, ">",
+                  attr0.numRegs);
+#endif
      gatherPointFromGridKernel<Index, NDim>
          <<<tv::cuda::getBlocks(num_voxel), tv::cuda::CUDA_NUM_THREADS,
             0, stream>>>(tv::torch2tv<float>(grids),
@@ -68,11 +84,25 @@ void gather_point_from_grid_cuda(
                          tv::torch2tv<Index>(coors),
                          gs);
      TV_CHECK_CUDA_ERR_V2("gatherPointFromGridKernel failed");
-      cudaFuncAttributes attr2;
 #ifdef TV_LOG_KERNEL_INFO
+      cudaFuncAttributes attr1;
      checkCudaErrors(cudaFuncGetAttributes(
-          &attr2, gatherPointFromGridKernel<Index, NDim>));
+          &attr1, gatherPointFromGridKernel<Index, NDim>));
      tv::ssprint("gatherPointFromGridKernel<", tv::type_s<Index>, NDim, ">",
+                  attr1.numRegs);
+#endif
+      resetGridKernel<Index>
+          <<<tv::cuda::getBlocks(num_voxel), tv::cuda::CUDA_NUM_THREADS,
+             0, stream>>>(tv::torch2tv<float>(grids),
+                          tv::torch2tv<Index>(numPointsPerGrid),
+                          tv::torch2tv<Index>(pointIndexUnique));
+      TV_CHECK_CUDA_ERR_V2("resetGridKernel failed");
+#ifdef TV_LOG_KERNEL_INFO
+      cudaFuncAttributes attr2;
+      checkCudaErrors(cudaFuncGetAttributes(
+          &attr2, resetGridKernel<Index, NDim>));
+      tv::ssprint("resetGridKernel<", tv::type_s<Index>, NDim, ">",
                  attr2.numRegs);
 #endif
    });

--- a/src/spconv/point2voxel_ops.cc
+++ b/src/spconv/point2voxel_ops.cc
@@ -3,53 +3,44 @@
 namespace spconv {
-std::vector<torch::Tensor>
+int64_t
-pointsToVoxel(torch::Tensor points, torch::Tensor indexes,
+pointsToVoxel(torch::Tensor points,
-                std::vector<int64_t> gridShape,
+              torch::Tensor indexes,
-                const int64_t ndim,
+              torch::Tensor pointIndex,
-                const int64_t gridVolume) {
+              torch::Tensor grids,
-  auto device = points.device().type();
+              torch::Tensor numPointsPerGrid,
-  auto num_points = points.size(0);
+              torch::Tensor voxels,
-  auto num_features = points.size(1);
+              torch::Tensor coors,
-  auto pointIndexUnique = torch::full(
+              std::vector<int64_t> gridShape,
-      {num_points + 1}, std::numeric_limits<int>::max(),
+              const int64_t ndim) {
-      torch::dtype(torch::kInt32).device(points.device()));
-  auto grids = torch::zeros({gridVolume, num_features},
-      torch::dtype(points.dtype()).device(points.device()));
-  auto numPointsPerGrid = torch::zeros({gridVolume},
-      torch::dtype(torch::kInt32).device(points.device()));
  if (points.device().type() == torch::kCPU) {
    TV_THROW_INVALID_ARG("not support cpu currently");
  }
 #ifdef TV_CUDA
  else if (points.device().type() == torch::kCUDA) {
    scatter_point_to_grid_cuda(points, indexes, grids,
-        numPointsPerGrid, pointIndexUnique, gridShape, ndim);
+        numPointsPerGrid, pointIndex, gridShape, ndim);
  }
 #endif
  else {
    TV_THROW_INVALID_ARG("unknown device type");
  }
-  auto res = torch::_unique(pointIndexUnique);
+  auto res = torch::_unique(pointIndex);
-  pointIndexUnique = std::get<0>(res);
+  auto pointIndexUnique = std::get<0>(res);
  auto num_voxel = pointIndexUnique.size(0) - 1;
-  auto voxels = torch::zeros({num_voxel, num_features},
-      torch::dtype(points.dtype()).device(points.device()));
-  auto coors = torch::zeros({num_voxel, ndim},
-      torch::dtype(torch::kInt32).device(points.device()));
  if (points.device().type() == torch::kCPU) {
    TV_THROW_INVALID_ARG("not support cpu currently");
  }
 #ifdef TV_CUDA
  else if (points.device().type() == torch::kCUDA) {
    gather_point_from_grid_cuda(grids, numPointsPerGrid,
-        pointIndexUnique, voxels, coors, gridShape, ndim);
+        pointIndex, pointIndexUnique, voxels, coors, gridShape, ndim);
  }
 #endif
  else {
    TV_THROW_INVALID_ARG("unknown device type");
  }
-  return {voxels, coors};
+  return num_voxel;
 }
 } // namespace spconv
--- a/test/benchmark_points_to_voxel.py
+++ b/test/benchmark_points_to_voxel.py
+import time
+from pathlib import Path
+import numpy as np
+import torch
+from torch import nn
+import spconv
+from spconv.utils import VoxelGeneratorV2, VoxelGeneratorV3
+def waymo_data_gpu(batch_size=1):
+    print('gpu with total points available per voxel')
+    data = np.load(Path(__file__).parent / "data" / "benchmark-pc.npz")
+    points = torch.from_numpy(data['pc']).cuda().float()
+    voxel_size = torch.Tensor([0.1, 0.1, 0.1]).to(points.dtype).to(points.device)
+    coors_range = torch.Tensor([-80, -80, -2, 80, 80, 6]).to(points.dtype).to(points.device)
+    gen = VoxelGeneratorV3(voxel_size, coors_range, max_points=200000,
+                                                    num_features=points.shape[1],
+                                                    dtype=points.dtype,
+                                                    device=points.device)
+    voxels, coors = gen.generate(points)
+    times = []
+    with torch.no_grad():
+        for i in range(200):
+            torch.cuda.synchronize()
+            t = time.time()
+            voxels, coors = gen.generate(points)
+            torch.cuda.synchronize()
+            times.append(time.time() - t)
+    print("voxelization time", np.mean(times[100:]))
+    N = coors.shape[0]
+    batch_id = torch.zeros([N, 1], dtype=coors.dtype, device=coors.device)
+    coors = torch.cat([batch_id, coors], dim=1)
+    return voxels, coors, gen.grid_size
+def waymo_data_cpu(max_points_per_voxel=1, batch_size=1):
+    print('cpu with %d max points per voxel' % max_points_per_voxel)
+    gen = VoxelGeneratorV2([0.1, 0.1, 0.1], [-80, -80, -2, 80, 80, 6], max_points_per_voxel,
+                           150000)
+    data = np.load(Path(__file__).parent / "data" / "benchmark-pc.npz")
+    pc = data["pc"]
+    data = gen.generate(pc)
+    times = []
+    with torch.no_grad():
+        for i in range(200):
+            torch.cuda.synchronize()
+            t = time.time()
+            data = gen.generate(pc)
+            torch.cuda.synchronize()
+            times.append(time.time() - t)
+    print("voxelization time", np.mean(times[100:]))
+    voxels = data["voxels"].reshape(-1, 3)
+    coors = data["coordinates"]
+    N = coors.shape[0]
+    coors = np.concatenate([np.full([N, 1], 0, coors.dtype), coors], axis=1)
+    return voxels, coors, gen.grid_size
+def main():
+    waymo_data_gpu()
+    waymo_data_cpu(1)
+    waymo_data_cpu(10)
+    waymo_data_cpu(40)
+if __name__ == "__main__":
+    main()