init GPU Voxelization

c328c870 · xmyqsh · cfaa1a3a · c328c870 · c328c870 · c328c870
Commit c328c870 authored Jul 03, 2020 by xmyqsh
10 changed files
--- a/include/spconv/point2voxel.cu.h
+++ b/include/spconv/point2voxel.cu.h
+#pragma once
+#include <tensorview/kernel_utils.h>
+#include <tensorview/tensorview.h>
+#include <torch/script.h>
+namespace spconv {
+template <typename Index, unsigned NDim>
+__global__ void scatterPointToGridKernel(
+    tv::TensorView<const float> points,
+    tv::TensorView<const Index> indexes,
+    tv::TensorView<float> grids,
+    tv::TensorView<Index> numPointsPerGrid,
+    tv::TensorView<Index> pointIndexUnique,
+    const tv::SimpleVector<Index, NDim> gridShape) {
+  Index index;
+  int numPoints = points.dim(0);
+  int numFeatures = points.dim(1);
+  for (int ix : tv::KernelLoopX<int>(numPoints)) {
+    // slow here, atomic Add + random access
+    // Use ILP to speed up it
+    index = tv::ArrayIndexRowMajor<NDim, NDim>::runPtrs(
+            indexes.data() + ix * NDim, gridShape.data(), 0);
+    pointIndexUnique(ix) = index;
+    atomicAdd(numPointsPerGrid.data() + index, Index(1));
+#pragma unroll
+    for (int k = 0; k != numFeatures; ++k) {
+      atomicAdd(grids.data() + index * numFeatures + k, *(points.data() + ix * numFeatures + k));
+    }
+  }
+}
+template <typename Index, unsigned NDim>
+__global__ void gatherPointFromGridKernel(
+    tv::TensorView<const float> grids,
+    tv::TensorView<const Index> numPointsPerGrid,
+    tv::TensorView<const Index> pointIndexUnique,
+    tv::TensorView<float> voxels,
+    tv::TensorView<Index> coors,
+    const tv::SimpleVector<Index, NDim> gridShape) {
+  Index index;
+  int numVoxels = voxels.dim(0);
+  int numFeatures = grids.dim(1);
+  for (int ix : tv::KernelLoopX<int>(numVoxels)) {
+    // slow here, random access
+    // Use ILP to speed up it
+    index = pointIndexUnique(ix);
+#pragma unroll
+    for (int k = 0; k != numFeatures; ++k) {
+      voxels(ix, k) = grids(index, k) / numPointsPerGrid(index);
+    }
+    index = tv::rowArrayIdxInv<Index, NDim>(
+        index, coors.data() + ix * NDim, gridShape.data());
+  }
+}
+} // namespace spconv
--- a/include/spconv/point2voxel_ops.h
+++ b/include/spconv/point2voxel_ops.h
+// Copyright 2020 xmyqsh
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <spconv/points2voxels.h>
+#include <tensorview/torch_utils.h>
+#include <torch/script.h>
+#include <utility/timer.h>
+namespace spconv {
+std::vector<torch::Tensor>
+pointsToVoxel(torch::Tensor points, torch::Tensor indexes,
+                std::vector<int64_t> gridShape,
+                const int64_t ndim,
+                const int64_t gridVolume);
+} // namespace spconv
--- a/include/spconv/points2voxels.h
+++ b/include/spconv/points2voxels.h
+#pragma once
+#include <tensorview/tensorview.h>
+#include <torch/script.h>
+namespace spconv {
+void scatter_point_to_grid_cuda(
+    torch::Tensor points,
+    torch::Tensor indexes,
+    torch::Tensor grids,
+    torch::Tensor numPointsPerGrid,
+    torch::Tensor pointIndexUnique,
+    std::vector<int64_t> gridShape,
+    const int ndim);
+void gather_point_from_grid_cuda(
+    torch::Tensor grids, torch::Tensor numPointsPerGrid,
+    torch::Tensor pointIndexUnique,
+    torch::Tensor voxels, torch::Tensor coors,
+    std::vector<int64_t> gridShape,
+    const int ndim);
+} // namespace spconv
--- a/spconv/ops.py
+++ b/spconv/ops.py
@@ -157,3 +157,36 @@ def pillar_scatter(features, coors, shape):
        return torch.ops.spconv.pillar_scatter_half(features, coors, shape)
    else:
        raise NotImplementedError
+def points_to_voxel(points, voxel_size, coors_range):
+    """
+        points: [N, ndim] float tensor. points[:, :3] contain xyz points and
+            points[:, 3:] contain other information such as reflectivity.
+        voxel_size: [3] list/tuple or array or tensor, float. xyz, indicate voxel size
+        coors_range: [6] list/tuple or array or tensor, float. indicate voxel range.
+            format: xyzxyz, minmax
+    """
+    if not isinstance(voxel_size, torch.Tensor):
+        if not isinstance(voxel_size, np.ndarray):
+            voxel_size = np.array(voxel_size, dtype=points.dtype)
+        voxel_size = torch.from_numpy(voxel_size).to(points.device)
+    if not isinstance(voxel_size, torch.Tensor):
+        if not isinstance(coors_range, np.ndarray):
+            coors_range = np.array(coors_range, dtype=points.dtype)
+        coors_range = torch.from_numpy(coors_range).to(points.device)
+    grid_shape = torch.round((coors_range[3:] - coors_range[:3]) / voxel_size).to(torch.int32)
+    grid_volume = grid_shape.prod()
+    ndim = grid_shape.shape[0]
+#    indexes = torch.round((points[:, :3] - coors_range[:3]) / voxel_size).to(torch.int32)
+    indexes = torch.floor((points[:, :3] - coors_range[:3]) / voxel_size).to(torch.int32)
+    voxels, coors = torch.ops.spconv.points_to_voxel(points, indexes, grid_shape.cpu().numpy().tolist(), ndim, grid_volume.item())
+    # xyz --> zyx
+    #coors = coors[::-1]
+    x, y, z = coors[:, 0].reshape([-1, 1]), coors[:, 1].reshape([-1, 1]), coors[:, 2].reshape([-1, 1])
+    coors = torch.cat([z, y, x], dim=1)
+    # can be skipped
+    x, y, z, f = voxels[:, 0].reshape([-1, 1]), voxels[:, 1].reshape([-1, 1]), voxels[:, 2].reshape([-1, 1]), voxels[:, 3:]
+    voxels = torch.cat([z, y, x, f], dim=1)
+    return voxels, coors
--- a/spconv/utils/__init__.py
+++ b/spconv/utils/__init__.py
@@ -13,8 +13,10 @@
 # limitations under the License.
 import numpy as np
+import torch
 from spconv import spconv_utils
+from spconv.ops import points_to_voxel as points_to_voxel3
 from spconv.spconv_utils import (non_max_suppression_cpu,
                                 points_to_voxel_3d_np,
                                 points_to_voxel_3d_np_mean,
@@ -292,3 +294,33 @@ class VoxelGeneratorV2:
    @property
    def grid_size(self):
        return self._grid_size
+class VoxelGeneratorV3:
+    def __init__(self,
+                 voxel_size,
+                 point_cloud_range):
+        self._point_cloud_range = point_cloud_range
+        self._voxel_size = voxel_size
+        self._grid_size = torch.round((self._point_cloud_range[3:] - self._point_cloud_range[:3]) / self._voxel_size).to(torch.int32)
+        self._grid_size = self._grid_size.cpu().numpy().tolist()
+    def generate(self, points):
+        res = points_to_voxel3(points, self._voxel_size, self._point_cloud_range)
+        return res
+    def generate_multi_gpu(self, points, max_voxels=None):
+        res = points_to_voxel3(points, self._voxel_size, self._point_cloud_range)
+        return res
+    @property
+    def voxel_size(self):
+        return self._voxel_size
+    @property
+    def point_cloud_range(self):
+        return self._point_cloud_range
+    @property
+    def grid_size(self):
+        return self._grid_size
--- a/src/spconv/CMakeLists.txt
+++ b/src/spconv/CMakeLists.txt
-set(ALL_FILES all.cc indice.cc reordering.cc maxpool.cc nms.cc spconv_ops.cc pool_ops.cc)
+set(ALL_FILES all.cc indice.cc reordering.cc maxpool.cc nms.cc spconv_ops.cc pool_ops.cc point2voxel_ops.cc)
 if (SPCONV_BuildCUDA)
-    set(ALL_FILES ${ALL_FILES} indice.cu reordering.cu maxpool.cu pillar_scatter.cu cublas_gemm.cc fused_conv.cu)
+    set(ALL_FILES ${ALL_FILES} indice.cu reordering.cu maxpool.cu pillar_scatter.cu cublas_gemm.cc point2voxel.cu fused_conv.cu)
 endif()
 add_library(spconv SHARED ${ALL_FILES})

--- a/src/spconv/all.cc
+++ b/src/spconv/all.cc
@@ -15,12 +15,14 @@
 #include <spconv/fused_spconv_ops.h>
 #include <spconv/nms_ops.h>
 #include <spconv/pillar_scatter_ops.h>
+#include <spconv/point2voxel_ops.h>
 #include <spconv/pool_ops.h>
 #include <spconv/spconv_ops.h>
 #include <torch/script.h>
 static auto registry =
    torch::RegisterOperators()
+        .op("spconv::points_to_voxel", &spconv::pointsToVoxel)
        .op("spconv::get_indice_pairs", &spconv::getIndicePairs)
        .op("spconv::indice_conv", &spconv::indiceConv)
        .op("spconv::indice_conv_backward", &spconv::indiceConvBackward)

--- a/src/spconv/point2voxel.cu
+++ b/src/spconv/point2voxel.cu
+#include <ATen/ATen.h>
+#include <spconv/point2voxel.cu.h>
+//#include <spconv/point2voxel.h>
+#include <tensorview/cuda_utils.h>
+#include <tensorview/mp_helper.h>
+#include <tensorview/tensor.h>
+#include <tensorview/tensorview.h>
+#include <tensorview/torch_utils.h>
+namespace spconv {
+void scatter_point_to_grid_cuda(
+    torch::Tensor points,
+    torch::Tensor indexes,
+    torch::Tensor grids,
+    torch::Tensor numPointsPerGrid,
+    torch::Tensor pointIndexUnique,
+    std::vector<int64_t> gridShape,
+    const int ndim) {
+  auto stream = at::cuda::getCurrentCUDAStream();
+  auto num_points = points.size(0);
+  auto num_features = points.size(1);
+  tv::dispatch_torch<int32_t>(pointIndexUnique.scalar_type(), [&](auto IndexValue) {
+    using Index = decltype(IndexValue);
+    tv::dispatch_int<2, 3, 4>(ndim, [&](auto I) {
+      constexpr int NDim = decltype(I)::value;
+      tv::SimpleVector<Index, NDim> gs(gridShape.begin(), gridShape.end());
+      scatterPointToGridKernel<Index, NDim>
+          <<<tv::cuda::getBlocks(num_points), tv::cuda::CUDA_NUM_THREADS,
+             0, stream>>>(tv::torch2tv<float>(points),
+                          tv::torch2tv<Index>(indexes),
+                          tv::torch2tv<float>(grids),
+                          tv::torch2tv<Index>(numPointsPerGrid),
+                          tv::torch2tv<Index>(pointIndexUnique),
+                          gs);
+      TV_CHECK_CUDA_ERR_V2("scatterPointToGridKernel failed");
+#ifdef TV_LOG_KERNEL_INFO
+      cudaFuncAttributes attr;
+      checkCudaErrors(cudaFuncGetAttributes(
+          &attr, scatterPointToGridKernel<Index, NDim>));
+      tv::ssprint("scatterPointToGridKernel<", tv::type_s<Index>, NDim,
+                  ">", attr.numRegs);
+#endif
+    });
+  });
+}
+void gather_point_from_grid_cuda(
+    torch::Tensor grids, torch::Tensor numPointsPerGrid,
+    torch::Tensor pointIndexUnique,
+    torch::Tensor voxels, torch::Tensor coors,
+    std::vector<int64_t> gridShape,
+    const int ndim) {
+  auto stream = at::cuda::getCurrentCUDAStream();
+  auto num_voxel = voxels.size(0);
+  tv::dispatch_torch<int32_t>(pointIndexUnique.scalar_type(), [&](auto IndexValue) {
+    using Index = decltype(IndexValue);
+    tv::dispatch_int<2, 3, 4>(ndim, [&](auto I) {
+      constexpr int NDim = decltype(I)::value;
+      tv::SimpleVector<Index, NDim> gs(gridShape.begin(), gridShape.end());
+      gatherPointFromGridKernel<Index, NDim>
+          <<<tv::cuda::getBlocks(num_voxel), tv::cuda::CUDA_NUM_THREADS,
+             0, stream>>>(tv::torch2tv<float>(grids),
+                          tv::torch2tv<Index>(numPointsPerGrid),
+                          tv::torch2tv<Index>(pointIndexUnique),
+                          tv::torch2tv<float>(voxels),
+                          tv::torch2tv<Index>(coors),
+                          gs);
+      TV_CHECK_CUDA_ERR_V2("gatherPointFromGridKernel failed");
+      cudaFuncAttributes attr2;
+#ifdef TV_LOG_KERNEL_INFO
+      checkCudaErrors(cudaFuncGetAttributes(
+          &attr2, gatherPointFromGridKernel<Index, NDim>));
+      tv::ssprint("gatherPointFromGridKernel<", tv::type_s<Index>, NDim, ">",
+                  attr2.numRegs);
+#endif
+    });
+  });
+}
+} // namespace spconv
--- a/src/spconv/point2voxel_ops.cc
+++ b/src/spconv/point2voxel_ops.cc
+#include <spconv/point2voxel_ops.h>
+//#include <spconv/point2voxel.cu.h>
+namespace spconv {
+std::vector<torch::Tensor>
+pointsToVoxel(torch::Tensor points, torch::Tensor indexes,
+                std::vector<int64_t> gridShape,
+                const int64_t ndim,
+                const int64_t gridVolume) {
+  auto device = points.device().type();
+  auto num_points = points.size(0);
+  auto num_features = points.size(1);
+  auto pointIndexUnique = torch::full(
+      {num_points + 1}, std::numeric_limits<int>::max(),
+      torch::dtype(torch::kInt32).device(points.device()));
+  auto grids = torch::zeros({gridVolume, num_features},
+      torch::dtype(points.dtype()).device(points.device()));
+  auto numPointsPerGrid = torch::zeros({gridVolume},
+      torch::dtype(torch::kInt32).device(points.device()));
+  if (points.device().type() == torch::kCPU) {
+    TV_THROW_INVALID_ARG("not support cpu currently");
+  }
+#ifdef TV_CUDA
+  else if (points.device().type() == torch::kCUDA) {
+    scatter_point_to_grid_cuda(points, indexes, grids,
+        numPointsPerGrid, pointIndexUnique, gridShape, ndim);
+  }
+#endif
+  else {
+    TV_THROW_INVALID_ARG("unknown device type");
+  }
+  auto res = torch::_unique(pointIndexUnique);
+  pointIndexUnique = std::get<0>(res);
+  auto num_voxel = pointIndexUnique.size(0) - 1;
+  auto voxels = torch::zeros({num_voxel, num_features},
+      torch::dtype(points.dtype()).device(points.device()));
+  auto coors = torch::zeros({num_voxel, ndim},
+      torch::dtype(torch::kInt32).device(points.device()));
+  if (points.device().type() == torch::kCPU) {
+    TV_THROW_INVALID_ARG("not support cpu currently");
+  }
+#ifdef TV_CUDA
+  else if (points.device().type() == torch::kCUDA) {
+    gather_point_from_grid_cuda(grids, numPointsPerGrid,
+        pointIndexUnique, voxels, coors, gridShape, ndim);
+  }
+#endif
+  else {
+    TV_THROW_INVALID_ARG("unknown device type");
+  }
+  return {voxels, coors};
+}
+} // namespace spconv
--- a/test/benchmark_points_to_voxel_gpu.py
+++ b/test/benchmark_points_to_voxel_gpu.py
+import time
+from pathlib import Path
+import numpy as np
+import torch
+from torch import nn
+import spconv
+from spconv.utils import VoxelGeneratorV3
+def waymo_data(batch_size=1):
+    data = np.load(Path(__file__).parent / "data" / "benchmark-pc.npz")
+    points = torch.from_numpy(data['pc']).cuda().float()
+    voxel_size = torch.Tensor([0.1, 0.1, 0.1]).to(points.dtype).to(points.device)
+    coors_range = torch.Tensor([-80, -80, -2, 80, 80, 6]).to(points.dtype).to(points.device)
+    gen = VoxelGeneratorV3(voxel_size, coors_range)
+    voxels, coors = gen.generate(points)
+    N = coors.shape[0]
+    batch_id = torch.zeros([N, 1], dtype=coors.dtype, device=coors.device)
+    coors = torch.cat([batch_id, coors], dim=1)
+    return voxels, coors, gen.grid_size
+class Net(nn.Module):
+    def __init__(self, shape, algo, device):
+        super().__init__()
+        self.device = device
+        self.net = spconv.SparseSequential(
+            spconv.SubMConv3d(3, 64, 3, bias=False, indice_key="c0", algo=algo),
+            spconv.SubMConv3d(64, 64, 3, bias=False, indice_key="c0", algo=algo),
+            # nn.BatchNorm1d(32),
+            # nn.ReLU(),
+            spconv.SparseMaxPool3d(2, 2),
+            spconv.SubMConv3d(64, 96, 3, bias=False, indice_key="c1", algo=algo),
+            spconv.SubMConv3d(96, 96, 3, bias=False, indice_key="c1", algo=algo),
+            # nn.BatchNorm1d(64),
+            # nn.ReLU(),
+            spconv.SparseMaxPool3d(2, 2),
+            spconv.SubMConv3d(96, 128, 3, bias=False, indice_key="c2", algo=algo),
+            spconv.SubMConv3d(128, 128, 3, bias=False, indice_key="c2", algo=algo),
+            # nn.BatchNorm1d(128),
+            # nn.ReLU(),
+            spconv.SparseMaxPool3d(2, 2),
+            spconv.SubMConv3d(128, 160, 3, bias=False, indice_key="c3", algo=algo),
+            spconv.SubMConv3d(160, 160, 3, bias=False, indice_key="c3", algo=algo),
+            # nn.BatchNorm1d(128),
+            # nn.ReLU(),
+            spconv.SparseMaxPool3d(2, 2),
+            spconv.SubMConv3d(160, 192, 3, bias=False, indice_key="c4", algo=algo),
+            spconv.SubMConv3d(192, 192, 3, bias=False, indice_key="c4", algo=algo),
+            # nn.BatchNorm1d(128),
+            # nn.ReLU(),
+            spconv.SparseMaxPool3d(2, 2),
+            spconv.SubMConv3d(192, 224, 3, bias=False, indice_key="c5", algo=algo),
+            spconv.SubMConv3d(224, 224, 3, bias=False, indice_key="c5", algo=algo),
+            # nn.BatchNorm1d(128),
+            # nn.ReLU(),
+            spconv.SparseMaxPool3d(2, 2),
+            spconv.SubMConv3d(224, 256, 3, bias=False, indice_key="c6", algo=algo),
+            spconv.SubMConv3d(256, 256, 3, bias=False, indice_key="c6", algo=algo),
+        )
+        max_batch_size = 1
+        # grid (dense map) is used for indice generation. use pre-allocated grid can run faster.
+        self.grid = torch.full([max_batch_size, *shape], -1,
+                               dtype=torch.int32, device=self.device)
+        # self.grid = None
+        self.shape = shape
+    def forward(self, features, coors, batch_size):
+        x = spconv.SparseConvTensor(features, coors, self.shape, batch_size,
+                                    self.grid)
+        return self.net(x)
+def main():
+    voxels, coors, spatial_shape = waymo_data()
+    voxels_th, coors_th = voxels, coors
+    algo = spconv.ConvAlgo.Native
+    net = Net(spatial_shape[::-1], algo, voxels_th.device).cuda(device=voxels_th.device).eval().float()
+    print(coors_th.shape)
+    out = net(voxels_th, coors_th, 1)
+    print(out.spatial_shape)
+    times = []
+    with torch.no_grad():
+        for i in range(20):
+            torch.cuda.synchronize()
+            t = time.time()
+            out = net(voxels_th, coors_th, 1)
+            torch.cuda.synchronize()
+            times.append(time.time() - t)
+    # print((net.grid == -1).float().sum(), net.grid.numel())
+    # print("spconv time", time.time() - t)
+    print("spconv time", np.mean(times[10:]))
+if __name__ == "__main__":
+    main()