"platforms/hip/include/HipArray.h" did not exist on "d0426ba9d6b04481f13945cc9f2b1f2eb51166ba"
Commit 3517290c authored by yanyan's avatar yanyan
Browse files

format code, add benchmark per layer

parent 540a2209
...@@ -10,9 +10,7 @@ ...@@ -10,9 +10,7 @@
namespace spconv { namespace spconv {
void scatter_point_to_grid_cuda( void scatter_point_to_grid_cuda(torch::Tensor points, torch::Tensor indexes,
torch::Tensor points,
torch::Tensor indexes,
torch::Tensor grids, torch::Tensor grids,
torch::Tensor numPointsPerGrid, torch::Tensor numPointsPerGrid,
torch::Tensor pointIndex, torch::Tensor pointIndex,
...@@ -27,27 +25,25 @@ void scatter_point_to_grid_cuda( ...@@ -27,27 +25,25 @@ void scatter_point_to_grid_cuda(
constexpr int NDim = decltype(I)::value; constexpr int NDim = decltype(I)::value;
tv::SimpleVector<Index, NDim> gs(gridShape.begin(), gridShape.end()); tv::SimpleVector<Index, NDim> gs(gridShape.begin(), gridShape.end());
scatterPointToGridKernel<Index, NDim> scatterPointToGridKernel<Index, NDim>
<<<tv::cuda::getBlocks(num_points), tv::cuda::CUDA_NUM_THREADS, <<<tv::cuda::getBlocks(num_points), tv::cuda::CUDA_NUM_THREADS, 0,
0, stream>>>(tv::torch2tv<float>(points), stream>>>(tv::torch2tv<float>(points),
tv::torch2tv<Index>(indexes), tv::torch2tv<Index>(indexes), tv::torch2tv<float>(grids),
tv::torch2tv<float>(grids),
tv::torch2tv<Index>(numPointsPerGrid), tv::torch2tv<Index>(numPointsPerGrid),
tv::torch2tv<Index>(pointIndex), tv::torch2tv<Index>(pointIndex), gs);
gs);
TV_CHECK_CUDA_ERR_V2("scatterPointToGridKernel failed"); TV_CHECK_CUDA_ERR_V2("scatterPointToGridKernel failed");
#ifdef TV_LOG_KERNEL_INFO #ifdef TV_LOG_KERNEL_INFO
cudaFuncAttributes attr; cudaFuncAttributes attr;
checkCudaErrors(cudaFuncGetAttributes( checkCudaErrors(
&attr, scatterPointToGridKernel<Index, NDim>)); cudaFuncGetAttributes(&attr, scatterPointToGridKernel<Index, NDim>));
tv::ssprint("scatterPointToGridKernel<", tv::type_s<Index>, NDim, tv::ssprint("scatterPointToGridKernel<", tv::type_s<Index>, NDim, ">",
">", attr.numRegs); attr.numRegs);
#endif #endif
}); });
}); });
} }
void gather_point_from_grid_cuda( void gather_point_from_grid_cuda(torch::Tensor grids,
torch::Tensor grids, torch::Tensor numPointsPerGrid, torch::Tensor numPointsPerGrid,
torch::Tensor pointIndex, torch::Tensor pointIndex,
torch::Tensor pointIndexUnique, torch::Tensor pointIndexUnique,
torch::Tensor voxels, torch::Tensor coors, torch::Tensor voxels, torch::Tensor coors,
...@@ -57,15 +53,17 @@ void gather_point_from_grid_cuda( ...@@ -57,15 +53,17 @@ void gather_point_from_grid_cuda(
auto num_voxel = voxels.size(0); auto num_voxel = voxels.size(0);
auto num_max_points = pointIndex.size(0) - 1; auto num_max_points = pointIndex.size(0) - 1;
auto grid_volume = grids.size(0); auto grid_volume = grids.size(0);
tv::dispatch_torch<int32_t>(pointIndexUnique.scalar_type(), [&](auto IndexValue) { tv::dispatch_torch<int32_t>(
pointIndexUnique.scalar_type(), [&](auto IndexValue) {
using Index = decltype(IndexValue); using Index = decltype(IndexValue);
tv::dispatch_int<2, 3, 4>(ndim, [&](auto I) { tv::dispatch_int<2, 3, 4>(ndim, [&](auto I) {
constexpr int NDim = decltype(I)::value; constexpr int NDim = decltype(I)::value;
tv::SimpleVector<Index, NDim> gs(gridShape.begin(), gridShape.end()); tv::SimpleVector<Index, NDim> gs(gridShape.begin(), gridShape.end());
resetPointIndexKernel<Index> resetPointIndexKernel<Index>
<<<tv::cuda::getBlocks(num_max_points), tv::cuda::CUDA_NUM_THREADS, <<<tv::cuda::getBlocks(num_max_points),
0, stream>>>(tv::torch2tv<Index>(pointIndex), grid_volume); tv::cuda::CUDA_NUM_THREADS, 0, stream>>>(
tv::torch2tv<Index>(pointIndex), grid_volume);
TV_CHECK_CUDA_ERR_V2("resetPointIndexKernel failed"); TV_CHECK_CUDA_ERR_V2("resetPointIndexKernel failed");
#ifdef TV_LOG_KERNEL_INFO #ifdef TV_LOG_KERNEL_INFO
cudaFuncAttributes attr0; cudaFuncAttributes attr0;
...@@ -76,32 +74,30 @@ void gather_point_from_grid_cuda( ...@@ -76,32 +74,30 @@ void gather_point_from_grid_cuda(
#endif #endif
gatherPointFromGridKernel<Index, NDim> gatherPointFromGridKernel<Index, NDim>
<<<tv::cuda::getBlocks(num_voxel), tv::cuda::CUDA_NUM_THREADS, <<<tv::cuda::getBlocks(num_voxel), tv::cuda::CUDA_NUM_THREADS, 0,
0, stream>>>(tv::torch2tv<float>(grids), stream>>>(tv::torch2tv<float>(grids),
tv::torch2tv<Index>(numPointsPerGrid), tv::torch2tv<Index>(numPointsPerGrid),
tv::torch2tv<Index>(pointIndexUnique), tv::torch2tv<Index>(pointIndexUnique),
tv::torch2tv<float>(voxels), tv::torch2tv<float>(voxels),
tv::torch2tv<Index>(coors), tv::torch2tv<Index>(coors), gs);
gs);
TV_CHECK_CUDA_ERR_V2("gatherPointFromGridKernel failed"); TV_CHECK_CUDA_ERR_V2("gatherPointFromGridKernel failed");
#ifdef TV_LOG_KERNEL_INFO #ifdef TV_LOG_KERNEL_INFO
cudaFuncAttributes attr1; cudaFuncAttributes attr1;
checkCudaErrors(cudaFuncGetAttributes( checkCudaErrors(cudaFuncGetAttributes(
&attr1, gatherPointFromGridKernel<Index, NDim>)); &attr1, gatherPointFromGridKernel<Index, NDim>));
tv::ssprint("gatherPointFromGridKernel<", tv::type_s<Index>, NDim, ">", tv::ssprint("gatherPointFromGridKernel<", tv::type_s<Index>, NDim,
attr1.numRegs); ">", attr1.numRegs);
#endif #endif
resetGridKernel<Index> resetGridKernel<Index><<<tv::cuda::getBlocks(num_voxel),
<<<tv::cuda::getBlocks(num_voxel), tv::cuda::CUDA_NUM_THREADS, tv::cuda::CUDA_NUM_THREADS, 0, stream>>>(
0, stream>>>(tv::torch2tv<float>(grids), tv::torch2tv<float>(grids), tv::torch2tv<Index>(numPointsPerGrid),
tv::torch2tv<Index>(numPointsPerGrid),
tv::torch2tv<Index>(pointIndexUnique)); tv::torch2tv<Index>(pointIndexUnique));
TV_CHECK_CUDA_ERR_V2("resetGridKernel failed"); TV_CHECK_CUDA_ERR_V2("resetGridKernel failed");
#ifdef TV_LOG_KERNEL_INFO #ifdef TV_LOG_KERNEL_INFO
cudaFuncAttributes attr2; cudaFuncAttributes attr2;
checkCudaErrors(cudaFuncGetAttributes( checkCudaErrors(
&attr2, resetGridKernel<Index, NDim>)); cudaFuncGetAttributes(&attr2, resetGridKernel<Index, NDim>));
tv::ssprint("resetGridKernel<", tv::type_s<Index>, NDim, ">", tv::ssprint("resetGridKernel<", tv::type_s<Index>, NDim, ">",
attr2.numRegs); attr2.numRegs);
#endif #endif
......
...@@ -3,23 +3,18 @@ ...@@ -3,23 +3,18 @@
namespace spconv { namespace spconv {
int64_t int64_t pointsToVoxel(torch::Tensor points, torch::Tensor indexes,
pointsToVoxel(torch::Tensor points, torch::Tensor pointIndex, torch::Tensor grids,
torch::Tensor indexes, torch::Tensor numPointsPerGrid, torch::Tensor voxels,
torch::Tensor pointIndex, torch::Tensor coors, std::vector<int64_t> gridShape,
torch::Tensor grids,
torch::Tensor numPointsPerGrid,
torch::Tensor voxels,
torch::Tensor coors,
std::vector<int64_t> gridShape,
const int64_t ndim) { const int64_t ndim) {
if (points.device().type() == torch::kCPU) { if (points.device().type() == torch::kCPU) {
TV_THROW_INVALID_ARG("not support cpu currently"); TV_THROW_INVALID_ARG("not support cpu currently");
} }
#ifdef TV_CUDA #ifdef TV_CUDA
else if (points.device().type() == torch::kCUDA) { else if (points.device().type() == torch::kCUDA) {
scatter_point_to_grid_cuda(points, indexes, grids, scatter_point_to_grid_cuda(points, indexes, grids, numPointsPerGrid,
numPointsPerGrid, pointIndex, gridShape, ndim); pointIndex, gridShape, ndim);
} }
#endif #endif
else { else {
...@@ -33,8 +28,9 @@ pointsToVoxel(torch::Tensor points, ...@@ -33,8 +28,9 @@ pointsToVoxel(torch::Tensor points,
} }
#ifdef TV_CUDA #ifdef TV_CUDA
else if (points.device().type() == torch::kCUDA) { else if (points.device().type() == torch::kCUDA) {
gather_point_from_grid_cuda(grids, numPointsPerGrid, gather_point_from_grid_cuda(grids, numPointsPerGrid, pointIndex,
pointIndex, pointIndexUnique, voxels, coors, gridShape, ndim); pointIndexUnique, voxels, coors, gridShape,
ndim);
} }
#endif #endif
else { else {
......
...@@ -247,10 +247,10 @@ torch::Tensor indiceConvNative(torch::Tensor features, torch::Tensor filters, ...@@ -247,10 +247,10 @@ torch::Tensor indiceConvNative(torch::Tensor features, torch::Tensor filters,
} }
template <int Algo> template <int Algo>
torch::Tensor torch::Tensor indiceConvFused(torch::Tensor features, torch::Tensor filters,
indiceConvFused(torch::Tensor features, torch::Tensor filters, torch::Tensor indicePairs,
torch::Tensor indicePairs, torch::Tensor indiceNum, torch::Tensor indiceNum, int64_t numActOut,
int64_t numActOut, int64_t _inverse, int64_t _subM) { int64_t _inverse, int64_t _subM) {
auto kernelVolume = indiceNum.size(0); auto kernelVolume = indiceNum.size(0);
// auto timer = spconv::CudaContextTimer<>(); // auto timer = spconv::CudaContextTimer<>();
bool subM = _subM != 0; bool subM = _subM != 0;
...@@ -282,7 +282,8 @@ indiceConvFused(torch::Tensor features, torch::Tensor filters, ...@@ -282,7 +282,8 @@ indiceConvFused(torch::Tensor features, torch::Tensor filters,
} }
#ifdef TV_CUDA #ifdef TV_CUDA
else if (device == torch::kCUDA) { else if (device == torch::kCUDA) {
FusedConvDispatch<Algo>::fwd(output, features, filters[i], indicePairs[inverse][i], FusedConvDispatch<Algo>::fwd(output, features, filters[i],
indicePairs[inverse][i],
indicePairs[!inverse][i], nHot); indicePairs[!inverse][i], nHot);
} }
#endif #endif
...@@ -518,8 +519,7 @@ template <int Algo> ...@@ -518,8 +519,7 @@ template <int Algo>
std::vector<torch::Tensor> std::vector<torch::Tensor>
indiceConvBwFused(torch::Tensor features, torch::Tensor filters, indiceConvBwFused(torch::Tensor features, torch::Tensor filters,
torch::Tensor outGrad, torch::Tensor indicePairs, torch::Tensor outGrad, torch::Tensor indicePairs,
torch::Tensor indiceNum, int64_t _inverse, torch::Tensor indiceNum, int64_t _inverse, int64_t _subM) {
int64_t _subM) {
auto kernelVolume = indiceNum.size(0); auto kernelVolume = indiceNum.size(0);
bool subM = _subM != 0; bool subM = _subM != 0;
bool inverse = _inverse != 0; bool inverse = _inverse != 0;
...@@ -723,7 +723,6 @@ template <> struct ConvDispatch<kMinkowskiEngine> { ...@@ -723,7 +723,6 @@ template <> struct ConvDispatch<kMinkowskiEngine> {
constexpr static auto *bwd = indiceConvBwFused<kFMinkowskiEngine>; constexpr static auto *bwd = indiceConvBwFused<kFMinkowskiEngine>;
}; };
torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters, torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters,
torch::Tensor indicePairs, torch::Tensor indiceNum, torch::Tensor indicePairs, torch::Tensor indiceNum,
int64_t numActOut, int64_t _inverse, int64_t _subM, int64_t numActOut, int64_t _inverse, int64_t _subM,
......
...@@ -26,38 +26,104 @@ class Net(nn.Module): ...@@ -26,38 +26,104 @@ class Net(nn.Module):
def __init__(self, shape, algo): def __init__(self, shape, algo):
super().__init__() super().__init__()
self.net = spconv.SparseSequential( self.net = spconv.SparseSequential(
spconv.SubMConv3d(3, 64, 3, bias=False, indice_key="c0", algo=algo), spconv.SubMConv3d(3, 64, 3, bias=False, indice_key="c0",
spconv.SubMConv3d(64, 64, 3, bias=False, indice_key="c0", algo=algo), algo=algo),
spconv.SubMConv3d(64,
64,
3,
bias=False,
indice_key="c0",
algo=algo),
# nn.BatchNorm1d(32), # nn.BatchNorm1d(32),
# nn.ReLU(), # nn.ReLU(),
spconv.SparseMaxPool3d(2, 2), spconv.SparseMaxPool3d(2, 2),
spconv.SubMConv3d(64, 96, 3, bias=False, indice_key="c1", algo=algo), spconv.SubMConv3d(64,
spconv.SubMConv3d(96, 96, 3, bias=False, indice_key="c1", algo=algo), 96,
3,
bias=False,
indice_key="c1",
algo=algo),
spconv.SubMConv3d(96,
96,
3,
bias=False,
indice_key="c1",
algo=algo),
# nn.BatchNorm1d(64), # nn.BatchNorm1d(64),
# nn.ReLU(), # nn.ReLU(),
spconv.SparseMaxPool3d(2, 2), spconv.SparseMaxPool3d(2, 2),
spconv.SubMConv3d(96, 128, 3, bias=False, indice_key="c2", algo=algo), spconv.SubMConv3d(96,
spconv.SubMConv3d(128, 128, 3, bias=False, indice_key="c2", algo=algo), 128,
3,
bias=False,
indice_key="c2",
algo=algo),
spconv.SubMConv3d(128,
128,
3,
bias=False,
indice_key="c2",
algo=algo),
# nn.BatchNorm1d(128), # nn.BatchNorm1d(128),
# nn.ReLU(), # nn.ReLU(),
spconv.SparseMaxPool3d(2, 2), spconv.SparseMaxPool3d(2, 2),
spconv.SubMConv3d(128, 160, 3, bias=False, indice_key="c3", algo=algo), spconv.SubMConv3d(128,
spconv.SubMConv3d(160, 160, 3, bias=False, indice_key="c3", algo=algo), 160,
3,
bias=False,
indice_key="c3",
algo=algo),
spconv.SubMConv3d(160,
160,
3,
bias=False,
indice_key="c3",
algo=algo),
# nn.BatchNorm1d(128), # nn.BatchNorm1d(128),
# nn.ReLU(), # nn.ReLU(),
spconv.SparseMaxPool3d(2, 2), spconv.SparseMaxPool3d(2, 2),
spconv.SubMConv3d(160, 192, 3, bias=False, indice_key="c4", algo=algo), spconv.SubMConv3d(160,
spconv.SubMConv3d(192, 192, 3, bias=False, indice_key="c4", algo=algo), 192,
3,
bias=False,
indice_key="c4",
algo=algo),
spconv.SubMConv3d(192,
192,
3,
bias=False,
indice_key="c4",
algo=algo),
# nn.BatchNorm1d(128), # nn.BatchNorm1d(128),
# nn.ReLU(), # nn.ReLU(),
spconv.SparseMaxPool3d(2, 2), spconv.SparseMaxPool3d(2, 2),
spconv.SubMConv3d(192, 224, 3, bias=False, indice_key="c5", algo=algo), spconv.SubMConv3d(192,
spconv.SubMConv3d(224, 224, 3, bias=False, indice_key="c5", algo=algo), 224,
3,
bias=False,
indice_key="c5",
algo=algo),
spconv.SubMConv3d(224,
224,
3,
bias=False,
indice_key="c5",
algo=algo),
# nn.BatchNorm1d(128), # nn.BatchNorm1d(128),
# nn.ReLU(), # nn.ReLU(),
spconv.SparseMaxPool3d(2, 2), spconv.SparseMaxPool3d(2, 2),
spconv.SubMConv3d(224, 256, 3, bias=False, indice_key="c6", algo=algo), spconv.SubMConv3d(224,
spconv.SubMConv3d(256, 256, 3, bias=False, indice_key="c6", algo=algo), 256,
3,
bias=False,
indice_key="c6",
algo=algo),
spconv.SubMConv3d(256,
256,
3,
bias=False,
indice_key="c6",
algo=algo),
) )
max_batch_size = 1 max_batch_size = 1
# grid (dense map) is used for indice generation. use pre-allocated grid can run faster. # grid (dense map) is used for indice generation. use pre-allocated grid can run faster.
......
import time
from pathlib import Path
import numpy as np
import torch
from torch import nn
import spconv
from spconv.utils import VoxelGeneratorV2
def waymo_data(batch_size=1):
gen = VoxelGeneratorV2([0.1, 0.1, 0.1], [-80, -80, -2, 80, 80, 6], 1,
150000)
data = np.load(Path(__file__).parent / "data" / "benchmark-pc.npz")
pc = data["pc"]
data = gen.generate(pc)
voxels = data["voxels"].reshape(-1, 3)
coors = data["coordinates"]
N = coors.shape[0]
coors = np.concatenate([np.full([N, 1], 0, coors.dtype), coors], axis=1)
return voxels, coors, gen.grid_size
class Net(nn.Module):
def __init__(self, shape, algo):
super().__init__()
self.net = spconv.SparseSequential(
spconv.SubMConv3d(3,
64,
3,
bias=False,
indice_key="c0",
algo=algo,
name="subm-0-0"),
spconv.SubMConv3d(64,
64,
3,
bias=False,
indice_key="c0",
algo=algo,
name="subm-0-1"),
# nn.BatchNorm1d(32),
# nn.ReLU(),
spconv.SparseMaxPool3d(2, 2, name="pool-0"),
spconv.SubMConv3d(64,
96,
3,
bias=False,
indice_key="c1",
algo=algo,
name="subm-1-0"),
spconv.SubMConv3d(96,
96,
3,
bias=False,
indice_key="c1",
algo=algo,
name="subm-1-1"),
# nn.BatchNorm1d(64),
# nn.ReLU(),
spconv.SparseMaxPool3d(2, 2, name="pool-1"),
spconv.SubMConv3d(96,
128,
3,
bias=False,
indice_key="c2",
algo=algo,
name="subm-2-0"),
spconv.SubMConv3d(128,
128,
3,
bias=False,
indice_key="c2",
algo=algo,
name="subm-2-1"),
# nn.BatchNorm1d(128),
# nn.ReLU(),
spconv.SparseMaxPool3d(2, 2, name="pool-2"),
spconv.SubMConv3d(128,
160,
3,
bias=False,
indice_key="c3",
algo=algo,
name="subm-3-0"),
spconv.SubMConv3d(160,
160,
3,
bias=False,
indice_key="c3",
algo=algo,
name="subm-3-1"),
# nn.BatchNorm1d(128),
# nn.ReLU(),
spconv.SparseMaxPool3d(2, 2, name="pool-3"),
spconv.SubMConv3d(160,
192,
3,
bias=False,
indice_key="c4",
algo=algo,
name="subm-4-0"),
spconv.SubMConv3d(192,
192,
3,
bias=False,
indice_key="c4",
algo=algo,
name="subm-4-1"),
# nn.BatchNorm1d(128),
# nn.ReLU(),
spconv.SparseMaxPool3d(2, 2, name="pool-4"),
spconv.SubMConv3d(192,
224,
3,
bias=False,
indice_key="c5",
algo=algo,
name="subm-5-0"),
spconv.SubMConv3d(224,
224,
3,
bias=False,
indice_key="c5",
algo=algo,
name="subm-5-1"),
# nn.BatchNorm1d(128),
# nn.ReLU(),
spconv.SparseMaxPool3d(2, 2, name="pool-5"),
spconv.SubMConv3d(224,
256,
3,
bias=False,
indice_key="c6",
algo=algo,
name="subm-6-0"),
spconv.SubMConv3d(256,
256,
3,
bias=False,
indice_key="c6",
algo=algo,
name="subm-6-1"),
)
max_batch_size = 1
# grid (dense map) is used for indice generation. use pre-allocated grid can run faster.
self.grid = torch.full([max_batch_size, *shape], -1,
dtype=torch.int32).cuda()
# self.grid = None
self.shape = shape
def forward(self, features, coors, batch_size):
x = spconv.SparseConvTensor(features,
coors,
self.shape,
batch_size,
self.grid,
benchmark=True)
return self.net(x)
def main():
dtype = torch.float32
voxels, coors, spatial_shape = waymo_data()
voxels_th = torch.from_numpy(voxels).cuda().to(dtype)
coors_th = torch.from_numpy(coors).cuda().int()
algo = spconv.ConvAlgo.Minkowski
net = Net(spatial_shape[::-1], algo).cuda().eval().to(dtype)
print(coors_th.shape)
out = net(voxels_th, coors_th, 1)
print(out.spatial_shape)
times = []
detail_bench = {}
detail_ind_gen_bench = {}
with torch.no_grad():
for i in range(20):
torch.cuda.synchronize()
t = time.time()
out = net(voxels_th, coors_th, 1)
for k, v in out.benchmark_record.items():
if k not in detail_bench:
detail_bench[k] = []
detail_ind_gen_bench[k] = []
detail_bench[k].extend(v["time"])
detail_ind_gen_bench[k].extend(v["indice_gen_time"])
torch.cuda.synchronize()
times.append(time.time() - t)
# print((net.grid == -1).float().sum(), net.grid.numel())
# print("spconv time", time.time() - t)
print("spconv time", np.mean(times[10:]))
print(detail_bench["subm-6-0"])
print(detail_ind_gen_bench["subm-6-0"])
if __name__ == "__main__":
main()
...@@ -13,10 +13,14 @@ def waymo_data_gpu(batch_size=1): ...@@ -13,10 +13,14 @@ def waymo_data_gpu(batch_size=1):
print('gpu with total points available per voxel') print('gpu with total points available per voxel')
data = np.load(Path(__file__).parent / "data" / "benchmark-pc.npz") data = np.load(Path(__file__).parent / "data" / "benchmark-pc.npz")
points = torch.from_numpy(data['pc']).cuda().float() points = torch.from_numpy(data['pc']).cuda().float()
voxel_size = torch.Tensor([0.1, 0.1, 0.1]).to(points.dtype).to(points.device) voxel_size = torch.Tensor([0.1, 0.1,
coors_range = torch.Tensor([-80, -80, -2, 80, 80, 6]).to(points.dtype).to(points.device) 0.1]).to(points.dtype).to(points.device)
coors_range = torch.Tensor([-80, -80, -2, 80, 80,
gen = VoxelGeneratorV3(voxel_size, coors_range, max_points=200000, 6]).to(points.dtype).to(points.device)
gen = VoxelGeneratorV3(voxel_size,
coors_range,
max_points=200000,
num_features=points.shape[1], num_features=points.shape[1],
dtype=points.dtype, dtype=points.dtype,
device=points.device) device=points.device)
...@@ -40,8 +44,8 @@ def waymo_data_gpu(batch_size=1): ...@@ -40,8 +44,8 @@ def waymo_data_gpu(batch_size=1):
def waymo_data_cpu(max_points_per_voxel=1, batch_size=1): def waymo_data_cpu(max_points_per_voxel=1, batch_size=1):
print('cpu with %d max points per voxel' % max_points_per_voxel) print('cpu with %d max points per voxel' % max_points_per_voxel)
gen = VoxelGeneratorV2([0.1, 0.1, 0.1], [-80, -80, -2, 80, 80, 6], max_points_per_voxel, gen = VoxelGeneratorV2([0.1, 0.1, 0.1], [-80, -80, -2, 80, 80, 6],
150000) max_points_per_voxel, 150000)
data = np.load(Path(__file__).parent / "data" / "benchmark-pc.npz") data = np.load(Path(__file__).parent / "data" / "benchmark-pc.npz")
pc = data["pc"] pc = data["pc"]
data = gen.generate(pc) data = gen.generate(pc)
...@@ -62,6 +66,7 @@ def waymo_data_cpu(max_points_per_voxel=1, batch_size=1): ...@@ -62,6 +66,7 @@ def waymo_data_cpu(max_points_per_voxel=1, batch_size=1):
coors = np.concatenate([np.full([N, 1], 0, coors.dtype), coors], axis=1) coors = np.concatenate([np.full([N, 1], 0, coors.dtype), coors], axis=1)
return voxels, coors, gen.grid_size return voxels, coors, gen.grid_size
def get_index(coor, grid_size): def get_index(coor, grid_size):
index = coor[0] index = coor[0]
for c, g in zip(coor[1:], grid_size): for c, g in zip(coor[1:], grid_size):
...@@ -100,5 +105,6 @@ def main(): ...@@ -100,5 +105,6 @@ def main():
print('Perfect GPU Voxelization!!!') print('Perfect GPU Voxelization!!!')
if __name__ == "__main__": if __name__ == "__main__":
main() main()
...@@ -12,8 +12,10 @@ from spconv.utils import VoxelGeneratorV3 ...@@ -12,8 +12,10 @@ from spconv.utils import VoxelGeneratorV3
def waymo_data(batch_size=1): def waymo_data(batch_size=1):
data = np.load(Path(__file__).parent / "data" / "benchmark-pc.npz") data = np.load(Path(__file__).parent / "data" / "benchmark-pc.npz")
points = torch.from_numpy(data['pc']).cuda().float() points = torch.from_numpy(data['pc']).cuda().float()
voxel_size = torch.Tensor([0.1, 0.1, 0.1]).to(points.dtype).to(points.device) voxel_size = torch.Tensor([0.1, 0.1,
coors_range = torch.Tensor([-80, -80, -2, 80, 80, 6]).to(points.dtype).to(points.device) 0.1]).to(points.dtype).to(points.device)
coors_range = torch.Tensor([-80, -80, -2, 80, 80,
6]).to(points.dtype).to(points.device)
gen = VoxelGeneratorV3(voxel_size, coors_range) gen = VoxelGeneratorV3(voxel_size, coors_range)
voxels, coors = gen.generate(points) voxels, coors = gen.generate(points)
...@@ -28,43 +30,111 @@ class Net(nn.Module): ...@@ -28,43 +30,111 @@ class Net(nn.Module):
super().__init__() super().__init__()
self.device = device self.device = device
self.net = spconv.SparseSequential( self.net = spconv.SparseSequential(
spconv.SubMConv3d(3, 64, 3, bias=False, indice_key="c0", algo=algo), spconv.SubMConv3d(3, 64, 3, bias=False, indice_key="c0",
spconv.SubMConv3d(64, 64, 3, bias=False, indice_key="c0", algo=algo), algo=algo),
spconv.SubMConv3d(64,
64,
3,
bias=False,
indice_key="c0",
algo=algo),
# nn.BatchNorm1d(32), # nn.BatchNorm1d(32),
# nn.ReLU(), # nn.ReLU(),
spconv.SparseMaxPool3d(2, 2), spconv.SparseMaxPool3d(2, 2),
spconv.SubMConv3d(64, 96, 3, bias=False, indice_key="c1", algo=algo), spconv.SubMConv3d(64,
spconv.SubMConv3d(96, 96, 3, bias=False, indice_key="c1", algo=algo), 96,
3,
bias=False,
indice_key="c1",
algo=algo),
spconv.SubMConv3d(96,
96,
3,
bias=False,
indice_key="c1",
algo=algo),
# nn.BatchNorm1d(64), # nn.BatchNorm1d(64),
# nn.ReLU(), # nn.ReLU(),
spconv.SparseMaxPool3d(2, 2), spconv.SparseMaxPool3d(2, 2),
spconv.SubMConv3d(96, 128, 3, bias=False, indice_key="c2", algo=algo), spconv.SubMConv3d(96,
spconv.SubMConv3d(128, 128, 3, bias=False, indice_key="c2", algo=algo), 128,
3,
bias=False,
indice_key="c2",
algo=algo),
spconv.SubMConv3d(128,
128,
3,
bias=False,
indice_key="c2",
algo=algo),
# nn.BatchNorm1d(128), # nn.BatchNorm1d(128),
# nn.ReLU(), # nn.ReLU(),
spconv.SparseMaxPool3d(2, 2), spconv.SparseMaxPool3d(2, 2),
spconv.SubMConv3d(128, 160, 3, bias=False, indice_key="c3", algo=algo), spconv.SubMConv3d(128,
spconv.SubMConv3d(160, 160, 3, bias=False, indice_key="c3", algo=algo), 160,
3,
bias=False,
indice_key="c3",
algo=algo),
spconv.SubMConv3d(160,
160,
3,
bias=False,
indice_key="c3",
algo=algo),
# nn.BatchNorm1d(128), # nn.BatchNorm1d(128),
# nn.ReLU(), # nn.ReLU(),
spconv.SparseMaxPool3d(2, 2), spconv.SparseMaxPool3d(2, 2),
spconv.SubMConv3d(160, 192, 3, bias=False, indice_key="c4", algo=algo), spconv.SubMConv3d(160,
spconv.SubMConv3d(192, 192, 3, bias=False, indice_key="c4", algo=algo), 192,
3,
bias=False,
indice_key="c4",
algo=algo),
spconv.SubMConv3d(192,
192,
3,
bias=False,
indice_key="c4",
algo=algo),
# nn.BatchNorm1d(128), # nn.BatchNorm1d(128),
# nn.ReLU(), # nn.ReLU(),
spconv.SparseMaxPool3d(2, 2), spconv.SparseMaxPool3d(2, 2),
spconv.SubMConv3d(192, 224, 3, bias=False, indice_key="c5", algo=algo), spconv.SubMConv3d(192,
spconv.SubMConv3d(224, 224, 3, bias=False, indice_key="c5", algo=algo), 224,
3,
bias=False,
indice_key="c5",
algo=algo),
spconv.SubMConv3d(224,
224,
3,
bias=False,
indice_key="c5",
algo=algo),
# nn.BatchNorm1d(128), # nn.BatchNorm1d(128),
# nn.ReLU(), # nn.ReLU(),
spconv.SparseMaxPool3d(2, 2), spconv.SparseMaxPool3d(2, 2),
spconv.SubMConv3d(224, 256, 3, bias=False, indice_key="c6", algo=algo), spconv.SubMConv3d(224,
spconv.SubMConv3d(256, 256, 3, bias=False, indice_key="c6", algo=algo), 256,
3,
bias=False,
indice_key="c6",
algo=algo),
spconv.SubMConv3d(256,
256,
3,
bias=False,
indice_key="c6",
algo=algo),
) )
max_batch_size = 1 max_batch_size = 1
# grid (dense map) is used for indice generation. use pre-allocated grid can run faster. # grid (dense map) is used for indice generation. use pre-allocated grid can run faster.
self.grid = torch.full([max_batch_size, *shape], -1, self.grid = torch.full([max_batch_size, *shape],
dtype=torch.int32, device=self.device) -1,
dtype=torch.int32,
device=self.device)
# self.grid = None # self.grid = None
self.shape = shape self.shape = shape
...@@ -78,7 +148,8 @@ def main(): ...@@ -78,7 +148,8 @@ def main():
voxels, coors, spatial_shape = waymo_data() voxels, coors, spatial_shape = waymo_data()
voxels_th, coors_th = voxels, coors voxels_th, coors_th = voxels, coors
algo = spconv.ConvAlgo.Native algo = spconv.ConvAlgo.Native
net = Net(spatial_shape[::-1], algo, voxels_th.device).cuda(device=voxels_th.device).eval().float() net = Net(spatial_shape[::-1], algo,
voxels_th.device).cuda(device=voxels_th.device).eval().float()
print(coors_th.shape) print(coors_th.shape)
out = net(voxels_th, coors_th, 1) out = net(voxels_th, coors_th, 1)
print(out.spatial_shape) print(out.spatial_shape)
......
Subproject commit 86931fef8538008a1a92036732b3eb7fe47b25d0 Subproject commit fd7e058d0cb3e4bf743edc530c7778a210cb168b
Subproject commit 10ba80acb91f138170b7a22bb86523cb07d6f942 Subproject commit 29764aad4881fde809af6a025c12012e47a55515
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment