Commit 3517290c authored by yanyan's avatar yanyan
Browse files

format code, add benchmark per layer

parent 540a2209
......@@ -10,14 +10,12 @@
namespace spconv {
void scatter_point_to_grid_cuda(
torch::Tensor points,
torch::Tensor indexes,
torch::Tensor grids,
torch::Tensor numPointsPerGrid,
torch::Tensor pointIndex,
std::vector<int64_t> gridShape,
const int ndim) {
void scatter_point_to_grid_cuda(torch::Tensor points, torch::Tensor indexes,
torch::Tensor grids,
torch::Tensor numPointsPerGrid,
torch::Tensor pointIndex,
std::vector<int64_t> gridShape,
const int ndim) {
auto stream = at::cuda::getCurrentCUDAStream();
auto num_points = points.size(0);
auto num_features = points.size(1);
......@@ -27,86 +25,84 @@ void scatter_point_to_grid_cuda(
constexpr int NDim = decltype(I)::value;
tv::SimpleVector<Index, NDim> gs(gridShape.begin(), gridShape.end());
scatterPointToGridKernel<Index, NDim>
<<<tv::cuda::getBlocks(num_points), tv::cuda::CUDA_NUM_THREADS,
0, stream>>>(tv::torch2tv<float>(points),
tv::torch2tv<Index>(indexes),
tv::torch2tv<float>(grids),
tv::torch2tv<Index>(numPointsPerGrid),
tv::torch2tv<Index>(pointIndex),
gs);
<<<tv::cuda::getBlocks(num_points), tv::cuda::CUDA_NUM_THREADS, 0,
stream>>>(tv::torch2tv<float>(points),
tv::torch2tv<Index>(indexes), tv::torch2tv<float>(grids),
tv::torch2tv<Index>(numPointsPerGrid),
tv::torch2tv<Index>(pointIndex), gs);
TV_CHECK_CUDA_ERR_V2("scatterPointToGridKernel failed");
#ifdef TV_LOG_KERNEL_INFO
cudaFuncAttributes attr;
checkCudaErrors(cudaFuncGetAttributes(
&attr, scatterPointToGridKernel<Index, NDim>));
tv::ssprint("scatterPointToGridKernel<", tv::type_s<Index>, NDim,
">", attr.numRegs);
checkCudaErrors(
cudaFuncGetAttributes(&attr, scatterPointToGridKernel<Index, NDim>));
tv::ssprint("scatterPointToGridKernel<", tv::type_s<Index>, NDim, ">",
attr.numRegs);
#endif
});
});
}
void gather_point_from_grid_cuda(
torch::Tensor grids, torch::Tensor numPointsPerGrid,
torch::Tensor pointIndex,
torch::Tensor pointIndexUnique,
torch::Tensor voxels, torch::Tensor coors,
std::vector<int64_t> gridShape,
const int ndim) {
void gather_point_from_grid_cuda(torch::Tensor grids,
torch::Tensor numPointsPerGrid,
torch::Tensor pointIndex,
torch::Tensor pointIndexUnique,
torch::Tensor voxels, torch::Tensor coors,
std::vector<int64_t> gridShape,
const int ndim) {
auto stream = at::cuda::getCurrentCUDAStream();
auto num_voxel = voxels.size(0);
auto num_max_points = pointIndex.size(0) - 1;
auto grid_volume = grids.size(0);
tv::dispatch_torch<int32_t>(pointIndexUnique.scalar_type(), [&](auto IndexValue) {
using Index = decltype(IndexValue);
tv::dispatch_int<2, 3, 4>(ndim, [&](auto I) {
constexpr int NDim = decltype(I)::value;
tv::SimpleVector<Index, NDim> gs(gridShape.begin(), gridShape.end());
tv::dispatch_torch<int32_t>(
pointIndexUnique.scalar_type(), [&](auto IndexValue) {
using Index = decltype(IndexValue);
tv::dispatch_int<2, 3, 4>(ndim, [&](auto I) {
constexpr int NDim = decltype(I)::value;
tv::SimpleVector<Index, NDim> gs(gridShape.begin(), gridShape.end());
resetPointIndexKernel<Index>
<<<tv::cuda::getBlocks(num_max_points), tv::cuda::CUDA_NUM_THREADS,
0, stream>>>(tv::torch2tv<Index>(pointIndex), grid_volume);
TV_CHECK_CUDA_ERR_V2("resetPointIndexKernel failed");
resetPointIndexKernel<Index>
<<<tv::cuda::getBlocks(num_max_points),
tv::cuda::CUDA_NUM_THREADS, 0, stream>>>(
tv::torch2tv<Index>(pointIndex), grid_volume);
TV_CHECK_CUDA_ERR_V2("resetPointIndexKernel failed");
#ifdef TV_LOG_KERNEL_INFO
cudaFuncAttributes attr0;
checkCudaErrors(cudaFuncGetAttributes(
&attr0, resetPointIndexKernel<Index, NDim>));
tv::ssprint("resetPointIndexKernel<", tv::type_s<Index>, NDim, ">",
attr0.numRegs);
cudaFuncAttributes attr0;
checkCudaErrors(cudaFuncGetAttributes(
&attr0, resetPointIndexKernel<Index, NDim>));
tv::ssprint("resetPointIndexKernel<", tv::type_s<Index>, NDim, ">",
attr0.numRegs);
#endif
gatherPointFromGridKernel<Index, NDim>
<<<tv::cuda::getBlocks(num_voxel), tv::cuda::CUDA_NUM_THREADS,
0, stream>>>(tv::torch2tv<float>(grids),
tv::torch2tv<Index>(numPointsPerGrid),
tv::torch2tv<Index>(pointIndexUnique),
tv::torch2tv<float>(voxels),
tv::torch2tv<Index>(coors),
gs);
TV_CHECK_CUDA_ERR_V2("gatherPointFromGridKernel failed");
gatherPointFromGridKernel<Index, NDim>
<<<tv::cuda::getBlocks(num_voxel), tv::cuda::CUDA_NUM_THREADS, 0,
stream>>>(tv::torch2tv<float>(grids),
tv::torch2tv<Index>(numPointsPerGrid),
tv::torch2tv<Index>(pointIndexUnique),
tv::torch2tv<float>(voxels),
tv::torch2tv<Index>(coors), gs);
TV_CHECK_CUDA_ERR_V2("gatherPointFromGridKernel failed");
#ifdef TV_LOG_KERNEL_INFO
cudaFuncAttributes attr1;
checkCudaErrors(cudaFuncGetAttributes(
&attr1, gatherPointFromGridKernel<Index, NDim>));
tv::ssprint("gatherPointFromGridKernel<", tv::type_s<Index>, NDim, ">",
attr1.numRegs);
cudaFuncAttributes attr1;
checkCudaErrors(cudaFuncGetAttributes(
&attr1, gatherPointFromGridKernel<Index, NDim>));
tv::ssprint("gatherPointFromGridKernel<", tv::type_s<Index>, NDim,
">", attr1.numRegs);
#endif
resetGridKernel<Index>
<<<tv::cuda::getBlocks(num_voxel), tv::cuda::CUDA_NUM_THREADS,
0, stream>>>(tv::torch2tv<float>(grids),
tv::torch2tv<Index>(numPointsPerGrid),
tv::torch2tv<Index>(pointIndexUnique));
TV_CHECK_CUDA_ERR_V2("resetGridKernel failed");
resetGridKernel<Index><<<tv::cuda::getBlocks(num_voxel),
tv::cuda::CUDA_NUM_THREADS, 0, stream>>>(
tv::torch2tv<float>(grids), tv::torch2tv<Index>(numPointsPerGrid),
tv::torch2tv<Index>(pointIndexUnique));
TV_CHECK_CUDA_ERR_V2("resetGridKernel failed");
#ifdef TV_LOG_KERNEL_INFO
cudaFuncAttributes attr2;
checkCudaErrors(cudaFuncGetAttributes(
&attr2, resetGridKernel<Index, NDim>));
tv::ssprint("resetGridKernel<", tv::type_s<Index>, NDim, ">",
attr2.numRegs);
cudaFuncAttributes attr2;
checkCudaErrors(
cudaFuncGetAttributes(&attr2, resetGridKernel<Index, NDim>));
tv::ssprint("resetGridKernel<", tv::type_s<Index>, NDim, ">",
attr2.numRegs);
#endif
});
});
});
});
}
} // namespace spconv
......@@ -3,23 +3,18 @@
namespace spconv {
int64_t
pointsToVoxel(torch::Tensor points,
torch::Tensor indexes,
torch::Tensor pointIndex,
torch::Tensor grids,
torch::Tensor numPointsPerGrid,
torch::Tensor voxels,
torch::Tensor coors,
std::vector<int64_t> gridShape,
const int64_t ndim) {
int64_t pointsToVoxel(torch::Tensor points, torch::Tensor indexes,
torch::Tensor pointIndex, torch::Tensor grids,
torch::Tensor numPointsPerGrid, torch::Tensor voxels,
torch::Tensor coors, std::vector<int64_t> gridShape,
const int64_t ndim) {
if (points.device().type() == torch::kCPU) {
TV_THROW_INVALID_ARG("not support cpu currently");
}
#ifdef TV_CUDA
else if (points.device().type() == torch::kCUDA) {
scatter_point_to_grid_cuda(points, indexes, grids,
numPointsPerGrid, pointIndex, gridShape, ndim);
scatter_point_to_grid_cuda(points, indexes, grids, numPointsPerGrid,
pointIndex, gridShape, ndim);
}
#endif
else {
......@@ -33,8 +28,9 @@ pointsToVoxel(torch::Tensor points,
}
#ifdef TV_CUDA
else if (points.device().type() == torch::kCUDA) {
gather_point_from_grid_cuda(grids, numPointsPerGrid,
pointIndex, pointIndexUnique, voxels, coors, gridShape, ndim);
gather_point_from_grid_cuda(grids, numPointsPerGrid, pointIndex,
pointIndexUnique, voxels, coors, gridShape,
ndim);
}
#endif
else {
......
......@@ -247,10 +247,10 @@ torch::Tensor indiceConvNative(torch::Tensor features, torch::Tensor filters,
}
template <int Algo>
torch::Tensor
indiceConvFused(torch::Tensor features, torch::Tensor filters,
torch::Tensor indicePairs, torch::Tensor indiceNum,
int64_t numActOut, int64_t _inverse, int64_t _subM) {
torch::Tensor indiceConvFused(torch::Tensor features, torch::Tensor filters,
torch::Tensor indicePairs,
torch::Tensor indiceNum, int64_t numActOut,
int64_t _inverse, int64_t _subM) {
auto kernelVolume = indiceNum.size(0);
// auto timer = spconv::CudaContextTimer<>();
bool subM = _subM != 0;
......@@ -282,8 +282,9 @@ indiceConvFused(torch::Tensor features, torch::Tensor filters,
}
#ifdef TV_CUDA
else if (device == torch::kCUDA) {
FusedConvDispatch<Algo>::fwd(output, features, filters[i], indicePairs[inverse][i],
indicePairs[!inverse][i], nHot);
FusedConvDispatch<Algo>::fwd(output, features, filters[i],
indicePairs[inverse][i],
indicePairs[!inverse][i], nHot);
}
#endif
else {
......@@ -517,9 +518,8 @@ indiceConvBwNative(torch::Tensor features, torch::Tensor filters,
template <int Algo>
std::vector<torch::Tensor>
indiceConvBwFused(torch::Tensor features, torch::Tensor filters,
torch::Tensor outGrad, torch::Tensor indicePairs,
torch::Tensor indiceNum, int64_t _inverse,
int64_t _subM) {
torch::Tensor outGrad, torch::Tensor indicePairs,
torch::Tensor indiceNum, int64_t _inverse, int64_t _subM) {
auto kernelVolume = indiceNum.size(0);
bool subM = _subM != 0;
bool inverse = _inverse != 0;
......@@ -557,8 +557,8 @@ indiceConvBwFused(torch::Tensor features, torch::Tensor filters,
#ifdef TV_CUDA
else if (device == torch::kCUDA) {
FusedConvDispatch<Algo>::bwd(features, inputGrad, outGrad, filters[i],
filtersGrad[i], indicePairs[inverse][i],
indicePairs[!inverse][i], nHot);
filtersGrad[i], indicePairs[inverse][i],
indicePairs[!inverse][i], nHot);
}
#endif
else {
......@@ -723,7 +723,6 @@ template <> struct ConvDispatch<kMinkowskiEngine> {
constexpr static auto *bwd = indiceConvBwFused<kFMinkowskiEngine>;
};
torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters,
torch::Tensor indicePairs, torch::Tensor indiceNum,
int64_t numActOut, int64_t _inverse, int64_t _subM,
......
......@@ -26,38 +26,104 @@ class Net(nn.Module):
def __init__(self, shape, algo):
super().__init__()
self.net = spconv.SparseSequential(
spconv.SubMConv3d(3, 64, 3, bias=False, indice_key="c0", algo=algo),
spconv.SubMConv3d(64, 64, 3, bias=False, indice_key="c0", algo=algo),
spconv.SubMConv3d(3, 64, 3, bias=False, indice_key="c0",
algo=algo),
spconv.SubMConv3d(64,
64,
3,
bias=False,
indice_key="c0",
algo=algo),
# nn.BatchNorm1d(32),
# nn.ReLU(),
spconv.SparseMaxPool3d(2, 2),
spconv.SubMConv3d(64, 96, 3, bias=False, indice_key="c1", algo=algo),
spconv.SubMConv3d(96, 96, 3, bias=False, indice_key="c1", algo=algo),
spconv.SubMConv3d(64,
96,
3,
bias=False,
indice_key="c1",
algo=algo),
spconv.SubMConv3d(96,
96,
3,
bias=False,
indice_key="c1",
algo=algo),
# nn.BatchNorm1d(64),
# nn.ReLU(),
spconv.SparseMaxPool3d(2, 2),
spconv.SubMConv3d(96, 128, 3, bias=False, indice_key="c2", algo=algo),
spconv.SubMConv3d(128, 128, 3, bias=False, indice_key="c2", algo=algo),
spconv.SubMConv3d(96,
128,
3,
bias=False,
indice_key="c2",
algo=algo),
spconv.SubMConv3d(128,
128,
3,
bias=False,
indice_key="c2",
algo=algo),
# nn.BatchNorm1d(128),
# nn.ReLU(),
spconv.SparseMaxPool3d(2, 2),
spconv.SubMConv3d(128, 160, 3, bias=False, indice_key="c3", algo=algo),
spconv.SubMConv3d(160, 160, 3, bias=False, indice_key="c3", algo=algo),
spconv.SubMConv3d(128,
160,
3,
bias=False,
indice_key="c3",
algo=algo),
spconv.SubMConv3d(160,
160,
3,
bias=False,
indice_key="c3",
algo=algo),
# nn.BatchNorm1d(128),
# nn.ReLU(),
spconv.SparseMaxPool3d(2, 2),
spconv.SubMConv3d(160, 192, 3, bias=False, indice_key="c4", algo=algo),
spconv.SubMConv3d(192, 192, 3, bias=False, indice_key="c4", algo=algo),
spconv.SubMConv3d(160,
192,
3,
bias=False,
indice_key="c4",
algo=algo),
spconv.SubMConv3d(192,
192,
3,
bias=False,
indice_key="c4",
algo=algo),
# nn.BatchNorm1d(128),
# nn.ReLU(),
spconv.SparseMaxPool3d(2, 2),
spconv.SubMConv3d(192, 224, 3, bias=False, indice_key="c5", algo=algo),
spconv.SubMConv3d(224, 224, 3, bias=False, indice_key="c5", algo=algo),
spconv.SubMConv3d(192,
224,
3,
bias=False,
indice_key="c5",
algo=algo),
spconv.SubMConv3d(224,
224,
3,
bias=False,
indice_key="c5",
algo=algo),
# nn.BatchNorm1d(128),
# nn.ReLU(),
spconv.SparseMaxPool3d(2, 2),
spconv.SubMConv3d(224, 256, 3, bias=False, indice_key="c6", algo=algo),
spconv.SubMConv3d(256, 256, 3, bias=False, indice_key="c6", algo=algo),
spconv.SubMConv3d(224,
256,
3,
bias=False,
indice_key="c6",
algo=algo),
spconv.SubMConv3d(256,
256,
3,
bias=False,
indice_key="c6",
algo=algo),
)
max_batch_size = 1
# grid (dense map) is used for indice generation. use pre-allocated grid can run faster.
......
import time
from pathlib import Path
import numpy as np
import torch
from torch import nn
import spconv
from spconv.utils import VoxelGeneratorV2
def waymo_data(batch_size=1):
gen = VoxelGeneratorV2([0.1, 0.1, 0.1], [-80, -80, -2, 80, 80, 6], 1,
150000)
data = np.load(Path(__file__).parent / "data" / "benchmark-pc.npz")
pc = data["pc"]
data = gen.generate(pc)
voxels = data["voxels"].reshape(-1, 3)
coors = data["coordinates"]
N = coors.shape[0]
coors = np.concatenate([np.full([N, 1], 0, coors.dtype), coors], axis=1)
return voxels, coors, gen.grid_size
class Net(nn.Module):
def __init__(self, shape, algo):
super().__init__()
self.net = spconv.SparseSequential(
spconv.SubMConv3d(3,
64,
3,
bias=False,
indice_key="c0",
algo=algo,
name="subm-0-0"),
spconv.SubMConv3d(64,
64,
3,
bias=False,
indice_key="c0",
algo=algo,
name="subm-0-1"),
# nn.BatchNorm1d(32),
# nn.ReLU(),
spconv.SparseMaxPool3d(2, 2, name="pool-0"),
spconv.SubMConv3d(64,
96,
3,
bias=False,
indice_key="c1",
algo=algo,
name="subm-1-0"),
spconv.SubMConv3d(96,
96,
3,
bias=False,
indice_key="c1",
algo=algo,
name="subm-1-1"),
# nn.BatchNorm1d(64),
# nn.ReLU(),
spconv.SparseMaxPool3d(2, 2, name="pool-1"),
spconv.SubMConv3d(96,
128,
3,
bias=False,
indice_key="c2",
algo=algo,
name="subm-2-0"),
spconv.SubMConv3d(128,
128,
3,
bias=False,
indice_key="c2",
algo=algo,
name="subm-2-1"),
# nn.BatchNorm1d(128),
# nn.ReLU(),
spconv.SparseMaxPool3d(2, 2, name="pool-2"),
spconv.SubMConv3d(128,
160,
3,
bias=False,
indice_key="c3",
algo=algo,
name="subm-3-0"),
spconv.SubMConv3d(160,
160,
3,
bias=False,
indice_key="c3",
algo=algo,
name="subm-3-1"),
# nn.BatchNorm1d(128),
# nn.ReLU(),
spconv.SparseMaxPool3d(2, 2, name="pool-3"),
spconv.SubMConv3d(160,
192,
3,
bias=False,
indice_key="c4",
algo=algo,
name="subm-4-0"),
spconv.SubMConv3d(192,
192,
3,
bias=False,
indice_key="c4",
algo=algo,
name="subm-4-1"),
# nn.BatchNorm1d(128),
# nn.ReLU(),
spconv.SparseMaxPool3d(2, 2, name="pool-4"),
spconv.SubMConv3d(192,
224,
3,
bias=False,
indice_key="c5",
algo=algo,
name="subm-5-0"),
spconv.SubMConv3d(224,
224,
3,
bias=False,
indice_key="c5",
algo=algo,
name="subm-5-1"),
# nn.BatchNorm1d(128),
# nn.ReLU(),
spconv.SparseMaxPool3d(2, 2, name="pool-5"),
spconv.SubMConv3d(224,
256,
3,
bias=False,
indice_key="c6",
algo=algo,
name="subm-6-0"),
spconv.SubMConv3d(256,
256,
3,
bias=False,
indice_key="c6",
algo=algo,
name="subm-6-1"),
)
max_batch_size = 1
# grid (dense map) is used for indice generation. use pre-allocated grid can run faster.
self.grid = torch.full([max_batch_size, *shape], -1,
dtype=torch.int32).cuda()
# self.grid = None
self.shape = shape
def forward(self, features, coors, batch_size):
x = spconv.SparseConvTensor(features,
coors,
self.shape,
batch_size,
self.grid,
benchmark=True)
return self.net(x)
def main():
dtype = torch.float32
voxels, coors, spatial_shape = waymo_data()
voxels_th = torch.from_numpy(voxels).cuda().to(dtype)
coors_th = torch.from_numpy(coors).cuda().int()
algo = spconv.ConvAlgo.Minkowski
net = Net(spatial_shape[::-1], algo).cuda().eval().to(dtype)
print(coors_th.shape)
out = net(voxels_th, coors_th, 1)
print(out.spatial_shape)
times = []
detail_bench = {}
detail_ind_gen_bench = {}
with torch.no_grad():
for i in range(20):
torch.cuda.synchronize()
t = time.time()
out = net(voxels_th, coors_th, 1)
for k, v in out.benchmark_record.items():
if k not in detail_bench:
detail_bench[k] = []
detail_ind_gen_bench[k] = []
detail_bench[k].extend(v["time"])
detail_ind_gen_bench[k].extend(v["indice_gen_time"])
torch.cuda.synchronize()
times.append(time.time() - t)
# print((net.grid == -1).float().sum(), net.grid.numel())
# print("spconv time", time.time() - t)
print("spconv time", np.mean(times[10:]))
print(detail_bench["subm-6-0"])
print(detail_ind_gen_bench["subm-6-0"])
if __name__ == "__main__":
main()
......@@ -13,13 +13,17 @@ def waymo_data_gpu(batch_size=1):
print('gpu with total points available per voxel')
data = np.load(Path(__file__).parent / "data" / "benchmark-pc.npz")
points = torch.from_numpy(data['pc']).cuda().float()
voxel_size = torch.Tensor([0.1, 0.1, 0.1]).to(points.dtype).to(points.device)
coors_range = torch.Tensor([-80, -80, -2, 80, 80, 6]).to(points.dtype).to(points.device)
gen = VoxelGeneratorV3(voxel_size, coors_range, max_points=200000,
num_features=points.shape[1],
dtype=points.dtype,
device=points.device)
voxel_size = torch.Tensor([0.1, 0.1,
0.1]).to(points.dtype).to(points.device)
coors_range = torch.Tensor([-80, -80, -2, 80, 80,
6]).to(points.dtype).to(points.device)
gen = VoxelGeneratorV3(voxel_size,
coors_range,
max_points=200000,
num_features=points.shape[1],
dtype=points.dtype,
device=points.device)
voxels, coors = gen.generate(points)
times = []
......@@ -40,8 +44,8 @@ def waymo_data_gpu(batch_size=1):
def waymo_data_cpu(max_points_per_voxel=1, batch_size=1):
print('cpu with %d max points per voxel' % max_points_per_voxel)
gen = VoxelGeneratorV2([0.1, 0.1, 0.1], [-80, -80, -2, 80, 80, 6], max_points_per_voxel,
150000)
gen = VoxelGeneratorV2([0.1, 0.1, 0.1], [-80, -80, -2, 80, 80, 6],
max_points_per_voxel, 150000)
data = np.load(Path(__file__).parent / "data" / "benchmark-pc.npz")
pc = data["pc"]
data = gen.generate(pc)
......@@ -62,6 +66,7 @@ def waymo_data_cpu(max_points_per_voxel=1, batch_size=1):
coors = np.concatenate([np.full([N, 1], 0, coors.dtype), coors], axis=1)
return voxels, coors, gen.grid_size
def get_index(coor, grid_size):
index = coor[0]
for c, g in zip(coor[1:], grid_size):
......@@ -100,5 +105,6 @@ def main():
print('Perfect GPU Voxelization!!!')
if __name__ == "__main__":
main()
......@@ -12,8 +12,10 @@ from spconv.utils import VoxelGeneratorV3
def waymo_data(batch_size=1):
data = np.load(Path(__file__).parent / "data" / "benchmark-pc.npz")
points = torch.from_numpy(data['pc']).cuda().float()
voxel_size = torch.Tensor([0.1, 0.1, 0.1]).to(points.dtype).to(points.device)
coors_range = torch.Tensor([-80, -80, -2, 80, 80, 6]).to(points.dtype).to(points.device)
voxel_size = torch.Tensor([0.1, 0.1,
0.1]).to(points.dtype).to(points.device)
coors_range = torch.Tensor([-80, -80, -2, 80, 80,
6]).to(points.dtype).to(points.device)
gen = VoxelGeneratorV3(voxel_size, coors_range)
voxels, coors = gen.generate(points)
......@@ -28,43 +30,111 @@ class Net(nn.Module):
super().__init__()
self.device = device
self.net = spconv.SparseSequential(
spconv.SubMConv3d(3, 64, 3, bias=False, indice_key="c0", algo=algo),
spconv.SubMConv3d(64, 64, 3, bias=False, indice_key="c0", algo=algo),
spconv.SubMConv3d(3, 64, 3, bias=False, indice_key="c0",
algo=algo),
spconv.SubMConv3d(64,
64,
3,
bias=False,
indice_key="c0",
algo=algo),
# nn.BatchNorm1d(32),
# nn.ReLU(),
spconv.SparseMaxPool3d(2, 2),
spconv.SubMConv3d(64, 96, 3, bias=False, indice_key="c1", algo=algo),
spconv.SubMConv3d(96, 96, 3, bias=False, indice_key="c1", algo=algo),
spconv.SubMConv3d(64,
96,
3,
bias=False,
indice_key="c1",
algo=algo),
spconv.SubMConv3d(96,
96,
3,
bias=False,
indice_key="c1",
algo=algo),
# nn.BatchNorm1d(64),
# nn.ReLU(),
spconv.SparseMaxPool3d(2, 2),
spconv.SubMConv3d(96, 128, 3, bias=False, indice_key="c2", algo=algo),
spconv.SubMConv3d(128, 128, 3, bias=False, indice_key="c2", algo=algo),
spconv.SubMConv3d(96,
128,
3,
bias=False,
indice_key="c2",
algo=algo),
spconv.SubMConv3d(128,
128,
3,
bias=False,
indice_key="c2",
algo=algo),
# nn.BatchNorm1d(128),
# nn.ReLU(),
spconv.SparseMaxPool3d(2, 2),
spconv.SubMConv3d(128, 160, 3, bias=False, indice_key="c3", algo=algo),
spconv.SubMConv3d(160, 160, 3, bias=False, indice_key="c3", algo=algo),
spconv.SubMConv3d(128,
160,
3,
bias=False,
indice_key="c3",
algo=algo),
spconv.SubMConv3d(160,
160,
3,
bias=False,
indice_key="c3",
algo=algo),
# nn.BatchNorm1d(128),
# nn.ReLU(),
spconv.SparseMaxPool3d(2, 2),
spconv.SubMConv3d(160, 192, 3, bias=False, indice_key="c4", algo=algo),
spconv.SubMConv3d(192, 192, 3, bias=False, indice_key="c4", algo=algo),
spconv.SubMConv3d(160,
192,
3,
bias=False,
indice_key="c4",
algo=algo),
spconv.SubMConv3d(192,
192,
3,
bias=False,
indice_key="c4",
algo=algo),
# nn.BatchNorm1d(128),
# nn.ReLU(),
spconv.SparseMaxPool3d(2, 2),
spconv.SubMConv3d(192, 224, 3, bias=False, indice_key="c5", algo=algo),
spconv.SubMConv3d(224, 224, 3, bias=False, indice_key="c5", algo=algo),
spconv.SubMConv3d(192,
224,
3,
bias=False,
indice_key="c5",
algo=algo),
spconv.SubMConv3d(224,
224,
3,
bias=False,
indice_key="c5",
algo=algo),
# nn.BatchNorm1d(128),
# nn.ReLU(),
spconv.SparseMaxPool3d(2, 2),
spconv.SubMConv3d(224, 256, 3, bias=False, indice_key="c6", algo=algo),
spconv.SubMConv3d(256, 256, 3, bias=False, indice_key="c6", algo=algo),
spconv.SubMConv3d(224,
256,
3,
bias=False,
indice_key="c6",
algo=algo),
spconv.SubMConv3d(256,
256,
3,
bias=False,
indice_key="c6",
algo=algo),
)
max_batch_size = 1
# grid (dense map) is used for indice generation. use pre-allocated grid can run faster.
self.grid = torch.full([max_batch_size, *shape], -1,
dtype=torch.int32, device=self.device)
self.grid = torch.full([max_batch_size, *shape],
-1,
dtype=torch.int32,
device=self.device)
# self.grid = None
self.shape = shape
......@@ -78,7 +148,8 @@ def main():
voxels, coors, spatial_shape = waymo_data()
voxels_th, coors_th = voxels, coors
algo = spconv.ConvAlgo.Native
net = Net(spatial_shape[::-1], algo, voxels_th.device).cuda(device=voxels_th.device).eval().float()
net = Net(spatial_shape[::-1], algo,
voxels_th.device).cuda(device=voxels_th.device).eval().float()
print(coors_th.shape)
out = net(voxels_th, coors_th, 1)
print(out.spatial_shape)
......
Subproject commit 86931fef8538008a1a92036732b3eb7fe47b25d0
Subproject commit fd7e058d0cb3e4bf743edc530c7778a210cb168b
Subproject commit 10ba80acb91f138170b7a22bb86523cb07d6f942
Subproject commit 29764aad4881fde809af6a025c12012e47a55515
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment