Commit bf011c76 authored by yan.yan's avatar yan.yan
Browse files

temp commit

parent 4791f582
......@@ -19,6 +19,67 @@ from spconv.utils import Point2VoxelCPU3d
from spconv.pytorch.utils import PointToVoxel
import torch
def main_pytorch_voxel_gen():
np.random.seed(50051)
# voxel gen source code: spconv/csrc/sparse/pointops.py
gen = PointToVoxel(vsize_xyz=[0.1, 0.1, 0.1],
coors_range_xyz=[-80, -80, -6, 80, 80, 6],
num_point_features=3,
max_num_voxels=5000,
max_num_points_per_voxel=5)
pc = np.random.uniform(-4, 4, size=[1000, 3])
pc_th = torch.from_numpy(pc)
voxels_th, indices_th, num_p_in_vx_th = gen(pc_th)
voxels_np = voxels_th.numpy()
indices_np = indices_th.numpy()
num_p_in_vx_np = num_p_in_vx_th.numpy()
print(f"------Raw Voxels {voxels_np.shape[0]}-------")
print(voxels_np[0])
# run voxel gen and FILL MEAN VALUE to voxel remain
voxels_th, indices_th, num_p_in_vx_th = gen(pc_th, empty_mean=True)
voxels_np = voxels_th.numpy()
indices_np = indices_th.numpy()
num_p_in_vx_np = num_p_in_vx_th.numpy()
print("------Voxels with mean filled-------")
print(voxels_np[0])
voxels_th, indices_th, num_p_in_vx_th, pc_voxel_id = gen.generate_voxel_with_id(pc_th, empty_mean=True)
print("------Voxel ids for every point-------")
print(pc_voxel_id[:10])
def main_pytorch_voxel_gen_cuda():
np.random.seed(50051)
# voxel gen source code: spconv/csrc/sparse/pointops.py
device = torch.device("cuda:0")
gen = PointToVoxel(vsize_xyz=[0.1, 0.1, 0.1],
coors_range_xyz=[-80, -80, -6, 80, 80, 6],
num_point_features=3,
max_num_voxels=5000,
max_num_points_per_voxel=5,
device=device)
pc = np.random.uniform(-4, 4, size=[1000, 3]).astype(np.float32)
pc_th = torch.from_numpy(pc).to(device)
voxels_th, indices_th, num_p_in_vx_th = gen(pc_th)
voxels_np = voxels_th.cpu().numpy()
indices_np = indices_th.cpu().numpy()
num_p_in_vx_np = num_p_in_vx_th.cpu().numpy()
print(f"------Raw Voxels {voxels_np.shape[0]}-------")
print(voxels_np[0])
# run voxel gen and FILL MEAN VALUE to voxel remain
voxels_tv, indices_tv, num_p_in_vx_tv = gen(pc_th, empty_mean=True)
voxels_np = voxels_tv.cpu().numpy()
indices_np = indices_tv.cpu().numpy()
num_p_in_vx_np = num_p_in_vx_tv.cpu().numpy()
print("------Voxels with mean filled-------")
print(voxels_np[0])
voxels_th, indices_th, num_p_in_vx_th, pc_voxel_id = gen.generate_voxel_with_id(pc_th, empty_mean=True)
print("------Voxel ids for every point-------")
print(pc[:10])
print(indices_th[pc_voxel_id[:10]])
def main():
np.random.seed(50051)
......@@ -81,58 +142,26 @@ def main_point_with_features():
print("------Voxels with mean filled-------")
print(voxels_np[0])
def main_pytorch_voxel_gen():
def main_cuda():
np.random.seed(50051)
# voxel gen source code: spconv/csrc/sparse/pointops.py
gen = PointToVoxel(vsize_xyz=[0.1, 0.1, 0.1],
coors_range_xyz=[-80, -80, -2, 80, 80, 6],
num_point_features=3,
max_num_voxels=5000,
max_num_points_per_voxel=5)
from spconv.utils import Point2VoxelGPU3d
pc = np.random.uniform(-10, 10, size=[1000, 3])
pc_th = torch.from_numpy(pc)
voxels_th, indices_th, num_p_in_vx_th = gen(pc_th)
voxels_np = voxels_th.numpy()
indices_np = indices_th.numpy()
num_p_in_vx_np = num_p_in_vx_th.numpy()
print(f"------Raw Voxels {voxels_np.shape[0]}-------")
print(voxels_np[0])
# run voxel gen and FILL MEAN VALUE to voxel remain
voxels_tv, indices_tv, num_p_in_vx_tv = gen(pc_th, empty_mean=True)
voxels_np = voxels_tv.numpy()
indices_np = indices_tv.numpy()
num_p_in_vx_np = num_p_in_vx_tv.numpy()
print("------Voxels with mean filled-------")
print(voxels_np[0])
def main_pytorch_voxel_gen_cuda():
np.random.seed(50051)
# voxel gen source code: spconv/csrc/sparse/pointops.py
device = torch.device("cuda:0")
gen = PointToVoxel(vsize_xyz=[0.1, 0.1, 0.1],
gen = Point2VoxelGPU3d(vsize_xyz=[0.1, 0.1, 0.1],
coors_range_xyz=[-80, -80, -2, 80, 80, 6],
num_point_features=3,
max_num_voxels=5000,
max_num_points_per_voxel=5,
device=device)
max_num_points_per_voxel=5)
pc = np.random.uniform(-10, 10, size=[1000, 3]).astype(np.float32)
pc_th = torch.from_numpy(pc).to(device)
voxels_th, indices_th, num_p_in_vx_th = gen(pc_th)
voxels_np = voxels_th.cpu().numpy()
indices_np = indices_th.cpu().numpy()
num_p_in_vx_np = num_p_in_vx_th.cpu().numpy()
print(f"------Raw Voxels {voxels_np.shape[0]}-------")
print(voxels_np[0])
# run voxel gen and FILL MEAN VALUE to voxel remain
voxels_tv, indices_tv, num_p_in_vx_tv = gen(pc_th, empty_mean=True)
pc = np.random.uniform(-10, 10, size=[100000, 3]).astype(np.float32)
pc_tv = tv.from_numpy(pc).cuda()
# generate voxels, note that voxels_tv reference to a persistent buffer in generator,
# so we can't run it in multi-thread.
voxels_tv, indices_tv, num_p_in_vx_tv = gen.point_to_voxel_hash(pc_tv)
voxels_np = voxels_tv.cpu().numpy()
indices_np = indices_tv.cpu().numpy()
num_p_in_vx_np = num_p_in_vx_tv.cpu().numpy()
print("------Voxels with mean filled-------")
print(f"------CUDA Raw Voxels {voxels_np.shape[0]}-------")
print(voxels_np[0])
......@@ -141,4 +170,5 @@ if __name__ == "__main__":
main_point_with_features()
main_pytorch_voxel_gen()
if torch.cuda.is_available():
main_cuda()
main_pytorch_voxel_gen_cuda()
import sys
from pathlib import Path
from typing import Dict, List, Tuple
import pickle
import sys
import time
from pathlib import Path
from cumm.gemm.algospec.core import GemmAlgo
import numpy as np
import pccm
import torch
import torch.nn.functional as F
from cumm import dtypes
from cumm import tensorview as tv
from cumm.constants import PACKAGE_ROOT
from cumm.conv.bases import NCHW, NHWC, ConvIterAlgo, ConvOpType
from cumm.conv.main import ConvMainUnitTest, gen_gemm_kernels
from cumm.conv.params import ConvProblem
from cumm.gemm import kernel
import os
from spconv.core_cc.csrc.sparse.all import SpconvOps
from cumm.gemm.codeops import div_up
from spconv.constants import PACKAGE_ROOT
from spconv.core import ConvAlgo
from spconv.pytorch import ops
from spconv.algo import CONV, BestConvAlgoByProfile
from spconv.pytorch.cppcore import torch_tensor_to_tv
def reduce_mask_count(mask: np.ndarray, width: int):
mask_length_32 = (div_up(mask.shape[0], width)) * width
if mask.shape[0] < mask_length_32:
mask_pad = np.zeros((mask_length_32, ), dtype=mask.dtype)
mask_pad[:mask.shape[0]] = mask
mask = mask_pad
mask = mask.reshape(-1, width)
maskr = np.bitwise_or.reduce(mask, axis=1)
maskr_tv = tv.from_numpy(maskr)
return SpconvOps.count_bits(maskr_tv).numpy().sum() * width
def reduce_mask_count_x(mask: np.ndarray, width: int):
mask_length_32 = (div_up(mask.shape[0], width)) * width
if mask.shape[0] < mask_length_32:
mask_pad = np.zeros((mask_length_32, ), dtype=mask.dtype)
mask_pad[:mask.shape[0]] = mask
mask = mask_pad
mask = mask.reshape(-1, width)
maskr = np.bitwise_or.reduce(mask, axis=1)
return maskr
def dev_subm_inds_v2(subm: bool = True, run_conv: bool = True):
limit_input_n = 16384
limit_input_n = None
np.random.seed(484)
with (PACKAGE_ROOT.parent / "test/data/test_spconv.pkl").open("rb") as f:
voxels_np, indices_np, spatial_shape = pickle.load(f)
from spconv.test_utils import generate_sparse_data
voxels_np = voxels_np[:limit_input_n]
indices_np = indices_np[:limit_input_n]
# spatial_shape = [19, 18, 17]
# sparse_dict = generate_sparse_data(spatial_shape, [1024], 128)
# voxels_np = np.ascontiguousarray(sparse_dict["features"]).astype(
# np.float32)
# indices_np = np.ascontiguousarray(
# sparse_dict["indices"][:, [3, 0, 1, 2]]).astype(np.int32)
voxels = tv.from_numpy(voxels_np).cuda()
indices = tv.from_numpy(indices_np).cuda()
indices_th = torch.from_numpy(indices_np).cuda()
print(spatial_shape, indices_np.shape)
ndim = 3
if subm:
ksize = [3, 3, 3]
kv = np.prod(ksize)
padding = [1] * ndim
stride = [1] * ndim
dilation = [1] * ndim
out_padding = [0] * ndim
else:
ksize = [2, 2, 2]
kv = np.prod(ksize)
padding = [0] * ndim
stride = [1] * ndim
dilation = [1] * ndim
out_padding = [0] * ndim
out_inds, pair_ref, indice_num_per_loc = ops.get_indice_pairs(
indices_th, 1, spatial_shape, ConvAlgo.Native, ksize, stride, padding,
dilation, out_padding, subm)
indice_num_per_loc_np = indice_num_per_loc.cpu().numpy()
indice_pairs_np = pair_ref.cpu().numpy()
algo = ConvAlgo.MaskImplicitGemm
if algo == ConvAlgo.MaskImplicitGemm:
num_split = 1
else:
num_split = 2
for i in range(5):
res = ops.get_indice_pairs_implicit_gemm(indices_th, 1, spatial_shape,
algo, ksize, stride, padding,
dilation, out_padding, subm)
out_inds = res[0]
num_inds_per_loc = res[1]
pair_fwd = res[2]
pair_fwd_x = pair_fwd.cpu().numpy().reshape(-1)
pair_fwd_x[pair_fwd_x == -1] = 0
loc_num_np = (pair_fwd_x > 0).reshape(kv, -1).sum(1)
print(loc_num_np)
print(indice_num_per_loc_np)
pair_bwd = res[3]
pair_mask_fwd_splits = res[4]
pair_mask_bwd_splits = res[5]
mask_tv = torch_tensor_to_tv(pair_mask_fwd_splits[0], dtype=tv.uint32).cpu().numpy()
bench_reduce_mask(mask_tv)
return
mask_argsort_fwd_splits = res[6]
mask_argsort_bwd_splits = res[7]
masks = res[8]
pair_mask_fwd_splits_tv = [
ops.torch_tensor_to_tv(t, dtype=tv.uint32)
for t in pair_mask_fwd_splits
]
valid_location_bitcount = [
SpconvOps.count_bits(t) for t in pair_mask_fwd_splits_tv
]
valid_location_count = sum(
[t.cpu().numpy().sum() for t in valid_location_bitcount])
reduce_length = 32
split_mask_valid_count = sum([
reduce_mask_count(t.cpu().numpy(), reduce_length)
for t in pair_mask_fwd_splits_tv
])
if subm:
print("SUBM", valid_location_count, split_mask_valid_count,
pair_fwd.numel())
else:
print("REGULAR", valid_location_count, split_mask_valid_count,
pair_fwd.numel())
# return
if run_conv:
C = 64
K = 64
desps = CONV.desps
mask_output_fwd = torch.zeros([2, div_up(out_inds.shape[0], 32)],
dtype=torch.int32,
device=indices_th.device)
mask_output_bwd = torch.zeros([2, div_up(indices.dim(0), 32)],
dtype=torch.int32,
device=indices_th.device)
for desp in desps:
if desp.algo != GemmAlgo.Simt.value:
continue
# if desp.op_type == ConvOpType.kBackwardWeight.value:
# continue
# if desp.tile_shape !
if desp.dtype_a == dtypes.int8.tv_dtype:
inp = np.random.randint(-1, 1, size=[voxels_np.shape[0],
C]).astype(np.int8)
weight = np.random.randint(-1, 1, size=[K, *ksize,
C]).astype(np.int8)
output = np.random.randint(-1, 1, size=[
out_inds.shape[0], K
]).astype(dtypes.get_npdtype_from_tvdtype(desp.dtype_output))
else:
inp = np.random.uniform(-1, 1, size=[
voxels_np.shape[0], C
]).astype(dtypes.get_npdtype_from_tvdtype(desp.dtype_input))
weight = np.random.uniform(-1, 1, size=[K, *ksize, C]).astype(
dtypes.get_npdtype_from_tvdtype(desp.dtype_weight))
output = np.random.uniform(-1, 1, size=[
out_inds.shape[0], K
]).astype(dtypes.get_npdtype_from_tvdtype(desp.dtype_output))
weight_ref = weight.transpose(1, 2, 3, 0, 4)
weight_ref = np.ascontiguousarray(weight_ref).reshape(-1, K, C)
if desp.op_type == ConvOpType.kBackwardInput.value:
inp_tv = tv.zeros(inp.shape, desp.dtype_input, 0)
else:
inp_tv = tv.from_numpy(inp).cuda()
if desp.op_type == ConvOpType.kBackwardWeight.value:
weight_tv = tv.zeros(weight.shape, desp.dtype_weight, 0)
else:
weight_tv = tv.from_numpy(weight).cuda()
# _ = tv.zeros([5000, 10], tv.float32, 0)
if desp.op_type == ConvOpType.kForward.value:
output_tv = tv.zeros(output.shape, desp.dtype_output, 0)
else:
output_tv = tv.from_numpy(output).cuda()
torch.cuda.synchronize()
t = time.time()
spk = 1
if desp.op_type == ConvOpType.kBackwardWeight.value:
# TODO support splitk parallel
spk = 32
if subm:
if desp.op_type == ConvOpType.kForward.value:
indice_pairs = pair_fwd
elif desp.op_type == ConvOpType.kBackwardInput.value:
indice_pairs = pair_bwd
else:
indice_pairs = pair_fwd
mask_output = mask_output_fwd
# print([bin(x.item()) for x in masks])
for j in range(num_split):
beta = 1 if j == 1 else 0
mask_filter = 0xffffffff
mask_filter = masks[j].item()
reverse_mask = False
if desp.op_type == ConvOpType.kBackwardWeight.value:
mask_op = mask_output[j]
else:
mask_op = pair_mask_fwd_splits[j]
if desp.op_type == ConvOpType.kBackwardInput.value:
reverse_mask = True
CONV.run_with_tuned_result(
BestConvAlgoByProfile(desp, spk),
desp.op_type,
inp_tv,
weight_tv,
output_tv,
torch_tensor_to_tv(mask_op, dtype=tv.uint32),
torch_tensor_to_tv(mask_argsort_fwd_splits[j]),
torch_tensor_to_tv(mask_output[j], dtype=tv.uint32),
torch_tensor_to_tv(indice_pairs),
reverse_mask,
mask_filter=mask_filter,
mask_width=32,
beta=beta,
verbose=True,
)
else:
if desp.op_type == ConvOpType.kForward.value:
indice_pairs = pair_fwd # inp -> out
mask_ops = pair_mask_fwd_splits
mask_argsorts = mask_argsort_fwd_splits
mask_output = mask_output_fwd
elif desp.op_type == ConvOpType.kBackwardInput.value:
indice_pairs = pair_bwd # out -> inp
mask_ops = pair_mask_bwd_splits
mask_argsorts = mask_argsort_bwd_splits
mask_output = mask_output_bwd
print([bin(x.item()) for x in masks])
else:
indice_pairs = pair_fwd # inp -> out
mask_ops = pair_mask_fwd_splits
mask_argsorts = mask_argsort_fwd_splits
mask_output = mask_output_fwd
for j in range(2):
beta = 1 if j == 1 else 0
mask_filter = masks[j].item()
reverse_mask = False
if desp.op_type == ConvOpType.kBackwardWeight.value:
mask_op = mask_output[j]
else:
mask_op = mask_ops[j]
CONV.run_with_tuned_result(
BestConvAlgoByProfile(desp, spk),
desp.op_type,
inp_tv,
weight_tv,
output_tv,
torch_tensor_to_tv(mask_op, dtype=tv.uint32),
torch_tensor_to_tv(mask_argsorts[j]),
torch_tensor_to_tv(mask_output[j], dtype=tv.uint32),
torch_tensor_to_tv(indice_pairs),
reverse_mask,
mask_filter=mask_filter,
mask_width=32,
beta=beta,
verbose=True,
)
torch.cuda.synchronize()
duration = time.time() - t
if desp.op_type == ConvOpType.kForward.value:
output_ref = np.zeros_like(output, dtype=np.float32)
# ref algorithm
for filter_offset in range(kv):
if subm and filter_offset > kv // 2:
nhot = indice_num_per_loc_np[kv - 1 - filter_offset]
elif subm and filter_offset == kv // 2:
nhot = voxels.shape[0]
else:
nhot = indice_num_per_loc_np[filter_offset]
a_inds = indice_pairs_np[0][filter_offset][:nhot]
c_inds = indice_pairs_np[1][filter_offset][:nhot]
# print(a_inds_cpu[:10])
a = inp[a_inds]
cc = a.astype(
np.float32) @ weight_ref[filter_offset].T.astype(
np.float32)
output_ref[c_inds] += cc
output_cpu = output_tv.cpu().numpy().astype(np.float32)
duration = time.time() - t
my = output_cpu.reshape(-1)
print("ERROR", np.linalg.norm(output_ref.reshape(-1) - my))
elif desp.op_type == ConvOpType.kBackwardInput.value:
dinput_ref = np.zeros_like(inp, dtype=np.float32)
# ref algorithm
for filter_offset in range(kv):
if subm and filter_offset > kv // 2:
nhot = indice_num_per_loc_np[kv - 1 - filter_offset]
elif subm and filter_offset == kv // 2:
nhot = voxels.shape[0]
else:
nhot = indice_num_per_loc_np[filter_offset]
a_inds = indice_pairs_np[1][filter_offset][:nhot]
c_inds = indice_pairs_np[0][filter_offset][:nhot]
# print(a_inds_cpu[:10])
a = output[a_inds]
# NK @ KC
cc = a.astype(
np.float32) @ weight_ref[filter_offset].astype(
np.float32)
dinput_ref[c_inds] += cc
din_cpu = inp_tv.cpu().numpy()
print(
"ERROR",
np.linalg.norm(
din_cpu.reshape(-1) - dinput_ref.reshape(-1)))
else:
dw_ref = np.zeros_like(weight_ref,
dtype=np.float32) # KV, K, C
for filter_offset in range(kv):
if subm and filter_offset > kv // 2:
nhot = indice_num_per_loc_np[kv - 1 - filter_offset]
elif subm and filter_offset == kv // 2:
nhot = voxels.shape[0]
else:
nhot = indice_num_per_loc_np[filter_offset]
o_inds = indice_pairs_np[1][filter_offset][:nhot]
i_inds = indice_pairs_np[0][filter_offset][:nhot]
# print(a_inds_cpu[:10])
out_gather = output[o_inds] # [N, K]
inp_gather = inp[i_inds] # [N, C]
# KN @ NC
dw_res = out_gather.astype(
np.float32).T @ inp_gather.astype(np.float32)
dw_ref[filter_offset] = dw_res
# print(indice_pairs_np_test[0])
dw_ref_kcrs = dw_ref.transpose(1, 0, 2)
dw_cpu = weight_tv.cpu().numpy().reshape(K, np.prod(ksize), C)
print(
"ERROR",
np.linalg.norm(
dw_cpu.reshape(-1) - dw_ref_kcrs.reshape(-1)))
def reverse_bits(a: np.ndarray):
a_unpack = np.unpackbits(a, bitorder="little")
return np.packbits(a_unpack)
def _count_mask_reduce(masks: np.ndarray):
masks_tv_count = SpconvOps.count_bits(tv.from_numpy(masks))
masks_tv_count_sum = masks_tv_count.numpy_view().sum()
reduce_count = reduce_mask_count(masks, 64)
print(masks_tv_count_sum, reduce_count, reduce_count / masks_tv_count_sum)
def bench_reduce_mask(masks: np.ndarray, width: int = 27):
# masks = np.random.randint(0, 2000000000, size=[100000], dtype=np.uint32)# & 0xffff
width_mask = np.array(0xffffffff, dtype=np.uint32) << (32 - width) >> (32 - width)
width_half_mask = np.array(0xffffffff, dtype=np.uint32) >> (32 - width // 2 - 1)
width_half_mask_left = width_half_mask << (width // 2 + 1)
print(bin(width_half_mask))
masks_sort = masks.copy()
masks_sort.sort()
_count_mask_reduce(masks_sort)
masks_sort = masks.copy() & width_half_mask
masks_sort.sort()
_count_mask_reduce(masks_sort)
# masks.sort()
# masks = masks & 0xffff
reversed_masks = SpconvOps.reverse_bits(tv.from_numpy(masks)).numpy()# & 0xffff0000
new_masks = np.concatenate([masks, reversed_masks])
np.random.shuffle(new_masks)
new_masks.sort()
_count_mask_reduce(new_masks)
new_masks &= width_half_mask
new_masks.sort()
_count_mask_reduce(new_masks)
if __name__ == "__main__":
dev_subm_inds_v2()
......@@ -131,9 +131,9 @@ class SimpleGemm:
# skip volta tensor op since it is very slow in architectures except volta.
if arch >= (7, 5) and desp.algo == GemmAlgo.Volta.value:
continue
lda = a.dim(1)
ldb = b.dim(1)
ldc = c.dim(1)
lda = a.stride[0]
ldb = b.stride[0]
ldc = c.stride[0]
if desp.supported_ldx(lda, ldb, ldc):
finally_algos.append(desp)
return finally_algos
......
from .basic import bench_basic
import fire
if __name__ == "__main__":
fire.Fire()
from spconv.benchmark.core import get_voxel_data
import time
from pathlib import Path
import numpy as np
import torch
from torch import nn
from cumm import tensorview as tv
from spconv.core import ConvAlgo
from cumm import dtypes
import spconv.pytorch as spconv
from spconv.test_utils import params_grid
class Net(nn.Module):
def __init__(self, shape, algo):
super().__init__()
pool_algo = algo
# pool_algo = ConvAlgo.Native
self.net = spconv.SparseSequential(
spconv.SubMConv3d(3, 64, 3, bias=False, indice_key="c0",
algo=algo),
spconv.SubMConv3d(64,
64,
3,
bias=False,
indice_key="c0",
algo=algo),
# nn.BatchNorm1d(32),
# nn.ReLU(),
# spconv.SparseConv3d(64, 64, 2, 2, bias=False, indice_key="m0"),
spconv.SparseMaxPool3d(2, 2, algo=pool_algo),
spconv.SubMConv3d(64,
96,
3,
bias=False,
indice_key="c1",
algo=algo),
spconv.SubMConv3d(96,
96,
3,
bias=False,
indice_key="c1",
algo=algo),
# nn.BatchNorm1d(64),
# nn.ReLU(),
# spconv.SparseConv3d(96, 96, 2, 2, bias=False, indice_key="m1"),
spconv.SparseMaxPool3d(2, 2, algo=pool_algo),
spconv.SubMConv3d(96,
128,
3,
bias=False,
indice_key="c2",
algo=algo),
spconv.SubMConv3d(128,
128,
3,
bias=False,
indice_key="c2",
algo=algo),
# nn.BatchNorm1d(128),
# nn.ReLU(),
# spconv.SparseConv3d(128, 128, 2, 2, bias=False, indice_key="m2"),
spconv.SparseMaxPool3d(2, 2, algo=pool_algo),
spconv.SubMConv3d(128,
160,
3,
bias=False,
indice_key="c3",
algo=algo),
spconv.SubMConv3d(160,
160,
3,
bias=False,
indice_key="c3",
algo=algo),
# nn.BatchNorm1d(128),
# nn.ReLU(),
# spconv.SparseConv3d(160, 160, 2, 2, bias=False, indice_key="m3"),
spconv.SparseMaxPool3d(2, 2, algo=pool_algo),
spconv.SubMConv3d(160,
192,
3,
bias=False,
indice_key="c4",
algo=algo),
spconv.SubMConv3d(192,
192,
3,
bias=False,
indice_key="c4",
algo=algo),
# nn.BatchNorm1d(128),
# nn.ReLU(),
spconv.SparseMaxPool3d(2, 2, indice_key="m4", algo=pool_algo),
# spconv.SparseConv3d(192, 192, 2, 2, bias=False, indice_key="m4"),
spconv.SubMConv3d(192,
224,
3,
bias=False,
indice_key="c5",
algo=algo),
spconv.SubMConv3d(224,
224,
3,
bias=False,
indice_key="c5",
algo=algo),
# nn.BatchNorm1d(224),
# nn.ReLU(),
# spconv.SparseConv3d(224, 224, 2, 2, bias=False, indice_key="m5"),
spconv.SparseMaxPool3d(2, 2, indice_key="m5", algo=pool_algo),
spconv.SubMConv3d(224,
256,
3,
bias=False,
indice_key="c6",
algo=algo),
spconv.SubMConv3d(256,
256,
3,
bias=False,
indice_key="c6",
algo=algo),
# nn.BatchNorm1d(256),
# nn.ReLU(),
# spconv.SparseInverseConv3d(256, 128, 2, indice_key="m5", bias=False, algo=algo),
# # # nn.BatchNorm1d(128),
# # # nn.ReLU(),
# spconv.SparseInverseConv3d(128, 64, 2, indice_key="m4", bias=False, algo=algo),
)
max_batch_size = 1
# grid (dense map) is used for indice generation. use pre-allocated grid can run faster.
self.grid = torch.full([max_batch_size, *shape], -1,
dtype=torch.int32).cuda()
# self.grid = None
self.shape = shape
def forward(self, features, coors, batch_size, enable_timer: bool = False):
x = spconv.SparseConvTensor(features,
coors,
self.shape,
batch_size,
self.grid,
enable_timer=enable_timer)
return self.net(x)
_DTYPE_TO_TORCH_DTYPE = {
dtypes.float32: torch.float32,
dtypes.float16: torch.float16,
}
def bench_basic(dtype_str: str):
dtype = dtypes.get_dtype_by_shortcut(dtype_str)
if dtype not in _DTYPE_TO_TORCH_DTYPE:
raise NotImplementedError("only support bench f32 and f16 for now")
torch_dtype = _DTYPE_TO_TORCH_DTYPE[dtype]
algos = [spconv.ConvAlgo.Native, spconv.ConvAlgo.MaskImplicitGemm, spconv.ConvAlgo.MaskSplitImplicitGemm]
(voxels, coors, spatial_shape) = get_voxel_data()
device = torch.device("cuda:0")
for algo, in params_grid(algos):
voxels_th = torch.from_numpy(voxels).to(device).to(torch_dtype)
coors_th = torch.from_numpy(coors).to(device).int()
voxels_th.requires_grad = True
net = Net(spatial_shape, algo).to(device).train().to(torch_dtype)# .train()
spconv.assign_name_for_sparse_modules(net)
with torch.no_grad():
out: spconv.SparseConvTensor = net(voxels_th, coors_th, 1)
dout = np.random.uniform(-0.2, 0.2, out.features.shape).astype(np.float32)
dout_t = torch.from_numpy(dout).to(device).to(torch_dtype)
times = []
with torch.no_grad():
for i in range(20):
torch.cuda.synchronize()
t = time.time()
out_nograd = net(voxels_th, coors_th, 1, False)
timer = out_nograd._timer
torch.cuda.synchronize()
times.append(time.time() - t)
print(f"basic[{dtype_str}|{algo}|forward]", np.mean(times[10:]))
times = []
for i in range(10):
out = net(voxels_th, coors_th, 1)
torch.cuda.synchronize()
t = time.time()
out.features.backward(dout_t)
torch.cuda.synchronize()
times.append(time.time() - t)
print(f"basic[{dtype_str}|{algo}|backward]", np.mean(times[5:]))
if __name__ == "__main__":
bench_basic("f16")
\ No newline at end of file
import requests
import fire
import pickle
from io import BytesIO
import numpy as np
from spconv.constants import PACKAGE_ROOT
RAW_PC_PATH = "https://raw.githubusercontent.com/traveller59/spconv/v2.1.10/test/data/test_spconv.pkl"
def get_voxel_data():
editable_test_data_path = PACKAGE_ROOT.parent / "test/data/test_spconv.pkl"
if editable_test_data_path.exists():
with editable_test_data_path.open("rb") as f:
return pickle.load(f)
ff = BytesIO()
with requests.get(RAW_PC_PATH, stream=True) as req:
req.raise_for_status()
for chunk in req.iter_content(chunk_size=8192):
ff.write(chunk)
ff.seek(0)
(voxels, coors, spatial_shape) = pickle.load(ff)
return voxels, coors, spatial_shape
def get_pc_data():
editable_test_data_path = PACKAGE_ROOT.parent / "test/data/benchmark-pc.npz"
if editable_test_data_path.exists():
pc = np.load(str(editable_test_data_path))["pc"]
return pc
ff = BytesIO()
with requests.get(RAW_PC_PATH, stream=True) as req:
req.raise_for_status()
for chunk in req.iter_content(chunk_size=8192):
ff.write(chunk)
ff.seek(0)
pc = np.load(ff)["pc"]
return pc
if __name__ == "__main__":
pc = get_pc_data()
print(pc[:10])
\ No newline at end of file
......@@ -452,7 +452,7 @@ IMPLGEMM_VOLTA_PARAMS = [
*gen_conv_params(ConvBwdWeight, (64, 64, 32), (32, 32, 32),
NDIM_DONT_CARE,
ConvIterAlgo.Optimized,
2, ["f16,f16,f16,f16,f16"],
2, ["f16,f16,f16,f32,f32"],
NHWC,
NHWC,
NHWC,
......@@ -464,7 +464,7 @@ IMPLGEMM_VOLTA_PARAMS = [
*gen_conv_params(ConvBwdWeight, (64, 64, 32), (32, 32, 32),
NDIM_DONT_CARE,
ConvIterAlgo.Optimized,
2, ["f16,f16,f16,f16,f16"],
2, ["f16,f16,f16,f32,f32"],
NHWC,
NHWC,
NHWC,
......@@ -476,7 +476,7 @@ IMPLGEMM_VOLTA_PARAMS = [
*gen_conv_params(ConvBwdWeight, (128, 128, 32), (32, 64, 32),
NDIM_DONT_CARE,
ConvIterAlgo.Optimized,
2, ["f16,f16,f16,f16,f16"],
2, ["f16,f16,f16,f32,f32"],
NHWC,
NHWC,
NHWC,
......
......@@ -298,7 +298,7 @@ class SpconvOps:
"""
...
@staticmethod
def point2voxel_cpu(points: Tensor, voxels: Tensor, indices: Tensor, num_per_voxel: Tensor, densehashdata: Tensor, vsize: List[float], grid_size: List[int], grid_stride: List[int], coors_range: List[float], empty_mean: bool = False, clear_voxels: bool = True) -> Tuple[Tensor, Tensor, Tensor]:
def point2voxel_cpu(points: Tensor, voxels: Tensor, indices: Tensor, num_per_voxel: Tensor, densehashdata: Tensor, pc_voxel_id: Tensor, vsize: List[float], grid_size: List[int], grid_stride: List[int], coors_range: List[float], empty_mean: bool = False, clear_voxels: bool = True) -> Tuple[Tensor, Tensor, Tensor]:
"""
Args:
points:
......@@ -306,6 +306,7 @@ class SpconvOps:
indices:
num_per_voxel:
densehashdata:
pc_voxel_id:
vsize:
grid_size:
grid_stride:
......@@ -315,7 +316,7 @@ class SpconvOps:
"""
...
@staticmethod
def point2voxel_cuda(points: Tensor, voxels: Tensor, indices: Tensor, num_per_voxel: Tensor, hashdata: Tensor, point_indice_data: Tensor, vsize: List[float], grid_size: List[int], grid_stride: List[int], coors_range: List[float], empty_mean: bool = False, clear_voxels: bool = True, stream_int: int = 0) -> Tuple[Tensor, Tensor, Tensor]:
def point2voxel_cuda(points: Tensor, voxels: Tensor, indices: Tensor, num_per_voxel: Tensor, hashdata: Tensor, point_indice_data: Tensor, pc_voxel_id: Tensor, vsize: List[float], grid_size: List[int], grid_stride: List[int], coors_range: List[float], empty_mean: bool = False, clear_voxels: bool = True, stream_int: int = 0) -> Tuple[Tensor, Tensor, Tensor]:
"""
Args:
points:
......@@ -324,6 +325,7 @@ class SpconvOps:
num_per_voxel:
hashdata:
point_indice_data:
pc_voxel_id:
vsize:
grid_size:
grid_stride:
......
......@@ -29,7 +29,7 @@ class Point2Voxel:
"""
...
@staticmethod
def point_to_voxel_hash_static(points: Tensor, voxels: Tensor, indices: Tensor, num_per_voxel: Tensor, hashdata: Tensor, point_indice_data: Tensor, vsize: List[float], grid_size: List[int], grid_stride: List[int], coors_range: List[float], clear_voxels: bool = True, empty_mean: bool = False, stream_int: int = 0) -> Tuple[Tensor, Tensor, Tensor]:
def point_to_voxel_hash_static(points: Tensor, voxels: Tensor, indices: Tensor, num_per_voxel: Tensor, hashdata: Tensor, point_indice_data: Tensor, points_voxel_id: Tensor, vsize: List[float], grid_size: List[int], grid_stride: List[int], coors_range: List[float], clear_voxels: bool = True, empty_mean: bool = False, stream_int: int = 0) -> Tuple[Tensor, Tensor, Tensor]:
"""
Args:
points:
......@@ -38,6 +38,7 @@ class Point2Voxel:
num_per_voxel:
hashdata:
point_indice_data:
points_voxel_id:
vsize:
grid_size:
grid_stride:
......
......@@ -29,7 +29,7 @@ class Point2Voxel:
"""
...
@staticmethod
def point_to_voxel_hash_static(points: Tensor, voxels: Tensor, indices: Tensor, num_per_voxel: Tensor, hashdata: Tensor, point_indice_data: Tensor, vsize: List[float], grid_size: List[int], grid_stride: List[int], coors_range: List[float], clear_voxels: bool = True, empty_mean: bool = False, stream_int: int = 0) -> Tuple[Tensor, Tensor, Tensor]:
def point_to_voxel_hash_static(points: Tensor, voxels: Tensor, indices: Tensor, num_per_voxel: Tensor, hashdata: Tensor, point_indice_data: Tensor, points_voxel_id: Tensor, vsize: List[float], grid_size: List[int], grid_stride: List[int], coors_range: List[float], clear_voxels: bool = True, empty_mean: bool = False, stream_int: int = 0) -> Tuple[Tensor, Tensor, Tensor]:
"""
Args:
points:
......@@ -38,6 +38,7 @@ class Point2Voxel:
num_per_voxel:
hashdata:
point_indice_data:
points_voxel_id:
vsize:
grid_size:
grid_stride:
......
......@@ -29,7 +29,7 @@ class Point2Voxel:
"""
...
@staticmethod
def point_to_voxel_hash_static(points: Tensor, voxels: Tensor, indices: Tensor, num_per_voxel: Tensor, hashdata: Tensor, point_indice_data: Tensor, vsize: List[float], grid_size: List[int], grid_stride: List[int], coors_range: List[float], clear_voxels: bool = True, empty_mean: bool = False, stream_int: int = 0) -> Tuple[Tensor, Tensor, Tensor]:
def point_to_voxel_hash_static(points: Tensor, voxels: Tensor, indices: Tensor, num_per_voxel: Tensor, hashdata: Tensor, point_indice_data: Tensor, points_voxel_id: Tensor, vsize: List[float], grid_size: List[int], grid_stride: List[int], coors_range: List[float], clear_voxels: bool = True, empty_mean: bool = False, stream_int: int = 0) -> Tuple[Tensor, Tensor, Tensor]:
"""
Args:
points:
......@@ -38,6 +38,7 @@ class Point2Voxel:
num_per_voxel:
hashdata:
point_indice_data:
points_voxel_id:
vsize:
grid_size:
grid_stride:
......
......@@ -29,7 +29,7 @@ class Point2Voxel:
"""
...
@staticmethod
def point_to_voxel_hash_static(points: Tensor, voxels: Tensor, indices: Tensor, num_per_voxel: Tensor, hashdata: Tensor, point_indice_data: Tensor, vsize: List[float], grid_size: List[int], grid_stride: List[int], coors_range: List[float], clear_voxels: bool = True, empty_mean: bool = False, stream_int: int = 0) -> Tuple[Tensor, Tensor, Tensor]:
def point_to_voxel_hash_static(points: Tensor, voxels: Tensor, indices: Tensor, num_per_voxel: Tensor, hashdata: Tensor, point_indice_data: Tensor, points_voxel_id: Tensor, vsize: List[float], grid_size: List[int], grid_stride: List[int], coors_range: List[float], clear_voxels: bool = True, empty_mean: bool = False, stream_int: int = 0) -> Tuple[Tensor, Tensor, Tensor]:
"""
Args:
points:
......@@ -38,6 +38,7 @@ class Point2Voxel:
num_per_voxel:
hashdata:
point_indice_data:
points_voxel_id:
vsize:
grid_size:
grid_stride:
......
......@@ -27,7 +27,7 @@ class Point2VoxelCPU:
"""
...
@staticmethod
def point_to_voxel_static(points: Tensor, voxels: Tensor, indices: Tensor, num_per_voxel: Tensor, densehashdata: Tensor, vsize: List[float], grid_size: List[int], grid_stride: List[int], coors_range: List[float], clear_voxels: bool = True) -> Tuple[Tensor, Tensor, Tensor]:
def point_to_voxel_static(points: Tensor, voxels: Tensor, indices: Tensor, num_per_voxel: Tensor, densehashdata: Tensor, points_voxel_id: Tensor, vsize: List[float], grid_size: List[int], grid_stride: List[int], coors_range: List[float], clear_voxels: bool = True) -> Tuple[Tensor, Tensor, Tensor]:
"""
Args:
points:
......@@ -35,6 +35,7 @@ class Point2VoxelCPU:
indices:
num_per_voxel:
densehashdata:
points_voxel_id:
vsize:
grid_size:
grid_stride:
......@@ -43,7 +44,7 @@ class Point2VoxelCPU:
"""
...
@staticmethod
def point_to_voxel_empty_mean_static(points: Tensor, voxels: Tensor, indices: Tensor, num_per_voxel: Tensor, densehashdata: Tensor, vsize: List[float], grid_size: List[int], grid_stride: List[int], coors_range: List[float], clear_voxels: bool = True) -> Tuple[Tensor, Tensor, Tensor]:
def point_to_voxel_empty_mean_static(points: Tensor, voxels: Tensor, indices: Tensor, num_per_voxel: Tensor, densehashdata: Tensor, points_voxel_id: Tensor, vsize: List[float], grid_size: List[int], grid_stride: List[int], coors_range: List[float], clear_voxels: bool = True) -> Tuple[Tensor, Tensor, Tensor]:
"""
Args:
points:
......@@ -51,6 +52,7 @@ class Point2VoxelCPU:
indices:
num_per_voxel:
densehashdata:
points_voxel_id:
vsize:
grid_size:
grid_stride:
......
......@@ -27,7 +27,7 @@ class Point2VoxelCPU:
"""
...
@staticmethod
def point_to_voxel_static(points: Tensor, voxels: Tensor, indices: Tensor, num_per_voxel: Tensor, densehashdata: Tensor, vsize: List[float], grid_size: List[int], grid_stride: List[int], coors_range: List[float], clear_voxels: bool = True) -> Tuple[Tensor, Tensor, Tensor]:
def point_to_voxel_static(points: Tensor, voxels: Tensor, indices: Tensor, num_per_voxel: Tensor, densehashdata: Tensor, points_voxel_id: Tensor, vsize: List[float], grid_size: List[int], grid_stride: List[int], coors_range: List[float], clear_voxels: bool = True) -> Tuple[Tensor, Tensor, Tensor]:
"""
Args:
points:
......@@ -35,6 +35,7 @@ class Point2VoxelCPU:
indices:
num_per_voxel:
densehashdata:
points_voxel_id:
vsize:
grid_size:
grid_stride:
......@@ -43,7 +44,7 @@ class Point2VoxelCPU:
"""
...
@staticmethod
def point_to_voxel_empty_mean_static(points: Tensor, voxels: Tensor, indices: Tensor, num_per_voxel: Tensor, densehashdata: Tensor, vsize: List[float], grid_size: List[int], grid_stride: List[int], coors_range: List[float], clear_voxels: bool = True) -> Tuple[Tensor, Tensor, Tensor]:
def point_to_voxel_empty_mean_static(points: Tensor, voxels: Tensor, indices: Tensor, num_per_voxel: Tensor, densehashdata: Tensor, points_voxel_id: Tensor, vsize: List[float], grid_size: List[int], grid_stride: List[int], coors_range: List[float], clear_voxels: bool = True) -> Tuple[Tensor, Tensor, Tensor]:
"""
Args:
points:
......@@ -51,6 +52,7 @@ class Point2VoxelCPU:
indices:
num_per_voxel:
densehashdata:
points_voxel_id:
vsize:
grid_size:
grid_stride:
......
......@@ -27,7 +27,7 @@ class Point2VoxelCPU:
"""
...
@staticmethod
def point_to_voxel_static(points: Tensor, voxels: Tensor, indices: Tensor, num_per_voxel: Tensor, densehashdata: Tensor, vsize: List[float], grid_size: List[int], grid_stride: List[int], coors_range: List[float], clear_voxels: bool = True) -> Tuple[Tensor, Tensor, Tensor]:
def point_to_voxel_static(points: Tensor, voxels: Tensor, indices: Tensor, num_per_voxel: Tensor, densehashdata: Tensor, points_voxel_id: Tensor, vsize: List[float], grid_size: List[int], grid_stride: List[int], coors_range: List[float], clear_voxels: bool = True) -> Tuple[Tensor, Tensor, Tensor]:
"""
Args:
points:
......@@ -35,6 +35,7 @@ class Point2VoxelCPU:
indices:
num_per_voxel:
densehashdata:
points_voxel_id:
vsize:
grid_size:
grid_stride:
......@@ -43,7 +44,7 @@ class Point2VoxelCPU:
"""
...
@staticmethod
def point_to_voxel_empty_mean_static(points: Tensor, voxels: Tensor, indices: Tensor, num_per_voxel: Tensor, densehashdata: Tensor, vsize: List[float], grid_size: List[int], grid_stride: List[int], coors_range: List[float], clear_voxels: bool = True) -> Tuple[Tensor, Tensor, Tensor]:
def point_to_voxel_empty_mean_static(points: Tensor, voxels: Tensor, indices: Tensor, num_per_voxel: Tensor, densehashdata: Tensor, points_voxel_id: Tensor, vsize: List[float], grid_size: List[int], grid_stride: List[int], coors_range: List[float], clear_voxels: bool = True) -> Tuple[Tensor, Tensor, Tensor]:
"""
Args:
points:
......@@ -51,6 +52,7 @@ class Point2VoxelCPU:
indices:
num_per_voxel:
densehashdata:
points_voxel_id:
vsize:
grid_size:
grid_stride:
......
......@@ -27,7 +27,7 @@ class Point2VoxelCPU:
"""
...
@staticmethod
def point_to_voxel_static(points: Tensor, voxels: Tensor, indices: Tensor, num_per_voxel: Tensor, densehashdata: Tensor, vsize: List[float], grid_size: List[int], grid_stride: List[int], coors_range: List[float], clear_voxels: bool = True) -> Tuple[Tensor, Tensor, Tensor]:
def point_to_voxel_static(points: Tensor, voxels: Tensor, indices: Tensor, num_per_voxel: Tensor, densehashdata: Tensor, points_voxel_id: Tensor, vsize: List[float], grid_size: List[int], grid_stride: List[int], coors_range: List[float], clear_voxels: bool = True) -> Tuple[Tensor, Tensor, Tensor]:
"""
Args:
points:
......@@ -35,6 +35,7 @@ class Point2VoxelCPU:
indices:
num_per_voxel:
densehashdata:
points_voxel_id:
vsize:
grid_size:
grid_stride:
......@@ -43,7 +44,7 @@ class Point2VoxelCPU:
"""
...
@staticmethod
def point_to_voxel_empty_mean_static(points: Tensor, voxels: Tensor, indices: Tensor, num_per_voxel: Tensor, densehashdata: Tensor, vsize: List[float], grid_size: List[int], grid_stride: List[int], coors_range: List[float], clear_voxels: bool = True) -> Tuple[Tensor, Tensor, Tensor]:
def point_to_voxel_empty_mean_static(points: Tensor, voxels: Tensor, indices: Tensor, num_per_voxel: Tensor, densehashdata: Tensor, points_voxel_id: Tensor, vsize: List[float], grid_size: List[int], grid_stride: List[int], coors_range: List[float], clear_voxels: bool = True) -> Tuple[Tensor, Tensor, Tensor]:
"""
Args:
points:
......@@ -51,6 +52,7 @@ class Point2VoxelCPU:
indices:
num_per_voxel:
densehashdata:
points_voxel_id:
vsize:
grid_size:
grid_stride:
......
......@@ -920,7 +920,7 @@ class SpconvOps(pccm.Class):
def point2voxel_cpu(self):
code = pccm.FunctionCode()
code.arg("points", "tv::Tensor")
code.arg("voxels, indices, num_per_voxel, densehashdata", "tv::Tensor")
code.arg("voxels, indices, num_per_voxel, densehashdata, pc_voxel_id", "tv::Tensor")
code.arg("vsize", f"std::vector<float>")
code.arg("grid_size, grid_stride", f"std::vector<int>")
code.arg("coors_range", f"std::vector<float>")
......@@ -950,11 +950,11 @@ class SpconvOps(pccm.Class):
}}
if (empty_mean){{
return Point2Voxel{ndim}DCPU::point_to_voxel_empty_mean_static(points, voxels, indices,
num_per_voxel, densehashdata,
num_per_voxel, densehashdata, pc_voxel_id,
vsize_, grid_size_, grid_stride_, coors_range_, clear_voxels);
}} else{{
return Point2Voxel{ndim}DCPU::point_to_voxel_static(points, voxels, indices,
num_per_voxel, densehashdata,
num_per_voxel, densehashdata, pc_voxel_id,
vsize_, grid_size_, grid_stride_, coors_range_, clear_voxels);
}}
}}
......@@ -967,7 +967,7 @@ class SpconvOps(pccm.Class):
def point2voxel_cuda(self):
code = pccm.FunctionCode()
code.arg("points", "tv::Tensor")
code.arg("voxels, indices, num_per_voxel, hashdata, point_indice_data",
code.arg("voxels, indices, num_per_voxel, hashdata, point_indice_data, pc_voxel_id",
"tv::Tensor")
code.arg("vsize", f"std::vector<float>")
code.arg("grid_size, grid_stride", f"std::vector<int>")
......@@ -1000,7 +1000,7 @@ class SpconvOps(pccm.Class):
coors_range_[i + {ndim}] = coors_range[i + {ndim}];
}}
return Point2Voxel{ndim}D::point_to_voxel_hash_static(points, voxels, indices,
num_per_voxel, hashdata, point_indice_data,
num_per_voxel, hashdata, point_indice_data, pc_voxel_id,
vsize_, grid_size_, grid_stride_, coors_range_, clear_voxels,
empty_mean, stream_int);
}}
......
......@@ -208,6 +208,7 @@ class Point2VoxelKernel(pccm.ParameterizedClass, pccm.pybind.PybindClassMixin):
code.arg("points_indice_data", f"const int64_t*")
code.arg("voxels", f"{self.dtype} *")
code.arg("num_per_voxel", f"int *")
code.arg("points_voxel_id", f"int64_t*")
code.arg("point_stride", f"int")
code.arg("max_points_per_voxel", f"int")
......@@ -219,14 +220,17 @@ class Point2VoxelKernel(pccm.ParameterizedClass, pccm.pybind.PybindClassMixin):
code.arg("grid_stride", f"tv::array<int, {self.ndim}>")
code.arg("num_points", f"int")
# TODO add backward?
code.raw(f"""
int voxel_stride0 = point_stride * max_points_per_voxel;
for (int i : tv::KernelLoopX<int>(num_points)){{
int64_t prod = points_indice_data[i];
int voxel_id = -1;
if (prod != -1){{
auto voxel_index_pair = table.lookup(prod);
if (!voxel_index_pair.empty() &&
voxel_index_pair.second < max_voxels) {{
voxel_id = voxel_index_pair.second;
int old = atomicAdd(num_per_voxel + voxel_index_pair.second, 1);
if (old < max_points_per_voxel) {{
for (int j = 0; j < point_stride; ++j) {{
......@@ -235,6 +239,7 @@ class Point2VoxelKernel(pccm.ParameterizedClass, pccm.pybind.PybindClassMixin):
}}
}}
}}
points_voxel_id[i] = voxel_id;
}}
""")
return code
......@@ -385,6 +390,7 @@ class Point2Voxel(pccm.ParameterizedClass, pccm.pybind.PybindClassMixin):
code.arg("stream_int", f"std::uintptr_t", "0")
code.raw(f"""
tv::Tensor points_voxel_id = tv::empty({{points.dim(0)}}, tv::int64, 0);
int64_t expected_hash_data_num = points.dim(0) * 2;
if (hashdata.dim(0) < expected_hash_data_num){{
hashdata = tv::zeros({{expected_hash_data_num}}, tv::custom128, 0);
......@@ -393,74 +399,18 @@ class Point2Voxel(pccm.ParameterizedClass, pccm.pybind.PybindClassMixin):
point_indice_data = tv::zeros({{points.dim(0)}}, tv::int64, 0);
}}
return point_to_voxel_hash_static(points, voxels, indices, num_per_voxel,
hashdata, point_indice_data, Point2VoxelCommon::tvarray2array(vsize),
hashdata, point_indice_data, points_voxel_id, Point2VoxelCommon::tvarray2array(vsize),
Point2VoxelCommon::tvarray2array(grid_size), Point2VoxelCommon::tvarray2array(grid_stride),
Point2VoxelCommon::tvarray2array(coors_range), clear_voxels, empty_mean, stream_int);
""")
return code.ret("std::tuple<tv::Tensor, tv::Tensor, tv::Tensor>")
code.raw(f"""
TV_ASSERT_INVALID_ARG(points.ndim() == 2 && points.dim(1) >= {self.ndim}, "error");
using V = int64_t;
using KeyType = int64_t;
constexpr KeyType kEmptyKey = std::numeric_limits<KeyType>::max();
if (clear_voxels){{
voxels.zero_();
}}
using table_t =
tv::hash::LinearHashTable<KeyType, V, tv::hash::Murmur3Hash<KeyType>,
kEmptyKey, false>;
using pair_t = typename table_t::value_type;
// int64_t expected_hash_data_num = int64_t(tv::hash::align_to_power2(points.dim(0) * 2));
int64_t expected_hash_data_num = points.dim(0) * 2;
if (hashdata.dim(0) < expected_hash_data_num){{
hashdata = tv::zeros({{expected_hash_data_num}}, tv::custom128, 0);
}}
if (point_indice_data.dim(0) < points.dim(0)){{
point_indice_data = tv::zeros({{points.dim(0)}}, tv::int64, 0);
}}
// auto timer = tv::CudaContextTimer<>();
num_per_voxel.zero_();
table_t hash = table_t(hashdata.data_ptr<pair_t>(), expected_hash_data_num);
hash.clear();
// tv::ssprint("clear time", timer.report());
auto launcher = tv::cuda::Launch(points.dim(0));
launcher(kernel::build_hash_table<table_t>, hash, points.data_ptr<const {self.dtype}>(),
point_indice_data.data_ptr<int64_t>(),
points.dim(1), vsize, coors_range, grid_size, grid_stride, points.dim(0));
// tv::ssprint("build_hash_table", timer.report());
auto table_launcher = tv::cuda::Launch(hash.size());
tv::Tensor count = tv::zeros({{1}}, tv::int32, 0);
Layout layout = Layout::from_shape(grid_size);
table_launcher(kernel::assign_table<table_t>, hash, indices.data_ptr<int>(),
count.data_ptr<int>(),
layout, voxels.dim(0));
auto count_cpu = count.cpu();
int count_val = count_cpu.item<int32_t>();
// tv::ssprint("assign_table", timer.report());
launcher(kernel::generate_voxel<table_t>, hash, points.data_ptr<const {self.dtype}>(),
point_indice_data.data_ptr<const int64_t>(), voxels.data_ptr<{self.dtype}>(),
num_per_voxel.data_ptr<int>(), points.dim(1), voxels.dim(1),
voxels.dim(0), vsize, coors_range,
grid_size, grid_stride, points.dim(0));
// tv::ssprint("generate_voxel", timer.report());
return std::make_tuple(voxels.slice_first_axis(0, count_val),
indices.slice_first_axis(0, count_val),
num_per_voxel.slice_first_axis(0, count_val));
""")
return code.ret("std::tuple<tv::Tensor, tv::Tensor, tv::Tensor>")
@pccm.pybind.mark
@pccm.cuda.static_function
def point_to_voxel_hash_static(self):
code = pccm.FunctionCode()
code.arg("points", "tv::Tensor")
code.arg("voxels, indices, num_per_voxel, hashdata, point_indice_data",
code.arg("voxels, indices, num_per_voxel, hashdata, point_indice_data, points_voxel_id",
"tv::Tensor")
code.arg("vsize", f"std::array<float, {self.ndim}>")
code.arg("grid_size, grid_stride", f"std::array<int, {self.ndim}>")
......@@ -516,7 +466,7 @@ class Point2Voxel(pccm.ParameterizedClass, pccm.pybind.PybindClassMixin):
launcher(kernel::generate_voxel<table_t>, hash, points.data_ptr<const {self.dtype}>(),
point_indice_data.data_ptr<const int64_t>(), voxels.data_ptr<{self.dtype}>(),
num_per_voxel.data_ptr<int>(), points.dim(1), voxels.dim(1),
num_per_voxel.data_ptr<int>(), points_voxel_id.data_ptr<int64_t>(), points.dim(1), voxels.dim(1),
voxels.dim(0), vsize_tv, coors_range_tv,
grid_size_tv, grid_stride_tv, points.dim(0));
// tv::ssprint("generate_voxel", timer.report());
......@@ -636,7 +586,7 @@ class Point2VoxelCPU(pccm.ParameterizedClass, pccm.pybind.PybindClassMixin):
def point_to_voxel_static_template(self, mean: bool = False):
code = pccm.FunctionCode()
code.arg("points", "tv::Tensor")
code.arg("voxels, indices, num_per_voxel, densehashdata", "tv::Tensor")
code.arg("voxels, indices, num_per_voxel, densehashdata, points_voxel_id", "tv::Tensor")
code.arg("vsize", f"std::array<float, {self.ndim}>")
code.arg("grid_size, grid_stride", f"std::array<int, {self.ndim}>")
code.arg("coors_range", f"std::array<float, {self.ndim * 2}>")
......@@ -653,6 +603,7 @@ class Point2VoxelCPU(pccm.ParameterizedClass, pccm.pybind.PybindClassMixin):
if (clear_voxels){{
voxels.zero_();
}}
auto points_voxel_id_ptr = points_voxel_id.data_ptr<int64_t>();
int res_voxel_num = 0;
int num_features = points.dim(1);
auto N = points.dim(0);
......@@ -680,20 +631,25 @@ class Point2VoxelCPU(pccm.ParameterizedClass, pccm.pybind.PybindClassMixin):
}}
coor[j] = c;
}}
if (failed)
if (failed){{
points_voxel_id_ptr[i] = -1;
continue;
}}
voxelidx = coor_to_voxelidx_rw({codeops.unpack("coor", range(self.ndim))});
if (voxelidx == -1) {{
voxelidx = voxel_num;
if (voxel_num >= max_num_voxels)
if (voxel_num >= max_num_voxels){{
points_voxel_id_ptr[i] = -1;
continue;
}}
voxel_num += 1;
coor_to_voxelidx_rw({codeops.unpack("coor", range(self.ndim))}) = voxelidx;
for (int k = 0; k < {self.ndim}; ++k) {{
coors_rw(voxelidx, k) = coor[k];
}}
}}
points_voxel_id_ptr[i] = voxelidx;
num = num_points_per_voxel_rw(voxelidx);
if (num < max_num_points_per_voxel) {{
// voxel_point_mask_rw(voxelidx, num) = {self.dtype}(1);
......@@ -781,8 +737,10 @@ class Point2VoxelCPU(pccm.ParameterizedClass, pccm.pybind.PybindClassMixin):
code.arg("points", "tv::Tensor")
code.arg("clear_voxels", "bool", "true")
code.raw(f"""
tv::Tensor points_voxel_id = tv::empty({{points.dim(0)}}, tv::int64, -1);
return point_to_voxel_static(points, voxels, indices, num_per_voxel, densehashdata,
tvarray2array(vsize),
points_voxel_id, tvarray2array(vsize),
tvarray2array(grid_size), tvarray2array(grid_stride),
tvarray2array(coors_range), clear_voxels);
""")
......@@ -795,8 +753,10 @@ class Point2VoxelCPU(pccm.ParameterizedClass, pccm.pybind.PybindClassMixin):
code.arg("points", "tv::Tensor")
code.arg("clear_voxels", "bool", "true")
code.raw(f"""
tv::Tensor points_voxel_id = tv::empty({{points.dim(0)}}, tv::int64, -1);
return point_to_voxel_empty_mean_static(points, voxels, indices, num_per_voxel,
densehashdata, tvarray2array(vsize),
densehashdata, points_voxel_id, tvarray2array(vsize),
tvarray2array(grid_size), tvarray2array(grid_stride),
tvarray2array(coors_range), clear_voxels);
""")
......
......@@ -27,3 +27,15 @@ try:
except:
# for unknown errors, just set a version
PYTORCH_VERSION = [1, 8, 0]
if PYTORCH_VERSION >= [1, 6, 0]:
TORCH_HAS_AMP = True
else:
TORCH_HAS_AMP = False
def is_amp_enabled():
if TORCH_HAS_AMP:
return torch.is_autocast_enabled()
else:
return False
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment