temp commit

bf011c76 · yan.yan · 4791f582 · bf011c76 · 4791f582 · bf011c76
Commit bf011c76 authored Nov 23, 2021 by yan.yan
20 changed files
--- a/example/voxel_gen.py
+++ b/example/voxel_gen.py
@@ -19,6 +19,67 @@ from spconv.utils import Point2VoxelCPU3d
 from spconv.pytorch.utils import PointToVoxel
 import torch

+def main_pytorch_voxel_gen():
+    np.random.seed(50051)
+    # voxel gen source code: spconv/csrc/sparse/pointops.py
+    gen = PointToVoxel(vsize_xyz=[0.1, 0.1, 0.1],
+                       coors_range_xyz=[-80, -80, -6, 80, 80, 6],
+                       num_point_features=3,
+                       max_num_voxels=5000,
+                       max_num_points_per_voxel=5)
+
+    pc = np.random.uniform(-4, 4, size=[1000, 3])
+    pc_th = torch.from_numpy(pc)
+    voxels_th, indices_th, num_p_in_vx_th = gen(pc_th)
+    voxels_np = voxels_th.numpy()
+    indices_np = indices_th.numpy()
+    num_p_in_vx_np = num_p_in_vx_th.numpy()
+    print(f"------Raw Voxels {voxels_np.shape[0]}-------")
+    print(voxels_np[0])
+    # run voxel gen and FILL MEAN VALUE to voxel remain
+    voxels_th, indices_th, num_p_in_vx_th = gen(pc_th, empty_mean=True)
+    voxels_np = voxels_th.numpy()
+    indices_np = indices_th.numpy()
+    num_p_in_vx_np = num_p_in_vx_th.numpy()
+    print("------Voxels with mean filled-------")
+    print(voxels_np[0])
+    voxels_th, indices_th, num_p_in_vx_th, pc_voxel_id = gen.generate_voxel_with_id(pc_th, empty_mean=True)
+    print("------Voxel ids for every point-------")
+    print(pc_voxel_id[:10])
+
+
+
+def main_pytorch_voxel_gen_cuda():
+    np.random.seed(50051)
+    # voxel gen source code: spconv/csrc/sparse/pointops.py
+    device = torch.device("cuda:0")
+    gen = PointToVoxel(vsize_xyz=[0.1, 0.1, 0.1],
+                       coors_range_xyz=[-80, -80, -6, 80, 80, 6],
+                       num_point_features=3,
+                       max_num_voxels=5000,
+                       max_num_points_per_voxel=5,
+                       device=device)
+
+    pc = np.random.uniform(-4, 4, size=[1000, 3]).astype(np.float32)
+    pc_th = torch.from_numpy(pc).to(device)
+    voxels_th, indices_th, num_p_in_vx_th = gen(pc_th)
+    voxels_np = voxels_th.cpu().numpy()
+    indices_np = indices_th.cpu().numpy()
+    num_p_in_vx_np = num_p_in_vx_th.cpu().numpy()
+    print(f"------Raw Voxels {voxels_np.shape[0]}-------")
+    print(voxels_np[0])
+    # run voxel gen and FILL MEAN VALUE to voxel remain
+    voxels_tv, indices_tv, num_p_in_vx_tv = gen(pc_th, empty_mean=True)
+    voxels_np = voxels_tv.cpu().numpy()
+    indices_np = indices_tv.cpu().numpy()
+    num_p_in_vx_np = num_p_in_vx_tv.cpu().numpy()
+    print("------Voxels with mean filled-------")
+    print(voxels_np[0])
+    voxels_th, indices_th, num_p_in_vx_th, pc_voxel_id = gen.generate_voxel_with_id(pc_th, empty_mean=True)
+    print("------Voxel ids for every point-------")
+    print(pc[:10])
+    print(indices_th[pc_voxel_id[:10]])
+

 def main():
    np.random.seed(50051)
@@ -81,58 +142,26 @@ def main_point_with_features():
    print("------Voxels with mean filled-------")
    print(voxels_np[0])

-
-def main_pytorch_voxel_gen():
+def main_cuda():
    np.random.seed(50051)
-    # voxel gen source code: spconv/csrc/sparse/pointops.py
-    gen = PointToVoxel(vsize_xyz=[0.1, 0.1, 0.1],
-                       coors_range_xyz=[-80, -80, -2, 80, 80, 6],
-                       num_point_features=3,
-                       max_num_voxels=5000,
-                       max_num_points_per_voxel=5)
+    from spconv.utils import Point2VoxelGPU3d

-    pc = np.random.uniform(-10, 10, size=[1000, 3])
-    pc_th = torch.from_numpy(pc)
-    voxels_th, indices_th, num_p_in_vx_th = gen(pc_th)
-    voxels_np = voxels_th.numpy()
-    indices_np = indices_th.numpy()
-    num_p_in_vx_np = num_p_in_vx_th.numpy()
-    print(f"------Raw Voxels {voxels_np.shape[0]}-------")
-    print(voxels_np[0])
-    # run voxel gen and FILL MEAN VALUE to voxel remain
-    voxels_tv, indices_tv, num_p_in_vx_tv = gen(pc_th, empty_mean=True)
-    voxels_np = voxels_tv.numpy()
-    indices_np = indices_tv.numpy()
-    num_p_in_vx_np = num_p_in_vx_tv.numpy()
-    print("------Voxels with mean filled-------")
-    print(voxels_np[0])
-
-
-def main_pytorch_voxel_gen_cuda():
-    np.random.seed(50051)
    # voxel gen source code: spconv/csrc/sparse/pointops.py
-    device = torch.device("cuda:0")
-    gen = PointToVoxel(vsize_xyz=[0.1, 0.1, 0.1],
+    gen = Point2VoxelGPU3d(vsize_xyz=[0.1, 0.1, 0.1],
                           coors_range_xyz=[-80, -80, -2, 80, 80, 6],
                           num_point_features=3,
                           max_num_voxels=5000,
-                       max_num_points_per_voxel=5,
-                       device=device)
+                           max_num_points_per_voxel=5)

-    pc = np.random.uniform(-10, 10, size=[1000, 3]).astype(np.float32)
-    pc_th = torch.from_numpy(pc).to(device)
-    voxels_th, indices_th, num_p_in_vx_th = gen(pc_th)
-    voxels_np = voxels_th.cpu().numpy()
-    indices_np = indices_th.cpu().numpy()
-    num_p_in_vx_np = num_p_in_vx_th.cpu().numpy()
-    print(f"------Raw Voxels {voxels_np.shape[0]}-------")
-    print(voxels_np[0])
-    # run voxel gen and FILL MEAN VALUE to voxel remain
-    voxels_tv, indices_tv, num_p_in_vx_tv = gen(pc_th, empty_mean=True)
+    pc = np.random.uniform(-10, 10, size=[100000, 3]).astype(np.float32)
+    pc_tv = tv.from_numpy(pc).cuda()
+    # generate voxels, note that voxels_tv reference to a persistent buffer in generator,
+    # so we can't run it in multi-thread.
+    voxels_tv, indices_tv, num_p_in_vx_tv = gen.point_to_voxel_hash(pc_tv)
    voxels_np = voxels_tv.cpu().numpy()
    indices_np = indices_tv.cpu().numpy()
    num_p_in_vx_np = num_p_in_vx_tv.cpu().numpy()
-    print("------Voxels with mean filled-------")
+    print(f"------CUDA Raw Voxels {voxels_np.shape[0]}-------")
    print(voxels_np[0])


@@ -141,4 +170,5 @@ if __name__ == "__main__":
    main_point_with_features()
    main_pytorch_voxel_gen()
    if torch.cuda.is_available():
+        main_cuda()
        main_pytorch_voxel_gen_cuda()
--- a/scripts/dev_subm.py
+++ b/scripts/dev_subm.py
-import sys
-from pathlib import Path
-from typing import Dict, List, Tuple
-import pickle
-import sys
-import time
-from pathlib import Path
-from cumm.gemm.algospec.core import GemmAlgo
-
-import numpy as np
-import pccm
-import torch
-import torch.nn.functional as F
-
-from cumm import dtypes
-from cumm import tensorview as tv
-from cumm.constants import PACKAGE_ROOT
-from cumm.conv.bases import NCHW, NHWC, ConvIterAlgo, ConvOpType
-from cumm.conv.main import ConvMainUnitTest, gen_gemm_kernels
-from cumm.conv.params import ConvProblem
-from cumm.gemm import kernel
-import os
-from spconv.core_cc.csrc.sparse.all import SpconvOps
-from cumm.gemm.codeops import div_up
-from spconv.constants import PACKAGE_ROOT
-from spconv.core import ConvAlgo
-
-from spconv.pytorch import ops
-from spconv.algo import CONV, BestConvAlgoByProfile
-from spconv.pytorch.cppcore import torch_tensor_to_tv
-
-
-def reduce_mask_count(mask: np.ndarray, width: int):
-    mask_length_32 = (div_up(mask.shape[0], width)) * width
-    if mask.shape[0] < mask_length_32:
-        mask_pad = np.zeros((mask_length_32, ), dtype=mask.dtype)
-        mask_pad[:mask.shape[0]] = mask
-        mask = mask_pad
-    mask = mask.reshape(-1, width)
-    maskr = np.bitwise_or.reduce(mask, axis=1)
-    maskr_tv = tv.from_numpy(maskr)
-    return SpconvOps.count_bits(maskr_tv).numpy().sum() * width
-
-
-def reduce_mask_count_x(mask: np.ndarray, width: int):
-    mask_length_32 = (div_up(mask.shape[0], width)) * width
-    if mask.shape[0] < mask_length_32:
-        mask_pad = np.zeros((mask_length_32, ), dtype=mask.dtype)
-        mask_pad[:mask.shape[0]] = mask
-        mask = mask_pad
-    mask = mask.reshape(-1, width)
-    maskr = np.bitwise_or.reduce(mask, axis=1)
-    return maskr
-
-
-def dev_subm_inds_v2(subm: bool = True, run_conv: bool = True):
-    limit_input_n = 16384
-    limit_input_n = None
-    np.random.seed(484)
-
-    with (PACKAGE_ROOT.parent / "test/data/test_spconv.pkl").open("rb") as f:
-        voxels_np, indices_np, spatial_shape = pickle.load(f)
-        from spconv.test_utils import generate_sparse_data
-        voxels_np = voxels_np[:limit_input_n]
-        indices_np = indices_np[:limit_input_n]
-
-        # spatial_shape = [19, 18, 17]
-        # sparse_dict = generate_sparse_data(spatial_shape, [1024], 128)
-
-        # voxels_np = np.ascontiguousarray(sparse_dict["features"]).astype(
-        #     np.float32)
-        # indices_np = np.ascontiguousarray(
-        #     sparse_dict["indices"][:, [3, 0, 1, 2]]).astype(np.int32)
-
-        voxels = tv.from_numpy(voxels_np).cuda()
-        indices = tv.from_numpy(indices_np).cuda()
-        indices_th = torch.from_numpy(indices_np).cuda()
-    print(spatial_shape, indices_np.shape)
-    ndim = 3
-    if subm:
-        ksize = [3, 3, 3]
-        kv = np.prod(ksize)
-        padding = [1] * ndim
-        stride = [1] * ndim
-        dilation = [1] * ndim
-        out_padding = [0] * ndim
-    else:
-        ksize = [2, 2, 2]
-        kv = np.prod(ksize)
-        padding = [0] * ndim
-        stride = [1] * ndim
-        dilation = [1] * ndim
-        out_padding = [0] * ndim
-    out_inds, pair_ref, indice_num_per_loc = ops.get_indice_pairs(
-        indices_th, 1, spatial_shape, ConvAlgo.Native, ksize, stride, padding,
-        dilation, out_padding, subm)
-    indice_num_per_loc_np = indice_num_per_loc.cpu().numpy()
-    indice_pairs_np = pair_ref.cpu().numpy()
-    algo = ConvAlgo.MaskImplicitGemm
-    if algo == ConvAlgo.MaskImplicitGemm:
-        num_split = 1
-    else:
-        num_split = 2
-    for i in range(5):
-        res = ops.get_indice_pairs_implicit_gemm(indices_th, 1, spatial_shape,
-                                                 algo, ksize, stride, padding,
-                                                 dilation, out_padding, subm)
-    out_inds = res[0]
-    num_inds_per_loc = res[1]
-    pair_fwd = res[2]
-    pair_fwd_x = pair_fwd.cpu().numpy().reshape(-1)
-    pair_fwd_x[pair_fwd_x == -1] = 0
-    loc_num_np = (pair_fwd_x > 0).reshape(kv, -1).sum(1)
-    print(loc_num_np)
-    print(indice_num_per_loc_np)
-
-    pair_bwd = res[3]
-    pair_mask_fwd_splits = res[4]
-
-    pair_mask_bwd_splits = res[5]
-    mask_tv = torch_tensor_to_tv(pair_mask_fwd_splits[0], dtype=tv.uint32).cpu().numpy()
-    bench_reduce_mask(mask_tv)
-    return
-
-    mask_argsort_fwd_splits = res[6]
-    mask_argsort_bwd_splits = res[7]
-    masks = res[8]
-    pair_mask_fwd_splits_tv = [
-        ops.torch_tensor_to_tv(t, dtype=tv.uint32)
-        for t in pair_mask_fwd_splits
-    ]
-    valid_location_bitcount = [
-        SpconvOps.count_bits(t) for t in pair_mask_fwd_splits_tv
-    ]
-    valid_location_count = sum(
-        [t.cpu().numpy().sum() for t in valid_location_bitcount])
-    reduce_length = 32
-    split_mask_valid_count = sum([
-        reduce_mask_count(t.cpu().numpy(), reduce_length)
-        for t in pair_mask_fwd_splits_tv
-    ])
-    if subm:
-        print("SUBM", valid_location_count, split_mask_valid_count,
-              pair_fwd.numel())
-    else:
-        print("REGULAR", valid_location_count, split_mask_valid_count,
-              pair_fwd.numel())
-    # return
-
-    if run_conv:
-        C = 64
-        K = 64
-        desps = CONV.desps
-        mask_output_fwd = torch.zeros([2, div_up(out_inds.shape[0], 32)],
-                                      dtype=torch.int32,
-                                      device=indices_th.device)
-        mask_output_bwd = torch.zeros([2, div_up(indices.dim(0), 32)],
-                                      dtype=torch.int32,
-                                      device=indices_th.device)
-
-        for desp in desps:
-            if desp.algo != GemmAlgo.Simt.value:
-                continue
-            # if desp.op_type == ConvOpType.kBackwardWeight.value:
-            #     continue
-            # if desp.tile_shape !
-            if desp.dtype_a == dtypes.int8.tv_dtype:
-                inp = np.random.randint(-1, 1, size=[voxels_np.shape[0],
-                                                     C]).astype(np.int8)
-                weight = np.random.randint(-1, 1, size=[K, *ksize,
-                                                        C]).astype(np.int8)
-                output = np.random.randint(-1, 1, size=[
-                    out_inds.shape[0], K
-                ]).astype(dtypes.get_npdtype_from_tvdtype(desp.dtype_output))
-            else:
-                inp = np.random.uniform(-1, 1, size=[
-                    voxels_np.shape[0], C
-                ]).astype(dtypes.get_npdtype_from_tvdtype(desp.dtype_input))
-                weight = np.random.uniform(-1, 1, size=[K, *ksize, C]).astype(
-                    dtypes.get_npdtype_from_tvdtype(desp.dtype_weight))
-                output = np.random.uniform(-1, 1, size=[
-                    out_inds.shape[0], K
-                ]).astype(dtypes.get_npdtype_from_tvdtype(desp.dtype_output))
-            weight_ref = weight.transpose(1, 2, 3, 0, 4)
-            weight_ref = np.ascontiguousarray(weight_ref).reshape(-1, K, C)
-            if desp.op_type == ConvOpType.kBackwardInput.value:
-                inp_tv = tv.zeros(inp.shape, desp.dtype_input, 0)
-            else:
-                inp_tv = tv.from_numpy(inp).cuda()
-            if desp.op_type == ConvOpType.kBackwardWeight.value:
-                weight_tv = tv.zeros(weight.shape, desp.dtype_weight, 0)
-            else:
-                weight_tv = tv.from_numpy(weight).cuda()
-            # _ = tv.zeros([5000, 10], tv.float32, 0)
-            if desp.op_type == ConvOpType.kForward.value:
-                output_tv = tv.zeros(output.shape, desp.dtype_output, 0)
-            else:
-                output_tv = tv.from_numpy(output).cuda()
-            torch.cuda.synchronize()
-            t = time.time()
-            spk = 1
-            if desp.op_type == ConvOpType.kBackwardWeight.value:
-                # TODO support splitk parallel
-                spk = 32
-            if subm:
-                if desp.op_type == ConvOpType.kForward.value:
-                    indice_pairs = pair_fwd
-                elif desp.op_type == ConvOpType.kBackwardInput.value:
-                    indice_pairs = pair_bwd
-                else:
-                    indice_pairs = pair_fwd
-                mask_output = mask_output_fwd
-                # print([bin(x.item()) for x in masks])
-                for j in range(num_split):
-                    beta = 1 if j == 1 else 0
-                    mask_filter = 0xffffffff
-                    mask_filter = masks[j].item()
-
-                    reverse_mask = False
-                    if desp.op_type == ConvOpType.kBackwardWeight.value:
-                        mask_op = mask_output[j]
-                    else:
-                        mask_op = pair_mask_fwd_splits[j]
-                    if desp.op_type == ConvOpType.kBackwardInput.value:
-                        reverse_mask = True
-                    CONV.run_with_tuned_result(
-                        BestConvAlgoByProfile(desp, spk),
-                        desp.op_type,
-                        inp_tv,
-                        weight_tv,
-                        output_tv,
-                        torch_tensor_to_tv(mask_op, dtype=tv.uint32),
-                        torch_tensor_to_tv(mask_argsort_fwd_splits[j]),
-                        torch_tensor_to_tv(mask_output[j], dtype=tv.uint32),
-                        torch_tensor_to_tv(indice_pairs),
-                        reverse_mask,
-                        mask_filter=mask_filter,
-                        mask_width=32,
-                        beta=beta,
-                        verbose=True,
-                    )
-            else:
-                if desp.op_type == ConvOpType.kForward.value:
-                    indice_pairs = pair_fwd  # inp -> out
-                    mask_ops = pair_mask_fwd_splits
-                    mask_argsorts = mask_argsort_fwd_splits
-                    mask_output = mask_output_fwd
-                elif desp.op_type == ConvOpType.kBackwardInput.value:
-                    indice_pairs = pair_bwd  # out -> inp
-                    mask_ops = pair_mask_bwd_splits
-                    mask_argsorts = mask_argsort_bwd_splits
-                    mask_output = mask_output_bwd
-
-                    print([bin(x.item()) for x in masks])
-                else:
-                    indice_pairs = pair_fwd  # inp -> out
-                    mask_ops = pair_mask_fwd_splits
-                    mask_argsorts = mask_argsort_fwd_splits
-                    mask_output = mask_output_fwd
-
-                for j in range(2):
-                    beta = 1 if j == 1 else 0
-                    mask_filter = masks[j].item()
-                    reverse_mask = False
-                    if desp.op_type == ConvOpType.kBackwardWeight.value:
-                        mask_op = mask_output[j]
-                    else:
-                        mask_op = mask_ops[j]
-
-                    CONV.run_with_tuned_result(
-                        BestConvAlgoByProfile(desp, spk),
-                        desp.op_type,
-                        inp_tv,
-                        weight_tv,
-                        output_tv,
-                        torch_tensor_to_tv(mask_op, dtype=tv.uint32),
-                        torch_tensor_to_tv(mask_argsorts[j]),
-                        torch_tensor_to_tv(mask_output[j], dtype=tv.uint32),
-                        torch_tensor_to_tv(indice_pairs),
-                        reverse_mask,
-                        mask_filter=mask_filter,
-                        mask_width=32,
-                        beta=beta,
-                        verbose=True,
-                    )
-
-            torch.cuda.synchronize()
-            duration = time.time() - t
-            if desp.op_type == ConvOpType.kForward.value:
-                output_ref = np.zeros_like(output, dtype=np.float32)
-                # ref algorithm
-                for filter_offset in range(kv):
-                    if subm and filter_offset > kv // 2:
-                        nhot = indice_num_per_loc_np[kv - 1 - filter_offset]
-                    elif subm and filter_offset == kv // 2:
-                        nhot = voxels.shape[0]
-                    else:
-                        nhot = indice_num_per_loc_np[filter_offset]
-                    a_inds = indice_pairs_np[0][filter_offset][:nhot]
-                    c_inds = indice_pairs_np[1][filter_offset][:nhot]
-                    # print(a_inds_cpu[:10])
-                    a = inp[a_inds]
-                    cc = a.astype(
-                        np.float32) @ weight_ref[filter_offset].T.astype(
-                            np.float32)
-                    output_ref[c_inds] += cc
-
-                output_cpu = output_tv.cpu().numpy().astype(np.float32)
-                duration = time.time() - t
-                my = output_cpu.reshape(-1)
-                print("ERROR", np.linalg.norm(output_ref.reshape(-1) - my))
-
-            elif desp.op_type == ConvOpType.kBackwardInput.value:
-                dinput_ref = np.zeros_like(inp, dtype=np.float32)
-                # ref algorithm
-                for filter_offset in range(kv):
-                    if subm and filter_offset > kv // 2:
-                        nhot = indice_num_per_loc_np[kv - 1 - filter_offset]
-                    elif subm and filter_offset == kv // 2:
-                        nhot = voxels.shape[0]
-                    else:
-                        nhot = indice_num_per_loc_np[filter_offset]
-                    a_inds = indice_pairs_np[1][filter_offset][:nhot]
-                    c_inds = indice_pairs_np[0][filter_offset][:nhot]
-
-                    # print(a_inds_cpu[:10])
-                    a = output[a_inds]
-                    # NK @ KC
-                    cc = a.astype(
-                        np.float32) @ weight_ref[filter_offset].astype(
-                            np.float32)
-                    dinput_ref[c_inds] += cc
-                din_cpu = inp_tv.cpu().numpy()
-                print(
-                    "ERROR",
-                    np.linalg.norm(
-                        din_cpu.reshape(-1) - dinput_ref.reshape(-1)))
-            else:
-                dw_ref = np.zeros_like(weight_ref,
-                                       dtype=np.float32)  # KV, K, C
-                for filter_offset in range(kv):
-                    if subm and filter_offset > kv // 2:
-                        nhot = indice_num_per_loc_np[kv - 1 - filter_offset]
-                    elif subm and filter_offset == kv // 2:
-                        nhot = voxels.shape[0]
-                    else:
-                        nhot = indice_num_per_loc_np[filter_offset]
-                    o_inds = indice_pairs_np[1][filter_offset][:nhot]
-                    i_inds = indice_pairs_np[0][filter_offset][:nhot]
-                    # print(a_inds_cpu[:10])
-                    out_gather = output[o_inds]  # [N, K]
-                    inp_gather = inp[i_inds]  # [N, C]
-                    # KN @ NC
-                    dw_res = out_gather.astype(
-                        np.float32).T @ inp_gather.astype(np.float32)
-                    dw_ref[filter_offset] = dw_res
-                # print(indice_pairs_np_test[0])
-                dw_ref_kcrs = dw_ref.transpose(1, 0, 2)
-                dw_cpu = weight_tv.cpu().numpy().reshape(K, np.prod(ksize), C)
-
-                print(
-                    "ERROR",
-                    np.linalg.norm(
-                        dw_cpu.reshape(-1) - dw_ref_kcrs.reshape(-1)))
-
-def reverse_bits(a: np.ndarray):
-    a_unpack = np.unpackbits(a, bitorder="little")
-    return np.packbits(a_unpack)
-
-def _count_mask_reduce(masks: np.ndarray):
-    masks_tv_count = SpconvOps.count_bits(tv.from_numpy(masks))
-    masks_tv_count_sum = masks_tv_count.numpy_view().sum()
-
-    reduce_count = reduce_mask_count(masks, 64)
-    print(masks_tv_count_sum, reduce_count, reduce_count / masks_tv_count_sum)
-
-
-def bench_reduce_mask(masks: np.ndarray, width: int = 27):
-    # masks = np.random.randint(0, 2000000000, size=[100000], dtype=np.uint32)#  & 0xffff
-    width_mask = np.array(0xffffffff, dtype=np.uint32) << (32 - width) >> (32 - width)
-
-    width_half_mask = np.array(0xffffffff, dtype=np.uint32) >> (32 - width // 2 - 1)
-    width_half_mask_left = width_half_mask << (width // 2 + 1)
-    print(bin(width_half_mask))
-    masks_sort = masks.copy()
-    masks_sort.sort()
-    _count_mask_reduce(masks_sort)
-    masks_sort = masks.copy() & width_half_mask
-    masks_sort.sort()
-    _count_mask_reduce(masks_sort)
-
-    # masks.sort()
-    # masks = masks & 0xffff
-
-    reversed_masks = SpconvOps.reverse_bits(tv.from_numpy(masks)).numpy()#  & 0xffff0000
-    new_masks = np.concatenate([masks, reversed_masks])
-    
-    np.random.shuffle(new_masks)
-    new_masks.sort()
-    _count_mask_reduce(new_masks)
-    new_masks &= width_half_mask
-    new_masks.sort()
-    _count_mask_reduce(new_masks)
-
-
-
-
-if __name__ == "__main__":
-    dev_subm_inds_v2()
--- a/spconv/algo.py
+++ b/spconv/algo.py
@@ -131,9 +131,9 @@ class SimpleGemm:
                # skip volta tensor op since it is very slow in architectures except volta.
                if arch >= (7, 5) and desp.algo == GemmAlgo.Volta.value:
                    continue
-                lda = a.dim(1)
-                ldb = b.dim(1)
-                ldc = c.dim(1)
+                lda = a.stride[0]
+                ldb = b.stride[0]
+                ldc = c.stride[0]
                if desp.supported_ldx(lda, ldb, ldc):
                    finally_algos.append(desp)
        return finally_algos

--- a/spconv/benchmark/__init__.py
+++ b/spconv/benchmark/__init__.py
--- a/spconv/benchmark/__main__.py
+++ b/spconv/benchmark/__main__.py
+from .basic import bench_basic
+
+import fire
+
+if __name__ == "__main__":
+    fire.Fire()
--- a/spconv/benchmark/basic.py
+++ b/spconv/benchmark/basic.py
+from spconv.benchmark.core import get_voxel_data
+
+
+import time
+from pathlib import Path
+
+import numpy as np
+import torch
+from torch import nn
+from cumm import tensorview as tv
+from spconv.core import ConvAlgo
+from cumm import dtypes
+import spconv.pytorch as spconv
+from spconv.test_utils import params_grid
+
+class Net(nn.Module):
+    def __init__(self, shape, algo):
+        super().__init__()
+        pool_algo = algo
+        # pool_algo = ConvAlgo.Native
+        self.net = spconv.SparseSequential(
+            spconv.SubMConv3d(3, 64, 3, bias=False, indice_key="c0",
+                              algo=algo),
+
+            spconv.SubMConv3d(64,
+                              64,
+                              3,
+                              bias=False,
+                              indice_key="c0",
+                              algo=algo),
+            # nn.BatchNorm1d(32),
+            # nn.ReLU(),
+            # spconv.SparseConv3d(64, 64, 2, 2, bias=False, indice_key="m0"),
+            spconv.SparseMaxPool3d(2, 2, algo=pool_algo),
+            spconv.SubMConv3d(64,
+                              96,
+                              3,
+                              bias=False,
+                              indice_key="c1",
+                              algo=algo),
+            spconv.SubMConv3d(96,
+                              96,
+                              3,
+                              bias=False,
+                              indice_key="c1",
+                              algo=algo),
+            # nn.BatchNorm1d(64),
+            # nn.ReLU(),
+            # spconv.SparseConv3d(96, 96, 2, 2, bias=False, indice_key="m1"),
+            spconv.SparseMaxPool3d(2, 2, algo=pool_algo),
+            spconv.SubMConv3d(96,
+                              128,
+                              3,
+                              bias=False,
+                              indice_key="c2",
+                              algo=algo),
+            spconv.SubMConv3d(128,
+                              128,
+                              3,
+                              bias=False,
+                              indice_key="c2",
+                              algo=algo),
+            # nn.BatchNorm1d(128),
+            # nn.ReLU(),
+            # spconv.SparseConv3d(128, 128, 2, 2, bias=False, indice_key="m2"),
+            spconv.SparseMaxPool3d(2, 2, algo=pool_algo),
+            spconv.SubMConv3d(128,
+                              160,
+                              3,
+                              bias=False,
+                              indice_key="c3",
+                              algo=algo),
+            spconv.SubMConv3d(160,
+                              160,
+                              3,
+                              bias=False,
+                              indice_key="c3",
+                              algo=algo),
+            # nn.BatchNorm1d(128),
+            # nn.ReLU(),
+            # spconv.SparseConv3d(160, 160, 2, 2, bias=False, indice_key="m3"),
+            spconv.SparseMaxPool3d(2, 2, algo=pool_algo),
+            spconv.SubMConv3d(160,
+                              192,
+                              3,
+                              bias=False,
+                              indice_key="c4",
+                              algo=algo),
+            spconv.SubMConv3d(192,
+                              192,
+                              3,
+                              bias=False,
+                              indice_key="c4",
+                              algo=algo),
+            # nn.BatchNorm1d(128),
+            # nn.ReLU(),
+            spconv.SparseMaxPool3d(2, 2, indice_key="m4", algo=pool_algo),
+            # spconv.SparseConv3d(192, 192, 2, 2, bias=False, indice_key="m4"),
+            spconv.SubMConv3d(192,
+                              224,
+                              3,
+                              bias=False,
+                              indice_key="c5",
+                              algo=algo),
+            spconv.SubMConv3d(224,
+                              224,
+                              3,
+                              bias=False,
+                              indice_key="c5",
+                              algo=algo),
+            # nn.BatchNorm1d(224),
+            # nn.ReLU(),
+            # spconv.SparseConv3d(224, 224, 2, 2, bias=False, indice_key="m5"),
+            spconv.SparseMaxPool3d(2, 2, indice_key="m5", algo=pool_algo),
+            spconv.SubMConv3d(224,
+                              256,
+                              3,
+                              bias=False,
+                              indice_key="c6",
+                              algo=algo),
+            spconv.SubMConv3d(256,
+                              256,
+                              3,
+                              bias=False,
+                              indice_key="c6",
+                              algo=algo),
+
+            # nn.BatchNorm1d(256),
+            # nn.ReLU(),
+
+            # spconv.SparseInverseConv3d(256, 128, 2, indice_key="m5", bias=False, algo=algo),
+            # # # nn.BatchNorm1d(128),
+            # # # nn.ReLU(),
+
+            # spconv.SparseInverseConv3d(128, 64, 2, indice_key="m4", bias=False, algo=algo),
+        )
+        max_batch_size = 1
+        # grid (dense map) is used for indice generation. use pre-allocated grid can run faster.
+        self.grid = torch.full([max_batch_size, *shape], -1,
+                               dtype=torch.int32).cuda()
+        # self.grid = None
+        self.shape = shape
+
+    def forward(self, features, coors, batch_size, enable_timer: bool = False):
+        x = spconv.SparseConvTensor(features,
+                                    coors,
+                                    self.shape,
+                                    batch_size,
+                                    self.grid,
+                                    enable_timer=enable_timer)
+        return self.net(x)
+
+_DTYPE_TO_TORCH_DTYPE = {
+    dtypes.float32: torch.float32,
+    dtypes.float16: torch.float16,
+}
+
+def bench_basic(dtype_str: str):
+    dtype = dtypes.get_dtype_by_shortcut(dtype_str)
+    if dtype not in _DTYPE_TO_TORCH_DTYPE:
+        raise NotImplementedError("only support bench f32 and f16 for now")
+    torch_dtype = _DTYPE_TO_TORCH_DTYPE[dtype]
+    algos = [spconv.ConvAlgo.Native, spconv.ConvAlgo.MaskImplicitGemm, spconv.ConvAlgo.MaskSplitImplicitGemm]
+    (voxels, coors, spatial_shape) = get_voxel_data()
+    device = torch.device("cuda:0")
+
+    for algo, in params_grid(algos):
+        voxels_th = torch.from_numpy(voxels).to(device).to(torch_dtype)
+        coors_th = torch.from_numpy(coors).to(device).int()
+        voxels_th.requires_grad = True
+        net = Net(spatial_shape, algo).to(device).train().to(torch_dtype)# .train()
+        spconv.assign_name_for_sparse_modules(net)
+        with torch.no_grad():
+            out: spconv.SparseConvTensor = net(voxels_th, coors_th, 1)
+        dout = np.random.uniform(-0.2, 0.2, out.features.shape).astype(np.float32)
+        dout_t = torch.from_numpy(dout).to(device).to(torch_dtype)
+        times = []
+        with torch.no_grad():
+            for i in range(20):
+                torch.cuda.synchronize()
+                t = time.time()
+                out_nograd = net(voxels_th, coors_th, 1, False)
+                timer = out_nograd._timer
+                torch.cuda.synchronize()
+                times.append(time.time() - t)
+        print(f"basic[{dtype_str}|{algo}|forward]", np.mean(times[10:]))
+        times = []
+
+        for i in range(10):
+            out = net(voxels_th, coors_th, 1)
+            torch.cuda.synchronize()
+            t = time.time()
+            out.features.backward(dout_t)
+            torch.cuda.synchronize()
+            times.append(time.time() - t)
+        print(f"basic[{dtype_str}|{algo}|backward]", np.mean(times[5:]))
+
+if __name__ == "__main__":
+    bench_basic("f16")
\ No newline at end of file
--- a/spconv/benchmark/core.py
+++ b/spconv/benchmark/core.py
+import requests
+import fire 
+import pickle 
+from io import BytesIO
+import numpy as np
+from spconv.constants import PACKAGE_ROOT
+
+RAW_PC_PATH = "https://raw.githubusercontent.com/traveller59/spconv/v2.1.10/test/data/test_spconv.pkl"
+
+def get_voxel_data():
+    editable_test_data_path = PACKAGE_ROOT.parent / "test/data/test_spconv.pkl"
+    if editable_test_data_path.exists():
+        with editable_test_data_path.open("rb") as f:
+            return pickle.load(f)
+    ff = BytesIO()
+    with requests.get(RAW_PC_PATH, stream=True) as req:
+        req.raise_for_status()
+        for chunk in req.iter_content(chunk_size=8192): 
+            ff.write(chunk)
+    ff.seek(0) 
+    (voxels, coors, spatial_shape) = pickle.load(ff)
+    return voxels, coors, spatial_shape
+
+def get_pc_data():
+    editable_test_data_path = PACKAGE_ROOT.parent / "test/data/benchmark-pc.npz"
+    if editable_test_data_path.exists():
+        pc = np.load(str(editable_test_data_path))["pc"]
+        return pc 
+    ff = BytesIO()
+    with requests.get(RAW_PC_PATH, stream=True) as req:
+        req.raise_for_status()
+        for chunk in req.iter_content(chunk_size=8192): 
+            ff.write(chunk)
+    ff.seek(0) 
+    pc = np.load(ff)["pc"]
+    return pc
+
+if __name__ == "__main__":
+    pc = get_pc_data()
+    print(pc[:10])
\ No newline at end of file
--- a/spconv/core.py
+++ b/spconv/core.py
@@ -452,7 +452,7 @@ IMPLGEMM_VOLTA_PARAMS = [
    *gen_conv_params(ConvBwdWeight, (64, 64, 32), (32, 32, 32),
                     NDIM_DONT_CARE,
                     ConvIterAlgo.Optimized,
-                     2, ["f16,f16,f16,f16,f16"],
+                     2, ["f16,f16,f16,f32,f32"],
                     NHWC,
                     NHWC,
                     NHWC,
@@ -464,7 +464,7 @@ IMPLGEMM_VOLTA_PARAMS = [
    *gen_conv_params(ConvBwdWeight, (64, 64, 32), (32, 32, 32),
                     NDIM_DONT_CARE,
                     ConvIterAlgo.Optimized,
-                     2, ["f16,f16,f16,f16,f16"],
+                     2, ["f16,f16,f16,f32,f32"],
                     NHWC,
                     NHWC,
                     NHWC,
@@ -476,7 +476,7 @@ IMPLGEMM_VOLTA_PARAMS = [
    *gen_conv_params(ConvBwdWeight, (128, 128, 32), (32, 64, 32),
                     NDIM_DONT_CARE,
                     ConvIterAlgo.Optimized,
-                     2, ["f16,f16,f16,f16,f16"],
+                     2, ["f16,f16,f16,f32,f32"],
                     NHWC,
                     NHWC,
                     NHWC,

--- a/spconv/core_cc/csrc/sparse/all/__init__.pyi
+++ b/spconv/core_cc/csrc/sparse/all/__init__.pyi
@@ -298,7 +298,7 @@ class SpconvOps:
        """
        ...
    @staticmethod
-    def point2voxel_cpu(points: Tensor, voxels: Tensor, indices: Tensor, num_per_voxel: Tensor, densehashdata: Tensor, vsize: List[float], grid_size: List[int], grid_stride: List[int], coors_range: List[float], empty_mean: bool = False, clear_voxels: bool = True) -> Tuple[Tensor, Tensor, Tensor]: 
+    def point2voxel_cpu(points: Tensor, voxels: Tensor, indices: Tensor, num_per_voxel: Tensor, densehashdata: Tensor, pc_voxel_id: Tensor, vsize: List[float], grid_size: List[int], grid_stride: List[int], coors_range: List[float], empty_mean: bool = False, clear_voxels: bool = True) -> Tuple[Tensor, Tensor, Tensor]: 
        """
        Args:
            points: 
@@ -306,6 +306,7 @@ class SpconvOps:
            indices: 
            num_per_voxel: 
            densehashdata: 
+            pc_voxel_id: 
            vsize: 
            grid_size: 
            grid_stride: 
@@ -315,7 +316,7 @@ class SpconvOps:
        """
        ...
    @staticmethod
-    def point2voxel_cuda(points: Tensor, voxels: Tensor, indices: Tensor, num_per_voxel: Tensor, hashdata: Tensor, point_indice_data: Tensor, vsize: List[float], grid_size: List[int], grid_stride: List[int], coors_range: List[float], empty_mean: bool = False, clear_voxels: bool = True, stream_int: int = 0) -> Tuple[Tensor, Tensor, Tensor]: 
+    def point2voxel_cuda(points: Tensor, voxels: Tensor, indices: Tensor, num_per_voxel: Tensor, hashdata: Tensor, point_indice_data: Tensor, pc_voxel_id: Tensor, vsize: List[float], grid_size: List[int], grid_stride: List[int], coors_range: List[float], empty_mean: bool = False, clear_voxels: bool = True, stream_int: int = 0) -> Tuple[Tensor, Tensor, Tensor]: 
        """
        Args:
            points: 
@@ -324,6 +325,7 @@ class SpconvOps:
            num_per_voxel: 
            hashdata: 
            point_indice_data: 
+            pc_voxel_id: 
            vsize: 
            grid_size: 
            grid_stride: 

--- a/spconv/core_cc/csrc/sparse/all/ops1d.pyi
+++ b/spconv/core_cc/csrc/sparse/all/ops1d.pyi
@@ -29,7 +29,7 @@ class Point2Voxel:
        """
        ...
    @staticmethod
-    def point_to_voxel_hash_static(points: Tensor, voxels: Tensor, indices: Tensor, num_per_voxel: Tensor, hashdata: Tensor, point_indice_data: Tensor, vsize: List[float], grid_size: List[int], grid_stride: List[int], coors_range: List[float], clear_voxels: bool = True, empty_mean: bool = False, stream_int: int = 0) -> Tuple[Tensor, Tensor, Tensor]: 
+    def point_to_voxel_hash_static(points: Tensor, voxels: Tensor, indices: Tensor, num_per_voxel: Tensor, hashdata: Tensor, point_indice_data: Tensor, points_voxel_id: Tensor, vsize: List[float], grid_size: List[int], grid_stride: List[int], coors_range: List[float], clear_voxels: bool = True, empty_mean: bool = False, stream_int: int = 0) -> Tuple[Tensor, Tensor, Tensor]: 
        """
        Args:
            points: 
@@ -38,6 +38,7 @@ class Point2Voxel:
            num_per_voxel: 
            hashdata: 
            point_indice_data: 
+            points_voxel_id: 
            vsize: 
            grid_size: 
            grid_stride: 

--- a/spconv/core_cc/csrc/sparse/all/ops2d.pyi
+++ b/spconv/core_cc/csrc/sparse/all/ops2d.pyi
@@ -29,7 +29,7 @@ class Point2Voxel:
        """
        ...
    @staticmethod
-    def point_to_voxel_hash_static(points: Tensor, voxels: Tensor, indices: Tensor, num_per_voxel: Tensor, hashdata: Tensor, point_indice_data: Tensor, vsize: List[float], grid_size: List[int], grid_stride: List[int], coors_range: List[float], clear_voxels: bool = True, empty_mean: bool = False, stream_int: int = 0) -> Tuple[Tensor, Tensor, Tensor]: 
+    def point_to_voxel_hash_static(points: Tensor, voxels: Tensor, indices: Tensor, num_per_voxel: Tensor, hashdata: Tensor, point_indice_data: Tensor, points_voxel_id: Tensor, vsize: List[float], grid_size: List[int], grid_stride: List[int], coors_range: List[float], clear_voxels: bool = True, empty_mean: bool = False, stream_int: int = 0) -> Tuple[Tensor, Tensor, Tensor]: 
        """
        Args:
            points: 
@@ -38,6 +38,7 @@ class Point2Voxel:
            num_per_voxel: 
            hashdata: 
            point_indice_data: 
+            points_voxel_id: 
            vsize: 
            grid_size: 
            grid_stride: 

--- a/spconv/core_cc/csrc/sparse/all/ops3d.pyi
+++ b/spconv/core_cc/csrc/sparse/all/ops3d.pyi
@@ -29,7 +29,7 @@ class Point2Voxel:
        """
        ...
    @staticmethod
-    def point_to_voxel_hash_static(points: Tensor, voxels: Tensor, indices: Tensor, num_per_voxel: Tensor, hashdata: Tensor, point_indice_data: Tensor, vsize: List[float], grid_size: List[int], grid_stride: List[int], coors_range: List[float], clear_voxels: bool = True, empty_mean: bool = False, stream_int: int = 0) -> Tuple[Tensor, Tensor, Tensor]: 
+    def point_to_voxel_hash_static(points: Tensor, voxels: Tensor, indices: Tensor, num_per_voxel: Tensor, hashdata: Tensor, point_indice_data: Tensor, points_voxel_id: Tensor, vsize: List[float], grid_size: List[int], grid_stride: List[int], coors_range: List[float], clear_voxels: bool = True, empty_mean: bool = False, stream_int: int = 0) -> Tuple[Tensor, Tensor, Tensor]: 
        """
        Args:
            points: 
@@ -38,6 +38,7 @@ class Point2Voxel:
            num_per_voxel: 
            hashdata: 
            point_indice_data: 
+            points_voxel_id: 
            vsize: 
            grid_size: 
            grid_stride: 

--- a/spconv/core_cc/csrc/sparse/all/ops4d.pyi
+++ b/spconv/core_cc/csrc/sparse/all/ops4d.pyi
@@ -29,7 +29,7 @@ class Point2Voxel:
        """
        ...
    @staticmethod
-    def point_to_voxel_hash_static(points: Tensor, voxels: Tensor, indices: Tensor, num_per_voxel: Tensor, hashdata: Tensor, point_indice_data: Tensor, vsize: List[float], grid_size: List[int], grid_stride: List[int], coors_range: List[float], clear_voxels: bool = True, empty_mean: bool = False, stream_int: int = 0) -> Tuple[Tensor, Tensor, Tensor]: 
+    def point_to_voxel_hash_static(points: Tensor, voxels: Tensor, indices: Tensor, num_per_voxel: Tensor, hashdata: Tensor, point_indice_data: Tensor, points_voxel_id: Tensor, vsize: List[float], grid_size: List[int], grid_stride: List[int], coors_range: List[float], clear_voxels: bool = True, empty_mean: bool = False, stream_int: int = 0) -> Tuple[Tensor, Tensor, Tensor]: 
        """
        Args:
            points: 
@@ -38,6 +38,7 @@ class Point2Voxel:
            num_per_voxel: 
            hashdata: 
            point_indice_data: 
+            points_voxel_id: 
            vsize: 
            grid_size: 
            grid_stride: 

--- a/spconv/core_cc/csrc/sparse/all/ops_cpu1d.pyi
+++ b/spconv/core_cc/csrc/sparse/all/ops_cpu1d.pyi
@@ -27,7 +27,7 @@ class Point2VoxelCPU:
        """
        ...
    @staticmethod
-    def point_to_voxel_static(points: Tensor, voxels: Tensor, indices: Tensor, num_per_voxel: Tensor, densehashdata: Tensor, vsize: List[float], grid_size: List[int], grid_stride: List[int], coors_range: List[float], clear_voxels: bool = True) -> Tuple[Tensor, Tensor, Tensor]: 
+    def point_to_voxel_static(points: Tensor, voxels: Tensor, indices: Tensor, num_per_voxel: Tensor, densehashdata: Tensor, points_voxel_id: Tensor, vsize: List[float], grid_size: List[int], grid_stride: List[int], coors_range: List[float], clear_voxels: bool = True) -> Tuple[Tensor, Tensor, Tensor]: 
        """
        Args:
            points: 
@@ -35,6 +35,7 @@ class Point2VoxelCPU:
            indices: 
            num_per_voxel: 
            densehashdata: 
+            points_voxel_id: 
            vsize: 
            grid_size: 
            grid_stride: 
@@ -43,7 +44,7 @@ class Point2VoxelCPU:
        """
        ...
    @staticmethod
-    def point_to_voxel_empty_mean_static(points: Tensor, voxels: Tensor, indices: Tensor, num_per_voxel: Tensor, densehashdata: Tensor, vsize: List[float], grid_size: List[int], grid_stride: List[int], coors_range: List[float], clear_voxels: bool = True) -> Tuple[Tensor, Tensor, Tensor]: 
+    def point_to_voxel_empty_mean_static(points: Tensor, voxels: Tensor, indices: Tensor, num_per_voxel: Tensor, densehashdata: Tensor, points_voxel_id: Tensor, vsize: List[float], grid_size: List[int], grid_stride: List[int], coors_range: List[float], clear_voxels: bool = True) -> Tuple[Tensor, Tensor, Tensor]: 
        """
        Args:
            points: 
@@ -51,6 +52,7 @@ class Point2VoxelCPU:
            indices: 
            num_per_voxel: 
            densehashdata: 
+            points_voxel_id: 
            vsize: 
            grid_size: 
            grid_stride: 

--- a/spconv/core_cc/csrc/sparse/all/ops_cpu2d.pyi
+++ b/spconv/core_cc/csrc/sparse/all/ops_cpu2d.pyi
@@ -27,7 +27,7 @@ class Point2VoxelCPU:
        """
        ...
    @staticmethod
-    def point_to_voxel_static(points: Tensor, voxels: Tensor, indices: Tensor, num_per_voxel: Tensor, densehashdata: Tensor, vsize: List[float], grid_size: List[int], grid_stride: List[int], coors_range: List[float], clear_voxels: bool = True) -> Tuple[Tensor, Tensor, Tensor]: 
+    def point_to_voxel_static(points: Tensor, voxels: Tensor, indices: Tensor, num_per_voxel: Tensor, densehashdata: Tensor, points_voxel_id: Tensor, vsize: List[float], grid_size: List[int], grid_stride: List[int], coors_range: List[float], clear_voxels: bool = True) -> Tuple[Tensor, Tensor, Tensor]: 
        """
        Args:
            points: 
@@ -35,6 +35,7 @@ class Point2VoxelCPU:
            indices: 
            num_per_voxel: 
            densehashdata: 
+            points_voxel_id: 
            vsize: 
            grid_size: 
            grid_stride: 
@@ -43,7 +44,7 @@ class Point2VoxelCPU:
        """
        ...
    @staticmethod
-    def point_to_voxel_empty_mean_static(points: Tensor, voxels: Tensor, indices: Tensor, num_per_voxel: Tensor, densehashdata: Tensor, vsize: List[float], grid_size: List[int], grid_stride: List[int], coors_range: List[float], clear_voxels: bool = True) -> Tuple[Tensor, Tensor, Tensor]: 
+    def point_to_voxel_empty_mean_static(points: Tensor, voxels: Tensor, indices: Tensor, num_per_voxel: Tensor, densehashdata: Tensor, points_voxel_id: Tensor, vsize: List[float], grid_size: List[int], grid_stride: List[int], coors_range: List[float], clear_voxels: bool = True) -> Tuple[Tensor, Tensor, Tensor]: 
        """
        Args:
            points: 
@@ -51,6 +52,7 @@ class Point2VoxelCPU:
            indices: 
            num_per_voxel: 
            densehashdata: 
+            points_voxel_id: 
            vsize: 
            grid_size: 
            grid_stride: 

--- a/spconv/core_cc/csrc/sparse/all/ops_cpu3d.pyi
+++ b/spconv/core_cc/csrc/sparse/all/ops_cpu3d.pyi
@@ -27,7 +27,7 @@ class Point2VoxelCPU:
        """
        ...
    @staticmethod
-    def point_to_voxel_static(points: Tensor, voxels: Tensor, indices: Tensor, num_per_voxel: Tensor, densehashdata: Tensor, vsize: List[float], grid_size: List[int], grid_stride: List[int], coors_range: List[float], clear_voxels: bool = True) -> Tuple[Tensor, Tensor, Tensor]: 
+    def point_to_voxel_static(points: Tensor, voxels: Tensor, indices: Tensor, num_per_voxel: Tensor, densehashdata: Tensor, points_voxel_id: Tensor, vsize: List[float], grid_size: List[int], grid_stride: List[int], coors_range: List[float], clear_voxels: bool = True) -> Tuple[Tensor, Tensor, Tensor]: 
        """
        Args:
            points: 
@@ -35,6 +35,7 @@ class Point2VoxelCPU:
            indices: 
            num_per_voxel: 
            densehashdata: 
+            points_voxel_id: 
            vsize: 
            grid_size: 
            grid_stride: 
@@ -43,7 +44,7 @@ class Point2VoxelCPU:
        """
        ...
    @staticmethod
-    def point_to_voxel_empty_mean_static(points: Tensor, voxels: Tensor, indices: Tensor, num_per_voxel: Tensor, densehashdata: Tensor, vsize: List[float], grid_size: List[int], grid_stride: List[int], coors_range: List[float], clear_voxels: bool = True) -> Tuple[Tensor, Tensor, Tensor]: 
+    def point_to_voxel_empty_mean_static(points: Tensor, voxels: Tensor, indices: Tensor, num_per_voxel: Tensor, densehashdata: Tensor, points_voxel_id: Tensor, vsize: List[float], grid_size: List[int], grid_stride: List[int], coors_range: List[float], clear_voxels: bool = True) -> Tuple[Tensor, Tensor, Tensor]: 
        """
        Args:
            points: 
@@ -51,6 +52,7 @@ class Point2VoxelCPU:
            indices: 
            num_per_voxel: 
            densehashdata: 
+            points_voxel_id: 
            vsize: 
            grid_size: 
            grid_stride: 

--- a/spconv/core_cc/csrc/sparse/all/ops_cpu4d.pyi
+++ b/spconv/core_cc/csrc/sparse/all/ops_cpu4d.pyi
@@ -27,7 +27,7 @@ class Point2VoxelCPU:
        """
        ...
    @staticmethod
-    def point_to_voxel_static(points: Tensor, voxels: Tensor, indices: Tensor, num_per_voxel: Tensor, densehashdata: Tensor, vsize: List[float], grid_size: List[int], grid_stride: List[int], coors_range: List[float], clear_voxels: bool = True) -> Tuple[Tensor, Tensor, Tensor]: 
+    def point_to_voxel_static(points: Tensor, voxels: Tensor, indices: Tensor, num_per_voxel: Tensor, densehashdata: Tensor, points_voxel_id: Tensor, vsize: List[float], grid_size: List[int], grid_stride: List[int], coors_range: List[float], clear_voxels: bool = True) -> Tuple[Tensor, Tensor, Tensor]: 
        """
        Args:
            points: 
@@ -35,6 +35,7 @@ class Point2VoxelCPU:
            indices: 
            num_per_voxel: 
            densehashdata: 
+            points_voxel_id: 
            vsize: 
            grid_size: 
            grid_stride: 
@@ -43,7 +44,7 @@ class Point2VoxelCPU:
        """
        ...
    @staticmethod
-    def point_to_voxel_empty_mean_static(points: Tensor, voxels: Tensor, indices: Tensor, num_per_voxel: Tensor, densehashdata: Tensor, vsize: List[float], grid_size: List[int], grid_stride: List[int], coors_range: List[float], clear_voxels: bool = True) -> Tuple[Tensor, Tensor, Tensor]: 
+    def point_to_voxel_empty_mean_static(points: Tensor, voxels: Tensor, indices: Tensor, num_per_voxel: Tensor, densehashdata: Tensor, points_voxel_id: Tensor, vsize: List[float], grid_size: List[int], grid_stride: List[int], coors_range: List[float], clear_voxels: bool = True) -> Tuple[Tensor, Tensor, Tensor]: 
        """
        Args:
            points: 
@@ -51,6 +52,7 @@ class Point2VoxelCPU:
            indices: 
            num_per_voxel: 
            densehashdata: 
+            points_voxel_id: 
            vsize: 
            grid_size: 
            grid_stride: 

--- a/spconv/csrc/sparse/all.py
+++ b/spconv/csrc/sparse/all.py
@@ -920,7 +920,7 @@ class SpconvOps(pccm.Class):
    def point2voxel_cpu(self):
        code = pccm.FunctionCode()
        code.arg("points", "tv::Tensor")
-        code.arg("voxels, indices, num_per_voxel, densehashdata", "tv::Tensor")
+        code.arg("voxels, indices, num_per_voxel, densehashdata, pc_voxel_id", "tv::Tensor")
        code.arg("vsize", f"std::vector<float>")
        code.arg("grid_size, grid_stride", f"std::vector<int>")
        code.arg("coors_range", f"std::vector<float>")
@@ -950,11 +950,11 @@ class SpconvOps(pccm.Class):
                }}
                if (empty_mean){{
                    return Point2Voxel{ndim}DCPU::point_to_voxel_empty_mean_static(points, voxels, indices, 
-                        num_per_voxel, densehashdata, 
+                        num_per_voxel, densehashdata, pc_voxel_id,
                        vsize_, grid_size_, grid_stride_, coors_range_, clear_voxels);
                }} else{{
                    return Point2Voxel{ndim}DCPU::point_to_voxel_static(points, voxels, indices, 
-                        num_per_voxel, densehashdata, 
+                        num_per_voxel, densehashdata, pc_voxel_id,
                        vsize_, grid_size_, grid_stride_, coors_range_, clear_voxels);
                }}
            }}
@@ -967,7 +967,7 @@ class SpconvOps(pccm.Class):
    def point2voxel_cuda(self):
        code = pccm.FunctionCode()
        code.arg("points", "tv::Tensor")
-        code.arg("voxels, indices, num_per_voxel, hashdata, point_indice_data",
+        code.arg("voxels, indices, num_per_voxel, hashdata, point_indice_data, pc_voxel_id",
                 "tv::Tensor")
        code.arg("vsize", f"std::vector<float>")
        code.arg("grid_size, grid_stride", f"std::vector<int>")
@@ -1000,7 +1000,7 @@ class SpconvOps(pccm.Class):
                    coors_range_[i + {ndim}] = coors_range[i + {ndim}];
                }}
                return Point2Voxel{ndim}D::point_to_voxel_hash_static(points, voxels, indices, 
-                    num_per_voxel, hashdata, point_indice_data,
+                    num_per_voxel, hashdata, point_indice_data, pc_voxel_id,
                    vsize_, grid_size_, grid_stride_, coors_range_, clear_voxels, 
                    empty_mean, stream_int);
            }}

--- a/spconv/csrc/sparse/pointops.py
+++ b/spconv/csrc/sparse/pointops.py
@@ -208,6 +208,7 @@ class Point2VoxelKernel(pccm.ParameterizedClass, pccm.pybind.PybindClassMixin):
        code.arg("points_indice_data", f"const int64_t*")
        code.arg("voxels", f"{self.dtype} *")
        code.arg("num_per_voxel", f"int *")
+        code.arg("points_voxel_id", f"int64_t*")

        code.arg("point_stride", f"int")
        code.arg("max_points_per_voxel", f"int")
@@ -219,14 +220,17 @@ class Point2VoxelKernel(pccm.ParameterizedClass, pccm.pybind.PybindClassMixin):
        code.arg("grid_stride", f"tv::array<int, {self.ndim}>")

        code.arg("num_points", f"int")
+        # TODO add backward?
        code.raw(f"""
        int voxel_stride0 = point_stride * max_points_per_voxel;
        for (int i : tv::KernelLoopX<int>(num_points)){{
            int64_t prod = points_indice_data[i];
+            int voxel_id = -1;
            if (prod != -1){{
                auto voxel_index_pair = table.lookup(prod);
                if (!voxel_index_pair.empty() &&
                    voxel_index_pair.second < max_voxels) {{
+                    voxel_id = voxel_index_pair.second;
                    int old = atomicAdd(num_per_voxel + voxel_index_pair.second, 1);
                    if (old < max_points_per_voxel) {{
                        for (int j = 0; j < point_stride; ++j) {{
@@ -235,6 +239,7 @@ class Point2VoxelKernel(pccm.ParameterizedClass, pccm.pybind.PybindClassMixin):
                    }}
                }}
            }}
+            points_voxel_id[i] = voxel_id;
        }}
        """)
        return code
@@ -385,6 +390,7 @@ class Point2Voxel(pccm.ParameterizedClass, pccm.pybind.PybindClassMixin):
        code.arg("stream_int", f"std::uintptr_t", "0")

        code.raw(f"""
+        tv::Tensor points_voxel_id = tv::empty({{points.dim(0)}}, tv::int64, 0);
        int64_t expected_hash_data_num = points.dim(0) * 2;
        if (hashdata.dim(0) < expected_hash_data_num){{
            hashdata = tv::zeros({{expected_hash_data_num}}, tv::custom128, 0);
@@ -393,74 +399,18 @@ class Point2Voxel(pccm.ParameterizedClass, pccm.pybind.PybindClassMixin):
            point_indice_data = tv::zeros({{points.dim(0)}}, tv::int64, 0);
        }}
        return point_to_voxel_hash_static(points, voxels, indices, num_per_voxel, 
-            hashdata, point_indice_data, Point2VoxelCommon::tvarray2array(vsize), 
+            hashdata, point_indice_data, points_voxel_id, Point2VoxelCommon::tvarray2array(vsize), 
            Point2VoxelCommon::tvarray2array(grid_size), Point2VoxelCommon::tvarray2array(grid_stride), 
            Point2VoxelCommon::tvarray2array(coors_range), clear_voxels, empty_mean, stream_int);
        """)
        return code.ret("std::tuple<tv::Tensor, tv::Tensor, tv::Tensor>")
-        code.raw(f"""
-
-        TV_ASSERT_INVALID_ARG(points.ndim() == 2 && points.dim(1) >= {self.ndim}, "error");
-        using V = int64_t;
-        using KeyType = int64_t;
-        constexpr KeyType kEmptyKey = std::numeric_limits<KeyType>::max();
-        if (clear_voxels){{
-            voxels.zero_();
-        }}
-        using table_t =
-            tv::hash::LinearHashTable<KeyType, V, tv::hash::Murmur3Hash<KeyType>,
-                                        kEmptyKey, false>;
-        using pair_t = typename table_t::value_type;
-        // int64_t expected_hash_data_num = int64_t(tv::hash::align_to_power2(points.dim(0) * 2));
-        int64_t expected_hash_data_num = points.dim(0) * 2;
-
-        if (hashdata.dim(0) < expected_hash_data_num){{
-            hashdata = tv::zeros({{expected_hash_data_num}}, tv::custom128, 0);
-        }}
-        if (point_indice_data.dim(0) < points.dim(0)){{
-            point_indice_data = tv::zeros({{points.dim(0)}}, tv::int64, 0);
-        }}
-        // auto timer = tv::CudaContextTimer<>();
-        num_per_voxel.zero_();
-        table_t hash = table_t(hashdata.data_ptr<pair_t>(), expected_hash_data_num);
-        hash.clear();
-        // tv::ssprint("clear time", timer.report());
-        auto launcher = tv::cuda::Launch(points.dim(0));
-        launcher(kernel::build_hash_table<table_t>, hash, points.data_ptr<const {self.dtype}>(),
-                point_indice_data.data_ptr<int64_t>(),
-                points.dim(1), vsize, coors_range, grid_size, grid_stride, points.dim(0));
-        // tv::ssprint("build_hash_table", timer.report());
-
-        auto table_launcher = tv::cuda::Launch(hash.size());
-        tv::Tensor count = tv::zeros({{1}}, tv::int32, 0);
-        Layout layout = Layout::from_shape(grid_size);
-        table_launcher(kernel::assign_table<table_t>, hash, indices.data_ptr<int>(),
-                        count.data_ptr<int>(),
-                        layout, voxels.dim(0));
-        auto count_cpu = count.cpu();
-        int count_val = count_cpu.item<int32_t>();
-        // tv::ssprint("assign_table", timer.report());
-
-        launcher(kernel::generate_voxel<table_t>, hash, points.data_ptr<const {self.dtype}>(),
-                point_indice_data.data_ptr<const int64_t>(), voxels.data_ptr<{self.dtype}>(),
-                num_per_voxel.data_ptr<int>(), points.dim(1), voxels.dim(1), 
-                voxels.dim(0), vsize, coors_range,
-                grid_size, grid_stride, points.dim(0));
-        // tv::ssprint("generate_voxel", timer.report());
-
-        return std::make_tuple(voxels.slice_first_axis(0, count_val), 
-            indices.slice_first_axis(0, count_val), 
-            num_per_voxel.slice_first_axis(0, count_val));
-
-        """)
-        return code.ret("std::tuple<tv::Tensor, tv::Tensor, tv::Tensor>")

    @pccm.pybind.mark
    @pccm.cuda.static_function
    def point_to_voxel_hash_static(self):
        code = pccm.FunctionCode()
        code.arg("points", "tv::Tensor")
-        code.arg("voxels, indices, num_per_voxel, hashdata, point_indice_data",
+        code.arg("voxels, indices, num_per_voxel, hashdata, point_indice_data, points_voxel_id",
                 "tv::Tensor")
        code.arg("vsize", f"std::array<float, {self.ndim}>")
        code.arg("grid_size, grid_stride", f"std::array<int, {self.ndim}>")
@@ -516,7 +466,7 @@ class Point2Voxel(pccm.ParameterizedClass, pccm.pybind.PybindClassMixin):

        launcher(kernel::generate_voxel<table_t>, hash, points.data_ptr<const {self.dtype}>(),
                point_indice_data.data_ptr<const int64_t>(), voxels.data_ptr<{self.dtype}>(),
-                num_per_voxel.data_ptr<int>(), points.dim(1), voxels.dim(1), 
+                num_per_voxel.data_ptr<int>(), points_voxel_id.data_ptr<int64_t>(), points.dim(1), voxels.dim(1), 
                voxels.dim(0), vsize_tv, coors_range_tv,
                grid_size_tv, grid_stride_tv, points.dim(0));
        // tv::ssprint("generate_voxel", timer.report());
@@ -636,7 +586,7 @@ class Point2VoxelCPU(pccm.ParameterizedClass, pccm.pybind.PybindClassMixin):
    def point_to_voxel_static_template(self, mean: bool = False):
        code = pccm.FunctionCode()
        code.arg("points", "tv::Tensor")
-        code.arg("voxels, indices, num_per_voxel, densehashdata", "tv::Tensor")
+        code.arg("voxels, indices, num_per_voxel, densehashdata, points_voxel_id", "tv::Tensor")
        code.arg("vsize", f"std::array<float, {self.ndim}>")
        code.arg("grid_size, grid_stride", f"std::array<int, {self.ndim}>")
        code.arg("coors_range", f"std::array<float, {self.ndim * 2}>")
@@ -653,6 +603,7 @@ class Point2VoxelCPU(pccm.ParameterizedClass, pccm.pybind.PybindClassMixin):
        if (clear_voxels){{
            voxels.zero_();
        }}
+        auto points_voxel_id_ptr = points_voxel_id.data_ptr<int64_t>();
        int res_voxel_num = 0;
        int num_features = points.dim(1);
        auto N = points.dim(0);
@@ -680,20 +631,25 @@ class Point2VoxelCPU(pccm.ParameterizedClass, pccm.pybind.PybindClassMixin):
                    }}
                    coor[j] = c;
                }}
-                if (failed)
+                if (failed){{
+                    points_voxel_id_ptr[i] = -1;
                    continue;
+                }}
                voxelidx = coor_to_voxelidx_rw({codeops.unpack("coor", range(self.ndim))});
                
                if (voxelidx == -1) {{
                    voxelidx = voxel_num;
-                    if (voxel_num >= max_num_voxels)
+                    if (voxel_num >= max_num_voxels){{
+                        points_voxel_id_ptr[i] = -1;
                        continue;
+                    }}
                    voxel_num += 1;
                    coor_to_voxelidx_rw({codeops.unpack("coor", range(self.ndim))}) = voxelidx;
                    for (int k = 0; k < {self.ndim}; ++k) {{
                        coors_rw(voxelidx, k) = coor[k];
                    }}
                }}
+                points_voxel_id_ptr[i] = voxelidx;
                num = num_points_per_voxel_rw(voxelidx);
                if (num < max_num_points_per_voxel) {{
                    // voxel_point_mask_rw(voxelidx, num) = {self.dtype}(1);
@@ -781,8 +737,10 @@ class Point2VoxelCPU(pccm.ParameterizedClass, pccm.pybind.PybindClassMixin):
        code.arg("points", "tv::Tensor")
        code.arg("clear_voxels", "bool", "true")
        code.raw(f"""
+        tv::Tensor points_voxel_id = tv::empty({{points.dim(0)}}, tv::int64, -1);
+
        return point_to_voxel_static(points, voxels, indices, num_per_voxel, densehashdata, 
-            tvarray2array(vsize), 
+            points_voxel_id, tvarray2array(vsize), 
            tvarray2array(grid_size), tvarray2array(grid_stride), 
            tvarray2array(coors_range), clear_voxels);
        """)
@@ -795,8 +753,10 @@ class Point2VoxelCPU(pccm.ParameterizedClass, pccm.pybind.PybindClassMixin):
        code.arg("points", "tv::Tensor")
        code.arg("clear_voxels", "bool", "true")
        code.raw(f"""
+        tv::Tensor points_voxel_id = tv::empty({{points.dim(0)}}, tv::int64, -1);
+
        return point_to_voxel_empty_mean_static(points, voxels, indices, num_per_voxel, 
-            densehashdata, tvarray2array(vsize), 
+            densehashdata, points_voxel_id, tvarray2array(vsize), 
            tvarray2array(grid_size), tvarray2array(grid_stride), 
            tvarray2array(coors_range), clear_voxels);
        """)

--- a/spconv/pytorch/constants.py
+++ b/spconv/pytorch/constants.py
@@ -27,3 +27,15 @@ try:
 except:
    # for unknown errors, just set a version
    PYTORCH_VERSION = [1, 8, 0]
+
+
+if PYTORCH_VERSION >= [1, 6, 0]:
+    TORCH_HAS_AMP = True
+else:
+    TORCH_HAS_AMP = False
+
+def is_amp_enabled():
+    if TORCH_HAS_AMP:
+        return torch.is_autocast_enabled()
+    else:
+        return False 
\ No newline at end of file