Merge branch 'develop'

bab09b63 · yan.yan · 7af751dc · 66529500 · bab09b63 · 7af751dc
Commit bab09b63 authored Jul 12, 2022 by yan.yan
20 changed files
--- a/docs/SPCONV_2_BREAKING_CHANGEs.md
+++ b/docs/SPCONV_2_BREAKING_CHANGEs.md
@@ -24,4 +24,4 @@
 * VoxelGenerator has been replaced by ```spconv.pytorch.utils.PointToVoxel``` (torch API) or Point2VoxelGPU[1-4]d/Point2VoxelCPU[1-4]d (tv.Tensor API).
 * spconv < 2.1 don't support CPU. spconv 2.1+ support cpu for debug usage.

-* test spconv 1.x model in spconv 2.x: Firstly set environment variable before run program, Then set all ```algo``` in conv/pool to ```ConvAlgo.Native```. Linux: ```export SPCONV_FILTER_HWIO="1"```, Windows powershell: ```$Env:SPCONV_FILTER_HWIO = "1"```. **WARNING** test spconv 1.x model don't support implicit gemm algorithm.
+* test spconv 1.x model in spconv 2.x: Linux: ```export SPCONV_SAVED_WEIGHT_LAYOUT="RSCK"```, Windows powershell: ```$Env:SPCONV_SAVED_WEIGHT_LAYOUT = "RSCK"```. 
\ No newline at end of file
--- a/scripts/dev_subm.py
+++ b/scripts/dev_subm.py
-import sys
-from pathlib import Path
-from typing import Dict, List, Tuple
-import pickle
-import sys
-import time
-from pathlib import Path
-from cumm.gemm.algospec.core import GemmAlgo
-
-import numpy as np
-import pccm
-import torch
-import torch.nn.functional as F
-
-from cumm import dtypes
-from cumm import tensorview as tv
-from cumm.constants import PACKAGE_ROOT
-from cumm.conv.bases import NCHW, NHWC, ConvIterAlgo, ConvOpType
-from cumm.conv.main import ConvMainUnitTest, gen_gemm_kernels
-from cumm.conv.params import ConvProblem
-from cumm.gemm import kernel
-import os
-from spconv.core_cc.csrc.sparse.all import SpconvOps
-from cumm.gemm.codeops import div_up
-from spconv.constants import PACKAGE_ROOT
-from spconv.core import ConvAlgo
-
-from spconv.pytorch import ops
-from spconv.algo import CONV, BestConvAlgoByProfile
-from spconv.pytorch.cppcore import torch_tensor_to_tv
-
-
-def reduce_mask_count(mask: np.ndarray, width: int):
-    mask_length_32 = (div_up(mask.shape[0], width)) * width
-    if mask.shape[0] < mask_length_32:
-        mask_pad = np.zeros((mask_length_32, ), dtype=mask.dtype)
-        mask_pad[:mask.shape[0]] = mask
-        mask = mask_pad
-    mask = mask.reshape(-1, width)
-    maskr = np.bitwise_or.reduce(mask, axis=1)
-    maskr_tv = tv.from_numpy(maskr)
-    return SpconvOps.count_bits(maskr_tv).numpy().sum() * width
-
-
-def reduce_mask_count_x(mask: np.ndarray, width: int):
-    mask_length_32 = (div_up(mask.shape[0], width)) * width
-    if mask.shape[0] < mask_length_32:
-        mask_pad = np.zeros((mask_length_32, ), dtype=mask.dtype)
-        mask_pad[:mask.shape[0]] = mask
-        mask = mask_pad
-    mask = mask.reshape(-1, width)
-    maskr = np.bitwise_or.reduce(mask, axis=1)
-    return maskr
-
-
-def dev_subm_inds_v2(subm: bool = False, run_conv: bool = True):
-    limit_input_n = 16384
-    limit_input_n = None
-    np.random.seed(484)
-
-    with (PACKAGE_ROOT.parent / "test/data/test_spconv.pkl").open("rb") as f:
-        voxels_np, indices_np, spatial_shape = pickle.load(f)
-        from spconv.test_utils import generate_sparse_data
-        voxels_np = voxels_np[:limit_input_n]
-        indices_np = indices_np[:limit_input_n]
-
-        spatial_shape = [19, 18, 17]
-        sparse_dict = generate_sparse_data(spatial_shape, [1024], 128)
-
-        voxels_np = np.ascontiguousarray(sparse_dict["features"]).astype(
-            np.float32)
-        indices_np = np.ascontiguousarray(
-            sparse_dict["indices"][:, [3, 0, 1, 2]]).astype(np.int32)
-
-        voxels = tv.from_numpy(voxels_np).cuda()
-        indices = tv.from_numpy(indices_np).cuda()
-        indices_th = torch.from_numpy(indices_np).cuda()
-    print(spatial_shape, indices_np.shape)
-    ndim = 3
-    if subm:
-        ksize = [3, 3, 3]
-        kv = np.prod(ksize)
-        padding = [1] * ndim
-        stride = [1] * ndim
-        dilation = [1] * ndim
-        out_padding = [0] * ndim
-    else:
-        ksize = [2, 2, 2]
-        kv = np.prod(ksize)
-        padding = [0] * ndim
-        stride = [1] * ndim
-        dilation = [1] * ndim
-        out_padding = [0] * ndim
-    out_inds, pair_ref, indice_num_per_loc = ops.get_indice_pairs(
-        indices_th, 1, spatial_shape, ConvAlgo.Native, ksize, stride, padding,
-        dilation, out_padding, subm)
-    indice_num_per_loc_np = indice_num_per_loc.cpu().numpy()
-    indice_pairs_np = pair_ref.cpu().numpy()
-    algo = ConvAlgo.MaskSplitImplicitGemm
-    if algo == ConvAlgo.MaskImplicitGemm:
-        num_split = 1
-    else:
-        num_split = 2
-    for i in range(5):
-        res = ops.get_indice_pairs_implicit_gemm(indices_th, 1, spatial_shape,
-                                                 algo, ksize, stride, padding,
-                                                 dilation, out_padding, subm)
-    out_inds = res[0]
-    num_inds_per_loc = res[1]
-    pair_fwd = res[2]
-    pair_fwd_x = pair_fwd.cpu().numpy().reshape(-1)
-    pair_fwd_x[pair_fwd_x == -1] = 0
-    loc_num_np = (pair_fwd_x > 0).reshape(kv, -1).sum(1)
-    print(loc_num_np)
-    print(indice_num_per_loc_np)
-
-    pair_bwd = res[3]
-    pair_mask_fwd_splits = res[4]
-    pair_mask_bwd_splits = res[5]
-    mask_argsort_fwd_splits = res[6]
-    mask_argsort_bwd_splits = res[7]
-    masks = res[8]
-    pair_mask_fwd_splits_tv = [
-        ops.torch_tensor_to_tv(t, dtype=tv.uint32)
-        for t in pair_mask_fwd_splits
-    ]
-    valid_location_bitcount = [
-        SpconvOps.count_bits(t) for t in pair_mask_fwd_splits_tv
-    ]
-    valid_location_count = sum(
-        [t.cpu().numpy().sum() for t in valid_location_bitcount])
-    reduce_length = 32
-    split_mask_valid_count = sum([
-        reduce_mask_count(t.cpu().numpy(), reduce_length)
-        for t in pair_mask_fwd_splits_tv
-    ])
-    if subm:
-        print("SUBM", valid_location_count, split_mask_valid_count,
-              pair_fwd.numel())
-    else:
-        print("REGULAR", valid_location_count, split_mask_valid_count,
-              pair_fwd.numel())
-    # return
-
-    if run_conv:
-        C = 64
-        K = 64
-        desps = CONV.desps
-        mask_output_fwd = torch.zeros([2, div_up(out_inds.shape[0], 32)],
-                                      dtype=torch.int32,
-                                      device=indices_th.device)
-        mask_output_bwd = torch.zeros([2, div_up(indices.dim(0), 32)],
-                                      dtype=torch.int32,
-                                      device=indices_th.device)
-
-        for desp in desps:
-            if desp.algo != GemmAlgo.Simt.value:
-                continue
-            # if desp.op_type == ConvOpType.kBackwardWeight.value:
-            #     continue
-            # if desp.tile_shape !
-            if desp.dtype_a == dtypes.int8.tv_dtype:
-                inp = np.random.randint(-1, 1, size=[voxels_np.shape[0],
-                                                     C]).astype(np.int8)
-                weight = np.random.randint(-1, 1, size=[K, *ksize,
-                                                        C]).astype(np.int8)
-                output = np.random.randint(-1, 1, size=[
-                    out_inds.shape[0], K
-                ]).astype(dtypes.get_npdtype_from_tvdtype(desp.dtype_output))
-            else:
-                inp = np.random.uniform(-1, 1, size=[
-                    voxels_np.shape[0], C
-                ]).astype(dtypes.get_npdtype_from_tvdtype(desp.dtype_input))
-                weight = np.random.uniform(-1, 1, size=[K, *ksize, C]).astype(
-                    dtypes.get_npdtype_from_tvdtype(desp.dtype_weight))
-                output = np.random.uniform(-1, 1, size=[
-                    out_inds.shape[0], K
-                ]).astype(dtypes.get_npdtype_from_tvdtype(desp.dtype_output))
-            weight_ref = weight.transpose(1, 2, 3, 0, 4)
-            weight_ref = np.ascontiguousarray(weight_ref).reshape(-1, K, C)
-            if desp.op_type == ConvOpType.kBackwardInput.value:
-                inp_tv = tv.zeros(inp.shape, desp.dtype_input, 0)
-            else:
-                inp_tv = tv.from_numpy(inp).cuda()
-            if desp.op_type == ConvOpType.kBackwardWeight.value:
-                weight_tv = tv.zeros(weight.shape, desp.dtype_weight, 0)
-            else:
-                weight_tv = tv.from_numpy(weight).cuda()
-            # _ = tv.zeros([5000, 10], tv.float32, 0)
-            if desp.op_type == ConvOpType.kForward.value:
-                output_tv = tv.zeros(output.shape, desp.dtype_output, 0)
-            else:
-                output_tv = tv.from_numpy(output).cuda()
-            torch.cuda.synchronize()
-            t = time.time()
-            spk = 1
-            if desp.op_type == ConvOpType.kBackwardWeight.value:
-                # TODO support splitk parallel
-                spk = 32
-            if subm:
-                if desp.op_type == ConvOpType.kForward.value:
-                    indice_pairs = pair_fwd
-                elif desp.op_type == ConvOpType.kBackwardInput.value:
-                    indice_pairs = pair_bwd
-                else:
-                    indice_pairs = pair_fwd
-                mask_output = mask_output_fwd
-                # print([bin(x.item()) for x in masks])
-                for j in range(num_split):
-                    beta = 1 if j == 1 else 0
-                    mask_filter = 0xffffffff
-                    mask_filter = masks[j].item()
-
-                    reverse_mask = False
-                    if desp.op_type == ConvOpType.kBackwardWeight.value:
-                        mask_op = mask_output[j]
-                    else:
-                        mask_op = pair_mask_fwd_splits[j]
-                    if desp.op_type == ConvOpType.kBackwardInput.value:
-                        reverse_mask = True
-                    CONV.run_with_tuned_result(
-                        BestConvAlgoByProfile(desp, spk),
-                        desp.op_type,
-                        inp_tv,
-                        weight_tv,
-                        output_tv,
-                        torch_tensor_to_tv(mask_op, dtype=tv.uint32),
-                        torch_tensor_to_tv(mask_argsort_fwd_splits[j]),
-                        torch_tensor_to_tv(mask_output[j], dtype=tv.uint32),
-                        torch_tensor_to_tv(indice_pairs),
-                        reverse_mask,
-                        mask_filter=mask_filter,
-                        mask_width=32,
-                        beta=beta,
-                        verbose=True,
-                    )
-            else:
-                if desp.op_type == ConvOpType.kForward.value:
-                    indice_pairs = pair_fwd  # inp -> out
-                    mask_ops = pair_mask_fwd_splits
-                    mask_argsorts = mask_argsort_fwd_splits
-                    mask_output = mask_output_fwd
-                elif desp.op_type == ConvOpType.kBackwardInput.value:
-                    indice_pairs = pair_bwd  # out -> inp
-                    mask_ops = pair_mask_bwd_splits
-                    mask_argsorts = mask_argsort_bwd_splits
-                    mask_output = mask_output_bwd
-
-                    print([bin(x.item()) for x in masks])
-                else:
-                    indice_pairs = pair_fwd  # inp -> out
-                    mask_ops = pair_mask_fwd_splits
-                    mask_argsorts = mask_argsort_fwd_splits
-                    mask_output = mask_output_fwd
-
-                for j in range(2):
-                    beta = 1 if j == 1 else 0
-                    mask_filter = masks[j].item()
-                    reverse_mask = False
-                    if desp.op_type == ConvOpType.kBackwardWeight.value:
-                        mask_op = mask_output[j]
-                    else:
-                        mask_op = mask_ops[j]
-
-                    CONV.run_with_tuned_result(
-                        BestConvAlgoByProfile(desp, spk),
-                        desp.op_type,
-                        inp_tv,
-                        weight_tv,
-                        output_tv,
-                        torch_tensor_to_tv(mask_op, dtype=tv.uint32),
-                        torch_tensor_to_tv(mask_argsorts[j]),
-                        torch_tensor_to_tv(mask_output[j], dtype=tv.uint32),
-                        torch_tensor_to_tv(indice_pairs),
-                        reverse_mask,
-                        mask_filter=mask_filter,
-                        mask_width=32,
-                        beta=beta,
-                        verbose=True,
-                    )
-
-            torch.cuda.synchronize()
-            duration = time.time() - t
-            if desp.op_type == ConvOpType.kForward.value:
-                output_ref = np.zeros_like(output, dtype=np.float32)
-                # ref algorithm
-                for filter_offset in range(kv):
-                    if subm and filter_offset > kv // 2:
-                        nhot = indice_num_per_loc_np[kv - 1 - filter_offset]
-                    elif subm and filter_offset == kv // 2:
-                        nhot = voxels.shape[0]
-                    else:
-                        nhot = indice_num_per_loc_np[filter_offset]
-                    a_inds = indice_pairs_np[0][filter_offset][:nhot]
-                    c_inds = indice_pairs_np[1][filter_offset][:nhot]
-                    # print(a_inds_cpu[:10])
-                    a = inp[a_inds]
-                    cc = a.astype(
-                        np.float32) @ weight_ref[filter_offset].T.astype(
-                            np.float32)
-                    output_ref[c_inds] += cc
-
-                output_cpu = output_tv.cpu().numpy().astype(np.float32)
-                duration = time.time() - t
-                my = output_cpu.reshape(-1)
-                print("ERROR", np.linalg.norm(output_ref.reshape(-1) - my))
-
-            elif desp.op_type == ConvOpType.kBackwardInput.value:
-                dinput_ref = np.zeros_like(inp, dtype=np.float32)
-                # ref algorithm
-                for filter_offset in range(kv):
-                    if subm and filter_offset > kv // 2:
-                        nhot = indice_num_per_loc_np[kv - 1 - filter_offset]
-                    elif subm and filter_offset == kv // 2:
-                        nhot = voxels.shape[0]
-                    else:
-                        nhot = indice_num_per_loc_np[filter_offset]
-                    a_inds = indice_pairs_np[1][filter_offset][:nhot]
-                    c_inds = indice_pairs_np[0][filter_offset][:nhot]
-
-                    # print(a_inds_cpu[:10])
-                    a = output[a_inds]
-                    # NK @ KC
-                    cc = a.astype(
-                        np.float32) @ weight_ref[filter_offset].astype(
-                            np.float32)
-                    dinput_ref[c_inds] += cc
-                din_cpu = inp_tv.cpu().numpy()
-                print(
-                    "ERROR",
-                    np.linalg.norm(
-                        din_cpu.reshape(-1) - dinput_ref.reshape(-1)))
-            else:
-                dw_ref = np.zeros_like(weight_ref,
-                                       dtype=np.float32)  # KV, K, C
-                for filter_offset in range(kv):
-                    if subm and filter_offset > kv // 2:
-                        nhot = indice_num_per_loc_np[kv - 1 - filter_offset]
-                    elif subm and filter_offset == kv // 2:
-                        nhot = voxels.shape[0]
-                    else:
-                        nhot = indice_num_per_loc_np[filter_offset]
-                    o_inds = indice_pairs_np[1][filter_offset][:nhot]
-                    i_inds = indice_pairs_np[0][filter_offset][:nhot]
-                    # print(a_inds_cpu[:10])
-                    out_gather = output[o_inds]  # [N, K]
-                    inp_gather = inp[i_inds]  # [N, C]
-                    # KN @ NC
-                    dw_res = out_gather.astype(
-                        np.float32).T @ inp_gather.astype(np.float32)
-                    dw_ref[filter_offset] = dw_res
-                # print(indice_pairs_np_test[0])
-                dw_ref_kcrs = dw_ref.transpose(1, 0, 2)
-                dw_cpu = weight_tv.cpu().numpy().reshape(K, np.prod(ksize), C)
-
-                print(
-                    "ERROR",
-                    np.linalg.norm(
-                        dw_cpu.reshape(-1) - dw_ref_kcrs.reshape(-1)))
-
-
-if __name__ == "__main__":
-    dev_subm_inds_v2()
--- a/spconv/__init__.py
+++ b/spconv/__init__.py
@@ -17,3 +17,5 @@ from . import build as _build
 from .core import ConvAlgo, AlgoHint
 from . import constants
 from .__version__ import __version__
+
+SPCONV_VERSION_NUMBERS = list(map(int, __version__.split(".")))
\ No newline at end of file
--- a/spconv/algo.py
+++ b/spconv/algo.py
@@ -228,9 +228,9 @@ class SimpleGemm:
                # skip volta tensor op since it is very slow in architectures except volta.
                if arch >= (7, 5) and desp.algo == GemmAlgo.Volta.value:
                    continue
-                lda = a.dim(1)
-                ldb = b.dim(1)
-                ldc = c.dim(1)
+                lda = a.stride[0]
+                ldb = b.stride[0]
+                ldc = c.stride[0]
                if desp.supported_ldx(lda, ldb, ldc):
                    if arch not in COMPILED_CUDA_ARCHS:
                        desp = desp.copy()
@@ -422,15 +422,16 @@ class SimpleGemm:
                                               c_inds.shape)
        avail = self.get_all_available(a, b, c, trans_a, trans_b, trans_c,
                                       arch, shuffle_type)
-
-        c_ = c.clone()
+        # c may be weight, may non-contiguous.
+        # cumm.tensorview.Tensor don't support non-contiguous clone
+        c_ = c.clone_whole_storage()
        times: List[float] = []
        best_gather_params = (-1, -1, -1, -1)
        best_scatter_params = (-1, -1, -1, -1)

        all_profile_res: List[BestAlgoByProfile] = []
        for desp in avail:
-            c_.zero_()
+            c_.zero_whole_storage_()
            split_k_slices = 1
            # TODO better splitk selection
            if desp.split_k_serial and hint & AlgoHint.BackwardWeight.value:

--- a/spconv/benchmark/__init__.py
+++ b/spconv/benchmark/__init__.py
--- a/spconv/benchmark/__main__.py
+++ b/spconv/benchmark/__main__.py
+from .basic import bench_basic
+
+import fire
+
+def bench_me_basic(dtype_str: str):
+    from spconv.benchmark.me import bench_me_basic
+    return bench_me_basic(dtype_str)
+
+def bench_torchsparse_basic(dtype_str: str):
+    from spconv.benchmark.thsp import bench_torchsparse_basic
+    return bench_torchsparse_basic(dtype_str)
+
+if __name__ == "__main__":
+    fire.Fire()
--- a/spconv/benchmark/basic.py
+++ b/spconv/benchmark/basic.py
+from spconv.benchmark.core import get_voxel_data
+
+
+import time
+from pathlib import Path
+
+import numpy as np
+import torch
+from torch import nn
+from cumm import tensorview as tv
+from spconv.core import ConvAlgo
+from cumm import dtypes
+import spconv.pytorch as spconv
+from spconv.test_utils import params_grid
+
+class Net(nn.Module):
+    def __init__(self, shape, algo):
+        super().__init__()
+        pool_algo = algo
+        # pool_algo = ConvAlgo.Native
+        self.net = spconv.SparseSequential(
+            spconv.SubMConv3d(3, 64, 3, bias=False, indice_key="c0",
+                              algo=algo),
+
+            spconv.SubMConv3d(64,
+                              64,
+                              3,
+                              bias=False,
+                              indice_key="c0",
+                              algo=algo),
+            # nn.BatchNorm1d(32),
+            # nn.ReLU(),
+            # spconv.SparseConv3d(64, 64, 2, 2, bias=False, indice_key="m0"),
+            spconv.SparseMaxPool3d(2, 2, algo=pool_algo),
+            spconv.SubMConv3d(64,
+                              96,
+                              3,
+                              bias=False,
+                              indice_key="c1",
+                              algo=algo),
+            spconv.SubMConv3d(96,
+                              96,
+                              3,
+                              bias=False,
+                              indice_key="c1",
+                              algo=algo),
+            # nn.BatchNorm1d(64),
+            # nn.ReLU(),
+            # spconv.SparseConv3d(96, 96, 2, 2, bias=False, indice_key="m1"),
+            spconv.SparseMaxPool3d(2, 2, algo=pool_algo),
+            spconv.SubMConv3d(96,
+                              128,
+                              3,
+                              bias=False,
+                              indice_key="c2",
+                              algo=algo),
+            spconv.SubMConv3d(128,
+                              128,
+                              3,
+                              bias=False,
+                              indice_key="c2",
+                              algo=algo),
+            # nn.BatchNorm1d(128),
+            # nn.ReLU(),
+            # spconv.SparseConv3d(128, 128, 2, 2, bias=False, indice_key="m2"),
+            spconv.SparseMaxPool3d(2, 2, algo=pool_algo),
+            spconv.SubMConv3d(128,
+                              160,
+                              3,
+                              bias=False,
+                              indice_key="c3",
+                              algo=algo),
+            spconv.SubMConv3d(160,
+                              160,
+                              3,
+                              bias=False,
+                              indice_key="c3",
+                              algo=algo),
+            # nn.BatchNorm1d(128),
+            # nn.ReLU(),
+            # spconv.SparseConv3d(160, 160, 2, 2, bias=False, indice_key="m3"),
+            spconv.SparseMaxPool3d(2, 2, algo=pool_algo),
+            spconv.SubMConv3d(160,
+                              192,
+                              3,
+                              bias=False,
+                              indice_key="c4",
+                              algo=algo),
+            spconv.SubMConv3d(192,
+                              192,
+                              3,
+                              bias=False,
+                              indice_key="c4",
+                              algo=algo),
+            # nn.BatchNorm1d(128),
+            # nn.ReLU(),
+            spconv.SparseMaxPool3d(2, 2, indice_key="m4", algo=pool_algo),
+            # spconv.SparseConv3d(192, 192, 2, 2, bias=False, indice_key="m4"),
+            spconv.SubMConv3d(192,
+                              224,
+                              3,
+                              bias=False,
+                              indice_key="c5",
+                              algo=algo),
+            spconv.SubMConv3d(224,
+                              224,
+                              3,
+                              bias=False,
+                              indice_key="c5",
+                              algo=algo),
+            # nn.BatchNorm1d(224),
+            # nn.ReLU(),
+            # spconv.SparseConv3d(224, 224, 2, 2, bias=False, indice_key="m5"),
+            spconv.SparseMaxPool3d(2, 2, indice_key="m5", algo=pool_algo),
+            spconv.SubMConv3d(224,
+                              256,
+                              3,
+                              bias=False,
+                              indice_key="c6",
+                              algo=algo),
+            spconv.SubMConv3d(256,
+                              256,
+                              3,
+                              bias=False,
+                              indice_key="c6",
+                              algo=algo),
+
+            # nn.BatchNorm1d(256),
+            # nn.ReLU(),
+
+            # spconv.SparseInverseConv3d(256, 128, 2, indice_key="m5", bias=False, algo=algo),
+            # # # nn.BatchNorm1d(128),
+            # # # nn.ReLU(),
+
+            # spconv.SparseInverseConv3d(128, 64, 2, indice_key="m4", bias=False, algo=algo),
+        )
+        max_batch_size = 1
+        self.shape = shape
+
+    def forward(self, features, coors, batch_size, enable_timer: bool = False):
+        x = spconv.SparseConvTensor(features,
+                                    coors,
+                                    self.shape,
+                                    batch_size,
+                                    enable_timer=enable_timer)
+        return self.net(x)
+
+_DTYPE_TO_TORCH_DTYPE = {
+    dtypes.float32: torch.float32,
+    dtypes.float16: torch.float16,
+}
+
+def bench_basic(dtype_str: str):
+    dtype = dtypes.get_dtype_by_shortcut(dtype_str)
+    if dtype not in _DTYPE_TO_TORCH_DTYPE:
+        raise NotImplementedError("only support bench f32 and f16 for now")
+    torch_dtype = _DTYPE_TO_TORCH_DTYPE[dtype]
+    algos = [spconv.ConvAlgo.Native, spconv.ConvAlgo.MaskImplicitGemm, spconv.ConvAlgo.MaskSplitImplicitGemm]
+    (voxels, coors, spatial_shape) = get_voxel_data()
+    device = torch.device("cuda:0")
+
+    for algo, in params_grid(algos):
+        voxels_th = torch.from_numpy(voxels).to(device).to(torch_dtype)
+        coors_th = torch.from_numpy(coors).to(device).int()
+        voxels_th.requires_grad = True
+        net = Net(spatial_shape, algo).to(device).train().to(torch_dtype)# .train()
+        spconv.assign_name_for_sparse_modules(net)
+        with torch.no_grad():
+            out: spconv.SparseConvTensor = net(voxels_th, coors_th, 1)
+        dout = np.random.uniform(-0.2, 0.2, out.features.shape).astype(np.float32)
+        dout_t = torch.from_numpy(dout).to(device).to(torch_dtype)
+        times = []
+        with torch.no_grad():
+            for i in range(100):
+                torch.cuda.synchronize()
+                t = time.time()
+                out_nograd = net(voxels_th, coors_th, 1, False)
+                timer = out_nograd._timer
+                torch.cuda.synchronize()
+                times.append(time.time() - t)
+        print(f"basic[{dtype_str}|{algo}|forward]", np.mean(times[50:]))
+        times = []
+
+        for i in range(50):
+            out = net(voxels_th, coors_th, 1)
+            torch.cuda.synchronize()
+            t = time.time()
+            out.features.backward(dout_t)
+            torch.cuda.synchronize()
+            times.append(time.time() - t)
+        print(f"basic[{dtype_str}|{algo}|backward]", np.mean(times[25:]))
+
+if __name__ == "__main__":
+    bench_basic("f16")
\ No newline at end of file
--- a/spconv/benchmark/core.py
+++ b/spconv/benchmark/core.py
+import requests
+import fire 
+import pickle 
+from io import BytesIO
+import numpy as np
+from spconv.constants import PACKAGE_ROOT
+
+RAW_TEST_DATA_PATH = "https://raw.githubusercontent.com/traveller59/spconv/v2.1.10/test/data/test_spconv.pkl"
+RAW_PC_PATH = "https://raw.githubusercontent.com/traveller59/spconv/v2.1.10/test/data/benchmark-pc.npz"
+
+def get_voxel_data():
+    editable_test_data_path = PACKAGE_ROOT.parent / "test/data/test_spconv.pkl"
+    if editable_test_data_path.exists():
+        with editable_test_data_path.open("rb") as f:
+            return pickle.load(f)
+    ff = BytesIO()
+    with requests.get(RAW_TEST_DATA_PATH, stream=True) as req:
+        req.raise_for_status()
+        for chunk in req.iter_content(chunk_size=8192): 
+            ff.write(chunk)
+    ff.seek(0) 
+    (voxels, coors, spatial_shape) = pickle.load(ff)
+    return voxels, coors, spatial_shape
+
+def get_pc_data():
+    editable_test_data_path = PACKAGE_ROOT.parent / "test/data/benchmark-pc.npz"
+    if editable_test_data_path.exists():
+        pc = np.load(str(editable_test_data_path))["pc"]
+        return pc 
+    ff = BytesIO()
+    with requests.get(RAW_PC_PATH, stream=True) as req:
+        req.raise_for_status()
+        for chunk in req.iter_content(chunk_size=8192): 
+            ff.write(chunk)
+    ff.seek(0) 
+    pc = np.load(ff)["pc"]
+    return pc
+
+if __name__ == "__main__":
+    pc = get_pc_data()
+    print(pc[:10])
\ No newline at end of file
--- a/spconv/benchmark/me.py
+++ b/spconv/benchmark/me.py
+"""Benchmark MinkowskiEngine
+"""
+from spconv.benchmark.core import get_voxel_data
+
+import time
+from pathlib import Path
+
+import numpy as np
+import torch
+from torch import nn
+from spconv.core import ConvAlgo
+from cumm import dtypes
+from spconv.test_utils import params_grid
+
+_DTYPE_TO_TORCH_DTYPE = {
+    dtypes.float32: torch.float32,
+    dtypes.float16: torch.float16,
+}
+
+def bench_me_basic(dtype_str: str):
+    dtype = dtypes.get_dtype_by_shortcut(dtype_str)
+    if dtype not in _DTYPE_TO_TORCH_DTYPE:
+        raise NotImplementedError("only support bench f32 and f16 for now")
+    torch_dtype = _DTYPE_TO_TORCH_DTYPE[dtype]
--- a/spconv/benchmark/thsp.py
+++ b/spconv/benchmark/thsp.py
+"""Benchmark torchsparse
+"""
+from spconv.benchmark.core import get_voxel_data
+
+import time
+from pathlib import Path
+
+import numpy as np
+import torch
+from torch import nn
+from spconv.core import ConvAlgo
+from cumm import dtypes
+from spconv.test_utils import params_grid
+
+_DTYPE_TO_TORCH_DTYPE = {
+    dtypes.float32: torch.float32,
+    dtypes.float16: torch.float16,
+}
+
+def bench_torchsparse_basic(dtype_str: str):
+    dtype = dtypes.get_dtype_by_shortcut(dtype_str)
+    if dtype not in _DTYPE_TO_TORCH_DTYPE:
+        raise NotImplementedError("only support bench f32 and f16 for now")
+    torch_dtype = _DTYPE_TO_TORCH_DTYPE[dtype]
--- a/spconv/constants.py
+++ b/spconv/constants.py
@@ -25,10 +25,20 @@ PACKAGE_ROOT = Path(__file__).parent.resolve()
 EDITABLE_INSTALLED = project_is_installed(
    PACKAGE_NAME) and project_is_editable(PACKAGE_NAME)

-_filter_hwio_env = os.getenv("SPCONV_FILTER_HWIO", "0")
-FILTER_HWIO = _filter_hwio_env == "1"
+_filter_hwio_env = os.getenv("SPCONV_FILTER_HWIO", None)
+if _filter_hwio_env is not None:
+    raise NotImplementedError("SPCONV_FILTER_HWIO is deprecated. use SPCONV_SAVED_WEIGHT_LAYOUT instead.")
+
 DISABLE_JIT = os.getenv("SPCONV_DISABLE_JIT", "0") == "1"
 NDIM_DONT_CARE = 3
+FILTER_HWIO = False
+
+SAVED_WEIGHT_LAYOUT = os.getenv("SPCONV_SAVED_WEIGHT_LAYOUT", "")
+
+if SAVED_WEIGHT_LAYOUT != "":
+    assert SAVED_WEIGHT_LAYOUT in ["KRSC", "RSKC", "RSCK"], "please set SAVED_WEIGHT_LAYOUT to KRSC, RSKC or RSCK"
+
+ALL_WEIGHT_IS_KRSC = True

 SPCONV_DEBUG_SAVE_PATH = os.getenv("SPCONV_DEBUG_SAVE_PATH", "")

@@ -63,3 +73,4 @@ class SpconvAllocatorKeys:
    MaskArgSortBwd = "MaskArgSortBwd"

    OutFeatures = "OutFeatures"
+SPCONV_DEBUG_WEIGHT = False
--- a/spconv/core.py
+++ b/spconv/core.py
@@ -83,6 +83,9 @@ SHUFFLE_SIMT_PARAMS: List[GemmAlgoParams] = [
                        "f32,f32,f32,f32,f32", 2, kernel.GemmAlgo.Simt, None),
    *gen_shuffle_params((32, 32, 32), (32, 32, 8), ["f32,f32,f32,f32,f32"],
                        "f32,f32,f32,f32,f32", 2, kernel.GemmAlgo.Simt, None),
+    *gen_shuffle_params((16, 32, 8), (16, 16, 8), ["f32,f32,f32,f32,f32"],
+                        "f32,f32,f32,f32,f32", 2, kernel.GemmAlgo.Simt, None),
+
    # fall back kernels if mat is misaligned for half
    # TODO use access-per-vector kernel instead of simt kernel for fallback
    *gen_shuffle_params((128, 128, 8), (32, 64, 8), ["f16,f16,f16,f32,f32"],

--- a/spconv/core_cc/csrc/sparse/all/__init__.pyi
+++ b/spconv/core_cc/csrc/sparse/all/__init__.pyi
@@ -5,6 +5,20 @@ class ThrustCustomAllocatorV2:
    alloc_func: Callable[int, int]
 class SpconvOps:
    @staticmethod
+    def cumm_version() -> str: 
+        """
+        get cumm version when build spconv.
+                
+        """
+        ...
+    @staticmethod
+    def pccm_version() -> str: 
+        """
+        get pccm version when build spconv.
+                
+        """
+        ...
+    @staticmethod
    def generate_conv_inds_stage1(indices: Tensor, indice_pairs: Tensor, indice_pairs_uniq: Tensor, indice_num_per_loc: Tensor, batch_size: int, output_dims: List[int], input_dims: List[int], ksize: List[int], stride: List[int], padding: List[int], dilation: List[int], transposed: bool = False, stream_int: int = 0) -> None: 
        """
        Args:
@@ -300,6 +314,13 @@ class SpconvOps:
        """
        ...
    @staticmethod
+    def reverse_bits(a: Tensor) -> Tensor: 
+        """
+        Args:
+            a: 
+        """
+        ...
+    @staticmethod
    def calc_point2voxel_meta_data(vsize_xyz: List[float], coors_range_xyz: List[float]) -> Tuple[List[float], List[int], List[int], List[float]]: 
        """
        Args:

--- a/spconv/cppconstants.py
+++ b/spconv/cppconstants.py
@@ -19,6 +19,10 @@ if hasattr(_ext, "cumm"):
 else:
    CPU_ONLY_BUILD = True

+from spconv.core_cc.csrc.sparse.all import SpconvOps
+
+BUILD_CUMM_VERSION = SpconvOps.cumm_version()
+BUILD_PCCM_VERSION = SpconvOps.pccm_version()
 from spconv.core_cc.csrc.utils.boxops import BoxOps
 from spconv.core_cc.cumm.common import CompileInfo
 HAS_BOOST = BoxOps.has_boost()

--- a/spconv/csrc/sparse/all.py
+++ b/spconv/csrc/sparse/all.py
@@ -13,11 +13,13 @@
 # limitations under the License.

 from cumm.common import TensorView, TensorViewCPU, TensorViewKernel, ThrustLib, GemmBasicHost
+import cumm
 from cumm.conv.bases import ConvOpType, NHWC
 from cumm.conv.params import ConvProblem
 from cumm import dtypes
 from cumm.constants import CUMM_CPU_ONLY_BUILD
 import pccm
+from pccm.__version__ import __version__ as pccm_version
 from ccimport import compat
 from .pointops import Point2Voxel, Point2VoxelCPU
 from .indices import SparseConvIndicesKernel, CudaCommonKernel, SparseConvIndicesCPU
@@ -99,6 +101,28 @@ class SpconvOps(pccm.Class):
                self.add_impl_only_param_class(cuda_funcs, f"ops{ndim}d",
                                               indices,
                                               f"SpconvIndices{ndim}D")
+    
+    @pccm.pybind.mark
+    @pccm.static_function
+    def cumm_version(self):
+        """get cumm version when build spconv.
+        """
+        code = pccm.FunctionCode()
+        code.raw(f"""
+        return \"{cumm.__version__}\";
+        """)
+        return code.ret("std::string")
+
+    @pccm.pybind.mark
+    @pccm.static_function
+    def pccm_version(self):
+        """get pccm version when build spconv.
+        """
+        code = pccm.FunctionCode()
+        code.raw(f"""
+        return \"{pccm_version}\";
+        """)
+        return code.ret("std::string")

    @pccm.pybind.mark
    @pccm.cuda.static_function
@@ -853,6 +877,66 @@ class SpconvOps(pccm.Class):
        """)
        return code.ret("tv::Tensor")

+    @pccm.pybind.mark
+    @pccm.cuda.static_function
+    def reverse_bits(self):
+        code = pccm.FunctionCode()
+        if CUMM_CPU_ONLY_BUILD:
+            return code.make_invalid()
+
+        code.add_dependency(TensorViewKernel)
+        code.arg("a", "tv::Tensor")
+        code.code_after_include = f"""
+        __global__ void reverse_bits_kernel_64(const uint64_t* data, uint64_t* out, int size){{
+            for (int i : tv::KernelLoopX<int>(size)){{
+                out[i] = __brevll(reinterpret_cast<const unsigned long long*>(data)[i]);
+            }}
+        }}
+
+        __global__ void reverse_bits_kernel(const uint32_t* data, uint32_t* out, int size){{
+            for (int i : tv::KernelLoopX<int>(size)){{
+                out[i] = __brev(data[i]);
+            }}
+        }}
+
+        uint32_t reverse(uint32_t x)
+        {{
+            x = ((x >> 1) & 0x55555555u) | ((x & 0x55555555u) << 1);
+            x = ((x >> 2) & 0x33333333u) | ((x & 0x33333333u) << 2);
+            x = ((x >> 4) & 0x0f0f0f0fu) | ((x & 0x0f0f0f0fu) << 4);
+            x = ((x >> 8) & 0x00ff00ffu) | ((x & 0x00ff00ffu) << 8);
+            x = ((x >> 16) & 0xffffu) | ((x & 0xffffu) << 16);
+            return x;
+        }}
+
+        int reverse(uint64_t i)
+        {{
+            return (reverse(uint32_t(i)) << 32) | reverse(uint32_t(i >> 32));
+        }}
+        """
+        code.raw(f"""
+        tv::Tensor res(a.shape(), a.dtype(), a.device());
+        tv::dispatch<uint32_t, uint64_t>(a.dtype(), [&](auto I){{
+            using T = TV_DECLTYPE(I);
+            auto res_ptr = res.data_ptr<T>();
+            auto a_ptr = a.data_ptr<const T>();
+            if (a.device() == -1){{
+                for (int i = 0; i < a.size(); ++i){{
+                    res_ptr[i] = reverse(a_ptr[i]);
+                }}
+            }}else{{
+                tv::cuda::Launch launcher(a.size());
+                tv::if_constexpr<std::is_same<T, uint64_t>::value>([=](auto _)mutable{{
+                    launcher(_(reverse_bits_kernel_64), a_ptr, res_ptr, int(a.size()));
+                }}, [=](auto _)mutable{{
+                    launcher(_(reverse_bits_kernel), a_ptr, res_ptr, int(a.size()));
+                }});
+            }}
+        }});
+        return res;
+        """)
+        return code.ret("tv::Tensor")
+
    @pccm.pybind.mark
    @pccm.static_function
    def calc_point2voxel_meta_data(self):

--- a/spconv/csrc/sparse/cpu_core.py
+++ b/spconv/csrc/sparse/cpu_core.py
@@ -16,7 +16,6 @@ import pccm
 from ccimport import compat
 from cumm.common import TensorView

-
 class OMPLib(pccm.Class):
    def __init__(self):
        super().__init__()

--- a/spconv/csrc/sparse/gather.py
+++ b/spconv/csrc/sparse/gather.py
@@ -13,7 +13,7 @@
 # limitations under the License.

 import pccm
-from cumm.common import TensorView
+from cumm.common import TensorView, GemmDTypes
 from cumm.constants import CUMM_CPU_ONLY_BUILD
 from spconv.csrc.sparse.cpu_core import OMPLib
 from typing import List
@@ -24,7 +24,7 @@ class GatherCPU(pccm.Class):
        super().__init__()
        if CUMM_CPU_ONLY_BUILD:
            self.add_dependency(OMPLib)
-        self.add_dependency(TensorView)
+        self.add_dependency(TensorView, GemmDTypes)
        self.add_include("tensorview/parallel/all.h")

    @pccm.static_function
@@ -39,7 +39,7 @@ class GatherCPU(pccm.Class):

        auto nhot = inds.dim(0);
        int channel = in.dim(1);
-        tv::dispatch<float, double>(out.dtype(), [&](auto I){{
+        tv::dispatch<float, double, tv::bfloat16_t, tv::half_t>(out.dtype(), [&](auto I){{
            auto indices_data = inds.data_ptr<const int>();
            using T = TV_DECLTYPE(I);
            T *buffer_data = out.data_ptr<T>();
@@ -65,7 +65,7 @@ class GatherCPU(pccm.Class):
        // tv::check_shape(inds, {{in.dim(0)}});
        auto nhot = inds.dim(0);
        int channel = in.dim(1);
-        tv::dispatch<float, double>(out.dtype(), [&](auto I){{
+        tv::dispatch<float, double, tv::bfloat16_t, tv::half_t>(out.dtype(), [&](auto I){{
            using T = TV_DECLTYPE(I);
            auto indices_data = inds.data_ptr<const int>();
            const T *buffer_data = in.data_ptr<const T>();

--- a/spconv/csrc/sparse/maxpool.py
+++ b/spconv/csrc/sparse/maxpool.py
@@ -17,7 +17,7 @@ from cumm.gemm.core.metaarray import MetaArray, seq
 from cumm import dtypes
 import pccm
 from cumm.gemm.layout import TensorGeneric, to_stride
-from cumm.common import TensorView, TensorViewHashKernel, TensorViewKernel, ThrustLib, GemmBasic
+from cumm.common import TensorView, GemmDTypes, TensorViewKernel, ThrustLib, GemmBasic
 from cumm.gemm import codeops
 from typing import List
 from cumm.conv.params import ConvProblem
@@ -352,7 +352,7 @@ class IndiceMaxPool(pccm.Class):
 class IndiceMaxPoolCPU(pccm.Class):
    def __init__(self):
        super().__init__()
-        self.add_dependency(TensorView)
+        self.add_dependency(TensorView, GemmDTypes)
        if CUMM_CPU_ONLY_BUILD:
            self.add_dependency(OMPLib)
        self.add_include("tensorview/parallel/all.h")
@@ -369,7 +369,7 @@ class IndiceMaxPoolCPU(pccm.Class):
        code.raw(f"""
        int nhot = out_inds.dim(0);
        int num_features = in.dim(1);
-        tv::dispatch<float, double>(out.dtype(), [&](auto I){{
+        tv::dispatch<float, double, tv::half_t, tv::bfloat16_t>(out.dtype(), [&](auto I){{
            using T = TV_DECLTYPE(I);
            auto out_features = out.data_ptr<T>();
            auto in_features = in.data_ptr<const T>();
@@ -409,7 +409,7 @@ class IndiceMaxPoolCPU(pccm.Class):
        code.raw(f"""
        int nhot = out_inds.dim(0);
        int num_features = in.dim(1);
-        tv::dispatch<float, double>(out.dtype(), [&](auto I){{
+        tv::dispatch<float, double, tv::half_t, tv::bfloat16_t>(out.dtype(), [&](auto I){{
            using T = TV_DECLTYPE(I);
            auto out_features = out.data_ptr<const T>();
            auto in_features = in.data_ptr<const T>();

--- a/spconv/pytorch/constants.py
+++ b/spconv/pytorch/constants.py
@@ -27,3 +27,15 @@ try:
 except:
    # for unknown errors, just set a version
    PYTORCH_VERSION = [1, 8, 0]
+
+
+if PYTORCH_VERSION >= [1, 6, 0]:
+    TORCH_HAS_AMP = True
+else:
+    TORCH_HAS_AMP = False
+
+def is_amp_enabled():
+    if TORCH_HAS_AMP:
+        return torch.is_autocast_enabled()
+    else:
+        return False 
\ No newline at end of file
--- a/spconv/pytorch/conv.py
+++ b/spconv/pytorch/conv.py
@@ -24,6 +24,7 @@ from torch.nn import init
 from torch.nn.parameter import Parameter

 from spconv import pytorch as spconv
+from spconv import SPCONV_VERSION_NUMBERS
 from spconv.core import ConvAlgo
 from spconv.debug_utils import spconv_save_debug_data
 from spconv.pytorch import functional as Fsp
@@ -31,10 +32,25 @@ from spconv.pytorch import ops
 from spconv.cppconstants import CPU_ONLY_BUILD
 from spconv.pytorch.core import IndiceData, SparseConvTensor, ImplicitGemmIndiceData, expand_nd
 from spconv.pytorch.modules import SparseModule
-from spconv.constants import FILTER_HWIO
+from spconv.constants import SAVED_WEIGHT_LAYOUT, ALL_WEIGHT_IS_KRSC, SPCONV_DEBUG_WEIGHT
 from spconv.utils import nullcontext
 from torch.nn.init import calculate_gain

+FILTER_HWIO = False
+
+
+def expand_nd(val: Union[int, List[int], Tuple[int, ...]], ndim: int) -> List[int]:
+    if isinstance(val, int):
+        val = [val] * ndim
+    elif isinstance(val, list):
+        assert len(val) == ndim
+    elif isinstance(val, tuple):
+        assert len(val) == ndim
+        return [*val]
+    else:
+        raise NotImplementedError
+    return val
+

 class SparseConvolution(SparseModule):
    __constants__ = [
@@ -101,7 +117,7 @@ class SparseConvolution(SparseModule):
        self.algo = algo
        self.fp32_accum = fp32_accum
        # self.algo = ConvAlgo.Native
-        if self.algo == ConvAlgo.Native:
+        if self.algo == ConvAlgo.Native and not ALL_WEIGHT_IS_KRSC:
            if FILTER_HWIO:
                # RSCK
                self.weight = Parameter(
@@ -121,6 +137,37 @@ class SparseConvolution(SparseModule):
            self.register_parameter('bias', None)
        self.reset_parameters()

+        self._register_load_state_dict_pre_hook(self._load_weight_different_layout)
+
+    def _load_weight_different_layout(
+            self, state_dict, prefix, local_metadata, strict,
+            missing_keys, unexpected_keys, error_msgs):
+        if not SAVED_WEIGHT_LAYOUT:
+            return
+        key = prefix + "weight"
+        assert key in state_dict
+        ndim = self.ndim
+        if SAVED_WEIGHT_LAYOUT == "RSKC":
+            state_dict[key] = state_dict[key].permute(ndim, *range(ndim), ndim + 1).contiguous()
+        elif SAVED_WEIGHT_LAYOUT == "RSCK":
+            state_dict[key] = state_dict[key].permute(ndim + 1, *range(ndim), ndim).contiguous()
+
+        if ALL_WEIGHT_IS_KRSC or self.algo != ConvAlgo.Native:
+            # in spconv 2.2, we only support KRSC layout.
+            if SAVED_WEIGHT_LAYOUT == "RSKC":
+                state_dict[key] = state_dict[key].permute(ndim, *range(ndim), ndim + 1).contiguous()
+            elif SAVED_WEIGHT_LAYOUT == "RSCK":
+                state_dict[key] = state_dict[key].permute(ndim + 1, *range(ndim), ndim).contiguous()
+
+        else:
+            if self.algo == ConvAlgo.Native:
+                # to RSCK
+                if SAVED_WEIGHT_LAYOUT == "RSKC":
+                    state_dict[key] = state_dict[key].permute(*range(ndim), ndim + 1, ndim).contiguous()
+                elif SAVED_WEIGHT_LAYOUT == "KRSC":
+                    state_dict[key] = state_dict[key].permute(*range(1, ndim + 1), 0, ndim + 1).contiguous()
+
+
    def extra_repr(self):
        s = ('{in_channels}, {out_channels}, kernel_size={kernel_size}'
             ', stride={stride}')
@@ -175,7 +222,11 @@ class SparseConvolution(SparseModule):
            return tensor.uniform_(-bound, bound)

    def reset_parameters(self):
-        self._custom_kaiming_uniform_(self.weight, a=math.sqrt(5))
+        if SPCONV_DEBUG_WEIGHT:
+            self._custom_kaiming_uniform_(self.weight, a=math.sqrt(0.005))
+        else:
+            self._custom_kaiming_uniform_(self.weight, a=math.sqrt(5))
+
        if self.bias is not None:
            fan_in, _ = self._calculate_fan_in_and_fan_out()
            bound = 1 / math.sqrt(fan_in)