Merge branch 'develop'

a6abf55d · yan.yan · fad30002 · 79a3eaf2 · a6abf55d · a6abf55d
Commit a6abf55d authored Oct 20, 2021 by yan.yan
20 changed files
--- a/spconv/csrc/sparse/__init__.py
+++ b/spconv/csrc/sparse/__init__.py
+# Copyright 2021 Yan Yan
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
--- a/spconv/csrc/sparse/all.py
+++ b/spconv/csrc/sparse/all.py
+# Copyright 2021 Yan Yan
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from cumm.common import TensorViewKernel, ThrustLib
+from cumm.conv.bases import ConvOpType, NHWC
+from cumm.conv.params import ConvProblem
+from cumm import dtypes
+import pccm 
+from ccimport import compat
+from .pointops import Point2Voxel, Point2VoxelCPU
+from .indices import SparseConvIndicesKernel, CudaCommonKernel
+from .maxpool import IndiceMaxPool
+
+class SpconvOps(pccm.Class):
+    def __init__(self):
+        super().__init__()
+        self.ndims = [1, 2, 3, 4]
+        for ndim in self.ndims:
+            p2v = Point2Voxel(dtypes.float32,  ndim)
+            p2v_cpu = Point2VoxelCPU(dtypes.float32, ndim)
+            self.add_param_class(f"ops{ndim}d", p2v, f"Point2Voxel{ndim}D")
+            self.add_param_class(f"ops_cpu{ndim}d", p2v_cpu, f"Point2Voxel{ndim}DCPU")
+
+            problem = ConvProblem(ndim, ConvOpType.kForward, NHWC, NHWC, NHWC)
+            indices = SparseConvIndicesKernel(problem, dtypes.int32)
+            # self.add_param_class("ops", indices, "SpconvIndices")
+            cuda_funcs = [self.generate_subm_conv_inds, 
+                self.generate_conv_inds_stage1, self.generate_conv_inds_stage1_5, self.generate_conv_inds_stage2, self.sort_1d_by_key]
+            self.add_impl_only_param_class(cuda_funcs, f"ops{ndim}d", indices, f"SpconvIndices{ndim}D")
+
+
+    @pccm.pybind.mark
+    @pccm.cuda.static_function
+    def generate_conv_inds_stage1(self):
+        code = pccm.FunctionCode()
+        code.arg("indices", "tv::Tensor")
+        code.arg("indice_pairs, indice_pairs_uniq, indice_num_per_loc", "tv::Tensor")
+        code.arg("batch_size", "int")
+        code.arg("output_dims, input_dims", f"std::vector<int>")
+        code.arg("ksize, stride, padding, dilation", f"std::vector<int>")
+        code.arg("transposed", f"bool", "false")
+
+        code.arg("stream_int", f"std::uintptr_t", "0", pyanno="int")
+        code.raw(f"""
+        int ndim = indices.dim(1) - 1;
+        TV_ASSERT_RT_ERR(output_dims.size() == ndim && input_dims.size() == ndim &&
+            ksize.size() == ndim && stride.size() == ndim && dilation.size() == ndim &&
+            padding.size() == ndim, "your params size not equal to ndim", ndim);
+        """)
+
+        for ndim in self.ndims:
+            code.raw(f"""
+            if (ndim == {ndim}){{
+                tv::array<int, {ndim}> output_dims_, input_dims_;
+                tv::array<int, {ndim}> ksize_, stride_, padding_, dilation_;
+                for (int i = 0; i < {ndim}; ++i){{
+                    output_dims_[i] = output_dims[i];
+                    input_dims_[i] = input_dims[i];
+                    ksize_[i] = ksize[i];
+                    stride_[i] = stride[i];
+                    padding_[i] = padding[i];
+                    dilation_[i] = dilation[i];
+                }}
+                return SpconvIndices{ndim}D::generate_conv_inds_stage1(indices,
+                    indice_pairs, indice_pairs_uniq, indice_num_per_loc,
+                    batch_size, output_dims_, input_dims_, 
+                    ksize_, stride_, padding_, dilation_, transposed, stream_int);
+            }}
+            """)
+        code.raw(f"""TV_THROW_RT_ERR("unknown ndim", ndim);""")
+
+        return code# .ret("int")
+
+    @pccm.pybind.mark
+    @pccm.cuda.static_function
+    def generate_conv_inds_stage1_5(self):
+        code = pccm.FunctionCode()
+        code.arg("indice_pairs_uniq", "tv::Tensor")
+        code.arg("ndim", "int")
+        code.arg("uniq_size", "int64_t")
+        code.arg("stream_int", f"std::uintptr_t", "0", pyanno="int")
+        for ndim in self.ndims:
+            code.raw(f"""
+            if (ndim == {ndim}){{
+                return SpconvIndices{ndim}D::generate_conv_inds_stage1_5(indice_pairs_uniq, uniq_size, stream_int);
+            }}
+            """)
+        code.raw(f"""TV_THROW_RT_ERR("unknown ndim", ndim);""")
+        return code.ret("int")
+
+    @pccm.pybind.mark
+    @pccm.cuda.static_function
+    def generate_conv_inds_stage2(self):
+        code = pccm.FunctionCode()
+        code.arg("indices, hashdata", "tv::Tensor")
+        code.arg("indice_pairs, indice_pairs_uniq, out_inds", "tv::Tensor")
+        code.arg("num_out_act", "int")
+        code.arg("batch_size", "int")
+        code.arg("output_dims, input_dims", f"std::vector<int>")
+        code.arg("ksize, stride, padding, dilation", f"std::vector<int>")
+        code.arg("transposed", f"bool", "false")
+        code.arg("stream_int", f"std::uintptr_t", "0", pyanno="int")
+        code.raw(f"""
+        int ndim = indices.dim(1) - 1;
+        TV_ASSERT_RT_ERR(output_dims.size() == ndim && input_dims.size() == ndim &&
+            ksize.size() == ndim && stride.size() == ndim && dilation.size() == ndim &&
+            padding.size() == ndim, "your params size not equal to ndim", ndim);
+        """)
+
+        for ndim in self.ndims:
+            code.raw(f"""
+            if (ndim == {ndim}){{
+                tv::array<int, {ndim}> output_dims_, input_dims_;
+                tv::array<int, {ndim}> ksize_, stride_, padding_, dilation_;
+                for (int i = 0; i < {ndim}; ++i){{
+                    output_dims_[i] = output_dims[i];
+                    input_dims_[i] = input_dims[i];
+                    ksize_[i] = ksize[i];
+                    stride_[i] = stride[i];
+                    padding_[i] = padding[i];
+                    dilation_[i] = dilation[i];
+                }}
+                return SpconvIndices{ndim}D::generate_conv_inds_stage2(indices, hashdata,
+                    indice_pairs, indice_pairs_uniq, out_inds, num_out_act,
+                    batch_size, output_dims_, input_dims_, 
+                    ksize_, stride_, padding_, dilation_, transposed, stream_int);
+            }}
+            """)
+        code.raw(f"""TV_THROW_RT_ERR("unknown ndim", ndim);""")
+
+        return code.ret("int")
+
+    @pccm.pybind.mark
+    @pccm.cuda.static_function
+    def generate_subm_conv_inds(self):
+        code = pccm.FunctionCode()
+        code.arg("indices, hashdata", "tv::Tensor")
+        code.arg("indice_pairs, out_inds, indice_num_per_loc", "tv::Tensor")
+        code.arg("batch_size", "int")
+        code.arg("input_dims", f"std::vector<int>")
+        code.arg("ksize, dilation", f"std::vector<int>")
+        code.arg("indice_pair_mask", "tv::Tensor", "tv::Tensor()", "cumm.tensorview.Tensor = Tensor()")
+        code.arg("backward", "bool", "false")
+        code.arg("stream_int", f"std::uintptr_t", "0", pyanno="int = 0")
+        code.raw(f"""
+        int ndim = indices.dim(1) - 1;
+        TV_ASSERT_RT_ERR(input_dims.size() == ndim &&
+            ksize.size() == ndim && dilation.size() == ndim, "your params size not equal to ndim", ndim);
+        """)
+        for ndim in self.ndims:
+            code.raw(f"""
+            if (ndim == {ndim}){{
+                tv::array<int, {ndim}> input_dims_;
+                tv::array<int, {ndim}> ksize_, dilation_;
+                for (int i = 0; i < {ndim}; ++i){{
+                    input_dims_[i] = input_dims[i];
+                    ksize_[i] = ksize[i];
+                    dilation_[i] = dilation[i];
+                }}
+                return SpconvIndices{ndim}D::generate_subm_conv_inds(indices, hashdata,
+                    indice_pairs, out_inds, indice_num_per_loc,
+                    batch_size, input_dims_, 
+                    ksize_, dilation_, indice_pair_mask, backward,
+                    stream_int);
+            }}
+            """)
+        code.raw(f"""TV_THROW_RT_ERR("unknown ndim", ndim);""")
+        return code.ret("int")
+
+    @pccm.pybind.mark
+    @pccm.cuda.static_function
+    def maxpool_forward(self):
+        code = pccm.FunctionCode()
+        code.arg("out", "tv::Tensor")
+        code.arg("inp", "tv::Tensor")
+        code.arg("out_inds", "tv::Tensor")
+        code.arg("in_inds", "tv::Tensor")
+        code.arg("stream", "std::uintptr_t", "0", pyanno="int")
+        code.add_dependency(IndiceMaxPool)
+        code.raw(f"""
+        return IndiceMaxPool::forward(out, inp, out_inds, in_inds, stream);
+        """)
+        return code
+
+    @pccm.pybind.mark
+    @pccm.cuda.static_function
+    def maxpool_backward(self):
+        code = pccm.FunctionCode()
+        code.arg("out", "tv::Tensor")
+        code.arg("inp", "tv::Tensor")
+        code.arg("dout", "tv::Tensor")
+        code.arg("dinp", "tv::Tensor")
+        code.arg("out_inds", "tv::Tensor")
+        code.arg("in_inds", "tv::Tensor")
+        code.arg("stream", "std::uintptr_t", "0", pyanno="int")
+        code.add_dependency(IndiceMaxPool)
+        code.raw(f"""
+        return IndiceMaxPool::backward(out, inp, dout, dinp, out_inds, in_inds, stream);
+        """)
+        return code
+
+    @pccm.pybind.mark
+    @pccm.cuda.static_function
+    def sort_1d_by_key(self):
+        code = pccm.FunctionCode()
+        code.add_dependency(ThrustLib, TensorViewKernel)
+        code.add_param_class("cudakers", CudaCommonKernel())
+        code.arg("data", "tv::Tensor")
+        code.raw(f"""
+        tv::Tensor indices({{data.dim(0)}}, tv::int32, 0);
+        tv::cuda::Launch launcher(data.dim(0));
+        launcher(cudakers::arange_kernel<int32_t>, indices.data_ptr<int32_t>(), indices.dim(0));
+        tv::dispatch<int32_t, uint32_t, int64_t, uint64_t>(data.dtype(), [&](auto I){{
+            using T = TV_DECLTYPE(I);
+            thrust::device_ptr<T> ptr_tr(data.data_ptr<T>());
+            thrust::device_ptr<int32_t> ptr_k(indices.data_ptr<int32_t>());
+            auto thrust_ctx = thrust::cuda::par.on(0);
+            thrust::sort_by_key(thrust_ctx, ptr_tr, ptr_tr + data.dim(0), ptr_k);
+        }});
+        return indices;
+        """)
+        return code.ret("tv::Tensor")
--- a/spconv/csrc/sparse/devleop/sort_bench.py
+++ b/spconv/csrc/sparse/devleop/sort_bench.py
+import torch 
+import time 
+
+def main():
+
+    arr = torch.randint(0, 130000, size=[130000]).to(torch.int32).cuda()
+    arr2 = torch.randint(0, 130000, size=[130000]).to(torch.int32).cuda()
+
+    torch.cuda.synchronize()
+    ar = torch.arange(arr.shape[0]).cuda()
+
+    t = time.time()
+    for i in range(10):
+
+        xx, indices = arr.sort()
+        # thh = torch.empty_like(indices)
+        xx2, indices2 = arr2.sort()
+
+        # thh[indices] = ar
+        torch.cuda.synchronize()
+        print(time.time() - t)
+        t = time.time()
+    # print(indices[:10], thh[:10])
+    a = torch.rand(130000, 27 * 32).cuda().float()
+    b = torch.rand(27 * 32, 32).cuda().float()
+    c = torch.rand(130000, 32).cuda().float()
+    for i in range(10):
+        torch.cuda.synchronize()
+        t = time.time()
+        torch.mm(a, b, out=c)
+        # thh[indices] = ar
+        torch.cuda.synchronize()
+        print(time.time() - t)
+
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
--- a/spconv/csrc/sparse/devleop/wtf.py
+++ b/spconv/csrc/sparse/devleop/wtf.py
+#!/home/yy/library/anaconda3/bin/python
+import sys
+from pathlib import Path 
+import ctypes
+# _cudart = ctypes.CDLL('libcudart.so')
+
+print(str(Path(__file__).parent.parent.parent.parent))
+sys.path.append(str(Path(__file__).parent.parent.parent.parent))
+
+
+
+from spconv import tensorview as tv 
+
+from spconv.sparse import build
+import numpy as np 
+from pathlib import Path 
+from spconv.spconv_ops_cc.sparse.all.ops import Point2Voxel
+from spconv.spconv_ops_cc.sparse.all import SpconvOps
+
+import time 
+
+def main():
+    data = np.load("/home/yy/OneDrive/dev/spconv/test/data/benchmark-pc.npz")["pc"].astype(np.float32)
+    print(data.shape, data.dtype)
+    p2v = Point2Voxel([0.1, 0.1, 0.1], [-80, -80, -2, 80, 80, 6], 3, 150000, 1)
+    gs = p2v.grid_size # zyx
+    print(gs)
+    # return
+    data_tv = tv.from_numpy(data).cuda()
+    for i in range(6):
+        t = time.time()
+
+        voxels, indices, num_per_voxel = p2v.point_to_voxel_hash(data_tv)   
+        
+        print(time.time() - t)
+    voxels, indices, num_per_voxel = p2v.point_to_voxel_hash(data_tv)   
+    print(voxels.shape, gs)
+    gs_xyz = gs
+    indices_np = indices.cpu().numpy()
+    # indices_offset = indices_np[:, 0] * gs_xyz[1] * gs_xyz[2] + indices_np[:, 1] * gs_xyz[2] + indices_np[:, 2]
+    # uq = np.unique(indices_offset)
+    # print(uq.shape, indices_offset.shape, gs_xyz)
+    # return 
+    ksize = [3] * 3 
+    kv = int(np.prod(ksize))
+    indices_with_bs = np.zeros((indices_np.shape[0], 4), dtype=np.int32)
+    indices_with_bs[:, 1:] = indices_np
+    print(indices_with_bs.mean(), indices_with_bs.max(), indices_with_bs.min())
+
+    indices = tv.from_numpy(indices_with_bs).cuda()
+    out_indices = tv.zeros([indices.dim(0) * kv, 4], tv.int32, 0)
+    indice_num_per_loc = tv.zeros([kv], tv.int32, 0)
+
+
+    points = voxels.view([-1, 3])
+    hashdata = tv.zeros([points.dim(0) * kv * 2], tv.custom64, 0)
+    hashdata_subm = tv.zeros([points.dim(0) * 2], tv.custom64, 0)
+
+    indice_pairs = tv.full([2, kv, indices.dim(0)], -1, tv.int32, 0)
+    indice_pairs_uniq = tv.zeros([indice_pairs.size // 2 + 1], tv.int32, 0)
+
+    # for i in range(10):
+    #     indice_pairs.fill_int_(-1)
+    #     np.random.shuffle(indices_with_bs)
+    #     indices = tv.from_numpy(indices_with_bs).cuda()
+
+    #     indice_num_per_loc.zero_()
+    #     out_act = SpconvOps.generate_conv_inds(indices, hashdata, indice_pairs,
+    #         indice_pairs_uniq, out_indices, indice_num_per_loc, 
+    #         1, gs, gs, [3, 3, 3], [1, 1, 1], [1, 1, 1], [1, 1, 1])
+    #     indice_num_per_loc.zero_()
+    #     out_act = SpconvOps.generate_subm_conv_inds(indices, hashdata_subm, indice_pairs,
+    #         out_indices, indice_num_per_loc, 
+    #         1, gs, ksize, [1, 1, 1])
+    #     indice_num_per_loc_cpu = indice_num_per_loc.cpu().numpy()
+    #     indice_pairs_cpu = indice_pairs.cpu().numpy()
+    #     indice_pairs_cpu_flat = indice_pairs_cpu.reshape(-1)
+    #     uq, count = np.unique(indice_pairs_cpu_flat, return_counts=True)
+    #     print(out_act, indice_pairs_cpu.shape, indice_pairs_cpu.mean(), indice_num_per_loc_cpu.tolist())
+    #     print(indice_pairs_cpu[:, 13, :2])
+    #     print(uq, count)
+
+if __name__ == "__main__":
+
+    main()
\ No newline at end of file
--- a/spconv/csrc/sparse/indices.py
+++ b/spconv/csrc/sparse/indices.py
+# Copyright 2021 Yan Yan
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import contextlib
+from cumm.conv.bases import ConvEnum
+from cumm.gemm.core.metaarray import MetaArray, seq
+from cumm import dtypes
+import pccm 
+from cumm.gemm.layout import TensorGeneric, to_stride
+from cumm.common import TensorView, TensorViewHashKernel, TensorViewKernel, ThrustLib
+from cumm.gemm import codeops
+from typing import List 
+from cumm.conv.params import ConvProblem
+import numpy as np 
+
+class CudaCommonKernel(pccm.ParameterizedClass):
+    # we need to use PClass instead of Class
+    # because cuda global function can't be put in class body.
+    @pccm.cuda.cuda_global_function
+    def arange_kernel(self):
+        code = pccm.FunctionCode()
+        code.targ("T")
+        code.arg("data", f"T*") 
+        code.arg("size", f"int") 
+        code.raw(f"""
+        for (int i : tv::KernelLoopX<int>(size)) {{
+            data[i] = T(i);
+        }}
+        """)
+        return code
+
+    @pccm.cuda.cuda_global_function
+    def fill_kernel(self):
+        code = pccm.FunctionCode()
+        code.targ("T")
+        code.arg("data", f"T*") 
+        code.arg("val", f"T")
+        code.arg("size", f"int") 
+        code.raw(f"""
+        for (int i : tv::KernelLoopX<int>(size)) {{
+            data[i] = T(val);
+        }}
+        """)
+        return code
+
+
+class ConvOutLocIter(pccm.ParameterizedClass):
+    # TODO add conv transpose
+    def __init__(self, problem: ConvProblem):
+        super().__init__()
+        self.add_dependency(TensorView)
+        self.add_param_class("lociter", problem, "ConvProblem")
+        layout_npq = TensorGeneric(problem.ndim + 1, False)
+        layout_rs = TensorGeneric(problem.ndim, False)
+
+        self.add_param_class("lociter", layout_npq, "LayoutNPQ")
+        self.add_param_class("lociter_rs", layout_rs, "LayoutRS")
+
+        self.ndim = problem.ndim 
+        self.add_member("problem_", f"ConvProblem")
+        self.add_member("count_", f"tv::array<int, {self.ndim}>")
+        self.add_member("layout_npq", f"LayoutNPQ")
+        self.add_member("layout_rs", f"LayoutRS")
+
+    @pccm.cuda.constructor(host=True, device=True, forceinline=True)
+    def ctor(self):
+        code = pccm.FunctionCode()
+        code.arg("problem", f"ConvProblem const&")
+        code.ctor_init("problem_", f"problem")
+        zeros = ", ".join(["0"] * self.ndim)
+        code.ctor_init("count_", f"{{{zeros}}}")
+        pqs = codeops.unpack("problem.output_dims", range(self.ndim))
+        rss = codeops.unpack("problem.ksize", range(self.ndim))
+
+        code.ctor_init("layout_npq", f"LayoutNPQ::from_shape({{problem.N, {pqs}}})")
+        code.ctor_init("layout_rs", f"LayoutRS::from_shape({{{rss}}})")
+        
+        return code 
+
+    @pccm.cuda.member_function(host=True,
+                               device=True,
+                               forceinline=True,
+                               name="operator++")
+    def increment(self):
+        code = pccm.FunctionCode()
+        for i in range(self.ndim - 1, -1, -1):
+            code.raw(f"""
+            if (++count_[{i}] < problem_.ksize[{i}]){{
+                return *this;
+            }}
+            count_[{i}] = 0;
+            """)
+        code.raw("return *this;")
+        return code.ret(f"{self.class_name}&")
+
+    @pccm.cuda.member_function(host=True,
+                               device=True,
+                               forceinline=True)
+    def set_filter_offset(self):
+        code = pccm.FunctionCode()
+        code.arg("filter_offset", "int")
+        code.raw(f"""
+        layout_rs.inverse(filter_offset, count_);
+        """)
+        return code
+
+    @pccm.cuda.member_function(host=True,
+                               device=True,
+                               forceinline=True,
+                               const=True)
+    def nhw_to_npq(self):
+        code = pccm.FunctionCode()
+        code.arg("nhw_offset", "const int*")
+        code.nontype_targ("NoStride", "bool")
+        for i in range(self.ndim):
+            code.raw(f"""
+            int r_{i} = count_[{i}];
+            int h_{i} = (nhw_offset[{i + 1}] + problem_.padding[{i}] - 
+                r_{i} * problem_.dilation[{i}]) / (NoStride ? 1 : problem_.stride[{i}]);
+            """)
+        h0h1h2 = codeops.unpack_str("h", range(self.ndim))
+        code.raw(f"""
+        return {{nhw_offset[0], {h0h1h2}}};
+        """)
+        return code.ret(f"tv::array<int, {self.ndim + 1}>")
+
+    @pccm.cuda.member_function(host=True,
+                               device=True,
+                               forceinline=True,
+                               const=True)
+    def npq_to_nhw(self):
+        code = pccm.FunctionCode()
+        code.arg("npq_offset", "const int*")
+        for i in range(self.ndim):
+            code.raw(f"""
+            int r_{i} = count_[{i}];
+            int h_{i} = npq_offset[{i + 1}] * problem_.stride[{i}] - problem_.padding[{i}] + r_{i} * problem_.dilation[{i}];
+            """)
+        h0h1h2 = codeops.unpack_str("h", range(self.ndim))
+        code.raw(f"""
+        return {{npq_offset[0], {h0h1h2}}};
+        """)
+        return code.ret(f"tv::array<int, {self.ndim + 1}>")
+
+
+    @pccm.cuda.member_function(host=True,
+                               device=True,
+                               forceinline=True,
+                               const=True)
+    def query_npq(self):
+        code = pccm.FunctionCode()
+        code.arg("nhw_offset", "const int*")
+        code.arg("npq_offset", f"tv::array<int, {self.ndim + 1}>&")
+        code.ret("bool")
+        code.raw(f"""
+        auto npq_no_stride = nhw_to_npq<true>(nhw_offset);
+        npq_offset[0] = npq_no_stride[0];
+        """)
+        hw_valid = [] # type: List[str]
+        stride_valid = [] # type: List[str]
+        for i in range(self.ndim):
+            code.raw(f"npq_offset[{i + 1}] = npq_no_stride[{i + 1}] / problem_.stride[{i}];")
+            hw_valid.append((f"npq_offset[{i + 1}] >= 0 && "
+                            f"npq_offset[{i + 1}] < problem_.output_dims[{i}]"))
+            stride_valid.append(f"!(npq_no_stride[{i + 1}] % problem_.stride[{i}])")
+        code.raw(f"""
+        return npq_no_stride[0] < problem_.N && 
+            {' && '.join(hw_valid)} &&
+            {' && '.join(stride_valid)};
+        """)
+        return code 
+
+    @pccm.cuda.member_function(host=True,
+                               device=True,
+                               forceinline=True,
+                               const=True)
+    def query_npq_no_stride(self):
+        code = pccm.FunctionCode()
+        code.arg("nhw_offset", "const int*")
+        code.arg("npq_offset", f"tv::array<int, {self.ndim + 1}>&")
+        code.ret("bool")
+        code.raw(f"""
+        npq_offset = nhw_to_npq<true>(nhw_offset);
+        """)
+        hw_valid = [] # type: List[str]
+        for i in range(self.ndim):
+            hw_valid.append((f"npq_offset[{i + 1}] >= 0 && "
+                            f"npq_offset[{i + 1}] < problem_.output_dims[{i}]"))
+        code.raw(f"""
+        return npq_offset[0] < problem_.N && 
+            {' && '.join(hw_valid)};
+        """)
+        return code 
+
+    @pccm.cuda.member_function(host=True,
+                               device=True,
+                               forceinline=True,
+                               const=True)
+    def query_nhw(self):
+        code = pccm.FunctionCode()
+        code.arg("npq_offset", "const int*")
+        code.arg("nhw_offset", f"tv::array<int, {self.ndim + 1}>&")
+        code.ret("bool")
+        code.raw(f"""
+        nhw_offset = npq_to_nhw(npq_offset);
+        """)
+        hw_valid = [] # type: List[str]
+        for i in range(self.ndim):
+            hw_valid.append((f"nhw_offset[{i + 1}] >= 0 && "
+                            f"nhw_offset[{i + 1}] < problem_.input_dims[{i}]"))
+        code.raw(f"""
+        return nhw_offset[0] < problem_.N && 
+            {' && '.join(hw_valid)};
+        """)
+        return code 
+
+    @pccm.cuda.member_function(host=True,
+                               device=True,
+                               forceinline=True,
+                               const=True)
+    def query_nhw_out(self):
+        code = pccm.FunctionCode()
+        code.arg("npq_offset", "const int*")
+        code.arg("nhw_offset", f"tv::array<int, {self.ndim + 1}>&")
+        code.ret("bool")
+        code.raw(f"""
+        nhw_offset = npq_to_nhw(npq_offset);
+        """)
+        hw_valid = [] # type: List[str]
+        for i in range(self.ndim):
+            hw_valid.append((f"nhw_offset[{i + 1}] >= 0 && "
+                            f"nhw_offset[{i + 1}] < problem_.output_dims[{i}]"))
+        code.raw(f"""
+        return nhw_offset[0] < problem_.N && 
+            {' && '.join(hw_valid)};
+        """)
+        return code 
+
+class SparseConvIndicesKernel(pccm.ParameterizedClass):
+    def __init__(self, problem: ConvProblem, dtype_indices: dtypes.DType):
+        super().__init__()
+        self.add_dependency(TensorView, TensorViewKernel, TensorViewHashKernel, ThrustLib)
+        self.loc_iter = ConvOutLocIter(problem)
+        self.add_param_class("spinds", self.loc_iter, "ConvLocIter")
+        self.add_param_class("spinds", problem, "ConvProblem")        
+        self.add_param_class("cudakers", CudaCommonKernel())        
+
+        self.ndim = problem.ndim 
+        self.dtype_indices = dtype_indices
+        self.dtype_indices_uniq = dtype_indices
+
+        assert dtype_indices == dtypes.int32 or dtype_indices == dtypes.int64
+
+
+    @pccm.cuda.cuda_global_function
+    def calc_conv_indices_stage1(self):
+        code = pccm.FunctionCode()
+        code.arg("loc_iter", f"ConvLocIter") # [N, ndim + 1]
+
+        code.arg("indices_in", f"const int*") # [N, ndim + 1]
+        code.arg("indice_pairs", f"{self.dtype_indices}*") # [2, kernelProd, MaxSize]
+        code.arg("indice_pairs_for_uniq", f"{self.dtype_indices}*") # [2, kernelProd, MaxSize]
+        code.arg("indice_num_per_loc", f"int*") # [kernelProd]
+
+        code.arg("num_indices_in", "int")
+        code.arg("indices_pair_size", "int")
+
+        code.arg("RS", "int")
+        code.arg("transposed", "bool")
+
+        code.raw(f"""
+        int filter_offset = blockIdx.y;
+        loc_iter.set_filter_offset(filter_offset);
+        int indices_pair_size_mul_RS = indices_pair_size * RS;
+        int filter_offset_mul_indices_pair_size = filter_offset * indices_pair_size;
+        for (int i : tv::KernelLoopX<int>(num_indices_in)) {{
+            tv::array<int, {self.ndim + 1}> npq_offset;
+            bool valid;
+            if (transposed){{
+                valid = loc_iter.query_nhw_out(indices_in + i * {self.ndim + 1}, npq_offset);
+            }}else{{
+                valid = loc_iter.query_npq(indices_in + i * {self.ndim + 1}, npq_offset);
+            }}
+            if (valid){{
+                int old_num = tv::cuda::atomicAggInc(indice_num_per_loc + filter_offset);
+                {self.dtype_indices} offset = loc_iter.layout_npq(npq_offset);
+                if (old_num < indices_pair_size){{
+                    indice_pairs[filter_offset_mul_indices_pair_size + old_num] = i;
+                    indice_pairs[indices_pair_size_mul_RS + filter_offset_mul_indices_pair_size + old_num] = offset;
+                    indice_pairs_for_uniq[filter_offset_mul_indices_pair_size + old_num] = offset;
+                }}
+            }}
+        }}
+        """)
+        return code
+
+    @pccm.cuda.cuda_global_function
+    def build_conv_hash_table(self):
+        code = pccm.FunctionCode()
+        code.targ("TTable")
+
+        code.arg("table", f"TTable") # [N, ndim + 1]
+        code.arg("indices_out", f"int*") # [N, ndim + 1]
+        code.arg("indice_pairs_for_uniq", f"const {self.dtype_indices}*") # [2, kernelProd, MaxSize]
+
+        code.arg("layout_npq", f"spinds::LayoutNPQ") # [2, kernelProd, MaxSize]
+
+        code.arg("num_indices", "int")
+
+        code.raw(f"""
+        for (int i : tv::KernelLoopX<int>(num_indices)) {{
+            {self.dtype_indices} index = indice_pairs_for_uniq[i];
+            layout_npq.inverse(index, indices_out + {self.ndim + 1} * i);
+            table.insert(index, i);
+        }}
+        """)
+        return code
+
+    @pccm.cuda.cuda_global_function
+    def calc_conv_indices_stage2(self):
+        code = pccm.FunctionCode()
+        code.targ("TTable")
+        code.arg("table", f"TTable") # [N, ndim + 1]
+        code.arg("indice_pairs_out_part", f"int*") # [2, kernelProd, MaxSize]
+        code.arg("num_indices_in", "int")
+        code.arg("indices_pair_size", "int")
+        # TODO use block instead of filter_offset?
+        code.raw(f"""
+        int filter_offset = blockIdx.y;
+        auto indice_pairs_out_part_filter = indice_pairs_out_part + filter_offset * indices_pair_size;
+        for (int i : tv::KernelLoopX<int>(num_indices_in)) {{
+            {self.dtype_indices} index = indice_pairs_out_part_filter[i];
+            if (index > -1){{
+                auto ptr = table.lookup_ptr(index);
+                if (ptr){{
+                    indice_pairs_out_part_filter[i] = ptr->second;
+                }}
+            }}
+        }}
+        """)
+        return code
+
+    @pccm.cuda.cuda_global_function
+    def build_subm_conv_hash_table(self):
+        code = pccm.FunctionCode()
+        code.targ("TTable")
+
+        code.arg("table", f"TTable") # [N, ndim + 1]
+        code.arg("indices_in", f"const int*") # [N, ndim + 1]
+
+        code.arg("layout_npq", f"spinds::LayoutNPQ") 
+
+        code.arg("num_indices", "int")
+
+        code.raw(f"""
+        for (int i : tv::KernelLoopX<int>(num_indices)) {{
+            {self.dtype_indices} index = layout_npq(indices_in + i * {self.ndim + 1});
+            table.insert(index, i);
+        }}
+        """)
+        return code
+
+    @pccm.cuda.cuda_global_function
+    def clean_indices_uniq(self):
+        code = pccm.FunctionCode()
+        code.arg("indice_pairs_for_uniq", f"{self.dtype_indices}*") 
+        code.arg("size", f"{self.dtype_indices}") 
+        code.raw(f"""
+        for ({self.dtype_indices} i : tv::KernelLoopX<{self.dtype_indices}>(size)) {{
+            indice_pairs_for_uniq[i] = std::numeric_limits<{self.dtype_indices}>::max();
+        }}
+        """)
+        return code
+
+    @pccm.cuda.cuda_global_function
+    def calc_subm_conv_indices(self):
+        code = pccm.FunctionCode()
+        code.targ("TTable")
+        code.arg("loc_iter", f"ConvLocIter") # [N, ndim + 1]
+        code.arg("table", f"TTable") # [N, ndim + 1]
+
+        code.arg("indices_in", f"const int*") # [N, ndim + 1]
+        code.arg("indice_pairs", f"{self.dtype_indices}*") # [2, kernelProd, MaxSize]
+        code.arg("indice_num_per_loc", f"int*") # [kernelProd]
+
+        code.arg("num_indices_in", "int")
+        code.arg("indices_pair_size", "int")
+
+        code.arg("RS", "int")
+        code.raw(f"""
+        int filter_offset = blockIdx.y;
+        loc_iter.set_filter_offset(filter_offset);
+        int indices_pair_size_mul_RS = indices_pair_size * RS;
+        int filter_offset_mul_indices_pair_size = filter_offset * indices_pair_size;
+
+        int filter_offset_mul_indices_pair_size_1 = (RS - 1 - filter_offset) * indices_pair_size;
+        if (filter_offset == (RS / 2)){{
+            for (int i : tv::KernelLoopX<int>(num_indices_in)) {{
+                indice_pairs[filter_offset_mul_indices_pair_size + i] = i;
+                indice_pairs[indices_pair_size_mul_RS + filter_offset_mul_indices_pair_size + i] = i;
+            }}
+        }} else {{
+            for (int i : tv::KernelLoopX<int>(num_indices_in)) {{
+                tv::array<int, {self.ndim + 1}> npq_offset;
+                if (loc_iter.query_npq_no_stride(indices_in + i * {self.ndim + 1}, npq_offset)){{
+                    {self.dtype_indices} offset = loc_iter.layout_npq(npq_offset);
+                    auto item = table.lookup(offset); // performance bound
+                    if (!item.empty()){{
+                        int old_num = tv::cuda::atomicAggInc(indice_num_per_loc + filter_offset);
+                        indice_pairs[filter_offset_mul_indices_pair_size + old_num] = i;
+                        indice_pairs[indices_pair_size_mul_RS + filter_offset_mul_indices_pair_size + old_num] = item.second;
+                        indice_pairs[filter_offset_mul_indices_pair_size_1 + old_num] = item.second;
+                        indice_pairs[indices_pair_size_mul_RS + filter_offset_mul_indices_pair_size_1 + old_num] = i;
+                    }}
+                }}
+            }}
+        }}
+        """)
+        return code
+
+    @pccm.cuda.cuda_global_function
+    def calc_subm_conv_indices_mask(self):
+        code = pccm.FunctionCode()
+        code.targ("TTable")
+        code.arg("loc_iter", f"ConvLocIter") # [N, ndim + 1]
+        code.arg("table", f"TTable") # [N, ndim + 1]
+
+        code.arg("indices_in", f"const int*") # [N, ndim + 1]
+        code.arg("indice_pairs", f"{self.dtype_indices}*") # [2, kernelProd, MaxSize]
+        code.arg("mask", f"uint32_t*") # [kernelProd]
+
+        code.arg("num_indices", "int")
+        code.arg("indices_pair_size", "int")
+
+        code.arg("RS", "int")
+        code.raw(f"""
+        int filter_offset = blockIdx.y;
+        uint32_t filter_mask_out = (1u << (filter_offset));
+        uint32_t filter_mask_in = (1u << (RS - 1 - filter_offset));
+        // uint32_t filter_mask_center = (1u << (RS / 2));
+
+        loc_iter.set_filter_offset(filter_offset);
+        int indices_pair_size_mul_RS = indices_pair_size * RS;
+        int filter_offset_mul_indices_pair_size = filter_offset * indices_pair_size;
+
+        int filter_offset_mul_indices_pair_size_1 = (RS - 1 - filter_offset) * indices_pair_size;
+        if (filter_offset == (RS / 2)){{
+            for (int i : tv::KernelLoopX<int>(num_indices)) {{
+                // atomicOr(mask + i, filter_mask_center);
+                indice_pairs[filter_offset_mul_indices_pair_size + i] = i;
+                indice_pairs[indices_pair_size_mul_RS + filter_offset_mul_indices_pair_size + i] = i;
+            }}
+        }} else {{
+            for (int output_index : tv::KernelLoopX<int>(num_indices)) {{
+                // find input offset from output offset
+                tv::array<int, {self.ndim + 1}> nhw_offset;
+                // table: input indice coord to output index (or output indice coord to input index)
+                if (loc_iter.query_nhw(indices_in + output_index * {self.ndim + 1}, nhw_offset)){{
+                    {self.dtype_indices} offset = loc_iter.layout_npq(nhw_offset);
+                    auto item = table.lookup(offset);
+                    if (!item.empty()) {{
+                        auto input_index = item.second; // we find a input indice idx.
+                        atomicOr(mask + output_index, filter_mask_out);
+                        atomicOr(mask + input_index, filter_mask_in);
+                        // for this output, we set correct input idx.
+                        indice_pairs[filter_offset_mul_indices_pair_size + output_index] = input_index;
+                        // the output in "input location" connect this output idx in another location.
+                        indice_pairs[filter_offset_mul_indices_pair_size_1 + input_index] = output_index;
+                        indice_pairs[indices_pair_size_mul_RS + filter_offset_mul_indices_pair_size + input_index] = output_index;
+                        indice_pairs[indices_pair_size_mul_RS + filter_offset_mul_indices_pair_size_1 + output_index] = input_index;
+                    }}
+                }}
+            }}
+        }}
+        """)
+        return code
+
+    @pccm.cuda.cuda_global_function
+    def calc_subm_conv_indices_split_mask(self):
+        code = pccm.FunctionCode()
+        code.targ("TTable")
+        code.arg("loc_iter", f"ConvLocIter") # [N, ndim + 1]
+        code.arg("table", f"TTable") # [N, ndim + 1]
+
+        code.arg("indices_in", f"const int*") # [N, ndim + 1]
+        code.arg("indice_pairs", f"{self.dtype_indices}*") # [2, kernelProd, MaxSize]
+        code.arg("mask1", f"uint32_t*") # [kernelProd]
+        code.arg("mask2", f"uint32_t*") # [kernelProd]
+
+        code.arg("num_indices", "int")
+        code.arg("indices_pair_size", "int")
+
+        code.arg("RS", "int")
+        code.raw(f"""
+        int filter_offset = blockIdx.y;
+        uint32_t filter_mask_out = (1u << (filter_offset));
+        uint32_t filter_mask_in = (1u << (RS - 1 - filter_offset));
+        // uint32_t filter_mask_center = (1u << (RS / 2));
+
+        loc_iter.set_filter_offset(filter_offset);
+        auto indice_ptr_inv = indice_pairs + indices_pair_size * RS;
+        int filter_offset_mul_indices_pair_size = filter_offset * indices_pair_size;
+        int filter_offset_mul_indices_pair_size_1 = (RS - 1 - filter_offset) * indices_pair_size;
+        if (filter_offset == (RS / 2)){{
+            for (int i : tv::KernelLoopX<int>(num_indices)) {{
+                indice_pairs[filter_offset_mul_indices_pair_size + i] = i;
+                indice_ptr_inv[filter_offset_mul_indices_pair_size + i] = i;
+            }}
+        }} else {{
+            for (int output_index : tv::KernelLoopX<int>(num_indices)) {{
+                // find input offset from output offset
+                tv::array<int, {self.ndim + 1}> nhw_offset;
+                // table: input indice coord to output index (or output indice coord to input index)
+                if (loc_iter.query_nhw(indices_in + output_index * {self.ndim + 1}, nhw_offset)){{
+                    {self.dtype_indices} offset = loc_iter.layout_npq(nhw_offset);
+                    auto item = table.lookup(offset);
+                    if (!item.empty()) {{
+                        auto input_index = item.second; // we find a input indice idx.
+                        atomicOr(mask1 + output_index, filter_mask_out);
+                        atomicOr(mask2 + input_index, filter_mask_in);
+                        // for this output, we set correct input idx.
+                        indice_pairs[filter_offset_mul_indices_pair_size + output_index] = input_index;
+                        // the output in "input location" connect this output idx in another location.
+                        indice_pairs[filter_offset_mul_indices_pair_size_1 + input_index] = output_index;
+                        indice_ptr_inv[filter_offset_mul_indices_pair_size + input_index] = output_index;
+                        indice_ptr_inv[filter_offset_mul_indices_pair_size_1 + output_index] = input_index;
+                    }}
+                }}
+            }}
+        }}
+        """)
+        return code
+
+    @pccm.cuda.static_function
+    def generate_conv_inds_stage1(self):
+        code = pccm.FunctionCode()
+        code.arg("indices", "tv::Tensor")
+        code.arg("indice_pairs, indice_pairs_uniq, indice_num_per_loc", "tv::Tensor")
+        code.arg("batch_size", "int")
+        code.arg("output_dims, input_dims", f"tv::array<int, {self.ndim}>")
+        code.arg("ksize, stride, padding, dilation", f"tv::array<int, {self.ndim}>")
+        code.arg("transposed", f"bool", "false")
+
+        code.arg("stream_int", f"std::uintptr_t", "0")
+
+        code.raw(f"""
+        // TODO stream
+        // TODO handle num input == 0
+        int kv = tv::arrayops::prod(ksize);
+        TV_ASSERT_RT_ERR(kv == indice_pairs.dim(1), "error");
+        // indice_pairs: [2, kv, indices.dim(0)]
+        // indice_pairs_uniq: [indice_pairs.size() / 2 + 1]
+        int64_t uniq_size = indice_pairs.size() / 2 + 1;
+        TV_ASSERT_RT_ERR(indice_pairs_uniq.dim(0) >= uniq_size, "error");
+        TV_ASSERT_RT_ERR(indice_num_per_loc.dim(0) == kv, "error");
+        int64_t expected_out_size = indices.dim(0) * kv;
+        tv::cuda::Launch launcher_num_act_in(indices.dim(0), reinterpret_cast<cudaStream_t>(stream_int));
+        // tv::cuda::Launch launcher_num_act_in_2(indices.dim(0));
+        launcher_num_act_in.blocks.y = kv;
+        ConvProblem problem(batch_size, 1, 1, input_dims, output_dims, ksize, padding, stride, dilation);
+        ConvLocIter loc_iter(problem);
+        tv::cuda::Launch launcher_clean_uniq(uniq_size, reinterpret_cast<cudaStream_t>(stream_int));
+        launcher_clean_uniq(clean_indices_uniq, indice_pairs_uniq.data_ptr<{self.dtype_indices}>(), uniq_size);
+        launcher_num_act_in(calc_conv_indices_stage1, loc_iter, indices.data_ptr<const int>(), 
+            indice_pairs.data_ptr<{self.dtype_indices}>(), 
+            indice_pairs_uniq.data_ptr<{self.dtype_indices}>(), indice_num_per_loc.data_ptr<int>(), indices.dim(0),
+            indice_pairs.dim(2), kv, transposed);
+        // thrust::device_ptr<{self.dtype_indices}> ptr_tr(indice_pairs_uniq.data_ptr<{self.dtype_indices}>());
+        // auto thrust_ctx = thrust::cuda::par.on(reinterpret_cast<cudaStream_t>(stream_int));
+        // thrust::sort(thrust_ctx, ptr_tr, ptr_tr + uniq_size);
+        // auto new_end = thrust::unique(thrust_ctx, ptr_tr, ptr_tr + uniq_size);
+        // auto num_out_act = new_end - ptr_tr - 1;
+        // return num_out_act;
+        """)
+        return code# .ret("int")
+
+    @pccm.cuda.static_function
+    def generate_conv_inds_stage1_5(self):
+        code = pccm.FunctionCode()
+        code.arg("indice_pairs_uniq", "tv::Tensor")
+        code.arg("uniq_size", "int64_t")
+        code.arg("stream_int", f"std::uintptr_t", "0")
+        code.raw(f"""
+        thrust::device_ptr<{self.dtype_indices}> ptr_tr(indice_pairs_uniq.data_ptr<{self.dtype_indices}>());
+        auto thrust_ctx = thrust::cuda::par.on(reinterpret_cast<cudaStream_t>(stream_int));
+        thrust::sort(thrust_ctx, ptr_tr, ptr_tr + uniq_size);
+        auto new_end = thrust::unique(thrust_ctx, ptr_tr, ptr_tr + uniq_size);
+        auto num_out_act = new_end - ptr_tr - 1;
+        return num_out_act;
+        """)
+        return code.ret("int")
+
+
+    @pccm.cuda.static_function
+    def generate_conv_inds_stage2(self):
+        code = pccm.FunctionCode()
+        code.arg("indices, hashdata", "tv::Tensor")
+        code.arg("indice_pairs, indice_pairs_uniq, out_inds", "tv::Tensor")
+        code.arg("num_out_act", "int")
+        code.arg("batch_size", "int")
+        code.arg("output_dims, input_dims", f"tv::array<int, {self.ndim}>")
+        code.arg("ksize, stride, padding, dilation", f"tv::array<int, {self.ndim}>")
+        code.arg("transposed", f"bool", "false")
+        code.arg("stream_int", f"std::uintptr_t", "0")
+        code.raw(f"""
+        auto custream = reinterpret_cast<cudaStream_t>(stream_int);
+        // TODO stream
+        // TODO handle num input == 0
+        int kv = tv::arrayops::prod(ksize);
+        TV_ASSERT_RT_ERR(kv == indice_pairs.dim(1), "error");
+        // indice_pairs: [2, kv, indices.dim(0)]
+        // indice_pairs_uniq: [indice_pairs.size() / 2 + 1]
+        // out_inds: [MaxSize, {self.ndim + 1}]
+        auto timer = tv::CudaContextTimer<>();
+        int64_t uniq_size = indice_pairs.size() / 2 + 1;
+        TV_ASSERT_RT_ERR(indice_pairs_uniq.dim(0) >= num_out_act, "error");
+        TV_ASSERT_RT_ERR(out_inds.dim(0) >= num_out_act && out_inds.dim(1) == {self.ndim + 1}, "error");
+        tv::cuda::Launch launcher_num_act_in(indices.dim(0), custream);
+        launcher_num_act_in.blocks.y = kv;
+        ConvProblem problem(batch_size, 1, 1, input_dims, output_dims, ksize, padding, stride, dilation);
+        ConvLocIter loc_iter(problem);
+        
+        // TODO handle invalid num_out_act
+        indice_pairs_uniq = indice_pairs_uniq.slice_first_axis(0, num_out_act);
+        tv::cuda::Launch lanucher_build_hash(num_out_act, custream);
+        using V = {self.dtype_indices};
+        using KeyType = {self.dtype_indices};
+        constexpr KeyType kEmptyKey = std::numeric_limits<KeyType>::max();
+        using table_t =
+            tv::hash::LinearHashTable<KeyType, V, tv::hash::Murmur3Hash<KeyType>,
+                                        kEmptyKey, false>;
+        using pair_t = typename table_t::value_type;
+        TV_ASSERT_RT_ERR(hashdata.dim(0) >= num_out_act, "hash size not enough");
+        table_t hash = table_t(hashdata.data_ptr<pair_t>(), hashdata.dim(0));
+        hash.clear(custream);
+        lanucher_build_hash(build_conv_hash_table<table_t>, hash, 
+            out_inds.data_ptr<int>(), indice_pairs_uniq.data_ptr<const {self.dtype_indices}>(), 
+            loc_iter.layout_npq, num_out_act);
+        launcher_num_act_in(calc_conv_indices_stage2<table_t>, hash, 
+            indice_pairs[1].data_ptr<int>(), indices.dim(0), 
+            indice_pairs.dim(2));
+        return num_out_act;
+        """)
+        return code.ret("int")
+
+
+    @pccm.cuda.static_function
+    def generate_subm_conv_inds(self):
+        code = pccm.FunctionCode()
+        code.arg("indices, hashdata", "tv::Tensor")
+        code.arg("indice_pairs, out_inds, indice_num_per_loc", "tv::Tensor")
+        code.arg("batch_size", "int")
+        code.arg("input_dims", f"tv::array<int, {self.ndim}>")
+        code.arg("ksize, dilation", f"tv::array<int, {self.ndim}>")
+        code.arg("indice_pair_mask", "tv::Tensor", "tv::Tensor()", "cumm.tensorview.Tensor = Tensor()")
+        code.arg("backward", "bool", "false")
+        code.arg("stream_int", f"std::uintptr_t", "0")
+
+        code.raw(f"""
+        auto custream = reinterpret_cast<cudaStream_t>(stream_int);
+        auto ctx = tv::Context();
+        ctx.set_cuda_stream(custream);
+        if (!indice_pair_mask.empty()){{
+            TV_ASSERT_INVALID_ARG(tv::arrayops::prod(ksize) < 32, "for now only support 32bit mask");
+        }}
+        // TODO stream
+        // TODO handle num input == 0
+        tv::array<int, {self.ndim}> stride, padding;
+        for (int i = 0; i < {self.ndim}; ++i){{
+            TV_ASSERT_RT_ERR(ksize[i] % 2 == 1, "subm only support odd ksize");
+            stride[i] = 1;
+            padding[i] = (ksize[i] / 2) * dilation[i];
+        }}
+        int kv = tv::arrayops::prod(ksize);
+        TV_ASSERT_RT_ERR(kv == indice_pairs.dim(1), "error");
+        // indice_pairs: [2, kv, indices.dim(0)]
+        // out_inds: [MaxSize, {self.ndim + 1}]
+        // auto timer = tv::CudaContextTimer<>();
+        TV_ASSERT_RT_ERR(indice_num_per_loc.dim(0) == kv, "error");
+        tv::cuda::Launch launcher_num_act_in(indices.dim(0), custream);
+        launcher_num_act_in.blocks.y = (kv / 2) + 1;
+        // launcher_num_act_in.blocks.y = kv;
+
+        ConvProblem problem(batch_size, 1, 1, input_dims, input_dims, ksize, padding, stride, dilation);
+        ConvLocIter loc_iter(problem);
+
+        tv::cuda::Launch lanucher_build_hash(indices.dim(0), custream);
+        using V = {self.dtype_indices};
+        using KeyType = {self.dtype_indices};
+        constexpr KeyType kEmptyKey = std::numeric_limits<KeyType>::max();
+
+        using table_t =
+            tv::hash::LinearHashTable<KeyType, V, tv::hash::Murmur3Hash<KeyType>,
+                                        kEmptyKey, false>;
+        using pair_t = typename table_t::value_type;
+        TV_ASSERT_RT_ERR(hashdata.dim(0) >= indices.dim(0), "hash size not enough");
+        table_t hash = table_t(hashdata.data_ptr<pair_t>(), hashdata.dim(0));
+        hash.clear(custream);
+        // tv::ssprint("clear hash time", hashdata.dim(0), timer.report() / 1000.0);
+
+        lanucher_build_hash(build_subm_conv_hash_table<table_t>, hash, indices.data_ptr<const int>(),
+            loc_iter.layout_npq, indices.dim(0));
+        // tv::ssprint("build_hash time", timer.report() / 1000.0);
+        if (!indice_pair_mask.empty()){{
+            if (indice_pair_mask.ndim() == 2 && indice_pair_mask.dim(0) == 2){{
+                auto mask_0 = indice_pair_mask[0];
+                tv::cuda::Launch lanucher_fill(mask_0.size(), custream);
+                lanucher_fill(cudakers::fill_kernel<int>, mask_0.data_ptr<int>(), (1 << (kv / 2)), mask_0.size());
+                indice_pair_mask[1].zero_(ctx);
+                auto kernel = &calc_subm_conv_indices_split_mask<table_t>;
+                launcher_num_act_in(kernel, loc_iter, hash,  
+                    indices.data_ptr<int>(), indice_pairs.data_ptr<int>(), 
+                    indice_pair_mask[0].data_ptr<uint32_t>(), indice_pair_mask[1].data_ptr<uint32_t>(), 
+                    indices.dim(0), indice_pairs.dim(2), kv);
+            }}else{{
+                tv::cuda::Launch lanucher_fill(indice_pair_mask.size(), custream);
+                lanucher_fill(cudakers::fill_kernel<int>, indice_pair_mask.data_ptr<int>(), (1 << (kv / 2)), indice_pair_mask.size());
+                TV_ASSERT_RT_ERR(indice_pair_mask.ndim() == 1, "error");
+                launcher_num_act_in(calc_subm_conv_indices_mask<table_t>, loc_iter, hash, 
+                    indices.data_ptr<int>(), indice_pairs.data_ptr<int>(), 
+                    indice_pair_mask.data_ptr<uint32_t>(), indices.dim(0), indice_pairs.dim(2), kv);
+            }}
+        }}else{{
+            launcher_num_act_in(calc_subm_conv_indices<table_t>, loc_iter, hash, indices.data_ptr<int>(), 
+                indice_pairs.data_ptr<int>(), 
+                indice_num_per_loc.data_ptr<int>(), indices.dim(0), indice_pairs.dim(2), kv);
+        }}
+        // tv::ssprint("gem subm conv inds time", timer.report() / 1000.0);
+        return indices.dim(0);
+        """)
+
+        return code.ret("int")
--- a/spconv/csrc/sparse/maxpool.py
+++ b/spconv/csrc/sparse/maxpool.py
+# Copyright 2021 Yan Yan
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import contextlib
+from cumm.conv.bases import ConvEnum
+from cumm.gemm.core.metaarray import MetaArray, seq
+from cumm import dtypes
+import pccm 
+from cumm.gemm.layout import TensorGeneric, to_stride
+from cumm.common import TensorView, TensorViewHashKernel, TensorViewKernel, ThrustLib, GemmBasic
+from cumm.gemm import codeops
+from typing import List 
+from cumm.conv.params import ConvProblem
+from cumm.gemm.mask_iters import MaskTileIterator, MaskTileIteratorParams
+import numpy as np 
+from cumm.gemm import (thread_map)
+
+class IndiceMaxPool(pccm.Class):
+    # TODO optimize this function
+    def __init__(self):
+        super().__init__()
+        self.add_dependency(TensorViewKernel, TensorView, GemmBasic)
+    
+    @pccm.cuda.cuda_global_function
+    def forward_kernel(self):
+        code = pccm.FunctionCode()
+        code.targ("T")
+
+        code.arg("out_features", f"T*") 
+        code.arg("in_features", f"const T*")
+        code.arg("out_indices", "const int*")
+        code.arg("in_indices", "const int*")
+        code.arg("size", "int")
+        code.arg("num_features", "int")
+
+        code.raw(f"""
+        for (int i : tv::KernelLoopY<int>(size)) {{
+            int in_idx = in_indices[i];
+            int out_idx = out_indices[i];
+            auto in_ptr = in_features + in_idx * num_features;
+            auto out_ptr = out_features + out_idx * num_features;
+            for (int j : tv::KernelLoopX<int>(num_features)) {{
+                auto in = in_ptr[j];
+                auto out = out_ptr[j];
+                if (in > out){{
+                    out_ptr[j] = in;
+                }}
+            }}
+        }}
+        """)
+        return code
+
+    @pccm.cuda.cuda_global_function
+    def backward_kernel(self):
+        code = pccm.FunctionCode()
+        code.targ("T")
+        code.arg("out_features", f"const T*") 
+        code.arg("in_features", f"const T*")
+        code.arg("dout_features", f"const T*") 
+        code.arg("din_features", f"T*")
+        code.arg("out_indices", "const int*")
+        code.arg("in_indices", "const int*")
+        code.arg("size", "int")
+        code.arg("num_features", "int")
+
+        code.raw(f"""
+        for (int i : tv::KernelLoopY<int>(size)) {{
+            int in_idx_offset = in_indices[i] * num_features;
+            int out_idx_offset = out_indices[i] * num_features;
+            auto in_ptr = in_features + in_idx_offset;
+            auto out_ptr = out_features + out_idx_offset;
+            auto din_ptr = din_features + in_idx_offset;
+            auto dout_ptr = dout_features + out_idx_offset;
+            for (int j : tv::KernelLoopX<int>(num_features)) {{
+                auto in = in_ptr[j];
+                auto out = out_ptr[j];
+                if (in == out){{
+                    din_ptr[j] = din_ptr[j] + dout_ptr[j];
+                }}
+            }}
+        }}
+        """)
+        return code
+
+    @pccm.cuda.static_function
+    def forward(self):
+        code = pccm.FunctionCode()
+        code.arg("out", "tv::Tensor")
+        code.arg("in", "tv::Tensor")
+        code.arg("out_inds", "tv::Tensor")
+        code.arg("in_inds", "tv::Tensor")
+        code.arg("stream", "std::uintptr_t", "0")
+
+        code.raw(f"""
+        auto nhot = out_inds.dim(0);
+        auto cudastream = reinterpret_cast<cudaStream_t>(stream);
+        tv::dispatch<float, double, tv::half_t, tv::bfloat16_t>(out.dtype(), [&](auto I){{
+            using T = TV_DECLTYPE(I);
+            constexpr int MaxThreads = 512;
+            tv::cuda::Launch launcher(1);
+            bool found = tv::dispatch_int_noexcept<512, 256, 128, 64, 32, 16>(out.dim(1), [](int my, int expect){{return my >= expect;}}, [&](auto V){{
+                // if out.dim(1) > value in list above, run this function.
+                // if a value is found, other value won't be executed.
+                int NumFeatures = TV_DECLTYPE(V)::value;
+                int Num0 = MaxThreads / NumFeatures;
+                dim3 blocks(tv::div_up(out.dim(1), NumFeatures), tv::div_up(nhot, Num0));
+                dim3 threads(NumFeatures, Num0);
+                launcher = tv::cuda::Launch(blocks, threads, cudastream);
+            }});
+            if (!found){{
+                int NumFeatures = 16;
+                int Num0 = MaxThreads / NumFeatures;
+                dim3 blocks(tv::div_up(out.dim(1), NumFeatures), tv::div_up(nhot, Num0));
+                dim3 threads(NumFeatures, Num0);
+                launcher = tv::cuda::Launch(blocks, threads, cudastream);
+            }}
+            launcher(forward_kernel<T>, out.data_ptr<T>(), in.data_ptr<const T>(),
+                out_inds.data_ptr<const int>(), in_inds.data_ptr<const int>(), nhot, out.dim(1));
+
+        }});
+        """)
+        return code
+
+    @pccm.cuda.static_function
+    def backward(self):
+        code = pccm.FunctionCode()
+        code.arg("out", "tv::Tensor")
+        code.arg("in", "tv::Tensor")
+        code.arg("dout", "tv::Tensor")
+        code.arg("din", "tv::Tensor")
+        code.arg("out_inds", "tv::Tensor")
+        code.arg("in_inds", "tv::Tensor")
+        code.arg("stream", "std::uintptr_t", "0")
+
+        code.raw(f"""
+        auto nhot = out_inds.dim(0);
+
+        auto cudastream = reinterpret_cast<cudaStream_t>(stream);
+        tv::dispatch<float, double, tv::half_t, tv::bfloat16_t>(out.dtype(), [&](auto I){{
+            using T = TV_DECLTYPE(I);
+            constexpr int MaxThreads = 512;
+            tv::cuda::Launch launcher(1);
+            bool found = tv::dispatch_int_noexcept<512, 256, 128, 64, 32, 16>(out.dim(1), [](int my, int expect){{return my >= expect;}}, [&](auto V){{
+                // if out.dim(1) > value in list above, run this function.
+                // if a value is found, other value won't be executed.
+                int NumFeatures = TV_DECLTYPE(V)::value;
+                int Num0 = MaxThreads / NumFeatures;
+                dim3 blocks(tv::div_up(out.dim(1), NumFeatures), tv::div_up(nhot, Num0));
+                dim3 threads(NumFeatures, Num0);
+                launcher = tv::cuda::Launch(blocks, threads, cudastream);
+            }});
+            if (!found){{
+                int NumFeatures = 16;
+                int Num0 = MaxThreads / NumFeatures;
+                dim3 blocks(tv::div_up(out.dim(1), NumFeatures), tv::div_up(nhot, Num0));
+                dim3 threads(NumFeatures, Num0);
+                launcher = tv::cuda::Launch(blocks, threads, cudastream);
+            }}
+            launcher(backward_kernel<T>, out.data_ptr<const T>(), in.data_ptr<const T>(),
+                dout.data_ptr<const T>(), din.data_ptr<T>(),
+                out_inds.data_ptr<const int>(), in_inds.data_ptr<const int>(), nhot, out.dim(1));
+        }});
+        """)
+        return code
--- a/spconv/csrc/sparse/pointops.py
+++ b/spconv/csrc/sparse/pointops.py
+# Copyright 2021 Yan Yan
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import contextlib
+from cumm.gemm.core.metaarray import MetaArray, seq
+from cumm import dtypes
+import pccm 
+from cumm.gemm.layout import TensorGeneric, to_stride
+from cumm.common import TensorView, TensorViewHashKernel
+from cumm.gemm import codeops
+from typing import List 
+from cumm.conv.params import ConvProblem
+import numpy as np 
+
+
+class Point2VoxelKernel(pccm.ParameterizedClass, pccm.pybind.PybindClassMixin):
+    """this class don't support multi-thread. 
+    create p2v for every thread.
+    """
+    def __init__(self, dtype: dtypes.DType, ndim: int, layout: TensorGeneric, zyx: bool = True):
+        super().__init__()
+        self.add_dependency(TensorView, TensorViewHashKernel)
+        self.add_param_class("layout_ns", layout, "Layout")
+        self.dtype = dtype 
+        self.ndim = ndim 
+        self.zyx = zyx
+
+    @pccm.cuda.cuda_global_function
+    def build_hash_table(self):
+        code = pccm.FunctionCode()
+        code.targ("TTable")
+        code.arg("table", "TTable")
+        code.arg("points", f"{self.dtype} const*")
+        code.arg("points_indice_data", f"int64_t *")
+
+        code.arg("point_stride", f"int")
+        code.arg("vsize", f"tv::array<float, {self.ndim}>")
+        code.arg("coors_range", f"tv::array<float, {self.ndim * 2}>")
+        code.arg("grid_bound", f"tv::array<int, {self.ndim}>")
+        code.arg("grid_stride", f"tv::array<int, {self.ndim}>")
+
+        code.arg("num_points", f"int")
+        point_xyz = f"{self.ndim - 1} - j"
+        if not self.zyx:
+            point_xyz = f"j"
+        # if zyx, the coors_range and grid_bound is zyx too, 
+        # generated indices is zyx.
+        code.raw(f"""
+        for (int i : tv::KernelLoopX<int>(num_points)){{
+            bool failed = false;
+            int c;
+            int64_t prod = 0;
+        #pragma unroll
+            for (int j = 0; j < {self.ndim}; ++j) {{
+                c = floor((points[i * point_stride + {point_xyz}] - coors_range[j]) /
+                            vsize[j]);
+                if ((c < 0 || c >= grid_bound[j])) {{
+                    failed = true;
+                }}
+                prod += grid_stride[j] * c;
+            }}
+            if (!failed){{
+                points_indice_data[i] = prod;
+                table.insert(prod, i);
+            }}else{{
+                points_indice_data[i] = -1;
+            }}
+        }}
+        """)
+        return code 
+
+    @pccm.cuda.cuda_global_function
+    def assign_table(self):
+        code = pccm.FunctionCode()
+        code.targ("TTable")
+        code.arg("table", "TTable")
+        code.arg("indices", f"int*")
+        code.arg("count", f"int*")
+        code.arg("layout", f"Layout")
+        code.arg("max_voxels", f"int")
+
+        code.raw(f"""
+        auto data = table.data();
+        for (int i : tv::KernelLoopX<int>(table.size())){{
+            auto &item = data[i];
+            if (!item.empty()) {{
+                item.second = tv::cuda::atomicAggInc(count);
+                if (item.second < max_voxels){{
+                    layout.inverse(item.first, indices + item.second * {self.ndim});
+                }}
+            }}
+        }}
+        """)
+        return code 
+
+    @pccm.cuda.cuda_global_function
+    def generate_voxel(self):
+        code = pccm.FunctionCode()
+        code.targ("TTable")
+        code.arg("table", "TTable")
+        code.arg("points", f"{self.dtype} const*")
+
+        code.arg("points_indice_data", f"const int64_t*")
+        code.arg("voxels", f"{self.dtype} *")
+        code.arg("num_per_voxel", f"int *")
+
+        code.arg("point_stride", f"int")
+        code.arg("max_points_per_voxel", f"int")
+        code.arg("max_voxels", f"int")
+
+        code.arg("vsize", f"tv::array<float, {self.ndim}>")
+        code.arg("coors_range", f"tv::array<float, {self.ndim * 2}>")
+        code.arg("grid_bound", f"tv::array<int, {self.ndim}>")
+        code.arg("grid_stride", f"tv::array<int, {self.ndim}>")
+
+        code.arg("num_points", f"int")
+        code.raw(f"""
+        int voxel_stride0 = point_stride * max_points_per_voxel;
+        for (int i : tv::KernelLoopX<int>(num_points)){{
+            int64_t prod = points_indice_data[i];
+            if (prod != -1){{
+                auto voxel_index_pair = table.lookup(prod);
+                if (!voxel_index_pair.empty() &&
+                    voxel_index_pair.second < max_voxels) {{
+                    int old = atomicAdd(num_per_voxel + voxel_index_pair.second, 1);
+                    if (old < max_points_per_voxel) {{
+                        for (int j = 0; j < point_stride; ++j) {{
+                            voxels[voxel_index_pair.second * voxel_stride0 + old * point_stride + j] = points[i * point_stride + j];
+                        }}
+                    }}
+                }}
+            }}
+        }}
+        """)
+        return code 
+
+class Point2Voxel(pccm.ParameterizedClass, pccm.pybind.PybindClassMixin):
+    def __init__(self, dtype: dtypes.DType, ndim: int, zyx: bool = True):
+        super().__init__()
+        self.add_dependency(TensorView)
+        layout = TensorGeneric(ndim, True)
+        self.add_param_class("layout_ns", layout, "Layout")
+        self.dtype = dtype 
+        self.ndim = ndim 
+        self.zyx = zyx
+        cuda_funcs = [self.point_to_voxel_hash]
+        self.add_impl_only_param_class(cuda_funcs, "kernel", Point2VoxelKernel(dtype, ndim, layout, zyx))
+
+        self.add_pybind_member("hashdata", "tv::Tensor", readwrite=False, pyanno="cumm.tensorview.Tensor")
+        self.add_pybind_member("point_indice_data", "tv::Tensor", readwrite=False, pyanno="cumm.tensorview.Tensor")
+
+        self.add_pybind_member("voxels", "tv::Tensor", readwrite=False)
+        self.add_pybind_member("indices", "tv::Tensor", readwrite=False)
+        self.add_pybind_member("num_per_voxel", "tv::Tensor", readwrite=False)
+        self.add_member("vsize", f"tv::array<float, {self.ndim}>")
+        self.add_member("coors_range", f"tv::array<float, {self.ndim * 2}>")
+        self.add_member("grid_size", f"tv::array<int, {self.ndim}>")
+        self.add_member("grid_stride", f"tv::array<int, {self.ndim}>")
+
+    @pccm.pybind.mark_prop_getter(prop_name="grid_size")
+    @pccm.member_function
+    def get_grid_size(self):
+        code = pccm.FunctionCode()
+        code.raw(f"""
+        std::array<int, {self.ndim}> res;
+        for (int i = 0; i < {self.ndim}; ++i){{
+            res[i] = grid_size[i];
+        }}
+        return res;
+        """)
+        return code.ret(f"std::array<int, {self.ndim}>")
+
+    @pccm.pybind.mark
+    @pccm.constructor
+    def ctor(self):
+        code = pccm.FunctionCode()
+        code.arg("vsize_xyz", f"std::array<float, {self.ndim}>")
+        code.arg("coors_range_xyz", f"std::array<float, {self.ndim * 2}>")
+        code.arg("num_point_features", f"int")
+        code.arg("max_num_voxels, max_num_points_per_voxel", f"int")
+        if self.zyx:
+            code.raw(f"""
+            for (int i = 0; i < {self.ndim}; ++i){{
+                vsize[{self.ndim - 1} - i] = vsize_xyz[i];
+                coors_range[{self.ndim - 1} - i] = coors_range_xyz[i];
+                coors_range[{2 * self.ndim - 1} - i] = coors_range_xyz[i + {self.ndim}];
+            }}
+            """)
+        else:
+            code.raw(f"""
+            for (int i = 0; i < {self.ndim}; ++i){{
+                vsize[i] = vsize_xyz[i];
+                coors_range[i] = coors_range_xyz[i];
+                coors_range[i + {self.ndim}] = coors_range_xyz[i + {self.ndim}];
+            }}
+            """)
+        # if zyx, grid_size is zyx.
+        code.raw(f"""
+        int64_t prod = 1;
+        for (size_t i = 0; i < {self.ndim}; ++i) {{
+            grid_size[i] =
+                std::round((coors_range[{self.ndim} + i] - coors_range[i]) / vsize[i]);
+        }}
+        for (int i = {self.ndim} - 1; i >= 0; --i) {{
+            grid_stride[i] = prod;
+            prod *= grid_size[i];
+        }}
+        voxels = tv::zeros({{max_num_voxels, max_num_points_per_voxel, num_point_features}}, tv::type_v<{self.dtype}>, 0);
+        indices = tv::zeros({{max_num_voxels, {self.ndim}}}, tv::int32, 0);
+        num_per_voxel = tv::zeros({{max_num_voxels}}, tv::int32, 0);
+        hashdata = tv::zeros({{1}}, tv::custom128, 0);
+        point_indice_data = tv::zeros({{1}}, tv::int64, 0);
+        """)
+        return code 
+
+    @pccm.pybind.mark
+    @pccm.cuda.member_function
+    def point_to_voxel_hash(self):
+        code = pccm.FunctionCode()
+        code.arg("points", "tv::Tensor")
+        code.arg("clear_voxels", "bool", "true")
+
+        code.raw(f"""
+        TV_ASSERT_INVALID_ARG(points.ndim() == 2 && points.dim(1) >= {self.ndim}, "error");
+        using V = int64_t;
+        using KeyType = int64_t;
+        constexpr KeyType kEmptyKey = std::numeric_limits<KeyType>::max();
+        if (clear_voxels){{
+            voxels.zero_();
+        }}
+        using table_t =
+            tv::hash::LinearHashTable<KeyType, V, tv::hash::Murmur3Hash<KeyType>,
+                                        kEmptyKey, false>;
+        using pair_t = typename table_t::value_type;
+        // int64_t expected_hash_data_num = int64_t(tv::hash::align_to_power2(points.dim(0) * 2));
+        int64_t expected_hash_data_num = points.dim(0) * 2;
+
+        if (hashdata.dim(0) < expected_hash_data_num){{
+            hashdata = tv::zeros({{expected_hash_data_num}}, tv::custom128, 0);
+        }}
+        if (point_indice_data.dim(0) < points.dim(0)){{
+            point_indice_data = tv::zeros({{points.dim(0)}}, tv::int64, 0);
+        }}
+        // auto timer = tv::CudaContextTimer<>();
+        num_per_voxel.zero_();
+        table_t hash = table_t(hashdata.data_ptr<pair_t>(), expected_hash_data_num);
+        hash.clear();
+        // tv::ssprint("clear time", timer.report());
+        auto launcher = tv::cuda::Launch(points.dim(0));
+        launcher(kernel::build_hash_table<table_t>, hash, points.data_ptr<const {self.dtype}>(),
+                point_indice_data.data_ptr<int64_t>(),
+                points.dim(1), vsize, coors_range, grid_size, grid_stride, points.dim(0));
+        // tv::ssprint("build_hash_table", timer.report());
+
+        auto table_launcher = tv::cuda::Launch(hash.size());
+        tv::Tensor count = tv::zeros({{1}}, tv::int32, 0);
+        Layout layout = Layout::from_shape(grid_size);
+        table_launcher(kernel::assign_table<table_t>, hash, indices.data_ptr<int>(),
+                        count.data_ptr<int>(),
+                        layout, voxels.dim(0));
+        auto count_cpu = count.cpu();
+        int count_val = count_cpu.item<int32_t>();
+        // tv::ssprint("assign_table", timer.report());
+
+        launcher(kernel::generate_voxel<table_t>, hash, points.data_ptr<const {self.dtype}>(),
+                point_indice_data.data_ptr<const int64_t>(), voxels.data_ptr<{self.dtype}>(),
+                num_per_voxel.data_ptr<int>(), points.dim(1), voxels.dim(1), 
+                voxels.dim(0), vsize, coors_range,
+                grid_size, grid_stride, points.dim(0));
+        // tv::ssprint("generate_voxel", timer.report());
+
+        return std::make_tuple(voxels.slice_first_axis(0, count_val), 
+            indices.slice_first_axis(0, count_val), 
+            num_per_voxel.slice_first_axis(0, count_val));
+
+        """)
+        return code.ret("std::tuple<tv::Tensor, tv::Tensor, tv::Tensor>")
+
+
+
+class Point2VoxelCPU(pccm.ParameterizedClass, pccm.pybind.PybindClassMixin):
+    def __init__(self, dtype: dtypes.DType, ndim: int, zyx: bool = True):
+        super().__init__()
+        self.add_dependency(TensorView)
+        layout = TensorGeneric(ndim, True)
+        self.add_param_class("layout_ns", layout, "Layout")
+        self.dtype = dtype 
+        self.ndim = ndim 
+        self.zyx = zyx
+
+        self.add_pybind_member("densehashdata", "tv::Tensor", readwrite=False, pyanno="cumm.tensorview.Tensor")
+
+        self.add_pybind_member("voxels", "tv::Tensor", readwrite=False)
+        self.add_pybind_member("indices", "tv::Tensor", readwrite=False)
+        self.add_pybind_member("num_per_voxel", "tv::Tensor", readwrite=False)
+        self.add_member("mean_per_voxel", "tv::Tensor")
+
+        self.add_member("vsize", f"tv::array<float, {self.ndim}>")
+        self.add_member("coors_range", f"tv::array<float, {self.ndim * 2}>")
+        self.add_member("grid_size", f"tv::array<int, {self.ndim}>")
+        self.add_member("grid_stride", f"tv::array<int, {self.ndim}>")
+
+    @pccm.pybind.mark_prop_getter(prop_name="grid_size")
+    @pccm.member_function
+    def get_grid_size(self):
+        code = pccm.FunctionCode()
+        code.raw(f"""
+        std::array<int, {self.ndim}> res;
+        for (int i = 0; i < {self.ndim}; ++i){{
+            res[i] = grid_size[i];
+        }}
+        return res;
+        """)
+        return code.ret(f"std::array<int, {self.ndim}>")
+
+    @pccm.pybind.mark
+    @pccm.constructor
+    def ctor(self):
+        code = pccm.FunctionCode()
+        code.arg("vsize_xyz", f"std::array<float, {self.ndim}>")
+        code.arg("coors_range_xyz", f"std::array<float, {self.ndim * 2}>")
+        code.arg("num_point_features", f"int")
+        code.arg("max_num_voxels, max_num_points_per_voxel", f"int")
+        if self.zyx:
+            code.raw(f"""
+            for (int i = 0; i < {self.ndim}; ++i){{
+                vsize[{self.ndim - 1} - i] = vsize_xyz[i];
+                coors_range[{self.ndim - 1} - i] = coors_range_xyz[i];
+                coors_range[{2 * self.ndim - 1} - i] = coors_range_xyz[i + {self.ndim}];
+            }}
+            """)
+        else:
+            code.raw(f"""
+            for (int i = 0; i < {self.ndim}; ++i){{
+                vsize[i] = vsize_xyz[i];
+                coors_range[i] = coors_range_xyz[i];
+                coors_range[i + {self.ndim}] = coors_range_xyz[i + {self.ndim}];
+            }}
+            """)
+        code.raw(f"""
+        int64_t prod = 1;
+        for (size_t i = 0; i < {self.ndim}; ++i) {{
+            grid_size[i] =
+                std::round((coors_range[{self.ndim} + i] - coors_range[i]) / vsize[i]);
+        }}
+        for (int i = {self.ndim} - 1; i >= 0; --i) {{
+            grid_stride[i] = prod;
+            prod *= grid_size[i];
+        }}
+        voxels = tv::zeros({{max_num_voxels, max_num_points_per_voxel, num_point_features}}, tv::type_v<{self.dtype}>, -1);
+        indices = tv::zeros({{max_num_voxels, {self.ndim}}}, tv::int32, -1);
+        num_per_voxel = tv::zeros({{max_num_voxels}}, tv::int32, -1);
+        mean_per_voxel = tv::zeros({{max_num_voxels, num_point_features}}, tv::DType({self.dtype.tv_dtype}), -1);
+        tv::TensorShape grid_shape(grid_size.data(), grid_size.data() + {self.ndim});
+        densehashdata = tv::zeros(grid_shape, tv::int32, -1);
+        auto densehashdata_ptr = densehashdata.data_ptr<int>();
+        for (int i= 0; i < densehashdata.size(); ++i){{
+            densehashdata_ptr[i] = -1;
+        }}
+        """)
+        return code 
+
+    def point_to_voxel_template(self, mean: bool = False):
+        code = pccm.FunctionCode()
+        code.arg("points", "tv::Tensor")
+        code.arg("clear_voxels", "bool", "true")
+
+        point_xyz = f"{self.ndim - 1} - j"
+        if not self.zyx:
+            point_xyz = f"j"
+        code.raw(f"""
+        auto max_num_voxels = voxels.dim(0);
+        auto max_num_points_per_voxel = voxels.dim(1);
+        num_per_voxel.zero_();
+        if (clear_voxels){{
+            voxels.zero_();
+        }}
+        """)
+        if mean:
+            code.raw(f"mean_per_voxel.zero_();")
+            code.raw(f"auto means_rw = mean_per_voxel.tview<{self.dtype}, 2>();")
+        else:
+            code.raw(f"auto means_rw = mean_per_voxel.tview<{self.dtype}, 2>();")
+        
+        code.raw(f"""
+        int res_voxel_num = 0;
+        int num_features = points.dim(1);
+        auto N = points.dim(0);
+        int c;
+        TV_ASSERT_RT_ERR(num_features == voxels.dim(2), "your points num features doesn't equal to voxel.");
+        tv::dispatch<float, double>(points.dtype(), [&](auto I){{
+            using T = decltype(I);
+            auto points_rw = points.tview<T, 2>();
+            auto coors_rw = indices.tview<int, 2>();
+            auto voxels_rw = voxels.tview<{self.dtype}, 3>();
+            auto num_points_per_voxel_rw = num_per_voxel.tview<int, 1>();
+            
+            int coor[{self.ndim}];
+            auto coor_to_voxelidx_rw = densehashdata.tview<int, {self.ndim}>();
+            int voxelidx, num;
+            bool failed;
+            int voxel_num = 0;
+            for (int i = 0; i < N; ++i) {{
+                failed = false;
+                for (int j = 0; j < {self.ndim}; ++j) {{
+                    c = floor((points_rw(i, {point_xyz}) - coors_range[j]) / vsize[j]);
+                    if ((c < 0 || c >= grid_size[j])) {{
+                        failed = true;
+                        break;
+                    }}
+                    coor[j] = c;
+                }}
+                if (failed)
+                    continue;
+                voxelidx = coor_to_voxelidx_rw({codeops.unpack("coor", range(self.ndim))});
+
+                if (voxelidx == -1) {{
+                    voxelidx = voxel_num;
+                    if (voxel_num >= max_num_voxels)
+                        continue;
+                    voxel_num += 1;
+                    coor_to_voxelidx_rw({codeops.unpack("coor", range(self.ndim))}) = voxelidx;
+                    for (int k = 0; k < {self.ndim}; ++k) {{
+                        coors_rw(voxelidx, k) = coor[k];
+                    }}
+                }}
+                num = num_points_per_voxel_rw(voxelidx);
+                if (num < max_num_points_per_voxel) {{
+                    // voxel_point_mask_rw(voxelidx, num) = {self.dtype}(1);
+                    for (int k = 0; k < num_features; ++k) {{
+                        voxels_rw(voxelidx, num, k) = points_rw(i, k);
+                    }}
+                    num_points_per_voxel_rw(voxelidx) += 1;
+                    if TV_IF_CONSTEXPR ({pccm.boolean(mean)}){{
+                        for (int k = 0; k < num_features; ++k) {{
+                            means_rw(voxelidx, k) +=
+                                (points_rw(i, k) - means_rw(voxelidx, k)) / {self.dtype}(num + 1);
+                        }}
+                    }}
+                }}
+            }}
+            for (int i = 0; i < voxel_num; ++i) {{
+                coor_to_voxelidx_rw({codeops.unpack("coors_rw", range(self.ndim), left="(i, ", right=")")}) = -1;
+                if TV_IF_CONSTEXPR ({pccm.boolean(mean)}){{
+                    num = num_points_per_voxel_rw(i);
+                    for (int j = num; j < max_num_points_per_voxel; ++j) {{
+                        for (int k = 0; k < num_features; ++k) {{
+                            voxels_rw(i, j, k) = means_rw(i, k);
+                        }}
+                    }}
+                }}
+            }}
+            res_voxel_num = voxel_num;
+        }});
+        return std::make_tuple(voxels.slice_first_axis(0, res_voxel_num), 
+            indices.slice_first_axis(0, res_voxel_num), 
+            num_per_voxel.slice_first_axis(0, res_voxel_num));
+        """)
+        return code.ret("std::tuple<tv::Tensor, tv::Tensor, tv::Tensor>")
+
+
+    @pccm.pybind.mark
+    @pccm.member_function
+    def point_to_voxel(self):
+        return self.point_to_voxel_template(False)
+
+    @pccm.pybind.mark
+    @pccm.member_function
+    def point_to_voxel_empty_mean(self):
+        return self.point_to_voxel_template(True)
--- a/spconv/ops.py
+++ b/spconv/ops.py
-# Copyright 2019 Yan Yan
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from enum import Enum
-
-import torch
-
-import spconv
-
-
-class ConvAlgo(Enum):
-    Native = 0  # small memory cost, faster when number of points is large.
-    Batch = 1  # high memory cost, faster when number of points is small (< 50000)
-    BatchGemmGather = 2  # high memory cost, faster when number of points medium
-
-
-def get_conv_output_size(input_size, kernel_size, stride, padding, dilation):
-    ndim = len(input_size)
-    output_size = []
-    for i in range(ndim):
-        size = (input_size[i] + 2 * padding[i] - dilation[i] *
-                (kernel_size[i] - 1) - 1) // stride[i] + 1
-        if kernel_size[i] == -1:
-            output_size.append(1)
-        else:
-            output_size.append(size)
-    return output_size
-
-
-def get_deconv_output_size(input_size, kernel_size, stride, padding, dilation,
-                           output_padding):
-    ndim = len(input_size)
-    output_size = []
-    for i in range(ndim):
-        if kernel_size[i] == -1:
-            raise ValueError("deconv don't support kernel_size < 0")
-        size = (input_size[i] - 1) * stride[i] - 2 * padding[i] + kernel_size[
-            i] + output_padding[i]
-        output_size.append(size)
-    return output_size
-
-
-def get_indice_pairs(indices,
-                     batch_size,
-                     spatial_shape,
-                     ksize=3,
-                     stride=1,
-                     padding=0,
-                     dilation=1,
-                     out_padding=0,
-                     subm=False,
-                     transpose=False,
-                     grid=None,
-                     use_hash=False):
-    ndim = indices.shape[1] - 1
-    if not isinstance(ksize, (list, tuple)):
-        ksize = [ksize] * ndim
-    if not isinstance(stride, (list, tuple)):
-        stride = [stride] * ndim
-    if not isinstance(padding, (list, tuple)):
-        padding = [padding] * ndim
-    if not isinstance(dilation, (list, tuple)):
-        dilation = [dilation] * ndim
-    if not isinstance(out_padding, (list, tuple)):
-        out_padding = [out_padding] * ndim
-
-    for d, s in zip(dilation, stride):
-        assert any([s == 1, d == 1]), "don't support this."
-
-    if not subm:
-        if transpose:
-            out_shape = get_deconv_output_size(spatial_shape, ksize, stride,
-                                               padding, dilation, out_padding)
-        else:
-            out_shape = get_conv_output_size(spatial_shape, ksize, stride,
-                                             padding, dilation)
-    else:
-        out_shape = spatial_shape
-    if grid is None:
-        res = torch.ops.spconv.get_indice_pairs(indices, batch_size, out_shape,
-                                                spatial_shape, ksize, stride,
-                                                padding, dilation, out_padding,
-                                                int(subm), int(transpose),
-                                                int(use_hash))
-        return res
-    else:
-        if ndim == 2:
-            get_indice_pairs_func = torch.ops.spconv.get_indice_pairs_grid_2d
-        elif ndim == 3:
-            get_indice_pairs_func = torch.ops.spconv.get_indice_pairs_grid_3d
-        else:
-            raise NotImplementedError
-        return get_indice_pairs_func(indices, grid, batch_size, out_shape,
-                                     spatial_shape, ksize, stride, padding,
-                                     dilation, out_padding, int(subm),
-                                     int(transpose), int(use_hash))
-
-
-def indice_conv(features,
-                filters,
-                indice_pairs,
-                indice_pair_num,
-                num_activate_out,
-                inverse=False,
-                subm=False,
-                algo=ConvAlgo.Native.value):
-    return torch.ops.spconv.indice_conv(features, filters, indice_pairs,
-                                        indice_pair_num, num_activate_out,
-                                        int(inverse), int(subm), algo)
-
-
-def fused_indice_conv(features, filters, bias, indice_pairs, indice_pair_num,
-                      num_activate_out, inverse, subm):
-    return torch.ops.spconv.fused_indice_conv_bn(features, filters, bias,
-                                                 indice_pairs, indice_pair_num,
-                                                 num_activate_out,
-                                                 int(inverse), int(subm))
-
-
-def indice_conv_backward(features,
-                         filters,
-                         out_bp,
-                         indice_pairs,
-                         indice_pair_num,
-                         inverse=False,
-                         subm=False,
-                         algo=ConvAlgo.Native.value):
-    return torch.ops.spconv.indice_conv_backward(features, filters, out_bp,
-                                                 indice_pairs, indice_pair_num,
-                                                 int(inverse), int(subm), algo)
-
-
-def indice_maxpool(features, indice_pairs, indice_pair_num, num_activate_out):
-    return torch.ops.spconv.indice_maxpool(features, indice_pairs,
-                                           indice_pair_num, num_activate_out)
-
-
-def indice_maxpool_backward(features, out_features, out_bp, indice_pairs,
-                            indice_pair_num):
-    return torch.ops.spconv.indice_maxpool_backward(features, out_features,
-                                                    out_bp, indice_pairs,
-                                                    indice_pair_num)
-
-
-def nms(boxes, scores, pre_max_size, post_max_size, thresh, eps):
-    res = torch.ops.spconv.nms(boxes, scores, pre_max_size, post_max_size,
-                               thresh, eps)
-    return res
-
-
-def pillar_scatter(features, coors, shape):
-    if features.dtype == torch.float32:
-        return torch.ops.spconv.pillar_scatter_float(features, coors, shape)
-    elif features.dtype == torch.half:
-        return torch.ops.spconv.pillar_scatter_half(features, coors, shape)
-    else:
-        raise NotImplementedError
--- a/spconv/pytorch/__init__.py
+++ b/spconv/pytorch/__init__.py
+import platform
+from pathlib import Path
+
+import numpy as np
+import torch
+
+from spconv.pytorch import ops
+from spconv.pytorch.conv import (SparseConv2d, SparseConv3d, SparseConvTranspose2d,
+                         SparseConvTranspose3d, SparseInverseConv2d,
+                         SparseInverseConv3d, SubMConv2d, SubMConv3d)
+from spconv.pytorch.core import SparseConvTensor
+from spconv.pytorch.identity import Identity
+from spconv.pytorch.modules import SparseModule, SparseSequential
+from spconv.pytorch.ops import ConvAlgo
+from spconv.pytorch.pool import SparseMaxPool2d, SparseMaxPool3d
+from spconv.pytorch.tables import AddTable, ConcatTable, JoinTable
+
+
+class ToDense(SparseModule):
+    """convert SparseConvTensor to NCHW dense tensor.
+    """
+    def forward(self, x: SparseConvTensor):
+        return x.dense()
+
+
+class RemoveGrid(SparseModule):
+    """remove pre-allocated grid buffer.
+    """
+    def forward(self, x: SparseConvTensor):
+        x.grid = None
+        return x
--- a/spconv/conv.py
+++ b/spconv/conv.py
-# Copyright 2019 Yan Yan
+# Copyright 2021 Yan Yan
 # 
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -14,6 +14,7 @@

 import math
 import time
+from typing import List, Optional, Tuple, Union

 import numpy as np
 import torch
@@ -21,11 +22,13 @@ from torch import nn
 from torch.nn import init
 from torch.nn.parameter import Parameter

-import spconv
-import spconv.functional as Fsp
-from spconv import ops
-from spconv.modules import SparseModule
-
+from spconv import pytorch as spconv
+from spconv.algo import ConvAlgo
+import spconv.pytorch.functional as Fsp
+from spconv.pytorch import ops
+from spconv.pytorch.core import IndiceData, SparseConvTensor
+from spconv.pytorch.modules import SparseModule
+from spconv.constants import FILTER_HWIO

 def _calculate_fan_in_and_fan_out_hwio(tensor):
    dimensions = tensor.ndimension()
@@ -38,8 +41,12 @@ def _calculate_fan_in_and_fan_out_hwio(tensor):
        fan_in = tensor.size(-2)
        fan_out = tensor.size(-1)
    else:
+        if FILTER_HWIO:
            num_input_fmaps = tensor.size(-2)
            num_output_fmaps = tensor.size(-1)
+        else:
+            num_input_fmaps = tensor.size(-1)
+            num_output_fmaps = tensor.size(-2)
        receptive_field_size = 1
        if tensor.dim() > 2:
            receptive_field_size = tensor[..., 0, 0].numel()
@@ -56,24 +63,24 @@ class SparseConvolution(SparseModule):
    ]

    def __init__(self,
-                 ndim,
-                 in_channels,
-                 out_channels,
-                 kernel_size=3,
-                 stride=1,
-                 padding=0,
-                 dilation=1,
-                 groups=1,
-                 bias=True,
-                 subm=False,
-                 output_padding=0,
-                 transposed=False,
-                 inverse=False,
-                 indice_key=None,
-                 fused_bn=False,
-                 use_hash=False,
-                 algo=ops.ConvAlgo.Native):
-        super(SparseConvolution, self).__init__()
+                 ndim: int,
+                 in_channels: int,
+                 out_channels: int,
+                 kernel_size: Union[int, List[int], Tuple[int, ...]]=3,
+                 stride: Union[int, List[int], Tuple[int, ...]]=1,
+                 padding: Union[int, List[int], Tuple[int, ...]]=0,
+                 dilation: Union[int, List[int], Tuple[int, ...]]=1,
+                 groups: Union[int, List[int], Tuple[int, ...]]=1,
+                 bias: bool=True,
+                 subm: bool=False,
+                 output_padding: Union[int, List[int], Tuple[int, ...]]=0,
+                 transposed: bool=False,
+                 inverse: bool=False,
+                 indice_key: Optional[str]=None,
+                 fused_bn: bool=False,
+                 algo: ops.ConvAlgo=ops.ConvAlgo.Native,
+                 name=None):
+        super(SparseConvolution, self).__init__(name=name)
        assert groups == 1
        if not isinstance(kernel_size, (list, tuple)):
            kernel_size = [kernel_size] * ndim
@@ -104,11 +111,13 @@ class SparseConvolution(SparseModule):
        self.subm = subm
        self.indice_key = indice_key
        self.fused_bn = fused_bn
-        self.use_hash = use_hash
-        self.algo = algo.value
-
+        self.algo = algo
+        if FILTER_HWIO:
            self.weight = Parameter(
                torch.Tensor(*kernel_size, in_channels, out_channels))
+        else:
+            self.weight = Parameter(
+                torch.Tensor(*kernel_size, out_channels, in_channels))
        if bias:
            self.bias = Parameter(torch.Tensor(out_channels))
        else:
@@ -117,14 +126,15 @@ class SparseConvolution(SparseModule):

    def reset_parameters(self):
        n = self.in_channels
-        init.kaiming_uniform_(self.weight, a=math.sqrt(5))
+        # init.uniform_(self.weight, 0, 0.001)
+        init.kaiming_uniform_(self.weight, a=math.sqrt(0.005))
        if self.bias is not None:
            fan_in, _ = _calculate_fan_in_and_fan_out_hwio(self.weight)
            bound = 1 / math.sqrt(fan_in)
            init.uniform_(self.bias, -bound, bound)

-    def forward(self, input):
-        assert isinstance(input, spconv.SparseConvTensor)
+    def forward(self, input: SparseConvTensor):
+        assert isinstance(input, SparseConvTensor)
        features = input.features
        device = features.device
        indices = input.indices
@@ -143,47 +153,91 @@ class SparseConvolution(SparseModule):
            out_spatial_shape = spatial_shape
        # input.update_grid(out_spatial_shape)
        # t = time.time()
+        out_tensor = input.shadow_copy()
+        if input.benchmark:
+            if self.name is None:
+                raise ValueError(
+                    "you need to assign name to spmodules before benchmark (spconv.utils.bench.assign_name_to_spmod)"
+                )
+            if self.name not in input.benchmark_record:
+                input.benchmark_record[self.name] = {
+                    "type": "SparseConvolution",
+                    "indice_gen_time": [],
+                    "time": [],
+                    "num_points": [],
+                    "num_out_points": [],
+                    "params": {
+                        "kernel_size": self.kernel_size,
+                        "stride": self.stride,
+                        "padding": self.padding,
+                        "dilation": self.dilation,
+                        "output_padding": self.output_padding,
+                        "subm": self.subm,
+                        "transposed": self.transposed,
+                        "input_channels": self.in_channels,
+                        "out_channels": self.out_channels,
+                    }
+                }
        if self.conv1x1:
+            if FILTER_HWIO:
                features = torch.mm(
                    input.features,
-                self.weight.view(self.in_channels, self.out_channels))
+                    self.weight.view(self.out_channels, self.in_channels).T)
+            else:
+                features = torch.mm(
+                    input.features,
+                    self.weight.view(self.in_channels, self.out_channels).T)
+
            if self.bias is not None:
                features += self.bias
-            out_tensor = spconv.SparseConvTensor(features, input.indices,
-                                                 input.spatial_shape,
-                                                 input.batch_size)
-            out_tensor.indice_dict = input.indice_dict
-            out_tensor.grid = input.grid
+            out_tensor.features = features
            return out_tensor
        datas = input.find_indice_pair(self.indice_key)
        if self.inverse:
            assert datas is not None and self.indice_key is not None
-            _, outids, indice_pairs, indice_pair_num, out_spatial_shape = datas
+            outids = datas.indices
+            indice_pairs = datas.indice_pairs
+            indice_pair_num = datas.indice_pair_num
+            out_spatial_shape = datas.out_spatial_shape
            assert indice_pair_num.shape[0] == np.prod(
                self.kernel_size
            ), "inverse conv must have same kernel size as its couple conv"
        else:
            if self.indice_key is not None and datas is not None:
-                outids, _, indice_pairs, indice_pair_num, _ = datas
+                outids = datas.out_indices
+                indice_pairs = datas.indice_pairs
+                indice_pair_num = datas.indice_pair_num
            else:
+                if input.benchmark:
+                    torch.cuda.synchronize()
+                    t = time.time()
                outids, indice_pairs, indice_pair_num = ops.get_indice_pairs(
                    indices,
                    batch_size,
                    spatial_shape,
+                    self.algo,
                    self.kernel_size,
                    self.stride,
                    self.padding,
                    self.dilation,
                    self.output_padding,
                    self.subm,
-                    self.transposed,
-                    grid=input.grid,
-                    use_hash=self.use_hash)
-                input.indice_dict[self.indice_key] = (outids, indices,
-                                                      indice_pairs,
-                                                      indice_pair_num,
-                                                      spatial_shape)
+                    self.transposed)
+                if input.benchmark:
+                    torch.cuda.synchronize()
+                    interval = time.time() - t
+                    out_tensor.benchmark_record[
+                        self.name]["indice_gen_time"].append(interval)
+
+                indice_data = IndiceData(outids, indices, indice_pairs,
+                                         indice_pair_num, spatial_shape)
+                input.indice_dict[self.indice_key] = indice_data
+        if input.benchmark:
+            torch.cuda.synchronize()
+            t = time.time()
+
        if self.fused_bn:
+            raise NotImplementedError
            assert self.bias is not None
            out_features = ops.fused_indice_conv(features, self.weight,
                                                 self.bias,
@@ -210,12 +264,46 @@ class SparseConvolution(SparseModule):

            if self.bias is not None:
                out_features += self.bias
-        out_tensor = spconv.SparseConvTensor(out_features, outids,
-                                             out_spatial_shape, batch_size)
-        out_tensor.indice_dict = input.indice_dict
-        out_tensor.grid = input.grid
+        if input.benchmark:
+            torch.cuda.synchronize()
+            interval = time.time() - t
+            out_tensor.benchmark_record[self.name]["time"].append(interval)
+            out_tensor.benchmark_record[self.name]["num_points"].append(
+                features.shape[0])
+            out_tensor.benchmark_record[self.name]["num_out_points"].append(
+                out_features.shape[0])
+
+        out_tensor.features = out_features
+        out_tensor.indices = outids
+        out_tensor.spatial_shape = out_spatial_shape
        return out_tensor

+class SparseConv1d(SparseConvolution):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 bias=True,
+                 indice_key=None,
+                 algo=ops.ConvAlgo.Native,
+                 name=None):
+        super(SparseConv1d, self).__init__(1,
+                                           in_channels,
+                                           out_channels,
+                                           kernel_size,
+                                           stride,
+                                           padding,
+                                           dilation,
+                                           groups,
+                                           bias,
+                                           indice_key=indice_key,
+                                           algo=algo,
+                                           name=name)
+

 class SparseConv2d(SparseConvolution):
    def __init__(self,
@@ -228,8 +316,8 @@ class SparseConv2d(SparseConvolution):
                 groups=1,
                 bias=True,
                 indice_key=None,
-                 use_hash=False,
-                 algo=ops.ConvAlgo.Native):
+                 algo=ops.ConvAlgo.Native,
+                 name=None):
        super(SparseConv2d, self).__init__(2,
                                           in_channels,
                                           out_channels,
@@ -240,8 +328,8 @@ class SparseConv2d(SparseConvolution):
                                           groups,
                                           bias,
                                           indice_key=indice_key,
-                                           use_hash=use_hash,
-                                           algo=algo)
+                                           algo=algo,
+                                           name=name)


 class SparseConv3d(SparseConvolution):
@@ -255,8 +343,8 @@ class SparseConv3d(SparseConvolution):
                 groups=1,
                 bias=True,
                 indice_key=None,
-                 use_hash=False,
-                 algo=ops.ConvAlgo.Native):
+                 algo=ops.ConvAlgo.Native,
+                 name=None):
        super(SparseConv3d, self).__init__(3,
                                           in_channels,
                                           out_channels,
@@ -267,8 +355,8 @@ class SparseConv3d(SparseConvolution):
                                           groups,
                                           bias,
                                           indice_key=indice_key,
-                                           use_hash=use_hash,
-                                           algo=algo)
+                                           algo=algo,
+                                           name=name)


 class SparseConv4d(SparseConvolution):
@@ -282,8 +370,8 @@ class SparseConv4d(SparseConvolution):
                 groups=1,
                 bias=True,
                 indice_key=None,
-                 use_hash=False,
-                 algo=ops.ConvAlgo.Native):
+                 algo=ops.ConvAlgo.Native,
+                 name=None):
        super(SparseConv4d, self).__init__(4,
                                           in_channels,
                                           out_channels,
@@ -294,8 +382,36 @@ class SparseConv4d(SparseConvolution):
                                           groups,
                                           bias,
                                           indice_key=indice_key,
-                                           use_hash=use_hash,
-                                           algo=algo)
+                                           algo=algo,
+                                           name=name)
+
+
+class SparseConvTranspose1d(SparseConvolution):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 bias=True,
+                 indice_key=None,
+                 algo=ops.ConvAlgo.Native,
+                 name=None):
+        super(SparseConvTranspose1d, self).__init__(1,
+                                                    in_channels,
+                                                    out_channels,
+                                                    kernel_size,
+                                                    stride,
+                                                    padding,
+                                                    dilation,
+                                                    groups,
+                                                    bias,
+                                                    transposed=True,
+                                                    indice_key=indice_key,
+                                                    algo=algo,
+                                                    name=name)


 class SparseConvTranspose2d(SparseConvolution):
@@ -309,8 +425,8 @@ class SparseConvTranspose2d(SparseConvolution):
                 groups=1,
                 bias=True,
                 indice_key=None,
-                 use_hash=False,
-                 algo=ops.ConvAlgo.Native):
+                 algo=ops.ConvAlgo.Native,
+                 name=None):
        super(SparseConvTranspose2d, self).__init__(2,
                                                    in_channels,
                                                    out_channels,
@@ -322,8 +438,8 @@ class SparseConvTranspose2d(SparseConvolution):
                                                    bias,
                                                    transposed=True,
                                                    indice_key=indice_key,
-                                                    use_hash=use_hash,
-                                                    algo=algo)
+                                                    algo=algo,
+                                                    name=name)


 class SparseConvTranspose3d(SparseConvolution):
@@ -337,8 +453,8 @@ class SparseConvTranspose3d(SparseConvolution):
                 groups=1,
                 bias=True,
                 indice_key=None,
-                 use_hash=False,
-                 algo=ops.ConvAlgo.Native):
+                 algo=ops.ConvAlgo.Native,
+                 name=None):
        super(SparseConvTranspose3d, self).__init__(3,
                                                    in_channels,
                                                    out_channels,
@@ -350,8 +466,55 @@ class SparseConvTranspose3d(SparseConvolution):
                                                    bias,
                                                    transposed=True,
                                                    indice_key=indice_key,
-                                                    use_hash=use_hash,
-                                                    algo=algo)
+                                                    algo=algo,
+                                                    name=name)
+
+class SparseConvTranspose4d(SparseConvolution):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 bias=True,
+                 indice_key=None,
+                 algo=ops.ConvAlgo.Native,
+                 name=None):
+        super(SparseConvTranspose4d, self).__init__(4,
+                                                    in_channels,
+                                                    out_channels,
+                                                    kernel_size,
+                                                    stride,
+                                                    padding,
+                                                    dilation,
+                                                    groups,
+                                                    bias,
+                                                    transposed=True,
+                                                    indice_key=indice_key,
+                                                    algo=algo,
+                                                    name=name)
+
+
+class SparseInverseConv1d(SparseConvolution):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 indice_key,
+                 bias=True,
+                 algo=ops.ConvAlgo.Native,
+                 name=None):
+        super(SparseInverseConv1d, self).__init__(1,
+                                                  in_channels,
+                                                  out_channels,
+                                                  kernel_size,
+                                                  bias=bias,
+                                                  inverse=True,
+                                                  indice_key=indice_key,
+                                                  algo=algo,
+                                                  name=name)


 class SparseInverseConv2d(SparseConvolution):
@@ -361,7 +524,8 @@ class SparseInverseConv2d(SparseConvolution):
                 kernel_size,
                 indice_key,
                 bias=True,
-                 algo=ops.ConvAlgo.Native):
+                 algo=ops.ConvAlgo.Native,
+                 name=None):
        super(SparseInverseConv2d, self).__init__(2,
                                                  in_channels,
                                                  out_channels,
@@ -369,7 +533,8 @@ class SparseInverseConv2d(SparseConvolution):
                                                  bias=bias,
                                                  inverse=True,
                                                  indice_key=indice_key,
-                                                  algo=algo)
+                                                  algo=algo,
+                                                  name=name)


 class SparseInverseConv3d(SparseConvolution):
@@ -379,7 +544,8 @@ class SparseInverseConv3d(SparseConvolution):
                 kernel_size,
                 indice_key,
                 bias=True,
-                 algo=ops.ConvAlgo.Native):
+                 algo=ops.ConvAlgo.Native,
+                 name=None):
        super(SparseInverseConv3d, self).__init__(3,
                                                  in_channels,
                                                  out_channels,
@@ -387,7 +553,54 @@ class SparseInverseConv3d(SparseConvolution):
                                                  bias=bias,
                                                  inverse=True,
                                                  indice_key=indice_key,
-                                                  algo=algo)
+                                                  algo=algo,
+                                                  name=name)
+
+class SparseInverseConv4d(SparseConvolution):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 indice_key,
+                 bias=True,
+                 algo=ops.ConvAlgo.Native,
+                 name=None):
+        super(SparseInverseConv4d, self).__init__(4,
+                                                  in_channels,
+                                                  out_channels,
+                                                  kernel_size,
+                                                  bias=bias,
+                                                  inverse=True,
+                                                  indice_key=indice_key,
+                                                  algo=algo,
+                                                  name=name)
+
+class SubMConv1d(SparseConvolution):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 bias=True,
+                 indice_key=None,
+                 algo=ops.ConvAlgo.Native,
+                 name=None):
+        super(SubMConv1d, self).__init__(1,
+                                         in_channels,
+                                         out_channels,
+                                         kernel_size,
+                                         stride,
+                                         padding,
+                                         dilation,
+                                         groups,
+                                         bias,
+                                         True,
+                                         indice_key=indice_key,
+                                         algo=algo,
+                                         name=name)


 class SubMConv2d(SparseConvolution):
@@ -401,8 +614,8 @@ class SubMConv2d(SparseConvolution):
                 groups=1,
                 bias=True,
                 indice_key=None,
-                 use_hash=False,
-                 algo=ops.ConvAlgo.Native):
+                 algo=ops.ConvAlgo.Native,
+                 name=None):
        super(SubMConv2d, self).__init__(2,
                                         in_channels,
                                         out_channels,
@@ -414,8 +627,8 @@ class SubMConv2d(SparseConvolution):
                                         bias,
                                         True,
                                         indice_key=indice_key,
-                                         use_hash=use_hash,
-                                         algo=algo)
+                                         algo=algo,
+                                         name=name)


 class SubMConv3d(SparseConvolution):
@@ -429,8 +642,8 @@ class SubMConv3d(SparseConvolution):
                 groups=1,
                 bias=True,
                 indice_key=None,
-                 use_hash=False,
-                 algo=ops.ConvAlgo.Native):
+                 algo=ops.ConvAlgo.Native,
+                 name=None):
        super(SubMConv3d, self).__init__(3,
                                         in_channels,
                                         out_channels,
@@ -442,8 +655,8 @@ class SubMConv3d(SparseConvolution):
                                         bias,
                                         True,
                                         indice_key=indice_key,
-                                         use_hash=use_hash,
-                                         algo=algo)
+                                         algo=algo,
+                                         name=name)


 class SubMConv4d(SparseConvolution):
@@ -457,8 +670,8 @@ class SubMConv4d(SparseConvolution):
                 groups=1,
                 bias=True,
                 indice_key=None,
-                 use_hash=False,
-                 algo=ops.ConvAlgo.Native):
+                 algo=ops.ConvAlgo.Native,
+                 name=None):
        super(SubMConv4d, self).__init__(4,
                                         in_channels,
                                         out_channels,
@@ -470,5 +683,5 @@ class SubMConv4d(SparseConvolution):
                                         bias,
                                         True,
                                         indice_key=indice_key,
-                                         use_hash=use_hash,
-                                         algo=algo)
+                                         algo=algo,
+                                         name=name)
--- a/spconv/pytorch/core.py
+++ b/spconv/pytorch/core.py
+# Copyright 2021 Yan Yan
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch 
+if torch.__version__ >= "1.8.0":
+    from .core_fx import *
+else:
+    from .core import *
--- a/spconv/pytorch/core_fx.py
+++ b/spconv/pytorch/core_fx.py
+# Copyright 2021 Yan Yan
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Optional
+
+import numpy as np
+import torch
+import torch.fx
+from torch.fx.symbolic_trace import ProxyableClassMeta
+
+class IndiceData(object):
+    def __init__(self, out_indices, indices, indice_pairs, indice_pair_num,
+                 out_spatial_shape):
+        self.out_indices = out_indices
+        self.indices = indices
+        self.indice_pairs = indice_pairs
+        self.indice_pair_num = indice_pair_num
+        self.out_spatial_shape = out_spatial_shape
+
+
+def scatter_nd(indices, updates, shape):
+    """pytorch edition of tensorflow scatter_nd.
+    this function don't contain except handle code. so use this carefully
+    when indice repeats, don't support repeat add which is supported
+    in tensorflow.
+    """
+    ret = torch.zeros(*shape, dtype=updates.dtype, device=updates.device)
+    ndim = indices.shape[-1]
+    output_shape = list(indices.shape[:-1]) + shape[indices.shape[-1]:]
+    flatted_indices = indices.view(-1, ndim)
+    slices = [flatted_indices[:, i] for i in range(ndim)]
+    slices += [Ellipsis]
+    ret[slices] = updates.view(*output_shape)
+    return ret
+
+
+class SparseConvTensor(metaclass=ProxyableClassMeta):
+    def __init__(self,
+                 features,
+                 indices,
+                 spatial_shape,
+                 batch_size,
+                 grid=None,
+                 voxel_num=None,
+                 benchmark=False):
+        """
+        Args:
+            features: [num_points, num_features] feature tensor
+            indices: [num_points, ndim + 1] indice tensor. batch index saved in indices[:, 0]
+            spatial_shape: spatial shape of your sparse data
+            batch_size: batch size of your sparse data
+            grid: pre-allocated grid tensor. should be used when the volume of spatial shape
+                is very large.
+            benchmark: whether to enable benchmark. if enabled, all sparse operators will be record to
+                SparseConvTensor.
+        """
+        self._features = features
+        self.indices = indices
+        self.spatial_shape = spatial_shape
+        self.batch_size = batch_size
+        self.indice_dict = {}
+        if grid is None:
+            grid = torch.Tensor()  # empty tensor
+        self.grid = grid
+        self.voxel_num = voxel_num # for tensorrt
+        self.benchmark = benchmark
+        self.benchmark_record = {}
+
+    def replace_feature(self, feature):
+        """we need to replace x.features = F.relu(x) with x = x.replace_feature(F.relu(x.features))
+        due to limit of torch.fx
+        """
+        new_spt = SparseConvTensor(feature, self.indices, self.spatial_shape, self.batch_size, self.grid, self.voxel_num, self.indice_dict)
+        new_spt.benchmark = self.benchmark
+        new_spt.benchmark_record = self.benchmark_record
+        return new_spt
+
+    @property
+    def features(self):
+        return self._features
+
+    @features.setter
+    def features(self, val):
+        msg = ("you can't set feature directly, use 'x = x.replace_feature(your_new_feature)'"
+                " to generate new SparseConvTensor instead.")
+        raise ValueError(msg)
+
+    @classmethod
+    def from_dense(cls, x: torch.Tensor):
+        """create sparse tensor fron channel last dense tensor by to_sparse
+        x must be NHWC tensor, channel last
+        """
+        x = x.to_sparse(x.ndim - 1)
+        spatial_shape = x.shape[1:-1]
+        batch_size = x.shape[0]
+        indices_th = x.indices().permute(1, 0).contiguous().int()
+        features_th = x.values()
+        return cls(features_th, indices_th, spatial_shape, batch_size)
+
+    @property
+    def spatial_size(self):
+        return np.prod(self.spatial_shape)
+
+    def find_indice_pair(self, key) -> Optional[IndiceData]:
+        if key is None:
+            return None
+        if key in self.indice_dict:
+            return self.indice_dict[key]
+        return None
+
+    def dense(self, channels_first=True):
+        output_shape = [self.batch_size] + list(
+            self.spatial_shape) + [self.features.shape[1]]
+        res = scatter_nd(
+            self.indices.to(self.features.device).long(), self.features,
+            output_shape)
+        if not channels_first:
+            return res
+        ndim = len(self.spatial_shape)
+        trans_params = list(range(0, ndim + 1))
+        trans_params.insert(1, ndim + 1)
+        return res.permute(*trans_params).contiguous()
+
+    # remove this due to limit of torch.fx
+    # @property
+    # def sparity(self):
+    #     return self.indices.shape[0] / np.prod(
+    #         self.spatial_shape) / self.batch_size
+
+    def shadow_copy(self) -> "SparseConvTensor":
+        """create a new spconv tensor with all member unchanged"""
+        tensor = SparseConvTensor(self.features, self.indices,
+                                  self.spatial_shape, self.batch_size,
+                                  self.grid, self.benchmark)
+        tensor.benchmark_record = self.benchmark_record
+        tensor.indice_dict = self.indice_dict
+        tensor.voxel_num = self.voxel_num
+        return tensor
--- a/spconv/pytorch/core_nofx.py
+++ b/spconv/pytorch/core_nofx.py
+# Copyright 2021 Yan Yan
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Optional
+
+import numpy as np
+import torch
+
+
+class IndiceData(object):
+    def __init__(self, out_indices, indices, indice_pairs, indice_pair_num,
+                 out_spatial_shape):
+        self.out_indices = out_indices
+        self.indices = indices
+        self.indice_pairs = indice_pairs
+        self.indice_pair_num = indice_pair_num
+        self.out_spatial_shape = out_spatial_shape
+
+
+def scatter_nd(indices, updates, shape):
+    """pytorch edition of tensorflow scatter_nd.
+    this function don't contain except handle code. so use this carefully
+    when indice repeats, don't support repeat add which is supported
+    in tensorflow.
+    """
+    ret = torch.zeros(*shape, dtype=updates.dtype, device=updates.device)
+    ndim = indices.shape[-1]
+    output_shape = list(indices.shape[:-1]) + shape[indices.shape[-1]:]
+    flatted_indices = indices.view(-1, ndim)
+    slices = [flatted_indices[:, i] for i in range(ndim)]
+    slices += [Ellipsis]
+    ret[slices] = updates.view(*output_shape)
+    return ret
+
+
+class SparseConvTensor(object):
+    def __init__(self,
+                 features,
+                 indices,
+                 spatial_shape,
+                 batch_size,
+                 grid=None,
+                 voxel_num=None,
+                 benchmark=False):
+        """
+        Args:
+            features: [num_points, num_features] feature tensor
+            indices: [num_points, ndim + 1] indice tensor. batch index saved in indices[:, 0]
+            spatial_shape: spatial shape of your sparse data
+            batch_size: batch size of your sparse data
+            grid: pre-allocated grid tensor. should be used when the volume of spatial shape
+                is very large.
+            benchmark: whether to enable benchmark. if enabled, all sparse operators will be record to
+                SparseConvTensor.
+        """
+        self._features = features
+        self.indices = indices
+        self.spatial_shape = spatial_shape
+        self.batch_size = batch_size
+        self.indice_dict = {}
+        if grid is None:
+            grid = torch.Tensor()  # empty tensor
+        self.grid = grid
+        self.voxel_num = voxel_num
+        self.benchmark = benchmark
+        self.benchmark_record = {}
+
+    def replace_feature(self, feature):
+        """we need to replace x.features = F.relu(x) with x = x.replace_feature(F.relu(x))
+        due to limit of torch.fx
+        """
+        new_spt = SparseConvTensor(feature, self.indices, self.spatial_shape, self.batch_size, self.grid, self.voxel_num, self.indice_dict)
+        new_spt.benchmark = self.benchmark
+        new_spt.benchmark_record = self.benchmark_record
+        return new_spt
+
+    @property
+    def features(self):
+        return self._features
+
+    @features.setter
+    def features(self, val):
+        msg = ("you can't set feature directly, use 'x = x.replace_feature(F.relu(x.feature))'"
+                " to generate new SparseConvTensor instead.")
+        raise ValueError(msg)
+
+    @classmethod
+    def from_dense(cls, x: torch.Tensor):
+        """create sparse tensor fron channel last dense tensor by to_sparse
+        x must be NHWC tensor, channel last
+        """
+        x = x.to_sparse(x.ndim - 1)
+        spatial_shape = x.shape[1:-1]
+        batch_size = x.shape[0]
+        indices_th = x.indices().permute(1, 0).contiguous().int()
+        features_th = x.values()
+        return cls(features_th, indices_th, spatial_shape, batch_size)
+
+    @property
+    def spatial_size(self):
+        return np.prod(self.spatial_shape)
+
+    def find_indice_pair(self, key) -> Optional[IndiceData]:
+        if key is None:
+            return None
+        if key in self.indice_dict:
+            return self.indice_dict[key]
+        return None
+
+    def dense(self, channels_first=True):
+        output_shape = [self.batch_size] + list(
+            self.spatial_shape) + [self.features.shape[1]]
+        res = scatter_nd(
+            self.indices.to(self.features.device).long(), self.features,
+            output_shape)
+        if not channels_first:
+            return res
+        ndim = len(self.spatial_shape)
+        trans_params = list(range(0, ndim + 1))
+        trans_params.insert(1, ndim + 1)
+        return res.permute(*trans_params).contiguous()
+
+    # @property
+    # def sparity(self):
+    #     return self.indices.shape[0] / np.prod(
+    #         self.spatial_shape) / self.batch_size
+
+    def shadow_copy(self) -> "SparseConvTensor":
+        """create a new spconv tensor with all member unchanged"""
+        tensor = SparseConvTensor(self.features, self.indices,
+                                  self.spatial_shape, self.batch_size,
+                                  self.grid, self.benchmark)
+        tensor.benchmark_record = self.benchmark_record
+        tensor.indice_dict = self.indice_dict
+        return tensor
--- a/spconv/pytorch/cppcore.py
+++ b/spconv/pytorch/cppcore.py
+# Copyright 2021 Yan Yan
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from cumm import tensorview as tv 
+import torch 
+from typing import Optional, List
+_TORCH_DTYPE_TO_TV = {
+    torch.float32: tv.float32,
+    torch.float64: tv.float64,
+    torch.float16: tv.float16,
+    torch.int32: tv.int32,
+    torch.int64: tv.int64,
+    torch.int8: tv.int8,
+    torch.int16: tv.int16,
+    torch.uint8: tv.uint8,
+}
+
+def torch_tensor_to_tv(ten: torch.Tensor, dtype: Optional[int] = None, shape: Optional[List[int]] = None):
+    assert ten.is_contiguous(), "must be contiguous tensor"
+    ptr = ten.data_ptr()
+    device = ten.device 
+    if device.type == "cpu":
+        tv_device = -1
+    elif device.type == "cuda":
+        tv_device = 0
+    else:
+        raise NotImplementedError
+    if shape is None:
+        shape = list(ten.shape)
+    if dtype is None:
+        dtype = _TORCH_DTYPE_TO_TV[ten.dtype]
+    return tv.from_blob(ptr, shape, dtype, tv_device)
+
+def get_current_stream():
+    return torch.cuda.current_stream().cuda_stream
+
+if __name__ == "__main__":
+    a = torch.rand(2, 2)
+    atv = torch_tensor_to_tv(a)
+    print(atv.numpy_view())
\ No newline at end of file
--- a/spconv/functional.py
+++ b/spconv/functional.py
-# Copyright 2019 Yan Yan
+# Copyright 2021 Yan Yan
 # 
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -16,7 +16,7 @@ import torch
 from torch import nn
 from torch.autograd import Function

-import spconv.ops as ops
+import spconv.pytorch.ops as ops


 class SparseConvFunction(Function):

--- a/spconv/identity.py
+++ b/spconv/identity.py
--- a/spconv/modules.py
+++ b/spconv/modules.py
-# Copyright 2019 Yan Yan
+# Copyright 2021 Yan Yan
 # 
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

+
 import sys
 import time
 from collections import OrderedDict
@@ -19,7 +20,7 @@ from collections import OrderedDict
 import torch
 from torch import nn

-import spconv
+from spconv import pytorch as spconv


 def is_spconv_module(module):
@@ -28,7 +29,7 @@ def is_spconv_module(module):


 def is_sparse_conv(module):
-    from spconv.conv import SparseConvolution
+    from spconv.pytorch.conv import SparseConvolution
    return isinstance(module, SparseConvolution)


@@ -49,7 +50,9 @@ def _mean_update(vals, m_vals, t):
 class SparseModule(nn.Module):
    """ place holder, all module subclass from this will take sptensor in SparseSequential.
    """
-    pass
+    def __init__(self, name=None):
+        super().__init__()
+        self.name = name


 class SparseSequential(SparseModule):
@@ -140,50 +143,3 @@ class SparseSequential(SparseModule):
                    input = module(input)
        return input

-    def fused(self):
-        """don't use this. no effect.
-        """
-        from spconv.conv import SparseConvolution
-        mods = [v for k, v in self._modules.items()]
-        fused_mods = []
-        idx = 0
-        while idx < len(mods):
-            if is_sparse_conv(mods[idx]):
-                if idx < len(mods) - 1 and isinstance(mods[idx + 1],
-                                                      nn.BatchNorm1d):
-                    new_module = SparseConvolution(
-                        ndim=mods[idx].ndim,
-                        in_channels=mods[idx].in_channels,
-                        out_channels=mods[idx].out_channels,
-                        kernel_size=mods[idx].kernel_size,
-                        stride=mods[idx].stride,
-                        padding=mods[idx].padding,
-                        dilation=mods[idx].dilation,
-                        groups=mods[idx].groups,
-                        bias=True,
-                        subm=mods[idx].subm,
-                        output_padding=mods[idx].output_padding,
-                        transposed=mods[idx].transposed,
-                        inverse=mods[idx].inverse,
-                        indice_key=mods[idx].indice_key,
-                        fused_bn=True,
-                    )
-                    new_module.load_state_dict(mods[idx].state_dict(), False)
-                    new_module.to(mods[idx].weight.device)
-                    conv = new_module
-                    bn = mods[idx + 1]
-                    conv.bias.data.zero_()
-                    conv.weight.data[:] = conv.weight.data * bn.weight.data / (
-                        torch.sqrt(bn.running_var) + bn.eps)
-                    conv.bias.data[:] = (
-                        conv.bias.data - bn.running_mean) * bn.weight.data / (
-                            torch.sqrt(bn.running_var) + bn.eps) + bn.bias.data
-                    fused_mods.append(conv)
-                    idx += 2
-                else:
-                    fused_mods.append(mods[idx])
-                    idx += 1
-            else:
-                fused_mods.append(mods[idx])
-                idx += 1
-        return SparseSequential(*fused_mods)
--- a/spconv/pytorch/ops.py
+++ b/spconv/pytorch/ops.py
+# Copyright 2021 Yan Yan
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import functools
+from enum import Enum
+from cumm import tensorview as tv
+from cumm.gemm.algospec.core import ShuffleStrideType
+
+import torch
+import numpy as np
+import spconv
+from spconv.algo import AlgoHint, ConvAlgo
+from typing import List, Union
+from spconv.pytorch.cppcore import torch_tensor_to_tv, get_current_stream
+from spconv.core_cc.csrc.sparse.all import SpconvOps
+from spconv.algo import GEMM  # , GATHER, SCATTER
+import time
+from spconv.constants import FILTER_HWIO
+
+
+def get_conv_output_size(input_size, kernel_size, stride, padding, dilation):
+    ndim = len(input_size)
+    output_size = []
+    for i in range(ndim):
+        size = (input_size[i] + 2 * padding[i] - dilation[i] *
+                (kernel_size[i] - 1) - 1) // stride[i] + 1
+        if kernel_size[i] == -1:
+            output_size.append(1)
+        else:
+            output_size.append(size)
+    return output_size
+
+
+def get_deconv_output_size(input_size, kernel_size, stride, padding, dilation,
+                           output_padding):
+    ndim = len(input_size)
+    output_size = []
+    for i in range(ndim):
+        if kernel_size[i] == -1:
+            raise ValueError("deconv don't support kernel_size < 0")
+        size = (input_size[i] - 1) * stride[i] - 2 * padding[i] + kernel_size[
+            i] + output_padding[i]
+        output_size.append(size)
+    return output_size
+
+
+def get_indice_pairs(indices: torch.Tensor,
+                     batch_size: int,
+                     spatial_shape: List[int],
+                     algo: ConvAlgo,
+                     ksize: List[int],
+                     stride: List[int],
+                     padding: List[int],
+                     dilation: List[int],
+                     out_padding: List[int],
+                     subm: bool = False,
+                     transpose: bool = False):
+    # torch.cuda.synchronize()
+    # t = time.time()
+    ndim = indices.shape[1] - 1
+    kv: int = functools.reduce(lambda x, y: x * y, ksize, 1)
+    if not subm:
+        if transpose:
+            out_shape = get_deconv_output_size(spatial_shape, ksize, stride,
+                                               padding, dilation, out_padding)
+        else:
+            out_shape = get_conv_output_size(spatial_shape, ksize, stride,
+                                             padding, dilation)
+    else:
+        out_shape = spatial_shape
+    assert algo == ConvAlgo.Native, "TODO"
+    stream = get_current_stream()
+
+    pair = torch.full((2, kv, indices.shape[0]),
+                      -1,
+                      dtype=indices.dtype,
+                      device=indices.device)
+    indice_num_per_loc = torch.zeros((kv, ),
+                                     dtype=indices.dtype,
+                                     device=indices.device)
+
+    inds_tv = torch_tensor_to_tv(indices)
+    pair_tv = torch_tensor_to_tv(pair)
+    indice_num_per_loc_tv = torch_tensor_to_tv(indice_num_per_loc)
+
+    if subm:
+        out_inds = indices
+
+        hashdata = torch.empty((out_inds.shape[0] * 2, ),
+                               dtype=torch.int64,
+                               device=indices.device)
+        out_inds_tv = torch_tensor_to_tv(out_inds)
+        hashdata_tv = torch_tensor_to_tv(hashdata, dtype=tv.custom64)
+
+        SpconvOps.generate_subm_conv_inds(inds_tv,
+                                          hashdata_tv,
+                                          pair_tv,
+                                          out_inds_tv,
+                                          indice_num_per_loc_tv,
+                                          batch_size=batch_size,
+                                          input_dims=spatial_shape,
+                                          ksize=ksize,
+                                          dilation=dilation,
+                                          stream_int=stream)
+        # torch.cuda.synchronize()
+        # print("SUBM", time.time() - t)
+
+    else:
+        indice_pairs_uniq = torch.empty((pair.numel() // 2 + 1, ),
+                                        dtype=indices.dtype,
+                                        device=indices.device)
+        indice_pairs_uniq_tv = torch_tensor_to_tv(indice_pairs_uniq)
+
+        SpconvOps.generate_conv_inds_stage1(inds_tv,
+                                            pair_tv,
+                                            indice_pairs_uniq_tv,
+                                            indice_num_per_loc_tv,
+                                            batch_size=batch_size,
+                                            output_dims=out_shape,
+                                            input_dims=spatial_shape,
+                                            ksize=ksize,
+                                            stride=stride,
+                                            padding=padding,
+                                            dilation=dilation,
+                                            transposed=transpose,
+                                            stream_int=stream)
+        uniq_res = indice_pairs_uniq.unique()
+        num_act_out = uniq_res.shape[0] - 1
+        uniq_res_tv = torch_tensor_to_tv(uniq_res)
+        # num_act_out = SpconvOps.generate_conv_inds_stage1_5(
+        #     indice_pairs_uniq_tv,
+        #     ndim,
+        #     uniq_size=indice_pairs_uniq_tv.size,
+        #     stream_int=stream)
+        # uniq_res_tv = indice_pairs_uniq_tv.slice_first_axis(0, num_act_out)
+        out_inds = torch.empty((num_act_out, indices.shape[1]),
+                               dtype=indices.dtype,
+                               device=indices.device)
+        hashdata = torch.empty((out_inds.shape[0] * 2, ),
+                               dtype=torch.int64,
+                               device=indices.device)
+        out_inds_tv = torch_tensor_to_tv(out_inds)
+        hashdata_tv = torch_tensor_to_tv(hashdata, dtype=tv.custom64)
+        SpconvOps.generate_conv_inds_stage2(inds_tv,
+                                            hashdata_tv,
+                                            pair_tv,
+                                            uniq_res_tv,
+                                            out_inds_tv,
+                                            num_out_act=num_act_out,
+                                            batch_size=batch_size,
+                                            output_dims=out_shape,
+                                            input_dims=spatial_shape,
+                                            ksize=ksize,
+                                            stride=stride,
+                                            padding=padding,
+                                            dilation=dilation,
+                                            transposed=transpose,
+                                            stream_int=stream)
+        # torch.cuda.synchronize()
+        # print("REGU", time.time() - t)
+    return out_inds, pair, indice_num_per_loc
+
+
+def indice_conv(features: torch.Tensor,
+                filters: torch.Tensor,
+                indice_pairs: torch.Tensor,
+                indice_pair_num: torch.Tensor,
+                num_activate_out: int,
+                inverse: bool = False,
+                subm: bool = False,
+                algo: ConvAlgo = ConvAlgo.Native):
+    # filters: RSKC
+    # torch.cuda.synchronize()
+    # t = time.time()
+    if features.dtype == torch.int8 or features.dtype == torch.qint8:
+        raise NotImplementedError("work in progress")
+    if FILTER_HWIO:
+        out_channel = filters.shape[-1]
+    else:
+        out_channel = filters.shape[-2]
+    filters = filters.reshape(-1, *filters.shape[-2:])
+    kv = filters.shape[0]
+    kv_center = kv // 2
+    if subm:
+        if FILTER_HWIO:
+            out_features = torch.mm(features, filters[kv_center])
+        else:
+            out_features = torch.mm(features, filters[kv_center].T)
+    else:
+        out_features = torch.zeros((num_activate_out, out_channel),
+                                   dtype=features.dtype,
+                                   device=features.device)
+    if kv == 1 and subm:
+        return out_features
+
+    stream = get_current_stream()
+    indice_pair_num_cpu = indice_pair_num.cpu().tolist()
+    arch = torch.cuda.get_device_capability()
+    inited: bool = subm
+    a = torch_tensor_to_tv(features)
+    c = torch_tensor_to_tv(out_features)
+    profile_idx = kv_center
+    if subm:
+        profile_idx = kv_center - 1
+    # profile_idx = first_n
+    nhot_profile = indice_pair_num_cpu[profile_idx]
+
+    # print(nhot_profile, indice_pair_num_cpu)
+    profile_res = GEMM.get_profiled_algo(
+        a.shape,
+        filters.shape[-2:],
+        c.shape,
+        False,
+        False if FILTER_HWIO else True,
+        False,
+        arch=arch,
+        shuffle_type=ShuffleStrideType.ShuffleAC,
+        a_inds_shape=[nhot_profile],
+        c_inds_shape=[nhot_profile],
+        hint=AlgoHint.Fowrard.value)
+
+
+    maxnhot = max(indice_pair_num_cpu)
+    if profile_res is None:
+        # run profile on center
+        inp_indices_th = indice_pairs[int(inverse)][profile_idx, :nhot_profile]
+        out_indices_th = indice_pairs[int(not inverse)][
+            profile_idx, :nhot_profile]
+        inp_indices = torch_tensor_to_tv(inp_indices_th)
+        out_indices = torch_tensor_to_tv(out_indices_th)
+        filter_tv = torch_tensor_to_tv(filters)[profile_idx]
+
+        profile_res, min_time = GEMM.profile_and_cache(
+            a,
+            filter_tv,
+            c,
+            False,
+            False if FILTER_HWIO else True,
+            False,
+            arch=arch,
+            shuffle_type=ShuffleStrideType.ShuffleAC,
+            a_inds=inp_indices,
+            c_inds=out_indices,
+            alpha=1.0,
+            beta=0.0,
+            hint=AlgoHint.Fowrard.value,
+            stream=stream)
+
+    indice_pairs_tv = torch_tensor_to_tv(indice_pairs)
+    pair_in = indice_pairs_tv[int(inverse)]
+    pair_out = indice_pairs_tv[int(not inverse)]
+    filters_tv = torch_tensor_to_tv(filters)
+    for i, nhot in enumerate(indice_pair_num_cpu):
+        if subm and i == kv_center:
+            continue
+        if subm and i > kv_center:
+            nhot = indice_pair_num_cpu[kv - i - 1]
+        if nhot <= 0:
+            continue
+        inp_indices = pair_in[i].slice_first_axis(0, nhot)
+        out_indices = pair_out[i].slice_first_axis(0, nhot)
+        b = filters_tv[i]
+        # inp @ filter.T, NC @ KC
+        beta = 1.0 if inited else 0.0
+        algo_desp = GEMM.run_profile(profile_res,
+                                     a,
+                                     b,
+                                     c,
+                                     False,
+                                     False if FILTER_HWIO else True,
+                                     False,
+                                     arch=arch,
+                                     stream=stream,
+                                     shuffle_type=ShuffleStrideType.ShuffleAC,
+                                     a_inds=inp_indices,
+                                     c_inds=out_indices,
+                                     hint=AlgoHint.Fowrard.value,
+                                     alpha=1.0,
+                                     beta=beta)
+
+        # gather_times += gather_time
+        inited = True
+    # torch.cuda.synchronize()
+    # # print(stream, valid_count, maxnhot, features.shape[0], features.shape[1], out_channel, time.time() - t, total_times, txt)
+    # # print(algo_desp, profile_res.external_gather, profile_res.splitk, features.shape[0], features.shape[1], out_channel, time.time() - t)
+
+    # # print(indice_pair_num_cpu)
+    # print("G", time.time() - t)
+    return out_features
+
+
+def fused_indice_conv(features, filters, bias, indice_pairs, indice_pair_num,
+                      num_activate_out, inverse, subm):
+    raise NotImplementedError
+
+
+def indice_conv_backward(features: torch.Tensor,
+                         filters: torch.Tensor,
+                         out_bp: torch.Tensor,
+                         indice_pairs: torch.Tensor,
+                         indice_pair_num: torch.Tensor,
+                         inverse: bool = False,
+                         subm: bool = False,
+                         algo: ConvAlgo = ConvAlgo.Native):
+    # torch.cuda.synchronize()
+    # t = time.time()
+
+    num_activate_out = out_bp.shape[0]
+    out_channel = out_bp.shape[-1]
+    filters_shape = filters.shape
+    filters = filters.reshape(-1, *filters.shape[-2:])
+    kv = filters.shape[0]
+    kv_center = kv // 2
+    assert out_bp.is_contiguous()
+    assert filters.is_contiguous()
+    assert features.is_contiguous()
+
+    if subm:
+        dfilters = torch.zeros_like(filters)
+        if FILTER_HWIO:
+            torch.mm(features.T, out_bp, out=dfilters[kv_center])
+            # TODO can we use torch mm for f16 backward weight?
+            din = torch.mm(out_bp, filters[kv_center].T)
+        else:
+            torch.mm(out_bp.T, features, out=dfilters[kv_center])
+            # TODO can we use torch mm for f16 backward weight?
+            din = torch.mm(out_bp, filters[kv_center])
+    else:
+        dfilters = torch.zeros_like(filters)
+        din = torch.zeros_like(features)
+    if kv == 1 and subm:
+        return (din, dfilters.reshape(filters_shape))
+
+    inited: bool = subm
+    indice_pairs_tv = torch_tensor_to_tv(indice_pairs)
+    # torch slice (a_th[x]) is very slow, so we need to use tv.Tensor earlier.
+    pair_in = indice_pairs_tv[int(inverse)]
+    pair_out = indice_pairs_tv[int(not inverse)]
+
+    stream = get_current_stream()
+    indice_pair_num_cpu = indice_pair_num.cpu().tolist()
+    arch = torch.cuda.get_device_capability()
+    filters_tv = torch_tensor_to_tv(filters)
+
+    dfilters_tv = torch_tensor_to_tv(dfilters)
+    out_bp_tv = torch_tensor_to_tv(out_bp)
+    features_tv = torch_tensor_to_tv(features)
+
+    din_tv = torch_tensor_to_tv(din)
+
+    profile_idx = kv_center
+    if subm:
+        profile_idx = kv_center - 1
+    # profile_idx = first_n
+    nhot_profile = indice_pair_num_cpu[profile_idx]
+
+    # print(nhot_profile, indice_pair_num_cpu)
+    profile_res_dgrad = GEMM.get_profiled_algo(
+        out_bp_tv.shape,
+        filters.shape[-2:],
+        din_tv.shape,
+        False,
+        True if FILTER_HWIO else False,
+        False,
+        arch=arch,
+        shuffle_type=ShuffleStrideType.ShuffleAC,
+        a_inds_shape=[nhot_profile],
+        c_inds_shape=[nhot_profile],
+        hint=AlgoHint.BackwardInput.value)
+    if profile_res_dgrad is None:
+        inp_indices = pair_in[profile_idx].slice_first_axis(0, nhot_profile)
+        out_indices = pair_out[profile_idx].slice_first_axis(0, nhot_profile)
+        filter_tv = filters_tv[profile_idx]
+        profile_res_dgrad, min_time = GEMM.profile_and_cache(
+            out_bp_tv,
+            filter_tv,
+            din_tv,
+            False,
+            True if FILTER_HWIO else False,
+            False,
+            arch=arch,
+            shuffle_type=ShuffleStrideType.ShuffleAC,
+            a_inds=out_indices,
+            c_inds=inp_indices,
+            alpha=1.0,
+            beta=0.0,
+            hint=AlgoHint.BackwardInput.value,
+            stream=stream)
+    if not FILTER_HWIO:
+        a_wgrad = out_bp_tv
+        b_wgrad = features_tv
+    else:
+        a_wgrad = features_tv
+        b_wgrad = out_bp_tv
+    profile_res_wgrad = GEMM.get_profiled_algo(
+        a_wgrad.shape,
+        b_wgrad.shape,
+        filters.shape[-2:],
+        True,
+        False,
+        False,
+        arch=arch,
+        shuffle_type=ShuffleStrideType.ShuffleAB,
+        a_inds_shape=[nhot_profile],
+        b_inds_shape=[nhot_profile],
+        hint=AlgoHint.BackwardWeight.value)
+
+    if profile_res_wgrad is None:
+        inp_indices = pair_in[profile_idx].slice_first_axis(0, nhot_profile)
+        out_indices = pair_out[profile_idx].slice_first_axis(0, nhot_profile)
+        dfilter_tv = dfilters_tv[profile_idx]
+        if not FILTER_HWIO:
+            a_inds_wgrad = out_indices
+            b_inds_wgrad = inp_indices
+        else:
+            a_inds_wgrad = inp_indices
+            b_inds_wgrad = out_indices
+        profile_res_wgrad, min_time = GEMM.profile_and_cache(
+            a_wgrad,
+            b_wgrad,
+            dfilter_tv,
+            True,
+            False,
+            False,
+            arch=arch,
+            shuffle_type=ShuffleStrideType.ShuffleAB,
+            a_inds=a_inds_wgrad,
+            b_inds=b_inds_wgrad,
+            alpha=1.0,
+            beta=0.0,
+            hint=AlgoHint.BackwardWeight.value,
+            stream=stream)
+        # print(profile_res_wgrad.algo_desp, profile_res_wgrad.splitk, min_time)
+    maxnhot = max(indice_pair_num_cpu)
+    # get workspace size for wgrad
+    if not FILTER_HWIO:
+        a_shape = [maxnhot, out_bp_tv.dim(1)]
+        b_shape = [maxnhot, features_tv.dim(1)]
+    else:
+        b_shape = [maxnhot, out_bp_tv.dim(1)]
+        a_shape = [maxnhot, features_tv.dim(1)]
+    m, n, k = GEMM.extract_mnk(a_shape,
+                               b_shape,
+                               profile_res_wgrad.algo_desp.trans_a,
+                               profile_res_wgrad.algo_desp.trans_b,
+                               profile_res_wgrad.algo_desp.trans_c,
+                               arch=arch,
+                               shuffle_type=ShuffleStrideType.ShuffleAB,
+                               a_inds_shape=[maxnhot],
+                               b_inds_shape=[maxnhot],
+                               hint=AlgoHint.BackwardWeight.value)
+    workspace_size = profile_res_wgrad.algo_desp.query_workspace_size(
+        m, n, k, profile_res_wgrad.splitk)
+    workspace = torch.Tensor()
+
+    workspace_tv = tv.Tensor()
+    if workspace_size > 0:
+        workspace = torch.empty((workspace_size, ),
+                                dtype=torch.int8,
+                                device=features.device)
+        workspace_tv = torch_tensor_to_tv(workspace)
+    # print(workspace_size, m, n, k, profile_res_wgrad.splitk)
+    # torch.cuda.synchronize()
+    # di_time = time.time() - t
+    # t = time.time()
+    inited = subm
+    for i, nhot in enumerate(indice_pair_num_cpu):
+        if subm and i == kv_center:
+            continue
+        if subm and i > kv_center:
+            nhot = indice_pair_num_cpu[kv - i - 1]
+        if nhot <= 0:
+            continue
+        beta = 1.0 if inited else 0.0
+        inp_indices = pair_in[i].slice_first_axis(0, nhot)
+        out_indices = pair_out[i].slice_first_axis(0, nhot)
+        # out.T @ inp, NK @ NC
+        # print(features_tv.shape, out_bp_tv.shape)
+        GEMM.run_profile(profile_res_dgrad,
+                         out_bp_tv,
+                         filters_tv[i],
+                         din_tv,
+                         False,
+                         True if FILTER_HWIO else False,
+                         False,
+                         arch=arch,
+                         stream=stream,
+                         shuffle_type=ShuffleStrideType.ShuffleAC,
+                         a_inds=out_indices,
+                         c_inds=inp_indices,
+                         hint=AlgoHint.BackwardInput.value,
+                         alpha=1.0,
+                         beta=beta)
+
+        if not FILTER_HWIO:
+            a = out_bp_tv
+            b = features_tv
+            a_inds = out_indices
+            b_inds = inp_indices
+        else:
+            a = features_tv
+            b = out_bp_tv
+            a_inds = inp_indices
+            b_inds = out_indices
+        GEMM.run_profile(profile_res_wgrad,
+                         a,
+                         b,
+                         dfilters_tv[i],
+                         True,
+                         False,
+                         False,
+                         arch=arch,
+                         stream=stream,
+                         shuffle_type=ShuffleStrideType.ShuffleAB,
+                         a_inds=a_inds,
+                         b_inds=b_inds,
+                         hint=AlgoHint.BackwardWeight.value,
+                         alpha=1.0,
+                         beta=beta,
+                         workspace=workspace_tv)
+        inited = True
+
+    # torch.cuda.synchronize()
+    # dw_time = time.time() - t
+    # # print(dw_time + di_time, di_time, dw_time, profile_res_wgrad.splitk, profile_res_wgrad.algo_desp, dfilters.shape)
+    # # print(dw_time + di_time)
+    # print("BWG", time.time() - t)
+    return (din, dfilters.reshape(filters_shape))
+
+
+def indice_maxpool(features, indice_pairs, indice_pair_num, num_activate_out):
+    # torch.cuda.synchronize()
+    # t = time.time()
+    out_channel = features.shape[-1]
+    out_features = torch.zeros((num_activate_out, out_channel),
+                               dtype=features.dtype,
+                               device=features.device)
+    stream = get_current_stream()
+    indice_pair_num_cpu = indice_pair_num.cpu().tolist()
+    out_features_tv = torch_tensor_to_tv(out_features)
+    features_tv = torch_tensor_to_tv(features)
+    for i, nhot in enumerate(indice_pair_num_cpu):
+        if nhot <= 0:
+            continue
+        inp_indices = torch_tensor_to_tv(indice_pairs[0][i, :nhot])
+        out_indices = torch_tensor_to_tv(indice_pairs[1][i, :nhot])
+        SpconvOps.maxpool_forward(out_features_tv, features_tv, out_indices,
+                                  inp_indices, stream)
+    # torch.cuda.synchronize()
+    # print("M", time.time() - t)
+
+    return out_features
+
+
+def indice_maxpool_backward(features, out_features, out_bp, indice_pairs,
+                            indice_pair_num):
+    out_channel = features.shape[-1]
+    din = torch.zeros_like(features)
+    stream = get_current_stream()
+    indice_pair_num_cpu = indice_pair_num.cpu().tolist()
+    out_features_tv = torch_tensor_to_tv(out_features)
+    features_tv = torch_tensor_to_tv(features)
+    out_bp_tv = torch_tensor_to_tv(out_bp)
+    din_tv = torch_tensor_to_tv(din)
+    for i, nhot in enumerate(indice_pair_num_cpu):
+        if nhot <= 0:
+            continue
+        inp_indices = torch_tensor_to_tv(indice_pairs[0][i, :nhot])
+        out_indices = torch_tensor_to_tv(indice_pairs[1][i, :nhot])
+        SpconvOps.maxpool_backward(out_features_tv, features_tv, out_bp_tv,
+                                   din_tv, out_indices, inp_indices, stream)
+
+    return din
+
+
+def nms(boxes, scores, pre_max_size, post_max_size, thresh, eps):
+    raise NotImplementedError
+
+
+def pillar_scatter(features, coors, shape):
+    raise NotImplementedError
--- a/spconv/pool.py
+++ b/spconv/pool.py
-# Copyright 2019 Yan Yan
+# Copyright 2021 Yan Yan
 # 
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -21,10 +21,12 @@ from torch import nn
 from torch.nn import init
 from torch.nn.parameter import Parameter

-import spconv
-import spconv.functional as Fsp
-from spconv import ops
-from spconv.modules import SparseModule
+from spconv import pytorch as spconv
+from spconv.algo import ConvAlgo
+import spconv.pytorch.functional as Fsp
+from spconv.pytorch import ops
+from spconv.pytorch.core import IndiceData
+from spconv.pytorch.modules import SparseModule


 class SparseMaxPool(SparseModule):
@@ -34,8 +36,10 @@ class SparseMaxPool(SparseModule):
                 stride=None,
                 padding=0,
                 dilation=1,
-                 subm=False):
-        super(SparseMaxPool, self).__init__()
+                 indice_key=None,
+                 subm=False,
+                 name=None):
+        super(SparseMaxPool, self).__init__(name=name)
        if not isinstance(kernel_size, (list, tuple)):
            kernel_size = [kernel_size] * ndim
        if stride is None:
@@ -46,13 +50,13 @@ class SparseMaxPool(SparseModule):
            padding = [padding] * ndim
        if not isinstance(dilation, (list, tuple)):
            dilation = [dilation] * ndim
-
        self.ndim = ndim
        self.kernel_size = kernel_size
        self.stride = stride
        self.padding = padding
        self.subm = subm
        self.dilation = dilation
+        self.indice_key = indice_key

    def forward(self, input):
        assert isinstance(input, spconv.SparseConvTensor)
@@ -67,27 +71,130 @@ class SparseMaxPool(SparseModule):
                self.dilation)
        else:
            out_spatial_shape = spatial_shape
+        out_tensor = input.shadow_copy()
+        if input.benchmark:
+            if self.name is None:
+                raise ValueError(
+                    "you need to assign name to spmodules before benchmark (spconv.utils.bench.assign_name_to_spmod)"
+                )
+            if self.name not in input.benchmark_record:
+                input.benchmark_record[self.name] = {
+                    "type": "SparseMaxPool",
+                    "indice_gen_time": [],
+                    "time": [],
+                    "num_points": [],
+                    "num_out_points": [],
+                    "params": {
+                        "kernel_size": self.kernel_size,
+                        "stride": self.stride,
+                        "padding": self.padding,
+                        "dilation": self.dilation,
+                        "channels": features.shape[1],
+                    }
+                }
+
+        if input.benchmark:
+            torch.cuda.synchronize()
+            t = time.time()
+
        outids, indice_pairs, indice_pairs_num = ops.get_indice_pairs(
-            indices, batch_size, spatial_shape, self.kernel_size, self.stride,
-            self.padding, self.dilation, 0, self.subm)
+            indices,
+            batch_size,
+            spatial_shape,
+            ConvAlgo.Native,
+            self.kernel_size,
+            self.stride,
+            self.padding,
+            self.dilation,
+            0,
+            False)
+        if input.benchmark:
+            torch.cuda.synchronize()
+            interval = time.time() - t
+            out_tensor.benchmark_record[self.name]["indice_gen_time"].append(
+                interval)
+            t = time.time()
+
+        if self.indice_key is not None:
+            datas = input.find_indice_pair(self.indice_key)
+            if datas is None:
+                indice_data = IndiceData(outids, indices, indice_pairs,
+                                         indice_pairs_num, spatial_shape)
+                input.indice_dict[self.indice_key] = indice_data
+            else:
+                raise ValueError("indice data exists")

        out_features = Fsp.indice_maxpool(features, indice_pairs.to(device),
                                          indice_pairs_num.to(device),
                                          outids.shape[0])
-        out_tensor = spconv.SparseConvTensor(out_features, outids,
-                                             out_spatial_shape, batch_size)
-        out_tensor.indice_dict = input.indice_dict
-        out_tensor.grid = input.grid
+        if input.benchmark:
+            torch.cuda.synchronize()
+            interval = time.time() - t
+            out_tensor.benchmark_record[self.name]["time"].append(interval)
+            out_tensor.benchmark_record[self.name]["num_points"].append(
+                features.shape[0])
+            out_tensor.benchmark_record[self.name]["num_out_points"].append(
+                out_features.shape[0])
+
+        out_tensor.features = out_features
+        out_tensor.indices = outids
+        out_tensor.spatial_shape = out_spatial_shape
        return out_tensor


+class SparseMaxPool1d(SparseMaxPool):
+    def __init__(self,
+                 kernel_size,
+                 stride=None,
+                 padding=0,
+                 dilation=1,
+                 name=None):
+        super(SparseMaxPool1d, self).__init__(1,
+                                              kernel_size,
+                                              stride,
+                                              padding,
+                                              dilation,
+                                              name=name)
+
 class SparseMaxPool2d(SparseMaxPool):
-    def __init__(self, kernel_size, stride=None, padding=0, dilation=1):
-        super(SparseMaxPool2d, self).__init__(2, kernel_size, stride, padding,
-                                              dilation)
+    def __init__(self,
+                 kernel_size,
+                 stride=None,
+                 padding=0,
+                 dilation=1,
+                 name=None):
+        super(SparseMaxPool2d, self).__init__(2,
+                                              kernel_size,
+                                              stride,
+                                              padding,
+                                              dilation,
+                                              name=name)


 class SparseMaxPool3d(SparseMaxPool):
-    def __init__(self, kernel_size, stride=None, padding=0, dilation=1):
-        super(SparseMaxPool3d, self).__init__(3, kernel_size, stride, padding,
-                                              dilation)
+    def __init__(self,
+                 kernel_size,
+                 stride=None,
+                 padding=0,
+                 dilation=1,
+                 name=None):
+        super(SparseMaxPool3d, self).__init__(3,
+                                              kernel_size,
+                                              stride,
+                                              padding,
+                                              dilation,
+                                              name=name)
+
+class SparseMaxPool4d(SparseMaxPool):
+    def __init__(self,
+                 kernel_size,
+                 stride=None,
+                 padding=0,
+                 dilation=1,
+                 name=None):
+        super(SparseMaxPool4d, self).__init__(4,
+                                              kernel_size,
+                                              stride,
+                                              padding,
+                                              dilation,
+                                              name=name)
--- a/spconv/pytorch/spatial.py
+++ b/spconv/pytorch/spatial.py
+# Copyright 2021 Yan Yan
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+import time
+
+import numpy as np
+import torch
+from torch import nn
+from torch.nn import init
+from torch.nn.parameter import Parameter
+
+from spconv import pytorch as spconv
+from spconv.pytorch.modules import SparseModule
+
+
+class RemoveDuplicate(SparseModule):
+    def forward(self, x: spconv.SparseConvTensor):
+        inds = x.indices
+        spatial_shape = [x.batch_size, *x.spatial_shape]
+        spatial_stride = [0] * len(spatial_shape)
+        val = 1
+        for i in range(inds.shape[1] - 1, -1, -1):
+            spatial_stride[i] = val
+            val *= spatial_shape[i]
+        indices_index = inds[:, -1]
+        for i in range(len(spatial_shape) - 1):
+            indices_index += spatial_stride[i] * inds[:, i]
+        _, unique_inds = torch.unique(indices_index)
+        new_inds = inds[unique_inds]
+        new_features = x.features[unique_inds]
+        res = spconv.SparseConvTensor(new_features, new_inds, x.spatial_shape,
+                                      x.batch_size, x.grid)
+        return res