working on tensor core test

01ed382c · yan.yan · 3517290c · 01ed382c · 01ed382c · 01ed382c
Commit 01ed382c authored Oct 18, 2021 by yan.yan
20 changed files
--- a/spconv/core_cc/csrc/sparse/all/ops4d.pyi
+++ b/spconv/core_cc/csrc/sparse/all/ops4d.pyi
+from typing import overload, Any, Callable, Dict, List, Optional, Set, Tuple, Type, Union
+from pccm.stubs import EnumValue, EnumClassValue
+from cumm.tensorview import Tensor
+class Point2Voxel:
+    hashdata: Tensor
+    point_indice_data: Tensor
+    voxels: Tensor
+    indices: Tensor
+    num_per_voxel: Tensor
+    @property
+    def grid_size(self) -> List[int]: ...
+    def __init__(self, vsize_xyz: List[float], coors_range_xyz: List[float], num_point_features: int, max_num_voxels: int, max_num_points_per_voxel: int) -> None: 
+        """
+        Args:
+            vsize_xyz: 
+            coors_range_xyz: 
+            num_point_features: 
+            max_num_voxels: 
+            max_num_points_per_voxel: 
+        """
+        ...
+    def point_to_voxel_hash(self, points: Tensor, clear_voxels: bool = True) -> Tuple[Tensor, Tensor, Tensor]: 
+        """
+        Args:
+            points: 
+            clear_voxels: 
+        """
+        ...
--- a/spconv/core_cc/csrc/sparse/all/ops_cpu1d.pyi
+++ b/spconv/core_cc/csrc/sparse/all/ops_cpu1d.pyi
+from typing import overload, Any, Callable, Dict, List, Optional, Set, Tuple, Type, Union
+from pccm.stubs import EnumValue, EnumClassValue
+from cumm.tensorview import Tensor
+class Point2VoxelCPU:
+    densehashdata: Tensor
+    voxels: Tensor
+    indices: Tensor
+    num_per_voxel: Tensor
+    @property
+    def grid_size(self) -> List[int]: ...
+    def __init__(self, vsize_xyz: List[float], coors_range_xyz: List[float], num_point_features: int, max_num_voxels: int, max_num_points_per_voxel: int) -> None: 
+        """
+        Args:
+            vsize_xyz: 
+            coors_range_xyz: 
+            num_point_features: 
+            max_num_voxels: 
+            max_num_points_per_voxel: 
+        """
+        ...
+    def point_to_voxel(self, points: Tensor, clear_voxels: bool = True) -> Tuple[Tensor, Tensor, Tensor]: 
+        """
+        Args:
+            points: 
+            clear_voxels: 
+        """
+        ...
+    def point_to_voxel_empty_mean(self, points: Tensor, clear_voxels: bool = True) -> Tuple[Tensor, Tensor, Tensor]: 
+        """
+        Args:
+            points: 
+            clear_voxels: 
+        """
+        ...
--- a/spconv/core_cc/csrc/sparse/all/ops_cpu2d.pyi
+++ b/spconv/core_cc/csrc/sparse/all/ops_cpu2d.pyi
+from typing import overload, Any, Callable, Dict, List, Optional, Set, Tuple, Type, Union
+from pccm.stubs import EnumValue, EnumClassValue
+from cumm.tensorview import Tensor
+class Point2VoxelCPU:
+    densehashdata: Tensor
+    voxels: Tensor
+    indices: Tensor
+    num_per_voxel: Tensor
+    @property
+    def grid_size(self) -> List[int]: ...
+    def __init__(self, vsize_xyz: List[float], coors_range_xyz: List[float], num_point_features: int, max_num_voxels: int, max_num_points_per_voxel: int) -> None: 
+        """
+        Args:
+            vsize_xyz: 
+            coors_range_xyz: 
+            num_point_features: 
+            max_num_voxels: 
+            max_num_points_per_voxel: 
+        """
+        ...
+    def point_to_voxel(self, points: Tensor, clear_voxels: bool = True) -> Tuple[Tensor, Tensor, Tensor]: 
+        """
+        Args:
+            points: 
+            clear_voxels: 
+        """
+        ...
+    def point_to_voxel_empty_mean(self, points: Tensor, clear_voxels: bool = True) -> Tuple[Tensor, Tensor, Tensor]: 
+        """
+        Args:
+            points: 
+            clear_voxels: 
+        """
+        ...
--- a/spconv/core_cc/csrc/sparse/all/ops_cpu3d.pyi
+++ b/spconv/core_cc/csrc/sparse/all/ops_cpu3d.pyi
+from typing import overload, Any, Callable, Dict, List, Optional, Set, Tuple, Type, Union
+from pccm.stubs import EnumValue, EnumClassValue
+from cumm.tensorview import Tensor
+class Point2VoxelCPU:
+    densehashdata: Tensor
+    voxels: Tensor
+    indices: Tensor
+    num_per_voxel: Tensor
+    @property
+    def grid_size(self) -> List[int]: ...
+    def __init__(self, vsize_xyz: List[float], coors_range_xyz: List[float], num_point_features: int, max_num_voxels: int, max_num_points_per_voxel: int) -> None: 
+        """
+        Args:
+            vsize_xyz: 
+            coors_range_xyz: 
+            num_point_features: 
+            max_num_voxels: 
+            max_num_points_per_voxel: 
+        """
+        ...
+    def point_to_voxel(self, points: Tensor, clear_voxels: bool = True) -> Tuple[Tensor, Tensor, Tensor]: 
+        """
+        Args:
+            points: 
+            clear_voxels: 
+        """
+        ...
+    def point_to_voxel_empty_mean(self, points: Tensor, clear_voxels: bool = True) -> Tuple[Tensor, Tensor, Tensor]: 
+        """
+        Args:
+            points: 
+            clear_voxels: 
+        """
+        ...
--- a/spconv/core_cc/csrc/sparse/all/ops_cpu4d.pyi
+++ b/spconv/core_cc/csrc/sparse/all/ops_cpu4d.pyi
+from typing import overload, Any, Callable, Dict, List, Optional, Set, Tuple, Type, Union
+from pccm.stubs import EnumValue, EnumClassValue
+from cumm.tensorview import Tensor
+class Point2VoxelCPU:
+    densehashdata: Tensor
+    voxels: Tensor
+    indices: Tensor
+    num_per_voxel: Tensor
+    @property
+    def grid_size(self) -> List[int]: ...
+    def __init__(self, vsize_xyz: List[float], coors_range_xyz: List[float], num_point_features: int, max_num_voxels: int, max_num_points_per_voxel: int) -> None: 
+        """
+        Args:
+            vsize_xyz: 
+            coors_range_xyz: 
+            num_point_features: 
+            max_num_voxels: 
+            max_num_points_per_voxel: 
+        """
+        ...
+    def point_to_voxel(self, points: Tensor, clear_voxels: bool = True) -> Tuple[Tensor, Tensor, Tensor]: 
+        """
+        Args:
+            points: 
+            clear_voxels: 
+        """
+        ...
+    def point_to_voxel_empty_mean(self, points: Tensor, clear_voxels: bool = True) -> Tuple[Tensor, Tensor, Tensor]: 
+        """
+        Args:
+            points: 
+            clear_voxels: 
+        """
+        ...
--- a/spconv/core_cc/cumm/__init__.pyi
+++ b/spconv/core_cc/cumm/__init__.pyi
+# Copyright 2021 Yan Yan
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
--- a/spconv/core_cc/cumm/gemm/__init__.pyi
+++ b/spconv/core_cc/cumm/gemm/__init__.pyi
+# Copyright 2021 Yan Yan
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
--- a/spconv/core_cc/cumm/gemm/gather.pyi
+++ b/spconv/core_cc/cumm/gemm/gather.pyi
+from typing import overload, Any, Callable, Dict, List, Optional, Set, Tuple, Type, Union
+from pccm.stubs import EnumValue, EnumClassValue
+from cumm.tensorview import Tensor
+class ScatterAll:
+    def __init__(self) -> None: ...
+    @staticmethod
+    def get_all_scatter_params() -> List[Tuple[int, int, int, int]]: ...
+    def supported_scatter(self, tile_m: int, tile_k_bytes: int, bytes_per_access: int, num_threads: int, channel_size: int, dtype: int) -> bool: 
+        """
+        Args:
+            tile_m: 
+            tile_k_bytes: 
+            bytes_per_access: 
+            num_threads: 
+            channel_size: 
+            dtype: 
+        """
+        ...
+    @staticmethod
+    def stream_synchronize(stream: int = 0) -> None: 
+        """
+        Args:
+            stream: 
+        """
+        ...
+    def scatter(self, output: Tensor, input: Tensor, indices: Tensor, tile_m: int, tile_k_bytes: int, bytes_per_access: int, num_threads: int, stream: int = 0) -> None: 
+        """
+        Args:
+            output: 
+            input: 
+            indices: 
+            tile_m: 
+            tile_k_bytes: 
+            bytes_per_access: 
+            num_threads: 
+            stream: 
+        """
+        ...
+    def scatter2(self, output: Tensor, input: Tensor, indices: Tensor, size: int, stream: int = 0) -> None: 
+        """
+        Args:
+            output: 
+            input: 
+            indices: 
+            size: 
+            stream: 
+        """
+        ...
+class GatherAll:
+    def __init__(self) -> None: ...
+    @staticmethod
+    def get_all_gather_params() -> List[Tuple[int, int, int, int]]: ...
+    @staticmethod
+    def supported(bytes_per_access: int, channel_size: int, dtype: int) -> bool: 
+        """
+        Args:
+            bytes_per_access: 
+            channel_size: 
+            dtype: 
+        """
+        ...
+    @staticmethod
+    def stream_synchronize(stream: int = 0) -> None: 
+        """
+        Args:
+            stream: 
+        """
+        ...
+    def gather(self, output: Tensor, input: Tensor, indices: Tensor, tile_m: int, tile_k_bytes: int, bytes_per_access: int, num_threads: int, stream: int = 0) -> None: 
+        """
+        Args:
+            output: 
+            input: 
+            indices: 
+            tile_m: 
+            tile_k_bytes: 
+            bytes_per_access: 
+            num_threads: 
+            stream: 
+        """
+        ...
+    def gather2(self, output: Tensor, input: Tensor, indices: Tensor, size: int, stream: int = 0) -> None: 
+        """
+        Args:
+            output: 
+            input: 
+            indices: 
+            size: 
+            stream: 
+        """
+        ...
--- a/spconv/core_cc/cumm/gemm/main.pyi
+++ b/spconv/core_cc/cumm/gemm/main.pyi
+from typing import overload, Any, Callable, Dict, List, Optional, Set, Tuple, Type, Union
+from pccm.stubs import EnumValue, EnumClassValue
+from cumm.tensorview import Tensor
+class GemmAlgoDesp:
+    dtype_a: int
+    dtype_b: int
+    dtype_c: int
+    tile_shape: Tuple[int, int, int]
+    warp_tile_shape: Tuple[int, int, int]
+    num_stage: int
+    dacc: int
+    dcomp: int
+    algo: str
+    tensorop: List[int]
+    split_k_serial_: int
+    split_k_parallel_: int
+    shuffle_type: str
+    element_per_access_a: int
+    element_per_access_b: int
+    element_per_access_c: int
+    def __init__(self) -> None: ...
+    def __repr__(self) -> str: ...
+    @property
+    def split_k_serial(self) -> bool: ...
+    @split_k_serial.setter
+    def split_k_serial(self, val: bool) -> None: 
+        """
+        Args:
+            val: 
+        """
+        ...
+    @property
+    def split_k_parallel(self) -> bool: ...
+    @split_k_parallel.setter
+    def split_k_parallel(self, val: bool) -> None: 
+        """
+        Args:
+            val: 
+        """
+        ...
+    def check_valid(self) -> None: ...
+    @property
+    def trans_a(self) -> bool: ...
+    @trans_a.setter
+    def trans_a(self, val: bool) -> None: 
+        """
+        Args:
+            val: 
+        """
+        ...
+    @property
+    def trans_b(self) -> bool: ...
+    @trans_b.setter
+    def trans_b(self, val: bool) -> None: 
+        """
+        Args:
+            val: 
+        """
+        ...
+    @property
+    def trans_c(self) -> bool: ...
+    @trans_c.setter
+    def trans_c(self, val: bool) -> None: 
+        """
+        Args:
+            val: 
+        """
+        ...
+    def query_workspace_size(self, m: int, n: int, k: int, split_k_slices: int) -> int: 
+        """
+        Args:
+            m: 
+            n: 
+            k: 
+            split_k_slices: 
+        """
+        ...
+    def supported(self, m: int, n: int, k: int) -> bool: 
+        """
+        Args:
+            m: 
+            n: 
+            k: 
+        """
+        ...
+    def supported_ldx(self, lda: int, ldb: int, ldc: int) -> bool: 
+        """
+        Args:
+            lda: 
+            ldb: 
+            ldc: 
+        """
+        ...
+class GemmParams:
+    algo_desp: GemmAlgoDesp
+    split_k_slices: int
+    workspace: Tensor =  Tensor()
+    a_inds: Tensor =  Tensor()
+    b_inds: Tensor =  Tensor()
+    c_inds: Tensor =  Tensor()
+    alpha: float
+    beta: float
+    stream: int
+    def __init__(self) -> None: ...
+    def check_valid(self) -> None: ...
+    @property
+    def a(self) -> Tensor: ...
+    @a.setter
+    def a(self, val: Tensor) -> None: 
+        """
+        Args:
+            val: 
+        """
+        ...
+    @property
+    def b(self) -> Tensor: ...
+    @b.setter
+    def b(self, val: Tensor) -> None: 
+        """
+        Args:
+            val: 
+        """
+        ...
+    @property
+    def c(self) -> Tensor: ...
+    @c.setter
+    def c(self, val: Tensor) -> None: 
+        """
+        Args:
+            val: 
+        """
+        ...
+class GemmMainUnitTest:
+    @staticmethod
+    def get_all_algo_desp() -> List[GemmAlgoDesp]: ...
+    @staticmethod
+    def extract_mnk(a_shape: List[int], b_shape: List[int], trans_a: bool, trans_b: bool, trans_c: bool, shuffle_type: str = "NS", a_inds_shape: List[int] =  [], b_inds_shape: List[int] =  [], c_inds_shape: List[int] =  []) -> Tuple[int, int, int]: 
+        """
+        Args:
+            a_shape: 
+            b_shape: 
+            trans_a: 
+            trans_b: 
+            trans_c: 
+            shuffle_type: 
+            a_inds_shape: 
+            b_inds_shape: 
+            c_inds_shape: 
+        """
+        ...
+    @staticmethod
+    def align_to_power2(val: int) -> int: 
+        """
+        Args:
+            val: 
+        """
+        ...
+    @staticmethod
+    def device_synchronize() -> None: ...
+    @staticmethod
+    def stream_synchronize(stream: int) -> None: 
+        """
+        Args:
+            stream: 
+        """
+        ...
+    @staticmethod
+    def simple_select_tile_shape(m: int, n: int, k: int, tile_ms: List[int], tile_ns: List[int], tile_ks: List[int], tile_shape_to_algos: Dict[int, List[int]], large_k_first: bool) -> List[int]: 
+        """
+        Args:
+            m: 
+            n: 
+            k: 
+            tile_ms: 
+            tile_ns: 
+            tile_ks: 
+            tile_shape_to_algos: 
+            large_k_first: 
+        """
+        ...
+    @staticmethod
+    def matmul2(params: GemmParams) -> None: 
+        """
+        Args:
+            params: 
+        """
+        ...
--- a/spconv/csrc/__init__.py
+++ b/spconv/csrc/__init__.py
+# Copyright 2021 Yan Yan
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
--- a/spconv/csrc/sparse/__init__.py
+++ b/spconv/csrc/sparse/__init__.py
+# Copyright 2021 Yan Yan
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
--- a/spconv/csrc/sparse/all.py
+++ b/spconv/csrc/sparse/all.py
+# Copyright 2021 Yan Yan
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from cumm.common import TensorViewKernel, ThrustLib
+from cumm.conv.bases import ConvOpType, NHWC
+from cumm.conv.params import ConvProblem
+from cumm import dtypes
+import pccm 
+
+from .pointops import Point2Voxel, Point2VoxelCPU
+from .indices import SparseConvIndicesKernel, CudaCommonKernel
+from .maxpool import IndiceMaxPool
+
+class SpconvOps(pccm.Class):
+    def __init__(self):
+        super().__init__()
+        self.ndims = [1, 2, 3, 4]
+        for ndim in self.ndims:
+            p2v = Point2Voxel(dtypes.float32,  ndim)
+            p2v_cpu = Point2VoxelCPU(dtypes.float32, ndim)
+            self.add_param_class(f"ops{ndim}d", p2v, f"Point2Voxel{ndim}D")
+            self.add_param_class(f"ops_cpu{ndim}d", p2v_cpu, f"Point2Voxel{ndim}DCPU")
+
+            problem = ConvProblem(ndim, ConvOpType.kForward, NHWC, NHWC, NHWC)
+            indices = SparseConvIndicesKernel(problem, dtypes.int32)
+            # self.add_param_class("ops", indices, "SpconvIndices")
+            cuda_funcs = [self.generate_conv_inds, self.generate_subm_conv_inds, 
+                self.generate_conv_inds_stage1, self.generate_conv_inds_stage2, self.sort_1d_by_key]
+            self.add_impl_only_param_class(cuda_funcs, f"ops{ndim}d", indices, f"SpconvIndices{ndim}D")
+
+
+    @pccm.pybind.mark
+    @pccm.cuda.static_function
+    def generate_conv_inds(self):
+        code = pccm.FunctionCode()
+        code.arg("indices, hashdata", "tv::Tensor")
+        code.arg("indice_pairs, indice_pairs_uniq, out_inds, indice_num_per_loc", "tv::Tensor")
+        code.arg("batch_size", "int")
+        code.arg("output_dims, input_dims", f"std::vector<int>")
+        code.arg("ksize, stride, padding, dilation", f"std::vector<int>")
+        code.raw(f"""
+        int ndim = indices.dim(1) - 1;
+        TV_ASSERT_RT_ERR(output_dims.size() == ndim && input_dims.size() == ndim &&
+            ksize.size() == ndim && stride.size() == ndim && dilation.size() == ndim &&
+            padding.size() == ndim, "your params size not equal to ndim", ndim);
+        """)
+        for ndim in self.ndims:
+            code.raw(f"""
+            if (ndim == {ndim}){{
+                tv::array<int, {ndim}> output_dims_, input_dims_;
+                tv::array<int, {ndim}> ksize_, stride_, padding_, dilation_;
+                for (int i = 0; i < {ndim}; ++i){{
+                    output_dims_[i] = output_dims[i];
+                    input_dims_[i] = input_dims[i];
+                    ksize_[i] = ksize[i];
+                    stride_[i] = stride[i];
+                    padding_[i] = padding[i];
+                    dilation_[i] = dilation[i];
+                }}
+                return SpconvIndices{ndim}D::generate_conv_inds(indices, hashdata,
+                    indice_pairs, indice_pairs_uniq, out_inds, indice_num_per_loc,
+                    batch_size, output_dims_, input_dims_, 
+                    ksize_, stride_, padding_, dilation_);
+            }}
+            """)
+        code.raw(f"""TV_THROW_RT_ERR("unknown ndim", ndim);""")
+        return code.ret("int")
+
+    @pccm.pybind.mark
+    @pccm.cuda.static_function
+    def generate_conv_inds_stage1(self):
+        code = pccm.FunctionCode()
+        code.arg("indices", "tv::Tensor")
+        code.arg("indice_pairs, indice_pairs_uniq, indice_num_per_loc", "tv::Tensor")
+        code.arg("batch_size", "int")
+        code.arg("output_dims, input_dims", f"std::vector<int>")
+        code.arg("ksize, stride, padding, dilation", f"std::vector<int>")
+        code.arg("stream_int", f"std::uintptr_t", "0", pyanno="int")
+        code.raw(f"""
+        int ndim = indices.dim(1) - 1;
+        TV_ASSERT_RT_ERR(output_dims.size() == ndim && input_dims.size() == ndim &&
+            ksize.size() == ndim && stride.size() == ndim && dilation.size() == ndim &&
+            padding.size() == ndim, "your params size not equal to ndim", ndim);
+        """)
+
+        for ndim in self.ndims:
+            code.raw(f"""
+            if (ndim == {ndim}){{
+                tv::array<int, {ndim}> output_dims_, input_dims_;
+                tv::array<int, {ndim}> ksize_, stride_, padding_, dilation_;
+                for (int i = 0; i < {ndim}; ++i){{
+                    output_dims_[i] = output_dims[i];
+                    input_dims_[i] = input_dims[i];
+                    ksize_[i] = ksize[i];
+                    stride_[i] = stride[i];
+                    padding_[i] = padding[i];
+                    dilation_[i] = dilation[i];
+                }}
+                return SpconvIndices{ndim}D::generate_conv_inds_stage1(indices,
+                    indice_pairs, indice_pairs_uniq, indice_num_per_loc,
+                    batch_size, output_dims_, input_dims_, 
+                    ksize_, stride_, padding_, dilation_);
+            }}
+            """)
+        code.raw(f"""TV_THROW_RT_ERR("unknown ndim", ndim);""")
+
+        return code.ret("int")
+
+    @pccm.pybind.mark
+    @pccm.cuda.static_function
+    def generate_conv_inds_stage2(self):
+        code = pccm.FunctionCode()
+        code.arg("indices, hashdata", "tv::Tensor")
+        code.arg("indice_pairs, indice_pairs_uniq, out_inds", "tv::Tensor")
+        code.arg("num_out_act", "int")
+        code.arg("batch_size", "int")
+        code.arg("output_dims, input_dims", f"std::vector<int>")
+        code.arg("ksize, stride, padding, dilation", f"std::vector<int>")
+        code.arg("stream_int", f"std::uintptr_t", "0", pyanno="int")
+        code.raw(f"""
+        int ndim = indices.dim(1) - 1;
+        TV_ASSERT_RT_ERR(output_dims.size() == ndim && input_dims.size() == ndim &&
+            ksize.size() == ndim && stride.size() == ndim && dilation.size() == ndim &&
+            padding.size() == ndim, "your params size not equal to ndim", ndim);
+        """)
+
+        for ndim in self.ndims:
+            code.raw(f"""
+            if (ndim == {ndim}){{
+                tv::array<int, {ndim}> output_dims_, input_dims_;
+                tv::array<int, {ndim}> ksize_, stride_, padding_, dilation_;
+                for (int i = 0; i < {ndim}; ++i){{
+                    output_dims_[i] = output_dims[i];
+                    input_dims_[i] = input_dims[i];
+                    ksize_[i] = ksize[i];
+                    stride_[i] = stride[i];
+                    padding_[i] = padding[i];
+                    dilation_[i] = dilation[i];
+                }}
+                return SpconvIndices{ndim}D::generate_conv_inds_stage2(indices, hashdata,
+                    indice_pairs, indice_pairs_uniq, out_inds, num_out_act,
+                    batch_size, output_dims_, input_dims_, 
+                    ksize_, stride_, padding_, dilation_);
+            }}
+            """)
+        code.raw(f"""TV_THROW_RT_ERR("unknown ndim", ndim);""")
+
+        return code.ret("int")
+
+    @pccm.pybind.mark
+    @pccm.cuda.static_function
+    def generate_subm_conv_inds(self):
+        code = pccm.FunctionCode()
+        code.arg("indices, hashdata", "tv::Tensor")
+        code.arg("indice_pairs, out_inds, indice_num_per_loc", "tv::Tensor")
+        code.arg("batch_size", "int")
+        code.arg("input_dims", f"std::vector<int>")
+        code.arg("ksize, dilation", f"std::vector<int>")
+        code.arg("indice_pair_mask", "tv::Tensor", "tv::Tensor()", "cumm.tensorview.Tensor = Tensor()")
+        code.arg("backward", "bool", "false")
+        code.arg("stream_int", f"std::uintptr_t", "0", pyanno="int = 0")
+        code.raw(f"""
+        int ndim = indices.dim(1) - 1;
+        TV_ASSERT_RT_ERR(input_dims.size() == ndim &&
+            ksize.size() == ndim && dilation.size() == ndim, "your params size not equal to ndim", ndim);
+        """)
+        for ndim in self.ndims:
+            code.raw(f"""
+            if (ndim == {ndim}){{
+                tv::array<int, {ndim}> input_dims_;
+                tv::array<int, {ndim}> ksize_, dilation_;
+                for (int i = 0; i < {ndim}; ++i){{
+                    input_dims_[i] = input_dims[i];
+                    ksize_[i] = ksize[i];
+                    dilation_[i] = dilation[i];
+                }}
+                return SpconvIndices{ndim}D::generate_subm_conv_inds(indices, hashdata,
+                    indice_pairs, out_inds, indice_num_per_loc,
+                    batch_size, input_dims_, 
+                    ksize_, dilation_, indice_pair_mask, backward,
+                    stream_int);
+            }}
+            """)
+        code.raw(f"""TV_THROW_RT_ERR("unknown ndim", ndim);""")
+        return code.ret("int")
+
+    @pccm.pybind.mark
+    @pccm.cuda.static_function
+    def maxpool_forward(self):
+        code = pccm.FunctionCode()
+        code.arg("out", "tv::Tensor")
+        code.arg("inp", "tv::Tensor")
+        code.arg("out_inds", "tv::Tensor")
+        code.arg("in_inds", "tv::Tensor")
+        code.arg("stream", "std::uintptr_t", "0", pyanno="int")
+        code.add_dependency(IndiceMaxPool)
+        code.raw(f"""
+        return IndiceMaxPool::forward(out, inp, out_inds, in_inds, stream);
+        """)
+        return code
+
+    @pccm.pybind.mark
+    @pccm.cuda.static_function
+    def maxpool_backward(self):
+        code = pccm.FunctionCode()
+        code.arg("out", "tv::Tensor")
+        code.arg("inp", "tv::Tensor")
+        code.arg("dout", "tv::Tensor")
+        code.arg("dinp", "tv::Tensor")
+        code.arg("out_inds", "tv::Tensor")
+        code.arg("in_inds", "tv::Tensor")
+        code.arg("stream", "std::uintptr_t", "0", pyanno="int")
+        code.add_dependency(IndiceMaxPool)
+        code.raw(f"""
+        return IndiceMaxPool::backward(out, inp, dout, dinp, out_inds, in_inds, stream);
+        """)
+        return code
+
+    @pccm.pybind.mark
+    @pccm.cuda.static_function
+    def sort_1d_by_key(self):
+        code = pccm.FunctionCode()
+        code.add_dependency(ThrustLib, TensorViewKernel)
+        code.add_param_class("cudakers", CudaCommonKernel())
+        code.arg("data", "tv::Tensor")
+        code.raw(f"""
+        tv::Tensor indices({{data.dim(0)}}, tv::int32, 0);
+        tv::cuda::Launch launcher(data.dim(0));
+        launcher(cudakers::arange_kernel<int32_t>, indices.data_ptr<int32_t>(), indices.dim(0));
+        tv::dispatch<int32_t, uint32_t, int64_t, uint64_t>(data.dtype(), [&](auto I){{
+            using T = TV_DECLTYPE(I);
+            thrust::device_ptr<T> ptr_tr(data.data_ptr<T>());
+            thrust::device_ptr<int32_t> ptr_k(indices.data_ptr<int32_t>());
+            auto thrust_ctx = thrust::cuda::par.on(0);
+            thrust::sort_by_key(thrust_ctx, ptr_tr, ptr_tr + data.dim(0), ptr_k);
+        }});
+        return indices;
+        """)
+        return code.ret("tv::Tensor")
--- a/spconv/csrc/sparse/devleop/sort_bench.py
+++ b/spconv/csrc/sparse/devleop/sort_bench.py
+import torch 
+import time 
+
+def main():
+
+    arr = torch.randint(0, 130000, size=[130000]).to(torch.int32).cuda()
+    arr2 = torch.randint(0, 130000, size=[130000]).to(torch.int32).cuda()
+
+    torch.cuda.synchronize()
+    ar = torch.arange(arr.shape[0]).cuda()
+
+    t = time.time()
+    for i in range(10):
+
+        xx, indices = arr.sort()
+        # thh = torch.empty_like(indices)
+        xx2, indices2 = arr2.sort()
+
+        # thh[indices] = ar
+        torch.cuda.synchronize()
+        print(time.time() - t)
+        t = time.time()
+    # print(indices[:10], thh[:10])
+    a = torch.rand(130000, 27 * 32).cuda().float()
+    b = torch.rand(27 * 32, 32).cuda().float()
+    c = torch.rand(130000, 32).cuda().float()
+    for i in range(10):
+        torch.cuda.synchronize()
+        t = time.time()
+        torch.mm(a, b, out=c)
+        # thh[indices] = ar
+        torch.cuda.synchronize()
+        print(time.time() - t)
+
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
--- a/spconv/csrc/sparse/devleop/wtf.py
+++ b/spconv/csrc/sparse/devleop/wtf.py
+#!/home/yy/library/anaconda3/bin/python
+import sys
+from pathlib import Path 
+import ctypes
+# _cudart = ctypes.CDLL('libcudart.so')
+
+print(str(Path(__file__).parent.parent.parent.parent))
+sys.path.append(str(Path(__file__).parent.parent.parent.parent))
+
+
+
+from spconv import tensorview as tv 
+
+from spconv.sparse import build
+import numpy as np 
+from pathlib import Path 
+from spconv.spconv_ops_cc.sparse.all.ops import Point2Voxel
+from spconv.spconv_ops_cc.sparse.all import SpconvOps
+
+import time 
+
+def main():
+    data = np.load("/home/yy/OneDrive/dev/spconv/test/data/benchmark-pc.npz")["pc"].astype(np.float32)
+    print(data.shape, data.dtype)
+    p2v = Point2Voxel([0.1, 0.1, 0.1], [-80, -80, -2, 80, 80, 6], 3, 150000, 1)
+    gs = p2v.grid_size # zyx
+    print(gs)
+    # return
+    data_tv = tv.from_numpy(data).cuda()
+    for i in range(6):
+        t = time.time()
+
+        voxels, indices, num_per_voxel = p2v.point_to_voxel_hash(data_tv)   
+        
+        print(time.time() - t)
+    voxels, indices, num_per_voxel = p2v.point_to_voxel_hash(data_tv)   
+    print(voxels.shape, gs)
+    gs_xyz = gs
+    indices_np = indices.cpu().numpy()
+    # indices_offset = indices_np[:, 0] * gs_xyz[1] * gs_xyz[2] + indices_np[:, 1] * gs_xyz[2] + indices_np[:, 2]
+    # uq = np.unique(indices_offset)
+    # print(uq.shape, indices_offset.shape, gs_xyz)
+    # return 
+    ksize = [3] * 3 
+    kv = int(np.prod(ksize))
+    indices_with_bs = np.zeros((indices_np.shape[0], 4), dtype=np.int32)
+    indices_with_bs[:, 1:] = indices_np
+    print(indices_with_bs.mean(), indices_with_bs.max(), indices_with_bs.min())
+
+    indices = tv.from_numpy(indices_with_bs).cuda()
+    out_indices = tv.zeros([indices.dim(0) * kv, 4], tv.int32, 0)
+    indice_num_per_loc = tv.zeros([kv], tv.int32, 0)
+
+
+    points = voxels.view([-1, 3])
+    hashdata = tv.zeros([points.dim(0) * kv * 2], tv.custom64, 0)
+    hashdata_subm = tv.zeros([points.dim(0) * 2], tv.custom64, 0)
+
+    indice_pairs = tv.full([2, kv, indices.dim(0)], -1, tv.int32, 0)
+    indice_pairs_uniq = tv.zeros([indice_pairs.size // 2 + 1], tv.int32, 0)
+
+    # for i in range(10):
+    #     indice_pairs.fill_int_(-1)
+    #     np.random.shuffle(indices_with_bs)
+    #     indices = tv.from_numpy(indices_with_bs).cuda()
+
+    #     indice_num_per_loc.zero_()
+    #     out_act = SpconvOps.generate_conv_inds(indices, hashdata, indice_pairs,
+    #         indice_pairs_uniq, out_indices, indice_num_per_loc, 
+    #         1, gs, gs, [3, 3, 3], [1, 1, 1], [1, 1, 1], [1, 1, 1])
+    #     indice_num_per_loc.zero_()
+    #     out_act = SpconvOps.generate_subm_conv_inds(indices, hashdata_subm, indice_pairs,
+    #         out_indices, indice_num_per_loc, 
+    #         1, gs, ksize, [1, 1, 1])
+    #     indice_num_per_loc_cpu = indice_num_per_loc.cpu().numpy()
+    #     indice_pairs_cpu = indice_pairs.cpu().numpy()
+    #     indice_pairs_cpu_flat = indice_pairs_cpu.reshape(-1)
+    #     uq, count = np.unique(indice_pairs_cpu_flat, return_counts=True)
+    #     print(out_act, indice_pairs_cpu.shape, indice_pairs_cpu.mean(), indice_num_per_loc_cpu.tolist())
+    #     print(indice_pairs_cpu[:, 13, :2])
+    #     print(uq, count)
+
+if __name__ == "__main__":
+
+    main()
\ No newline at end of file
--- a/spconv/csrc/sparse/indices.py
+++ b/spconv/csrc/sparse/indices.py
+# Copyright 2021 Yan Yan
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import contextlib
+from cumm.conv.bases import ConvEnum
+from cumm.gemm.core.metaarray import MetaArray, seq
+from cumm import dtypes
+import pccm 
+from cumm.gemm.layout import TensorGeneric, to_stride
+from cumm.common import TensorView, TensorViewHashKernel, TensorViewKernel, ThrustLib
+from cumm.gemm import codeops
+from typing import List 
+from cumm.conv.params import ConvProblem
+import numpy as np 
+
+class CudaCommonKernel(pccm.ParameterizedClass):
+    # we need to use PClass instead of Class
+    # because cuda global function can't be put in class body.
+    @pccm.cuda.cuda_global_function
+    def arange_kernel(self):
+        code = pccm.FunctionCode()
+        code.targ("T")
+        code.arg("data", f"T*") 
+        code.arg("size", f"int") 
+        code.raw(f"""
+        for (int i : tv::KernelLoopX<int>(size)) {{
+            data[i] = T(i);
+        }}
+        """)
+        return code
+
+    @pccm.cuda.cuda_global_function
+    def fill_kernel(self):
+        code = pccm.FunctionCode()
+        code.targ("T")
+        code.arg("data", f"T*") 
+        code.arg("val", f"T")
+        code.arg("size", f"int") 
+        code.raw(f"""
+        for (int i : tv::KernelLoopX<int>(size)) {{
+            data[i] = T(val);
+        }}
+        """)
+        return code
+
+
+class ConvOutLocIter(pccm.ParameterizedClass):
+    # TODO add conv transpose
+    def __init__(self, problem: ConvProblem):
+        super().__init__()
+        self.add_dependency(TensorView)
+        self.add_param_class("lociter", problem, "ConvProblem")
+        layout_npq = TensorGeneric(problem.ndim + 1, False)
+        layout_rs = TensorGeneric(problem.ndim, False)
+
+        self.add_param_class("lociter", layout_npq, "LayoutNPQ")
+        self.add_param_class("lociter_rs", layout_rs, "LayoutRS")
+
+        self.ndim = problem.ndim 
+        self.add_member("problem_", f"ConvProblem")
+        self.add_member("count_", f"tv::array<int, {self.ndim}>")
+        self.add_member("layout_npq", f"LayoutNPQ")
+        self.add_member("layout_rs", f"LayoutRS")
+
+    @pccm.cuda.constructor(host=True, device=True, forceinline=True)
+    def ctor(self):
+        code = pccm.FunctionCode()
+        code.arg("problem", f"ConvProblem const&")
+        code.ctor_init("problem_", f"problem")
+        zeros = ", ".join(["0"] * self.ndim)
+        code.ctor_init("count_", f"{{{zeros}}}")
+        pqs = codeops.unpack("problem.output_dims", range(self.ndim))
+        rss = codeops.unpack("problem.ksize", range(self.ndim))
+
+        code.ctor_init("layout_npq", f"LayoutNPQ::from_shape({{problem.N, {pqs}}})")
+        code.ctor_init("layout_rs", f"LayoutRS::from_shape({{{rss}}})")
+
+        return code 
+
+    @pccm.cuda.member_function(host=True,
+                               device=True,
+                               forceinline=True,
+                               name="operator++")
+    def increment(self):
+        code = pccm.FunctionCode()
+        for i in range(self.ndim - 1, -1, -1):
+            code.raw(f"""
+            if (++count_[{i}] < problem_.ksize[{i}]){{
+                return *this;
+            }}
+            count_[{i}] = 0;
+            """)
+        code.raw("return *this;")
+        return code.ret(f"{self.class_name}&")
+
+    @pccm.cuda.member_function(host=True,
+                               device=True,
+                               forceinline=True)
+    def set_filter_offset(self):
+        code = pccm.FunctionCode()
+        code.arg("filter_offset", "int")
+        code.raw(f"""
+        layout_rs.inverse(filter_offset, count_);
+        """)
+        return code
+
+    @pccm.cuda.member_function(host=True,
+                               device=True,
+                               forceinline=True,
+                               const=True)
+    def nhw_to_npq(self):
+        code = pccm.FunctionCode()
+        code.arg("nhw_offset", "const int*")
+        code.nontype_targ("NoStride", "bool")
+        for i in range(self.ndim):
+            code.raw(f"""
+            int r_{i} = count_[{i}];
+            int h_{i} = (nhw_offset[{i + 1}] + problem_.padding[{i}] - 
+                r_{i} * problem_.dilation[{i}]) / (NoStride ? 1 : problem_.stride[{i}]);
+            """)
+        h0h1h2 = codeops.unpack_str("h", range(self.ndim))
+        code.raw(f"""
+        return {{nhw_offset[0], {h0h1h2}}};
+        """)
+        return code.ret(f"tv::array<int, {self.ndim + 1}>")
+
+    @pccm.cuda.member_function(host=True,
+                               device=True,
+                               forceinline=True,
+                               const=True)
+    def npq_to_nhw(self):
+        code = pccm.FunctionCode()
+        code.arg("npq_offset", "const int*")
+        for i in range(self.ndim):
+            code.raw(f"""
+            int r_{i} = count_[{i}];
+            int h_{i} = npq_offset[{i + 1}] * problem_.stride[{i}] - problem_.padding[{i}] + r_{i} * problem_.dilation[{i}];
+            """)
+        h0h1h2 = codeops.unpack_str("h", range(self.ndim))
+        code.raw(f"""
+        return {{npq_offset[0], {h0h1h2}}};
+        """)
+        return code.ret(f"tv::array<int, {self.ndim + 1}>")
+
+
+    @pccm.cuda.member_function(host=True,
+                               device=True,
+                               forceinline=True,
+                               const=True)
+    def query_npq(self):
+        code = pccm.FunctionCode()
+        code.arg("nhw_offset", "const int*")
+        code.arg("npq_offset", f"tv::array<int, {self.ndim + 1}>&")
+        code.ret("bool")
+        code.raw(f"""
+        auto npq_no_stride = nhw_to_npq<true>(nhw_offset);
+        npq_offset[0] = npq_no_stride[0];
+        """)
+        hw_valid = [] # type: List[str]
+        stride_valid = [] # type: List[str]
+        for i in range(self.ndim):
+            code.raw(f"npq_offset[{i + 1}] = npq_no_stride[{i + 1}] / problem_.stride[{i}];")
+            hw_valid.append((f"npq_offset[{i + 1}] >= 0 && "
+                            f"npq_offset[{i + 1}] < problem_.output_dims[{i}]"))
+            stride_valid.append(f"!(npq_no_stride[{i + 1}] % problem_.stride[{i}])")
+        code.raw(f"""
+        return npq_no_stride[0] < problem_.N && 
+            {' && '.join(hw_valid)} &&
+            {' && '.join(stride_valid)};
+        """)
+        return code 
+
+    @pccm.cuda.member_function(host=True,
+                               device=True,
+                               forceinline=True,
+                               const=True)
+    def query_npq_no_stride(self):
+        code = pccm.FunctionCode()
+        code.arg("nhw_offset", "const int*")
+        code.arg("npq_offset", f"tv::array<int, {self.ndim + 1}>&")
+        code.ret("bool")
+        code.raw(f"""
+        npq_offset = nhw_to_npq<true>(nhw_offset);
+        """)
+        hw_valid = [] # type: List[str]
+        for i in range(self.ndim):
+            hw_valid.append((f"npq_offset[{i + 1}] >= 0 && "
+                            f"npq_offset[{i + 1}] < problem_.output_dims[{i}]"))
+        code.raw(f"""
+        return npq_offset[0] < problem_.N && 
+            {' && '.join(hw_valid)};
+        """)
+        return code 
+
+    @pccm.cuda.member_function(host=True,
+                               device=True,
+                               forceinline=True,
+                               const=True)
+    def query_nhw(self):
+        code = pccm.FunctionCode()
+        code.arg("npq_offset", "const int*")
+        code.arg("nhw_offset", f"tv::array<int, {self.ndim + 1}>&")
+        code.ret("bool")
+        code.raw(f"""
+        nhw_offset = npq_to_nhw(npq_offset);
+        """)
+        hw_valid = [] # type: List[str]
+        for i in range(self.ndim):
+            hw_valid.append((f"nhw_offset[{i + 1}] >= 0 && "
+                            f"nhw_offset[{i + 1}] < problem_.input_dims[{i}]"))
+        code.raw(f"""
+        return nhw_offset[0] < problem_.N && 
+            {' && '.join(hw_valid)};
+        """)
+        return code 
+
+class SparseConvIndicesKernel(pccm.ParameterizedClass):
+    def __init__(self, problem: ConvProblem, dtype_indices: dtypes.DType):
+        super().__init__()
+        self.add_dependency(TensorView, TensorViewKernel, TensorViewHashKernel, ThrustLib)
+        self.loc_iter = ConvOutLocIter(problem)
+        self.add_param_class("spinds", self.loc_iter, "ConvLocIter")
+        self.add_param_class("spinds", problem, "ConvProblem")        
+        self.add_param_class("cudakers", CudaCommonKernel())        
+
+        self.ndim = problem.ndim 
+        self.dtype_indices = dtype_indices
+        self.dtype_indices_uniq = dtype_indices
+
+        assert dtype_indices == dtypes.int32 or dtype_indices == dtypes.int64
+
+
+    @pccm.cuda.cuda_global_function
+    def calc_conv_indices_stage1(self):
+        code = pccm.FunctionCode()
+        code.arg("loc_iter", f"ConvLocIter") # [N, ndim + 1]
+
+        code.arg("indices_in", f"const int*") # [N, ndim + 1]
+        code.arg("indice_pairs", f"{self.dtype_indices}*") # [2, kernelProd, MaxSize]
+        code.arg("indice_pairs_for_uniq", f"{self.dtype_indices}*") # [2, kernelProd, MaxSize]
+        code.arg("indice_num_per_loc", f"int*") # [kernelProd]
+
+        code.arg("num_indices_in", "int")
+        code.arg("indices_pair_size", "int")
+
+        code.arg("RS", "int")
+        # code.arg("bool", "transposed")
+
+        code.raw(f"""
+        int filter_offset = blockIdx.y;
+        loc_iter.set_filter_offset(filter_offset);
+        int indices_pair_size_mul_RS = indices_pair_size * RS;
+        int filter_offset_mul_indices_pair_size = filter_offset * indices_pair_size;
+        for (int i : tv::KernelLoopX<int>(num_indices_in)) {{
+            tv::array<int, {self.ndim + 1}> npq_offset;
+            if (loc_iter.query_npq(indices_in + i * {self.ndim + 1}, npq_offset)){{
+                int old_num = tv::cuda::atomicAggInc(indice_num_per_loc + filter_offset);
+                {self.dtype_indices} offset = loc_iter.layout_npq(npq_offset);
+                if (old_num < indices_pair_size){{
+                    indice_pairs[filter_offset_mul_indices_pair_size + old_num] = i;
+                    indice_pairs[indices_pair_size_mul_RS + filter_offset_mul_indices_pair_size + old_num] = offset;
+                    indice_pairs_for_uniq[filter_offset_mul_indices_pair_size + old_num] = offset;
+                }}
+            }}
+        }}
+        """)
+        return code
+
+    @pccm.cuda.cuda_global_function
+    def build_conv_hash_table(self):
+        code = pccm.FunctionCode()
+        code.targ("TTable")
+
+        code.arg("table", f"TTable") # [N, ndim + 1]
+        code.arg("indices_out", f"int*") # [N, ndim + 1]
+        code.arg("indice_pairs_for_uniq", f"const {self.dtype_indices}*") # [2, kernelProd, MaxSize]
+
+        code.arg("layout_npq", f"spinds::LayoutNPQ") # [2, kernelProd, MaxSize]
+
+        code.arg("num_indices", "int")
+
+        code.raw(f"""
+        for (int i : tv::KernelLoopX<int>(num_indices)) {{
+            {self.dtype_indices} index = indice_pairs_for_uniq[i];
+            layout_npq.inverse(index, indices_out + {self.ndim + 1} * i);
+            table.insert(index, i);
+        }}
+        """)
+        return code
+
+    @pccm.cuda.cuda_global_function
+    def calc_conv_indices_stage2(self):
+        code = pccm.FunctionCode()
+        code.targ("TTable")
+        code.arg("table", f"TTable") # [N, ndim + 1]
+        code.arg("indice_pairs_out_part", f"int*") # [2, kernelProd, MaxSize]
+        code.arg("num_indices_in", "int")
+        code.arg("indices_pair_size", "int")
+        # TODO use block instead of filter_offset?
+        code.raw(f"""
+        int filter_offset = blockIdx.y;
+        auto indice_pairs_out_part_filter = indice_pairs_out_part + filter_offset * indices_pair_size;
+        for (int i : tv::KernelLoopX<int>(num_indices_in)) {{
+            {self.dtype_indices} index = indice_pairs_out_part_filter[i];
+            if (index > -1){{
+                auto ptr = table.lookup_ptr(index);
+                if (ptr){{
+                    indice_pairs_out_part_filter[i] = ptr->second;
+                }}
+            }}
+        }}
+        """)
+        return code
+
+    @pccm.cuda.cuda_global_function
+    def build_subm_conv_hash_table(self):
+        code = pccm.FunctionCode()
+        code.targ("TTable")
+
+        code.arg("table", f"TTable") # [N, ndim + 1]
+        code.arg("indices_in", f"const int*") # [N, ndim + 1]
+
+        code.arg("layout_npq", f"spinds::LayoutNPQ") 
+
+        code.arg("num_indices", "int")
+
+        code.raw(f"""
+        for (int i : tv::KernelLoopX<int>(num_indices)) {{
+            {self.dtype_indices} index = layout_npq(indices_in + i * {self.ndim + 1});
+            table.insert(index, i);
+        }}
+        """)
+        return code
+
+    @pccm.cuda.cuda_global_function
+    def clean_indices_uniq(self):
+        code = pccm.FunctionCode()
+        code.arg("indice_pairs_for_uniq", f"{self.dtype_indices}*") 
+        code.arg("size", f"{self.dtype_indices}") 
+        code.raw(f"""
+        for ({self.dtype_indices} i : tv::KernelLoopX<{self.dtype_indices}>(size)) {{
+            indice_pairs_for_uniq[i] = std::numeric_limits<{self.dtype_indices}>::max();
+        }}
+        """)
+        return code
+
+    @pccm.cuda.cuda_global_function
+    def calc_subm_conv_indices(self):
+        code = pccm.FunctionCode()
+        code.targ("TTable")
+        code.arg("loc_iter", f"ConvLocIter") # [N, ndim + 1]
+        code.arg("table", f"TTable") # [N, ndim + 1]
+
+        code.arg("indices_in", f"const int*") # [N, ndim + 1]
+        code.arg("indice_pairs", f"{self.dtype_indices}*") # [2, kernelProd, MaxSize]
+        code.arg("indice_num_per_loc", f"int*") # [kernelProd]
+
+        code.arg("num_indices_in", "int")
+        code.arg("indices_pair_size", "int")
+
+        code.arg("RS", "int")
+        code.raw(f"""
+        int filter_offset = blockIdx.y;
+        loc_iter.set_filter_offset(filter_offset);
+        int indices_pair_size_mul_RS = indices_pair_size * RS;
+        int filter_offset_mul_indices_pair_size = filter_offset * indices_pair_size;
+
+        int filter_offset_mul_indices_pair_size_1 = (RS - 1 - filter_offset) * indices_pair_size;
+        if (filter_offset == (RS / 2)){{
+            for (int i : tv::KernelLoopX<int>(num_indices_in)) {{
+                indice_pairs[filter_offset_mul_indices_pair_size + i] = i;
+                indice_pairs[indices_pair_size_mul_RS + filter_offset_mul_indices_pair_size + i] = i;
+            }}
+        }} else {{
+            for (int i : tv::KernelLoopX<int>(num_indices_in)) {{
+                tv::array<int, {self.ndim + 1}> npq_offset;
+                if (loc_iter.query_npq_no_stride(indices_in + i * {self.ndim + 1}, npq_offset)){{
+                    {self.dtype_indices} offset = loc_iter.layout_npq(npq_offset);
+                    auto item = table.lookup(offset); // performance bound
+                    if (!item.empty()){{
+                        int old_num = tv::cuda::atomicAggInc(indice_num_per_loc + filter_offset);
+                        indice_pairs[filter_offset_mul_indices_pair_size + old_num] = i;
+                        indice_pairs[indices_pair_size_mul_RS + filter_offset_mul_indices_pair_size + old_num] = item.second;
+                        indice_pairs[filter_offset_mul_indices_pair_size_1 + old_num] = item.second;
+                        indice_pairs[indices_pair_size_mul_RS + filter_offset_mul_indices_pair_size_1 + old_num] = i;
+                    }}
+                }}
+            }}
+        }}
+        """)
+        return code
+
+    @pccm.cuda.cuda_global_function
+    def calc_subm_conv_indices_mask(self):
+        code = pccm.FunctionCode()
+        code.targ("TTable")
+        code.arg("loc_iter", f"ConvLocIter") # [N, ndim + 1]
+        code.arg("table", f"TTable") # [N, ndim + 1]
+
+        code.arg("indices_in", f"const int*") # [N, ndim + 1]
+        code.arg("indice_pairs", f"{self.dtype_indices}*") # [2, kernelProd, MaxSize]
+        code.arg("mask", f"uint32_t*") # [kernelProd]
+
+        code.arg("num_indices", "int")
+        code.arg("indices_pair_size", "int")
+
+        code.arg("RS", "int")
+        code.raw(f"""
+        int filter_offset = blockIdx.y;
+        uint32_t filter_mask_out = (1u << (filter_offset));
+        uint32_t filter_mask_in = (1u << (RS - 1 - filter_offset));
+        uint32_t filter_mask_center = (1u << (RS / 2));
+
+        loc_iter.set_filter_offset(filter_offset);
+        int indices_pair_size_mul_RS = indices_pair_size * RS;
+        int filter_offset_mul_indices_pair_size = filter_offset * indices_pair_size;
+
+        int filter_offset_mul_indices_pair_size_1 = (RS - 1 - filter_offset) * indices_pair_size;
+        if (filter_offset == (RS / 2)){{
+            for (int i : tv::KernelLoopX<int>(num_indices)) {{
+                // atomicOr(mask + i, filter_mask_center);
+                indice_pairs[filter_offset_mul_indices_pair_size + i] = i;
+                indice_pairs[indices_pair_size_mul_RS + filter_offset_mul_indices_pair_size + i] = i;
+            }}
+        }} else {{
+            for (int output_index : tv::KernelLoopX<int>(num_indices)) {{
+                // find input offset from output offset
+                tv::array<int, {self.ndim + 1}> nhw_offset;
+                // table: input indice coord to output index (or output indice coord to input index)
+                if (loc_iter.query_nhw(indices_in + output_index * {self.ndim + 1}, nhw_offset)){{
+                    {self.dtype_indices} offset = loc_iter.layout_npq(nhw_offset);
+                    auto item = table.lookup(offset);
+                    if (!item.empty()) {{
+                        auto input_index = item.second; // we find a input indice idx.
+                        atomicOr(mask + output_index, filter_mask_out);
+                        atomicOr(mask + input_index, filter_mask_in);
+                        // for this output, we set correct input idx.
+                        indice_pairs[filter_offset_mul_indices_pair_size + output_index] = input_index;
+                        // the output in "input location" connect this output idx in another location.
+                        indice_pairs[filter_offset_mul_indices_pair_size_1 + input_index] = output_index;
+                        indice_pairs[indices_pair_size_mul_RS + filter_offset_mul_indices_pair_size + input_index] = output_index;
+                        indice_pairs[indices_pair_size_mul_RS + filter_offset_mul_indices_pair_size_1 + output_index] = input_index;
+                    }}
+                }}
+            }}
+        }}
+        """)
+        return code
+
+    @pccm.cuda.cuda_global_function
+    def calc_subm_conv_indices_split_mask(self):
+        code = pccm.FunctionCode()
+        code.targ("TTable")
+        code.arg("loc_iter", f"ConvLocIter") # [N, ndim + 1]
+        code.arg("table", f"TTable") # [N, ndim + 1]
+
+        code.arg("indices_in", f"const int*") # [N, ndim + 1]
+        code.arg("indice_pairs", f"{self.dtype_indices}*") # [2, kernelProd, MaxSize]
+        code.arg("mask1", f"uint32_t*") # [kernelProd]
+        code.arg("mask2", f"uint32_t*") # [kernelProd]
+
+        code.arg("num_indices", "int")
+        code.arg("indices_pair_size", "int")
+
+        code.arg("RS", "int")
+        code.raw(f"""
+        int filter_offset = blockIdx.y;
+        uint32_t filter_mask_out = (1u << (filter_offset));
+        uint32_t filter_mask_in = (1u << (RS - 1 - filter_offset));
+        uint32_t filter_mask_center = (1u << (RS / 2));
+
+        loc_iter.set_filter_offset(filter_offset);
+        auto indice_ptr_inv = indice_pairs + indices_pair_size * RS;
+        int filter_offset_mul_indices_pair_size = filter_offset * indices_pair_size;
+        int filter_offset_mul_indices_pair_size_1 = (RS - 1 - filter_offset) * indices_pair_size;
+        if (filter_offset == (RS / 2)){{
+            for (int i : tv::KernelLoopX<int>(num_indices)) {{
+                indice_pairs[filter_offset_mul_indices_pair_size + i] = i;
+                indice_ptr_inv[filter_offset_mul_indices_pair_size + i] = i;
+            }}
+        }} else {{
+            for (int output_index : tv::KernelLoopX<int>(num_indices)) {{
+                // find input offset from output offset
+                tv::array<int, {self.ndim + 1}> nhw_offset;
+                // table: input indice coord to output index (or output indice coord to input index)
+                if (loc_iter.query_nhw(indices_in + output_index * {self.ndim + 1}, nhw_offset)){{
+                    {self.dtype_indices} offset = loc_iter.layout_npq(nhw_offset);
+                    auto item = table.lookup(offset);
+                    if (!item.empty()) {{
+                        auto input_index = item.second; // we find a input indice idx.
+                        atomicOr(mask1 + output_index, filter_mask_out);
+                        atomicOr(mask2 + input_index, filter_mask_in);
+                        // for this output, we set correct input idx.
+                        indice_pairs[filter_offset_mul_indices_pair_size + output_index] = input_index;
+                        // the output in "input location" connect this output idx in another location.
+                        indice_pairs[filter_offset_mul_indices_pair_size_1 + input_index] = output_index;
+                        indice_ptr_inv[filter_offset_mul_indices_pair_size + input_index] = output_index;
+                        indice_ptr_inv[filter_offset_mul_indices_pair_size_1 + output_index] = input_index;
+                    }}
+                }}
+            }}
+        }}
+        """)
+        return code
+
+    @pccm.cuda.static_function
+    def generate_conv_inds(self):
+        code = pccm.FunctionCode()
+        code.arg("indices, hashdata", "tv::Tensor")
+        code.arg("indice_pairs, indice_pairs_uniq, out_inds, indice_num_per_loc", "tv::Tensor")
+        code.arg("batch_size", "int")
+        code.arg("output_dims, input_dims", f"tv::array<int, {self.ndim}>")
+        code.arg("ksize, stride, padding, dilation", f"tv::array<int, {self.ndim}>")
+        code.raw(f"""
+        // TODO stream
+        // TODO handle num input == 0
+        int kv = tv::arrayops::prod(ksize);
+        TV_ASSERT_RT_ERR(kv == indice_pairs.dim(1), "error");
+        // indice_pairs: [2, kv, indices.dim(0)]
+        // indice_pairs_uniq: [indice_pairs.size() / 2 + 1]
+        // out_inds: [MaxSize, {self.ndim + 1}]
+        auto timer = tv::CudaContextTimer<>();
+        int64_t uniq_size = indice_pairs.size() / 2 + 1;
+        TV_ASSERT_RT_ERR(indice_pairs_uniq.dim(0) == uniq_size, "error");
+        TV_ASSERT_RT_ERR(indice_num_per_loc.dim(0) == kv, "error");
+        
+        int64_t expected_out_size = indices.dim(0) * kv;
+        TV_ASSERT_RT_ERR(out_inds.dim(0) == expected_out_size && out_inds.dim(1) == {self.ndim + 1}, "error");
+        tv::cuda::Launch launcher_num_act_in(indices.dim(0));
+        // tv::cuda::Launch launcher_num_act_in_2(indices.dim(0));
+        launcher_num_act_in.blocks.y = kv;
+
+        ConvProblem problem(batch_size, 1, 1, input_dims, output_dims, ksize, padding, stride, dilation);
+        ConvLocIter loc_iter(problem);
+        tv::cuda::Launch launcher_clean_uniq(uniq_size);
+        launcher_clean_uniq(clean_indices_uniq, indice_pairs_uniq.data_ptr<{self.dtype_indices}>(), uniq_size);
+        tv::ssprint("clean time", timer.report() / 1000.0);
+
+        launcher_num_act_in(calc_conv_indices_stage1, loc_iter, indices.data_ptr<const int>(), 
+            indice_pairs.data_ptr<{self.dtype_indices}>(), 
+            indice_pairs_uniq.data_ptr<{self.dtype_indices}>(), indice_num_per_loc.data_ptr<int>(), indices.dim(0),
+            indice_pairs.dim(2), kv);
+        tv::ssprint("calc_conv_indices_stage1 time", timer.report() / 1000.0, uniq_size);
+
+        thrust::device_ptr<{self.dtype_indices}> ptr_tr(indice_pairs_uniq.data_ptr<{self.dtype_indices}>());
+        auto thrust_ctx = thrust::cuda::par.on(0);
+        thrust::sort(thrust_ctx, ptr_tr, ptr_tr + uniq_size);
+        auto new_end = thrust::unique(thrust_ctx, ptr_tr, ptr_tr + uniq_size);
+        auto num_out_act = new_end - ptr_tr - 1;
+        tv::ssprint("unique time", num_out_act, timer.report() / 1000.0);
+
+        // return num_out_act;
+        // TODO handle invalid num_out_act
+        indice_pairs_uniq = indice_pairs_uniq.slice_first_axis(0, num_out_act);
+        tv::cuda::Launch lanucher_build_hash(num_out_act);
+        using V = {self.dtype_indices};
+        using KeyType = {self.dtype_indices};
+        constexpr KeyType kEmptyKey = std::numeric_limits<KeyType>::max();
+
+        using table_t =
+            tv::hash::LinearHashTable<KeyType, V, tv::hash::Murmur3Hash<KeyType>,
+                                        kEmptyKey, false>;
+        using pair_t = typename table_t::value_type;
+        TV_ASSERT_RT_ERR(hashdata.dim(0) >= num_out_act, "hash size not enough");
+        table_t hash = table_t(hashdata.data_ptr<pair_t>(), hashdata.dim(0));
+        hash.clear();
+        tv::ssprint("clear hash time", hashdata.dim(0), timer.report() / 1000.0);
+
+        lanucher_build_hash(build_conv_hash_table<table_t>, hash, out_inds.data_ptr<int>(), indice_pairs_uniq.data_ptr<const {self.dtype_indices}>(), 
+            loc_iter.layout_npq, num_out_act);
+        tv::ssprint("build_hash time", num_out_act, timer.report() / 1000.0);
+
+        launcher_num_act_in(calc_conv_indices_stage2<table_t>, hash, indice_pairs[1].data_ptr<int>(), indices.dim(0), 
+            indice_pairs.dim(2));
+        tv::ssprint("gem conv inds time", timer.report() / 1000.0);
+        return num_out_act;
+        """)
+
+        return code.ret("int")
+
+    @pccm.cuda.static_function
+    def generate_conv_inds_stage1(self):
+        code = pccm.FunctionCode()
+        code.arg("indices", "tv::Tensor")
+        code.arg("indice_pairs, indice_pairs_uniq, indice_num_per_loc", "tv::Tensor")
+        code.arg("batch_size", "int")
+        code.arg("output_dims, input_dims", f"tv::array<int, {self.ndim}>")
+        code.arg("ksize, stride, padding, dilation", f"tv::array<int, {self.ndim}>")
+        code.arg("stream_int", f"std::uintptr_t", "0")
+
+        code.raw(f"""
+        // TODO stream
+        // TODO handle num input == 0
+        int kv = tv::arrayops::prod(ksize);
+        TV_ASSERT_RT_ERR(kv == indice_pairs.dim(1), "error");
+        // indice_pairs: [2, kv, indices.dim(0)]
+        // indice_pairs_uniq: [indice_pairs.size() / 2 + 1]
+        int64_t uniq_size = indice_pairs.size() / 2 + 1;
+        TV_ASSERT_RT_ERR(indice_pairs_uniq.dim(0) >= uniq_size, "error");
+        TV_ASSERT_RT_ERR(indice_num_per_loc.dim(0) == kv, "error");
+        int64_t expected_out_size = indices.dim(0) * kv;
+        tv::cuda::Launch launcher_num_act_in(indices.dim(0), reinterpret_cast<cudaStream_t>(stream_int));
+        // tv::cuda::Launch launcher_num_act_in_2(indices.dim(0));
+        launcher_num_act_in.blocks.y = kv;
+        ConvProblem problem(batch_size, 1, 1, input_dims, output_dims, ksize, padding, stride, dilation);
+        ConvLocIter loc_iter(problem);
+        tv::cuda::Launch launcher_clean_uniq(uniq_size, reinterpret_cast<cudaStream_t>(stream_int));
+        launcher_clean_uniq(clean_indices_uniq, indice_pairs_uniq.data_ptr<{self.dtype_indices}>(), uniq_size);
+        launcher_num_act_in(calc_conv_indices_stage1, loc_iter, indices.data_ptr<const int>(), 
+            indice_pairs.data_ptr<{self.dtype_indices}>(), 
+            indice_pairs_uniq.data_ptr<{self.dtype_indices}>(), indice_num_per_loc.data_ptr<int>(), indices.dim(0),
+            indice_pairs.dim(2), kv);
+        thrust::device_ptr<{self.dtype_indices}> ptr_tr(indice_pairs_uniq.data_ptr<{self.dtype_indices}>());
+        auto thrust_ctx = thrust::cuda::par.on(reinterpret_cast<cudaStream_t>(stream_int));
+        thrust::sort(thrust_ctx, ptr_tr, ptr_tr + uniq_size);
+        auto new_end = thrust::unique(thrust_ctx, ptr_tr, ptr_tr + uniq_size);
+        auto num_out_act = new_end - ptr_tr - 1;
+        return num_out_act;
+        """)
+        return code.ret("int")
+
+    @pccm.cuda.static_function
+    def generate_conv_inds_stage2(self):
+        code = pccm.FunctionCode()
+        code.arg("indices, hashdata", "tv::Tensor")
+        code.arg("indice_pairs, indice_pairs_uniq, out_inds", "tv::Tensor")
+        code.arg("num_out_act", "int")
+        code.arg("batch_size", "int")
+        code.arg("output_dims, input_dims", f"tv::array<int, {self.ndim}>")
+        code.arg("ksize, stride, padding, dilation", f"tv::array<int, {self.ndim}>")
+        code.arg("stream_int", f"std::uintptr_t", "0")
+        code.raw(f"""
+        auto custream = reinterpret_cast<cudaStream_t>(stream_int);
+        // TODO stream
+        // TODO handle num input == 0
+        int kv = tv::arrayops::prod(ksize);
+        TV_ASSERT_RT_ERR(kv == indice_pairs.dim(1), "error");
+        // indice_pairs: [2, kv, indices.dim(0)]
+        // indice_pairs_uniq: [indice_pairs.size() / 2 + 1]
+        // out_inds: [MaxSize, {self.ndim + 1}]
+        auto timer = tv::CudaContextTimer<>();
+        int64_t uniq_size = indice_pairs.size() / 2 + 1;
+        TV_ASSERT_RT_ERR(indice_pairs_uniq.dim(0) == uniq_size, "error");
+        TV_ASSERT_RT_ERR(out_inds.dim(0) >= num_out_act && out_inds.dim(1) == {self.ndim + 1}, "error");
+        tv::cuda::Launch launcher_num_act_in(indices.dim(0), custream);
+        launcher_num_act_in.blocks.y = kv;
+        ConvProblem problem(batch_size, 1, 1, input_dims, output_dims, ksize, padding, stride, dilation);
+        ConvLocIter loc_iter(problem);
+        
+        // TODO handle invalid num_out_act
+        indice_pairs_uniq = indice_pairs_uniq.slice_first_axis(0, num_out_act);
+        tv::cuda::Launch lanucher_build_hash(num_out_act, custream);
+        using V = {self.dtype_indices};
+        using KeyType = {self.dtype_indices};
+        constexpr KeyType kEmptyKey = std::numeric_limits<KeyType>::max();
+        using table_t =
+            tv::hash::LinearHashTable<KeyType, V, tv::hash::Murmur3Hash<KeyType>,
+                                        kEmptyKey, false>;
+        using pair_t = typename table_t::value_type;
+        TV_ASSERT_RT_ERR(hashdata.dim(0) >= num_out_act, "hash size not enough");
+        table_t hash = table_t(hashdata.data_ptr<pair_t>(), hashdata.dim(0));
+        hash.clear(custream);
+        lanucher_build_hash(build_conv_hash_table<table_t>, hash, 
+            out_inds.data_ptr<int>(), indice_pairs_uniq.data_ptr<const {self.dtype_indices}>(), 
+            loc_iter.layout_npq, num_out_act);
+        launcher_num_act_in(calc_conv_indices_stage2<table_t>, hash, 
+            indice_pairs[1].data_ptr<int>(), indices.dim(0), 
+            indice_pairs.dim(2));
+        return num_out_act;
+        """)
+        return code.ret("int")
+
+
+    @pccm.cuda.static_function
+    def generate_subm_conv_inds(self):
+        code = pccm.FunctionCode()
+        code.arg("indices, hashdata", "tv::Tensor")
+        code.arg("indice_pairs, out_inds, indice_num_per_loc", "tv::Tensor")
+        code.arg("batch_size", "int")
+        code.arg("input_dims", f"tv::array<int, {self.ndim}>")
+        code.arg("ksize, dilation", f"tv::array<int, {self.ndim}>")
+        code.arg("indice_pair_mask", "tv::Tensor", "tv::Tensor()", "cumm.tensorview.Tensor = Tensor()")
+        code.arg("backward", "bool", "false")
+        code.arg("stream_int", f"std::uintptr_t", "0")
+
+        code.raw(f"""
+        auto custream = reinterpret_cast<cudaStream_t>(stream_int);
+        auto ctx = tv::Context();
+        ctx.set_cuda_stream(custream);
+        if (!indice_pair_mask.empty()){{
+            TV_ASSERT_INVALID_ARG(tv::arrayops::prod(ksize) < 32, "for now only support 32bit mask");
+        }}
+        // TODO stream
+        // TODO handle num input == 0
+        tv::array<int, {self.ndim}> stride, padding;
+        for (int i = 0; i < {self.ndim}; ++i){{
+            TV_ASSERT_RT_ERR(ksize[i] % 2 == 1, "subm only support odd ksize");
+            stride[i] = 1;
+            padding[i] = (ksize[i] / 2) * dilation[i];
+        }}
+        int kv = tv::arrayops::prod(ksize);
+        TV_ASSERT_RT_ERR(kv == indice_pairs.dim(1), "error");
+        // indice_pairs: [2, kv, indices.dim(0)]
+        // out_inds: [MaxSize, {self.ndim + 1}]
+        // auto timer = tv::CudaContextTimer<>();
+        TV_ASSERT_RT_ERR(indice_num_per_loc.dim(0) == kv, "error");
+        tv::cuda::Launch launcher_num_act_in(indices.dim(0), custream);
+        launcher_num_act_in.blocks.y = (kv / 2) + 1;
+        // launcher_num_act_in.blocks.y = kv;
+
+        ConvProblem problem(batch_size, 1, 1, input_dims, input_dims, ksize, padding, stride, dilation);
+        ConvLocIter loc_iter(problem);
+
+        tv::cuda::Launch lanucher_build_hash(indices.dim(0), custream);
+        using V = {self.dtype_indices};
+        using KeyType = {self.dtype_indices};
+        constexpr KeyType kEmptyKey = std::numeric_limits<KeyType>::max();
+
+        using table_t =
+            tv::hash::LinearHashTable<KeyType, V, tv::hash::Murmur3Hash<KeyType>,
+                                        kEmptyKey, false>;
+        using pair_t = typename table_t::value_type;
+        TV_ASSERT_RT_ERR(hashdata.dim(0) >= indices.dim(0), "hash size not enough");
+        table_t hash = table_t(hashdata.data_ptr<pair_t>(), hashdata.dim(0));
+        hash.clear(custream);
+        // tv::ssprint("clear hash time", hashdata.dim(0), timer.report() / 1000.0);
+
+        lanucher_build_hash(build_subm_conv_hash_table<table_t>, hash, indices.data_ptr<const int>(),
+            loc_iter.layout_npq, indices.dim(0));
+        // tv::ssprint("build_hash time", timer.report() / 1000.0);
+        if (!indice_pair_mask.empty()){{
+            if (indice_pair_mask.ndim() == 2 && indice_pair_mask.dim(0) == 2){{
+                auto mask_0 = indice_pair_mask[0];
+                tv::cuda::Launch lanucher_fill(mask_0.size(), custream);
+                lanucher_fill(cudakers::fill_kernel<int>, mask_0.data_ptr<int>(), (1 << (kv / 2)), mask_0.size());
+                indice_pair_mask[1].zero_(ctx);
+                auto kernel = &calc_subm_conv_indices_split_mask<table_t>;
+                launcher_num_act_in(kernel, loc_iter, hash,  
+                    indices.data_ptr<int>(), indice_pairs.data_ptr<int>(), 
+                    indice_pair_mask[0].data_ptr<uint32_t>(), indice_pair_mask[1].data_ptr<uint32_t>(), 
+                    indices.dim(0), indice_pairs.dim(2), kv);
+            }}else{{
+                tv::cuda::Launch lanucher_fill(indice_pair_mask.size(), custream);
+                lanucher_fill(cudakers::fill_kernel<int>, indice_pair_mask.data_ptr<int>(), (1 << (kv / 2)), indice_pair_mask.size());
+                TV_ASSERT_RT_ERR(indice_pair_mask.ndim() == 1, "error");
+                launcher_num_act_in(calc_subm_conv_indices_mask<table_t>, loc_iter, hash, 
+                    indices.data_ptr<int>(), indice_pairs.data_ptr<int>(), 
+                    indice_pair_mask.data_ptr<uint32_t>(), indices.dim(0), indice_pairs.dim(2), kv);
+            }}
+        }}else{{
+            launcher_num_act_in(calc_subm_conv_indices<table_t>, loc_iter, hash, indices.data_ptr<int>(), 
+                indice_pairs.data_ptr<int>(), 
+                indice_num_per_loc.data_ptr<int>(), indices.dim(0), indice_pairs.dim(2), kv);
+        }}
+        // tv::ssprint("gem subm conv inds time", timer.report() / 1000.0);
+        return indices.dim(0);
+        """)
+
+        return code.ret("int")
--- a/spconv/csrc/sparse/maxpool.py
+++ b/spconv/csrc/sparse/maxpool.py
+# Copyright 2021 Yan Yan
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import contextlib
+from cumm.conv.bases import ConvEnum
+from cumm.gemm.core.metaarray import MetaArray, seq
+from cumm import dtypes
+import pccm 
+from cumm.gemm.layout import TensorGeneric, to_stride
+from cumm.common import TensorView, TensorViewHashKernel, TensorViewKernel, ThrustLib, GemmBasic
+from cumm.gemm import codeops
+from typing import List 
+from cumm.conv.params import ConvProblem
+import numpy as np 
+
+class IndiceMaxPool(pccm.Class):
+    # TODO optimize this function
+    def __init__(self):
+        super().__init__()
+        self.add_dependency(TensorViewKernel, TensorView, GemmBasic)
+    
+    @pccm.cuda.cuda_global_function
+    def forward_kernel(self):
+        code = pccm.FunctionCode()
+        code.targ("T")
+
+        code.arg("out_features", f"T*") 
+        code.arg("in_features", f"const T*")
+        code.arg("out_indices", "const int*")
+        code.arg("in_indices", "const int*")
+        code.arg("size", "int")
+        code.arg("num_features", "int")
+
+        code.raw(f"""
+        for (int i : tv::KernelLoopY<int>(size)) {{
+            int in_idx = in_indices[i];
+            int out_idx = out_indices[i];
+            auto in_ptr = in_features + in_idx * num_features;
+            auto out_ptr = out_features + out_idx * num_features;
+            for (int j : tv::KernelLoopX<int>(num_features)) {{
+                auto in = in_ptr[j];
+                auto out = out_ptr[j];
+                if (in > out){{
+                    out_ptr[j] = in;
+                }}
+            }}
+        }}
+        """)
+        return code
+
+    @pccm.cuda.cuda_global_function
+    def backward_kernel(self):
+        code = pccm.FunctionCode()
+        code.targ("T")
+        code.arg("out_features", f"const T*") 
+        code.arg("in_features", f"const T*")
+        code.arg("dout_features", f"const T*") 
+        code.arg("din_features", f"T*")
+        code.arg("out_indices", "const int*")
+        code.arg("in_indices", "const int*")
+        code.arg("size", "int")
+        code.arg("num_features", "int")
+
+        code.raw(f"""
+        for (int i : tv::KernelLoopY<int>(size)) {{
+            int in_idx_offset = in_indices[i] * num_features;
+            int out_idx_offset = out_indices[i] * num_features;
+            auto in_ptr = in_features + in_idx_offset;
+            auto out_ptr = out_features + out_idx_offset;
+            auto din_ptr = din_features + in_idx_offset;
+            auto dout_ptr = dout_features + out_idx_offset;
+            for (int j : tv::KernelLoopX<int>(num_features)) {{
+                auto in = in_ptr[j];
+                auto out = out_ptr[j];
+                if (in == out){{
+                    din_ptr[j] = din_ptr[j] + dout_ptr[j];
+                }}
+            }}
+        }}
+        """)
+        return code
+
+    @pccm.cuda.static_function
+    def forward(self):
+        code = pccm.FunctionCode()
+        code.arg("out", "tv::Tensor")
+        code.arg("in", "tv::Tensor")
+        code.arg("out_inds", "tv::Tensor")
+        code.arg("in_inds", "tv::Tensor")
+        code.arg("stream", "std::uintptr_t", "0")
+
+        code.raw(f"""
+        auto nhot = out_inds.dim(0);
+        auto cudastream = reinterpret_cast<cudaStream_t>(stream);
+        tv::dispatch<float, double, tv::half_t, tv::bfloat16_t>(out.dtype(), [&](auto I){{
+            using T = TV_DECLTYPE(I);
+            constexpr int MaxThreads = 512;
+            tv::cuda::Launch launcher(1);
+            bool found = tv::dispatch_int_noexcept<512, 256, 128, 64, 32, 16>(out.dim(1), [](int my, int expect){{return my >= expect;}}, [&](auto V){{
+                // if out.dim(1) > value in list above, run this function.
+                // if a value is found, other value won't be executed.
+                constexpr int NumFeatures = TV_DECLTYPE(V)::value;
+                constexpr int Num0 = MaxThreads / NumFeatures;
+                dim3 blocks(tv::div_up(out.dim(1), NumFeatures), tv::div_up(nhot, Num0));
+                dim3 threads(NumFeatures, Num0);
+                launcher = tv::cuda::Launch(blocks, threads, cudastream);
+            }});
+            if (!found){{
+                constexpr int NumFeatures = 16;
+                constexpr int Num0 = MaxThreads / NumFeatures;
+                dim3 blocks(tv::div_up(out.dim(1), NumFeatures), tv::div_up(nhot, Num0));
+                dim3 threads(NumFeatures, Num0);
+                launcher = tv::cuda::Launch(blocks, threads, cudastream);
+            }}
+            launcher(forward_kernel<T>, out.data_ptr<T>(), in.data_ptr<const T>(),
+                out_inds.data_ptr<const int>(), in_inds.data_ptr<const int>(), nhot, out.dim(1));
+
+        }});
+        """)
+        return code
+
+    @pccm.cuda.static_function
+    def backward(self):
+        code = pccm.FunctionCode()
+        code.arg("out", "tv::Tensor")
+        code.arg("in", "tv::Tensor")
+        code.arg("dout", "tv::Tensor")
+        code.arg("din", "tv::Tensor")
+        code.arg("out_inds", "tv::Tensor")
+        code.arg("in_inds", "tv::Tensor")
+        code.arg("stream", "std::uintptr_t", "0")
+
+        code.raw(f"""
+        auto nhot = out_inds.dim(0);
+
+        auto cudastream = reinterpret_cast<cudaStream_t>(stream);
+        tv::dispatch<float, double, tv::half_t, tv::bfloat16_t>(out.dtype(), [&](auto I){{
+            using T = TV_DECLTYPE(I);
+            constexpr int MaxThreads = 512;
+            tv::cuda::Launch launcher(1);
+            bool found = tv::dispatch_int_noexcept<512, 256, 128, 64, 32, 16>(out.dim(1), [](int my, int expect){{return my >= expect;}}, [&](auto V){{
+                // if out.dim(1) > value in list above, run this function.
+                // if a value is found, other value won't be executed.
+                constexpr int NumFeatures = TV_DECLTYPE(V)::value;
+                constexpr int Num0 = MaxThreads / NumFeatures;
+                dim3 blocks(tv::div_up(out.dim(1), NumFeatures), tv::div_up(nhot, Num0));
+                dim3 threads(NumFeatures, Num0);
+                launcher = tv::cuda::Launch(blocks, threads, cudastream);
+            }});
+            if (!found){{
+                constexpr int NumFeatures = 16;
+                constexpr int Num0 = MaxThreads / NumFeatures;
+                dim3 blocks(tv::div_up(out.dim(1), NumFeatures), tv::div_up(nhot, Num0));
+                dim3 threads(NumFeatures, Num0);
+                launcher = tv::cuda::Launch(blocks, threads, cudastream);
+            }}
+            launcher(backward_kernel<T>, out.data_ptr<const T>(), in.data_ptr<const T>(),
+                dout.data_ptr<const T>(), din.data_ptr<T>(),
+                out_inds.data_ptr<const int>(), in_inds.data_ptr<const int>(), nhot, out.dim(1));
+        }});
+        """)
+        return code
+
--- a/spconv/csrc/sparse/pointops.py
+++ b/spconv/csrc/sparse/pointops.py
+# Copyright 2021 Yan Yan
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import contextlib
+from cumm.gemm.core.metaarray import MetaArray, seq
+from cumm import dtypes
+import pccm 
+from cumm.gemm.layout import TensorGeneric, to_stride
+from cumm.common import TensorView, TensorViewHashKernel
+from cumm.gemm import codeops
+from typing import List 
+from cumm.conv.params import ConvProblem
+import numpy as np 
+
+
+class Point2VoxelKernel(pccm.ParameterizedClass, pccm.pybind.PybindClassMixin):
+    """this class don't support multi-thread. 
+    create p2v for every thread.
+    """
+    def __init__(self, dtype: dtypes.DType, ndim: int, layout: TensorGeneric, zyx: bool = True):
+        super().__init__()
+        self.add_dependency(TensorView, TensorViewHashKernel)
+        self.add_param_class("layout_ns", layout, "Layout")
+        self.dtype = dtype 
+        self.ndim = ndim 
+        self.zyx = zyx
+
+    @pccm.cuda.cuda_global_function
+    def build_hash_table(self):
+        code = pccm.FunctionCode()
+        code.targ("TTable")
+        code.arg("table", "TTable")
+        code.arg("points", f"{self.dtype} const*")
+        code.arg("points_indice_data", f"int64_t *")
+
+        code.arg("point_stride", f"int")
+        code.arg("vsize", f"tv::array<float, {self.ndim}>")
+        code.arg("coors_range", f"tv::array<float, {self.ndim * 2}>")
+        code.arg("grid_bound", f"tv::array<int, {self.ndim}>")
+        code.arg("grid_stride", f"tv::array<int, {self.ndim}>")
+
+        code.arg("num_points", f"int")
+        point_xyz = f"{self.ndim - 1} - j"
+        if not self.zyx:
+            point_xyz = f"j"
+        # if zyx, the coors_range and grid_bound is zyx too, 
+        # generated indices is zyx.
+        code.raw(f"""
+        for (int i : tv::KernelLoopX<int>(num_points)){{
+            bool failed = false;
+            int c;
+            int64_t prod = 0;
+        #pragma unroll
+            for (int j = 0; j < {self.ndim}; ++j) {{
+                c = floor((points[i * point_stride + {point_xyz}] - coors_range[j]) /
+                            vsize[j]);
+                if ((c < 0 || c >= grid_bound[j])) {{
+                    failed = true;
+                }}
+                prod += grid_stride[j] * c;
+            }}
+            if (!failed){{
+                points_indice_data[i] = prod;
+                table.insert(prod, i);
+            }}else{{
+                points_indice_data[i] = -1;
+            }}
+        }}
+        """)
+        return code 
+
+    @pccm.cuda.cuda_global_function
+    def assign_table(self):
+        code = pccm.FunctionCode()
+        code.targ("TTable")
+        code.arg("table", "TTable")
+        code.arg("indices", f"int*")
+        code.arg("count", f"int*")
+        code.arg("layout", f"Layout")
+        code.arg("max_voxels", f"int")
+
+        code.raw(f"""
+        auto data = table.data();
+        for (int i : tv::KernelLoopX<int>(table.size())){{
+            auto &item = data[i];
+            if (!item.empty()) {{
+                item.second = tv::cuda::atomicAggInc(count);
+                if (item.second < max_voxels){{
+                    layout.inverse(item.first, indices + item.second * {self.ndim});
+                }}
+            }}
+        }}
+        """)
+        return code 
+
+    @pccm.cuda.cuda_global_function
+    def generate_voxel(self):
+        code = pccm.FunctionCode()
+        code.targ("TTable")
+        code.arg("table", "TTable")
+        code.arg("points", f"{self.dtype} const*")
+
+        code.arg("points_indice_data", f"const int64_t*")
+        code.arg("voxels", f"{self.dtype} *")
+        code.arg("num_per_voxel", f"int *")
+
+        code.arg("point_stride", f"int")
+        code.arg("max_points_per_voxel", f"int")
+        code.arg("max_voxels", f"int")
+
+        code.arg("vsize", f"tv::array<float, {self.ndim}>")
+        code.arg("coors_range", f"tv::array<float, {self.ndim * 2}>")
+        code.arg("grid_bound", f"tv::array<int, {self.ndim}>")
+        code.arg("grid_stride", f"tv::array<int, {self.ndim}>")
+
+        code.arg("num_points", f"int")
+        code.raw(f"""
+        int voxel_stride0 = point_stride * max_points_per_voxel;
+        for (int i : tv::KernelLoopX<int>(num_points)){{
+            int64_t prod = points_indice_data[i];
+            if (prod != -1){{
+                auto voxel_index_pair = table.lookup(prod);
+                if (!voxel_index_pair.empty() &&
+                    voxel_index_pair.second < max_voxels) {{
+                    int old = atomicAdd(num_per_voxel + voxel_index_pair.second, 1);
+                    if (old < max_points_per_voxel) {{
+                        for (int j = 0; j < point_stride; ++j) {{
+                            voxels[voxel_index_pair.second * voxel_stride0 + old * point_stride + j] = points[i * point_stride + j];
+                        }}
+                    }}
+                }}
+            }}
+        }}
+        """)
+        return code 
+
+class Point2Voxel(pccm.ParameterizedClass, pccm.pybind.PybindClassMixin):
+    def __init__(self, dtype: dtypes.DType, ndim: int, zyx: bool = True):
+        super().__init__()
+        self.add_dependency(TensorView)
+        layout = TensorGeneric(ndim, True)
+        self.add_param_class("layout_ns", layout, "Layout")
+        self.dtype = dtype 
+        self.ndim = ndim 
+        self.zyx = zyx
+        cuda_funcs = [self.point_to_voxel_hash]
+        self.add_impl_only_param_class(cuda_funcs, "kernel", Point2VoxelKernel(dtype, ndim, layout, zyx))
+
+        self.add_pybind_member("hashdata", "tv::Tensor", readwrite=False, pyanno="cumm.tensorview.Tensor")
+        self.add_pybind_member("point_indice_data", "tv::Tensor", readwrite=False, pyanno="cumm.tensorview.Tensor")
+
+        self.add_pybind_member("voxels", "tv::Tensor", readwrite=False)
+        self.add_pybind_member("indices", "tv::Tensor", readwrite=False)
+        self.add_pybind_member("num_per_voxel", "tv::Tensor", readwrite=False)
+        self.add_member("vsize", f"tv::array<float, {self.ndim}>")
+        self.add_member("coors_range", f"tv::array<float, {self.ndim * 2}>")
+        self.add_member("grid_size", f"tv::array<int, {self.ndim}>")
+        self.add_member("grid_stride", f"tv::array<int, {self.ndim}>")
+
+    @pccm.pybind.mark_prop_getter(prop_name="grid_size")
+    @pccm.member_function
+    def get_grid_size(self):
+        code = pccm.FunctionCode()
+        code.raw(f"""
+        std::array<int, {self.ndim}> res;
+        for (int i = 0; i < {self.ndim}; ++i){{
+            res[i] = grid_size[i];
+        }}
+        return res;
+        """)
+        return code.ret(f"std::array<int, {self.ndim}>")
+
+    @pccm.pybind.mark
+    @pccm.constructor
+    def ctor(self):
+        code = pccm.FunctionCode()
+        code.arg("vsize_xyz", f"std::array<float, {self.ndim}>")
+        code.arg("coors_range_xyz", f"std::array<float, {self.ndim * 2}>")
+        code.arg("num_point_features", f"int")
+        code.arg("max_num_voxels, max_num_points_per_voxel", f"int")
+        if self.zyx:
+            code.raw(f"""
+            for (int i = 0; i < {self.ndim}; ++i){{
+                vsize[{self.ndim - 1} - i] = vsize_xyz[i];
+                coors_range[{self.ndim - 1} - i] = coors_range_xyz[i];
+                coors_range[{2 * self.ndim - 1} - i] = coors_range_xyz[i + {self.ndim}];
+            }}
+            """)
+        else:
+            code.raw(f"""
+            for (int i = 0; i < {self.ndim}; ++i){{
+                vsize[i] = vsize_xyz[i];
+                coors_range[i] = coors_range_xyz[i];
+                coors_range[i + {self.ndim}] = coors_range_xyz[i + {self.ndim}];
+            }}
+            """)
+        # if zyx, grid_size is zyx.
+        code.raw(f"""
+        int64_t prod = 1;
+        for (size_t i = 0; i < {self.ndim}; ++i) {{
+            grid_size[i] =
+                std::round((coors_range[{self.ndim} + i] - coors_range[i]) / vsize[i]);
+        }}
+        for (int i = {self.ndim} - 1; i >= 0; --i) {{
+            grid_stride[i] = prod;
+            prod *= grid_size[i];
+        }}
+        voxels = tv::zeros({{max_num_voxels, max_num_points_per_voxel, num_point_features}}, tv::type_v<{self.dtype}>, 0);
+        indices = tv::zeros({{max_num_voxels, {self.ndim}}}, tv::int32, 0);
+        num_per_voxel = tv::zeros({{max_num_voxels}}, tv::int32, 0);
+        hashdata = tv::zeros({{1}}, tv::custom128, 0);
+        point_indice_data = tv::zeros({{1}}, tv::int64, 0);
+        """)
+        return code 
+
+    @pccm.pybind.mark
+    @pccm.cuda.member_function
+    def point_to_voxel_hash(self):
+        code = pccm.FunctionCode()
+        code.arg("points", "tv::Tensor")
+        code.arg("clear_voxels", "bool", "true")
+
+        code.raw(f"""
+        TV_ASSERT_INVALID_ARG(points.ndim() == 2 && points.dim(1) >= {self.ndim}, "error");
+        using V = int64_t;
+        using KeyType = int64_t;
+        constexpr KeyType kEmptyKey = std::numeric_limits<KeyType>::max();
+        if (clear_voxels){{
+            voxels.zero_();
+        }}
+        using table_t =
+            tv::hash::LinearHashTable<KeyType, V, tv::hash::Murmur3Hash<KeyType>,
+                                        kEmptyKey, false>;
+        using pair_t = typename table_t::value_type;
+        // int64_t expected_hash_data_num = int64_t(tv::hash::align_to_power2(points.dim(0) * 2));
+        int64_t expected_hash_data_num = points.dim(0) * 2;
+
+        if (hashdata.dim(0) < expected_hash_data_num){{
+            hashdata = tv::zeros({{expected_hash_data_num}}, tv::custom128, 0);
+        }}
+        if (point_indice_data.dim(0) < points.dim(0)){{
+            point_indice_data = tv::zeros({{points.dim(0)}}, tv::int64, 0);
+        }}
+        // auto timer = tv::CudaContextTimer<>();
+        num_per_voxel.zero_();
+        table_t hash = table_t(hashdata.data_ptr<pair_t>(), expected_hash_data_num);
+        hash.clear();
+        // tv::ssprint("clear time", timer.report());
+        auto launcher = tv::cuda::Launch(points.dim(0));
+        launcher(kernel::build_hash_table<table_t>, hash, points.data_ptr<const {self.dtype}>(),
+                point_indice_data.data_ptr<int64_t>(),
+                points.dim(1), vsize, coors_range, grid_size, grid_stride, points.dim(0));
+        // tv::ssprint("build_hash_table", timer.report());
+
+        auto table_launcher = tv::cuda::Launch(hash.size());
+        tv::Tensor count = tv::zeros({{1}}, tv::int32, 0);
+        Layout layout = Layout::from_shape(grid_size);
+        table_launcher(kernel::assign_table<table_t>, hash, indices.data_ptr<int>(),
+                        count.data_ptr<int>(),
+                        layout, voxels.dim(0));
+        auto count_cpu = count.cpu();
+        int count_val = count_cpu.item<int32_t>();
+        // tv::ssprint("assign_table", timer.report());
+
+        launcher(kernel::generate_voxel<table_t>, hash, points.data_ptr<const {self.dtype}>(),
+                point_indice_data.data_ptr<const int64_t>(), voxels.data_ptr<{self.dtype}>(),
+                num_per_voxel.data_ptr<int>(), points.dim(1), voxels.dim(1), 
+                voxels.dim(0), vsize, coors_range,
+                grid_size, grid_stride, points.dim(0));
+        // tv::ssprint("generate_voxel", timer.report());
+
+        return std::make_tuple(voxels.slice_first_axis(0, count_val), 
+            indices.slice_first_axis(0, count_val), 
+            num_per_voxel.slice_first_axis(0, count_val));
+
+        """)
+        return code.ret("std::tuple<tv::Tensor, tv::Tensor, tv::Tensor>")
+
+
+
+class Point2VoxelCPU(pccm.ParameterizedClass, pccm.pybind.PybindClassMixin):
+    def __init__(self, dtype: dtypes.DType, ndim: int, zyx: bool = True):
+        super().__init__()
+        self.add_dependency(TensorView)
+        layout = TensorGeneric(ndim, True)
+        self.add_param_class("layout_ns", layout, "Layout")
+        self.dtype = dtype 
+        self.ndim = ndim 
+        self.zyx = zyx
+
+        self.add_pybind_member("densehashdata", "tv::Tensor", readwrite=False, pyanno="cumm.tensorview.Tensor")
+
+        self.add_pybind_member("voxels", "tv::Tensor", readwrite=False)
+        self.add_pybind_member("indices", "tv::Tensor", readwrite=False)
+        self.add_pybind_member("num_per_voxel", "tv::Tensor", readwrite=False)
+        self.add_member("mean_per_voxel", "tv::Tensor")
+
+        self.add_member("vsize", f"tv::array<float, {self.ndim}>")
+        self.add_member("coors_range", f"tv::array<float, {self.ndim * 2}>")
+        self.add_member("grid_size", f"tv::array<int, {self.ndim}>")
+        self.add_member("grid_stride", f"tv::array<int, {self.ndim}>")
+
+    @pccm.pybind.mark_prop_getter(prop_name="grid_size")
+    @pccm.member_function
+    def get_grid_size(self):
+        code = pccm.FunctionCode()
+        code.raw(f"""
+        std::array<int, {self.ndim}> res;
+        for (int i = 0; i < {self.ndim}; ++i){{
+            res[i] = grid_size[i];
+        }}
+        return res;
+        """)
+        return code.ret(f"std::array<int, {self.ndim}>")
+
+    @pccm.pybind.mark
+    @pccm.constructor
+    def ctor(self):
+        code = pccm.FunctionCode()
+        code.arg("vsize_xyz", f"std::array<float, {self.ndim}>")
+        code.arg("coors_range_xyz", f"std::array<float, {self.ndim * 2}>")
+        code.arg("num_point_features", f"int")
+        code.arg("max_num_voxels, max_num_points_per_voxel", f"int")
+        if self.zyx:
+            code.raw(f"""
+            for (int i = 0; i < {self.ndim}; ++i){{
+                vsize[{self.ndim - 1} - i] = vsize_xyz[i];
+                coors_range[{self.ndim - 1} - i] = coors_range_xyz[i];
+                coors_range[{2 * self.ndim - 1} - i] = coors_range_xyz[i + {self.ndim}];
+            }}
+            """)
+        else:
+            code.raw(f"""
+            for (int i = 0; i < {self.ndim}; ++i){{
+                vsize[i] = vsize_xyz[i];
+                coors_range[i] = coors_range_xyz[i];
+                coors_range[i + {self.ndim}] = coors_range_xyz[i + {self.ndim}];
+            }}
+            """)
+        code.raw(f"""
+        int64_t prod = 1;
+        for (size_t i = 0; i < {self.ndim}; ++i) {{
+            grid_size[i] =
+                std::round((coors_range[{self.ndim} + i] - coors_range[i]) / vsize[i]);
+        }}
+        for (int i = {self.ndim} - 1; i >= 0; --i) {{
+            grid_stride[i] = prod;
+            prod *= grid_size[i];
+        }}
+        voxels = tv::zeros({{max_num_voxels, max_num_points_per_voxel, num_point_features}}, tv::type_v<{self.dtype}>, -1);
+        indices = tv::zeros({{max_num_voxels, {self.ndim}}}, tv::int32, -1);
+        num_per_voxel = tv::zeros({{max_num_voxels}}, tv::int32, -1);
+        mean_per_voxel = tv::zeros({{max_num_voxels, num_point_features}}, tv::DType({self.dtype.tv_dtype}), -1);
+        tv::TensorShape grid_shape(grid_size.data(), grid_size.data() + {self.ndim});
+        densehashdata = tv::zeros(grid_shape, tv::int32, -1);
+        auto densehashdata_ptr = densehashdata.data_ptr<int>();
+        for (int i= 0; i < densehashdata.size(); ++i){{
+            densehashdata_ptr[i] = -1;
+        }}
+        """)
+        return code 
+
+    def point_to_voxel_template(self, mean: bool = False):
+        code = pccm.FunctionCode()
+        code.arg("points", "tv::Tensor")
+        code.arg("clear_voxels", "bool", "true")
+
+        point_xyz = f"{self.ndim - 1} - j"
+        if not self.zyx:
+            point_xyz = f"j"
+        code.raw(f"""
+        auto max_num_voxels = voxels.dim(0);
+        auto max_num_points_per_voxel = voxels.dim(1);
+        num_per_voxel.zero_();
+        if (clear_voxels){{
+            voxels.zero_();
+        }}
+        """)
+        if mean:
+            code.raw(f"mean_per_voxel.zero_();")
+            code.raw(f"auto means_rw = mean_per_voxel.tview<{self.dtype}, 2>();")
+        else:
+            code.raw(f"auto means_rw = mean_per_voxel.tview<{self.dtype}, 2>();")
+        
+        code.raw(f"""
+        int res_voxel_num = 0;
+        int num_features = points.dim(1);
+        auto N = points.dim(0);
+        int c;
+        TV_ASSERT_RT_ERR(num_features == voxels.dim(2), "your points num features doesn't equal to voxel.");
+        constexpr bool kUseMean = {pccm.boolean(mean)};
+        tv::dispatch<float, double>(points.dtype(), [&](auto I){{
+            using T = decltype(I);
+            auto points_rw = points.tview<T, 2>();
+            auto coors_rw = indices.tview<int, 2>();
+            auto voxels_rw = voxels.tview<{self.dtype}, 3>();
+            auto num_points_per_voxel_rw = num_per_voxel.tview<int, 1>();
+            
+            int coor[{self.ndim}];
+            auto coor_to_voxelidx_rw = densehashdata.tview<int, {self.ndim}>();
+            int voxelidx, num;
+            bool failed;
+            int voxel_num = 0;
+            for (int i = 0; i < N; ++i) {{
+                failed = false;
+                for (int j = 0; j < {self.ndim}; ++j) {{
+                    c = floor((points_rw(i, {point_xyz}) - coors_range[j]) / vsize[j]);
+                    if ((c < 0 || c >= grid_size[j])) {{
+                        failed = true;
+                        break;
+                    }}
+                    coor[j] = c;
+                }}
+                if (failed)
+                    continue;
+                voxelidx = coor_to_voxelidx_rw({codeops.unpack("coor", range(self.ndim))});
+
+                if (voxelidx == -1) {{
+                    voxelidx = voxel_num;
+                    if (voxel_num >= max_num_voxels)
+                        continue;
+                    voxel_num += 1;
+                    coor_to_voxelidx_rw({codeops.unpack("coor", range(self.ndim))}) = voxelidx;
+                    for (int k = 0; k < {self.ndim}; ++k) {{
+                        coors_rw(voxelidx, k) = coor[k];
+                    }}
+                }}
+                num = num_points_per_voxel_rw(voxelidx);
+                if (num < max_num_points_per_voxel) {{
+                    // voxel_point_mask_rw(voxelidx, num) = {self.dtype}(1);
+                    for (int k = 0; k < num_features; ++k) {{
+                        voxels_rw(voxelidx, num, k) = points_rw(i, k);
+                    }}
+                    num_points_per_voxel_rw(voxelidx) += 1;
+                    if TV_IF_CONSTEXPR (kUseMean){{
+                        for (int k = 0; k < num_features; ++k) {{
+                            means_rw(voxelidx, k) +=
+                                (points_rw(i, k) - means_rw(voxelidx, k)) / {self.dtype}(num + 1);
+                        }}
+                    }}
+                }}
+            }}
+            for (int i = 0; i < voxel_num; ++i) {{
+                coor_to_voxelidx_rw({codeops.unpack("coors_rw", range(self.ndim), left="(i, ", right=")")}) = -1;
+                if TV_IF_CONSTEXPR (kUseMean){{
+                    num = num_points_per_voxel_rw(i);
+                    for (int j = num; j < max_num_points_per_voxel; ++j) {{
+                        for (int k = 0; k < num_features; ++k) {{
+                            voxels_rw(i, j, k) = means_rw(i, k);
+                        }}
+                    }}
+                }}
+            }}
+            res_voxel_num = voxel_num;
+        }});
+        return std::make_tuple(voxels.slice_first_axis(0, res_voxel_num), 
+            indices.slice_first_axis(0, res_voxel_num), 
+            num_per_voxel.slice_first_axis(0, res_voxel_num));
+        """)
+        return code.ret("std::tuple<tv::Tensor, tv::Tensor, tv::Tensor>")
+
+
+    @pccm.pybind.mark
+    @pccm.member_function
+    def point_to_voxel(self):
+        return self.point_to_voxel_template(False)
+
+    @pccm.pybind.mark
+    @pccm.member_function
+    def point_to_voxel_empty_mean(self):
+        return self.point_to_voxel_template(True)
--- a/spconv/ops.py
+++ b/spconv/ops.py
-# Copyright 2019-2020 Yan Yan
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from enum import Enum
-
-import torch
-
-import spconv
-
-
-class ConvAlgo(Enum):
-    Native = 0  # small memory cost, faster when number of points is large.
-    Batch = 1  # high memory cost, faster when number of points is small (< 50000)
-    BatchGemmGather = 2  # high memory cost, faster when number of points medium
-    SparseConvNet = 3
-    Minkowski = 4  # https://github.com/StanfordVL/MinkowskiEngine/blob/master/src/convolution.cu
-
-
-def get_conv_output_size(input_size, kernel_size, stride, padding, dilation):
-    ndim = len(input_size)
-    output_size = []
-    for i in range(ndim):
-        size = (input_size[i] + 2 * padding[i] - dilation[i] *
-                (kernel_size[i] - 1) - 1) // stride[i] + 1
-        if kernel_size[i] == -1:
-            output_size.append(1)
-        else:
-            output_size.append(size)
-    return output_size
-
-
-def get_deconv_output_size(input_size, kernel_size, stride, padding, dilation,
-                           output_padding):
-    ndim = len(input_size)
-    output_size = []
-    for i in range(ndim):
-        if kernel_size[i] == -1:
-            raise ValueError("deconv don't support kernel_size < 0")
-        size = (input_size[i] - 1) * stride[i] - 2 * padding[i] + kernel_size[
-            i] + output_padding[i]
-        output_size.append(size)
-    return output_size
-
-
-def get_indice_pairs(indices,
-                     batch_size,
-                     spatial_shape,
-                     ksize=3,
-                     stride=1,
-                     padding=0,
-                     dilation=1,
-                     out_padding=0,
-                     subm=False,
-                     transpose=False,
-                     grid=None,
-                     use_hash=False):
-    ndim = indices.shape[1] - 1
-    if not isinstance(ksize, (list, tuple)):
-        ksize = [ksize] * ndim
-    if not isinstance(stride, (list, tuple)):
-        stride = [stride] * ndim
-    if not isinstance(padding, (list, tuple)):
-        padding = [padding] * ndim
-    if not isinstance(dilation, (list, tuple)):
-        dilation = [dilation] * ndim
-    if not isinstance(out_padding, (list, tuple)):
-        out_padding = [out_padding] * ndim
-
-    for d, s in zip(dilation, stride):
-        assert any([s == 1, d == 1]), "don't support this."
-
-    if not subm:
-        if transpose:
-            out_shape = get_deconv_output_size(spatial_shape, ksize, stride,
-                                               padding, dilation, out_padding)
-        else:
-            out_shape = get_conv_output_size(spatial_shape, ksize, stride,
-                                             padding, dilation)
-    else:
-        out_shape = spatial_shape
-    if grid is None:
-        grid = torch.Tensor()
-    res = torch.ops.spconv.get_indice_pairs(indices, grid, batch_size,
-                                            out_shape, spatial_shape, ksize,
-                                            stride, padding, dilation,
-                                            out_padding, int(subm),
-                                            int(transpose), int(use_hash))
-    return res
-
-
-def indice_conv(features,
-                filters,
-                indice_pairs,
-                indice_pair_num,
-                num_activate_out,
-                inverse=False,
-                subm=False,
-                algo=ConvAlgo.Native.value):
-    return torch.ops.spconv.indice_conv(features, filters, indice_pairs,
-                                        indice_pair_num, num_activate_out,
-                                        int(inverse), int(subm), algo)
-
-
-def fused_indice_conv(features, filters, bias, indice_pairs, indice_pair_num,
-                      num_activate_out, inverse, subm):
-    return torch.ops.spconv.fused_indice_conv_bn(features, filters, bias,
-                                                 indice_pairs, indice_pair_num,
-                                                 num_activate_out,
-                                                 int(inverse), int(subm))
-
-
-def indice_conv_backward(features,
-                         filters,
-                         out_bp,
-                         indice_pairs,
-                         indice_pair_num,
-                         inverse=False,
-                         subm=False,
-                         algo=ConvAlgo.Native.value):
-    return torch.ops.spconv.indice_conv_backward(features, filters, out_bp,
-                                                 indice_pairs, indice_pair_num,
-                                                 int(inverse), int(subm), algo)
-
-
-def indice_maxpool(features, indice_pairs, indice_pair_num, num_activate_out):
-    return torch.ops.spconv.indice_maxpool(features, indice_pairs,
-                                           indice_pair_num, num_activate_out)
-
-
-def indice_maxpool_backward(features, out_features, out_bp, indice_pairs,
-                            indice_pair_num):
-    return torch.ops.spconv.indice_maxpool_backward(features, out_features,
-                                                    out_bp, indice_pairs,
-                                                    indice_pair_num)
-
-
-def nms(boxes, scores, pre_max_size, post_max_size, thresh, eps):
-    res = torch.ops.spconv.nms(boxes, scores, pre_max_size, post_max_size,
-                               thresh, eps)
-    return res
-
-
-def pillar_scatter(features, coors, shape):
-    if features.dtype == torch.float32:
-        return torch.ops.spconv.pillar_scatter_float(features, coors, shape)
-    elif features.dtype == torch.half:
-        return torch.ops.spconv.pillar_scatter_half(features, coors, shape)
-    else:
-        raise NotImplementedError
--- a/spconv/pytorch/__init__.py
+++ b/spconv/pytorch/__init__.py
+import platform
+from pathlib import Path
+
+import numpy as np
+import torch
+
+from spconv.pytorch import ops
+from spconv.pytorch.conv import (SparseConv2d, SparseConv3d, SparseConvTranspose2d,
+                         SparseConvTranspose3d, SparseInverseConv2d,
+                         SparseInverseConv3d, SubMConv2d, SubMConv3d)
+from spconv.pytorch.core import SparseConvTensor
+from spconv.pytorch.identity import Identity
+from spconv.pytorch.modules import SparseModule, SparseSequential
+from spconv.pytorch.ops import ConvAlgo
+from spconv.pytorch.pool import SparseMaxPool2d, SparseMaxPool3d
+from spconv.pytorch.tables import AddTable, ConcatTable, JoinTable
+
+
+class ToDense(SparseModule):
+    """convert SparseConvTensor to NCHW dense tensor.
+    """
+    def forward(self, x: SparseConvTensor):
+        return x.dense()
+
+
+class RemoveGrid(SparseModule):
+    """remove pre-allocated grid buffer.
+    """
+    def forward(self, x: SparseConvTensor):
+        x.grid = None
+        return x
--- a/spconv/conv.py
+++ b/spconv/conv.py
-# Copyright 2019-2020 Yan Yan
+# Copyright 2021 Yan Yan
 # 
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -21,12 +21,13 @@ from torch import nn
 from torch.nn import init
 from torch.nn.parameter import Parameter

-import spconv
-import spconv.functional as Fsp
-from spconv import ops
-from spconv.core import IndiceData, SparseConvTensor
-from spconv.modules import SparseModule
-
+from spconv import pytorch as spconv
+from spconv.algo import ConvAlgo
+import spconv.pytorch.functional as Fsp
+from spconv.pytorch import ops
+from spconv.pytorch.core import IndiceData, SparseConvTensor
+from spconv.pytorch.modules import SparseModule
+from spconv.constants import FILTER_HWIO

 def _calculate_fan_in_and_fan_out_hwio(tensor):
    dimensions = tensor.ndimension()
@@ -39,8 +40,8 @@ def _calculate_fan_in_and_fan_out_hwio(tensor):
        fan_in = tensor.size(-2)
        fan_out = tensor.size(-1)
    else:
-        num_input_fmaps = tensor.size(-2)
-        num_output_fmaps = tensor.size(-1)
+        num_input_fmaps = tensor.size(-1)
+        num_output_fmaps = tensor.size(-2)
        receptive_field_size = 1
        if tensor.dim() > 2:
            receptive_field_size = tensor[..., 0, 0].numel()
@@ -72,7 +73,6 @@ class SparseConvolution(SparseModule):
                 inverse=False,
                 indice_key=None,
                 fused_bn=False,
-                 use_hash=False,
                 algo=ops.ConvAlgo.Native,
                 name=None):
        super(SparseConvolution, self).__init__(name=name)
@@ -106,20 +106,25 @@ class SparseConvolution(SparseModule):
        self.subm = subm
        self.indice_key = indice_key
        self.fused_bn = fused_bn
-        self.use_hash = use_hash
-        self.algo = algo.value
-
+        self.algo = algo
+        if FILTER_HWIO:
            self.weight = Parameter(
                torch.Tensor(*kernel_size, in_channels, out_channels))
+        else:
+            self.weight = Parameter(
+                torch.Tensor(*kernel_size, out_channels, in_channels))
        if bias:
            self.bias = Parameter(torch.Tensor(out_channels))
        else:
            self.register_parameter('bias', None)
+        # self.workspace_for_splitk = torch.zeros((GLOBAL_MAXIMUM_SPLITK,), dtype=torch.int8)
+        # self.register_buffer("workspace_for_splitk", self.workspace_for_splitk)
        self.reset_parameters()

    def reset_parameters(self):
        n = self.in_channels
-        init.kaiming_uniform_(self.weight, a=math.sqrt(5))
+        # init.uniform_(self.weight, 0, 0.001)
+        init.kaiming_uniform_(self.weight, a=math.sqrt(0.005))
        if self.bias is not None:
            fan_in, _ = _calculate_fan_in_and_fan_out_hwio(self.weight)
            bound = 1 / math.sqrt(fan_in)
@@ -171,9 +176,15 @@ class SparseConvolution(SparseModule):
                    }
                }
        if self.conv1x1:
+            if FILTER_HWIO:
+                features = torch.mm(
+                    input.features,
+                    self.weight.view(self.out_channels, self.in_channels).T)
+            else:
                features = torch.mm(
                    input.features,
-                self.weight.view(self.in_channels, self.out_channels))
+                    self.weight.view(self.in_channels, self.out_channels).T)
+
            if self.bias is not None:
                features += self.bias
            out_tensor.features = features
@@ -201,15 +212,14 @@ class SparseConvolution(SparseModule):
                    indices,
                    batch_size,
                    spatial_shape,
+                    self.algo,
                    self.kernel_size,
                    self.stride,
                    self.padding,
                    self.dilation,
                    self.output_padding,
                    self.subm,
-                    self.transposed,
-                    grid=input.grid,
-                    use_hash=self.use_hash)
+                    self.transposed)
                if input.benchmark:
                    torch.cuda.synchronize()
                    interval = time.time() - t
@@ -264,6 +274,32 @@ class SparseConvolution(SparseModule):
        out_tensor.spatial_shape = out_spatial_shape
        return out_tensor

+class SparseConv1d(SparseConvolution):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 bias=True,
+                 indice_key=None,
+                 algo=ops.ConvAlgo.Native,
+                 name=None):
+        super(SparseConv1d, self).__init__(1,
+                                           in_channels,
+                                           out_channels,
+                                           kernel_size,
+                                           stride,
+                                           padding,
+                                           dilation,
+                                           groups,
+                                           bias,
+                                           indice_key=indice_key,
+                                           algo=algo,
+                                           name=name)
+

 class SparseConv2d(SparseConvolution):
    def __init__(self,
@@ -276,7 +312,6 @@ class SparseConv2d(SparseConvolution):
                 groups=1,
                 bias=True,
                 indice_key=None,
-                 use_hash=False,
                 algo=ops.ConvAlgo.Native,
                 name=None):
        super(SparseConv2d, self).__init__(2,
@@ -289,7 +324,6 @@ class SparseConv2d(SparseConvolution):
                                           groups,
                                           bias,
                                           indice_key=indice_key,
-                                           use_hash=use_hash,
                                           algo=algo,
                                           name=name)

@@ -305,7 +339,6 @@ class SparseConv3d(SparseConvolution):
                 groups=1,
                 bias=True,
                 indice_key=None,
-                 use_hash=False,
                 algo=ops.ConvAlgo.Native,
                 name=None):
        super(SparseConv3d, self).__init__(3,
@@ -318,7 +351,6 @@ class SparseConv3d(SparseConvolution):
                                           groups,
                                           bias,
                                           indice_key=indice_key,
-                                           use_hash=use_hash,
                                           algo=algo,
                                           name=name)

@@ -334,7 +366,6 @@ class SparseConv4d(SparseConvolution):
                 groups=1,
                 bias=True,
                 indice_key=None,
-                 use_hash=False,
                 algo=ops.ConvAlgo.Native,
                 name=None):
        super(SparseConv4d, self).__init__(4,
@@ -347,7 +378,6 @@ class SparseConv4d(SparseConvolution):
                                           groups,
                                           bias,
                                           indice_key=indice_key,
-                                           use_hash=use_hash,
                                           algo=algo,
                                           name=name)

@@ -363,7 +393,6 @@ class SparseConvTranspose2d(SparseConvolution):
                 groups=1,
                 bias=True,
                 indice_key=None,
-                 use_hash=False,
                 algo=ops.ConvAlgo.Native,
                 name=None):
        super(SparseConvTranspose2d, self).__init__(2,
@@ -377,7 +406,6 @@ class SparseConvTranspose2d(SparseConvolution):
                                                    bias,
                                                    transposed=True,
                                                    indice_key=indice_key,
-                                                    use_hash=use_hash,
                                                    algo=algo,
                                                    name=name)

@@ -393,7 +421,6 @@ class SparseConvTranspose3d(SparseConvolution):
                 groups=1,
                 bias=True,
                 indice_key=None,
-                 use_hash=False,
                 algo=ops.ConvAlgo.Native,
                 name=None):
        super(SparseConvTranspose3d, self).__init__(3,
@@ -407,7 +434,25 @@ class SparseConvTranspose3d(SparseConvolution):
                                                    bias,
                                                    transposed=True,
                                                    indice_key=indice_key,
-                                                    use_hash=use_hash,
+                                                    algo=algo,
+                                                    name=name)
+
+class SparseInverseConv1d(SparseConvolution):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 indice_key,
+                 bias=True,
+                 algo=ops.ConvAlgo.Native,
+                 name=None):
+        super(SparseInverseConv1d, self).__init__(1,
+                                                  in_channels,
+                                                  out_channels,
+                                                  kernel_size,
+                                                  bias=bias,
+                                                  inverse=True,
+                                                  indice_key=indice_key,
                                                  algo=algo,
                                                  name=name)

@@ -451,6 +496,52 @@ class SparseInverseConv3d(SparseConvolution):
                                                  algo=algo,
                                                  name=name)

+class SparseInverseConv4d(SparseConvolution):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 indice_key,
+                 bias=True,
+                 algo=ops.ConvAlgo.Native,
+                 name=None):
+        super(SparseInverseConv4d, self).__init__(4,
+                                                  in_channels,
+                                                  out_channels,
+                                                  kernel_size,
+                                                  bias=bias,
+                                                  inverse=True,
+                                                  indice_key=indice_key,
+                                                  algo=algo,
+                                                  name=name)
+
+class SubMConv1d(SparseConvolution):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 bias=True,
+                 indice_key=None,
+                 algo=ops.ConvAlgo.Native,
+                 name=None):
+        super(SubMConv1d, self).__init__(1,
+                                         in_channels,
+                                         out_channels,
+                                         kernel_size,
+                                         stride,
+                                         padding,
+                                         dilation,
+                                         groups,
+                                         bias,
+                                         True,
+                                         indice_key=indice_key,
+                                         algo=algo,
+                                         name=name)
+

 class SubMConv2d(SparseConvolution):
    def __init__(self,
@@ -463,7 +554,6 @@ class SubMConv2d(SparseConvolution):
                 groups=1,
                 bias=True,
                 indice_key=None,
-                 use_hash=False,
                 algo=ops.ConvAlgo.Native,
                 name=None):
        super(SubMConv2d, self).__init__(2,
@@ -477,7 +567,6 @@ class SubMConv2d(SparseConvolution):
                                         bias,
                                         True,
                                         indice_key=indice_key,
-                                         use_hash=use_hash,
                                         algo=algo,
                                         name=name)

@@ -493,7 +582,6 @@ class SubMConv3d(SparseConvolution):
                 groups=1,
                 bias=True,
                 indice_key=None,
-                 use_hash=False,
                 algo=ops.ConvAlgo.Native,
                 name=None):
        super(SubMConv3d, self).__init__(3,
@@ -507,7 +595,6 @@ class SubMConv3d(SparseConvolution):
                                         bias,
                                         True,
                                         indice_key=indice_key,
-                                         use_hash=use_hash,
                                         algo=algo,
                                         name=name)

@@ -523,7 +610,6 @@ class SubMConv4d(SparseConvolution):
                 groups=1,
                 bias=True,
                 indice_key=None,
-                 use_hash=False,
                 algo=ops.ConvAlgo.Native,
                 name=None):
        super(SubMConv4d, self).__init__(4,
@@ -537,6 +623,5 @@ class SubMConv4d(SparseConvolution):
                                         bias,
                                         True,
                                         indice_key=indice_key,
-                                         use_hash=use_hash,
                                         algo=algo,
                                         name=name)