bug fix

62c1496f · yan.yan · 01ed382c · 62c1496f · 62c1496f · 62c1496f
Commit 62c1496f authored Oct 19, 2021 by yan.yan
18 changed files
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
+name: build
+
+on:
+  push:
+    branches:
+      - main
+      - feature/*
+    tags:
+      - '*'
+  
+jobs:
+  build-windows:
+    runs-on: windows-latest
+    strategy:
+      matrix:
+        python-version: ['3.6', '3.7', '3.8', '3.9', '3.10'] 
+        cuda-version: ['10.2', '11.1', '11.4']
+    steps:
+      - uses: actions/checkout@master
+      - name: Install CUDA
+        env:
+          CUDA_VERSION: ${{ matrix.cuda-version }}
+          PYTHON_VERSION: ${{ matrix.python-version }}
+          cuda: ${{ matrix.cuda-version }}
+        if: (github.event_name == 'push' && (startsWith(github.ref, 'refs/tags')) && (env.CUDA_VERSION != '') ) || (env.CUDA_VERSION == '11.1' && env.PYTHON_VERSION == '3.10')
+        shell: powershell
+        run: .\tools\install_windows_cuda.ps1
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v2
+        with:
+          python-version: ${{ matrix.python-version }}
+      - uses: ilammy/msvc-dev-cmd@v1
+      - name: Install pep build
+        run: |
+          python -m pip install build --user
+          python -m pip install --upgrade pip twine wheel
+          python -m pip install pytest setuptools
+
+      - name: Build a windows binary wheel
+        env:
+          CUDA_VERSION: ${{ matrix.cuda-version }}
+          PYTHON_VERSION: ${{ matrix.python-version }}
+        if: (github.event_name == 'push' && (startsWith(github.ref, 'refs/tags')) ) || (env.CUDA_VERSION == '11.1' && env.PYTHON_VERSION == '3.10')
+        run: |
+          $Env:CUMM_CUDA_VERSION = "${{ matrix.cuda-version }}"
+          $Env:CUMM_CUDA_ARCH_LIST = "all"
+          $Env:SPCONV_DISABLE_JIT = "1"
+          pip install pccm pybind11
+          python -m build --wheel --outdir dist/ .
+        shell: powershell
+
+      - name: Publish a Python distribution to PyPI
+        if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags')
+        run: |
+          $Env:TWINE_USERNAME = "__token__"
+          $Env:TWINE_PASSWORD = "${{ secrets.pypi_password }}"
+          twine upload dist/*
+        shell: powershell
+  
+  build:
+    needs: build-windows
+    runs-on: ubuntu-20.04
+    strategy:
+      matrix:
+        python-version: ['3.8'] # this version is only used for upload.
+        cuda-version: ['102', '111', '114']
+
+    steps:
+      - uses: actions/checkout@master
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v2
+        with:
+          python-version: ${{ matrix.python-version }}
+
+      - name: Install pep build
+        run: |
+          python -m pip install build --user
+          python -m pip install --upgrade pip twine wheel
+          python -m pip install pytest setuptools
+
+      - name: Build a cuda wheel
+        env:
+          CUDA_VERSION: ${{ matrix.cuda-version }}
+          PYTHON_VERSION: ${{ matrix.python-version }}
+          DOCKER_IMAGE: scrin/manylinux2014-cuda:cu${{ matrix.cuda-version }}-devel
+          PLAT: manylinux2014_x86_64
+        if: (github.event_name == 'push' && (startsWith(github.ref, 'refs/tags')) && (env.CUDA_VERSION != '') ) || env.CUDA_VERSION == '114'
+        run: |
+          docker run --rm -e PLAT=$PLAT -e CUMM_CUDA_VERSION=${{ matrix.cuda-version }} -v `pwd`:/io $DOCKER_IMAGE bash -c "/io/tools/build-wheels.sh"
+
+      - name: Publish a Python distribution to PyPI
+        if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags')
+        uses: pypa/gh-action-pypi-publish@master
+        with:
+          user: __token__
+          password: ${{ secrets.pypi_password }}
\ No newline at end of file
--- a/README.md
+++ b/README.md
@@ -34,7 +34,6 @@
 * training/inference speed is increased
 * support int8/tensor core
 * doesn't depend on pytorch binary. 
-* If your GPU has tensor core, try mixed precision training in spconv 2.x!
 * since spconv 2.x doesn't depend on pytorch binary (never in future), it's impossible to support torch.jit/libtorch inference.

 ## TODO in Spconv 2.x

--- a/pyproject.toml
+++ b/pyproject.toml
 [build-system]
-requires = ["setuptools>=41.0", "wheel", "pccm>=0.2.5", "cumm>=0.1.3"]
+requires = ["setuptools>=41.0", "wheel", "pccm>=0.2.5", "cumm>=0.1.5"]
 build-backend = "setuptools.build_meta"
--- a/setup.py
+++ b/setup.py
@@ -30,7 +30,7 @@ DESCRIPTION = 'spatial sparse convolution'
 URL = 'https://github.com/traveller59/spconv'
 EMAIL = 'yanyan.sub@outlook.com'
 AUTHOR = 'Yan Yan'
-REQUIRES_PYTHON = '>=3.7'
+REQUIRES_PYTHON = '>=3.6'
 VERSION = None

 # What packages are required for this module to be executed?
@@ -134,12 +134,11 @@ if disable_jit is not None and disable_jit == "1":
    }
    from cumm.gemm.main import GemmMainUnitTest, SHUFFLE_SIMT_PARAMS, SHUFFLE_VOLTA_PARAMS, SHUFFLE_TURING_PARAMS
    from spconv.csrc.sparse.all import SpconvOps
-    from cumm.gemm.gather import GatherAll
    cu = GemmMainUnitTest(SHUFFLE_SIMT_PARAMS + SHUFFLE_VOLTA_PARAMS + SHUFFLE_TURING_PARAMS)

    cu.namespace = "cumm.gemm.main"
    ext_modules: List[Extension] = [
-        PCCMExtension([cu, SpconvOps(), GatherAll()],
+        PCCMExtension([cu, SpconvOps()],
                      "spconv/core_cc",
                      Path(__file__).resolve().parent / "spconv")
    ]

--- a/spconv/build.py
+++ b/spconv/build.py
@@ -22,7 +22,6 @@ from .constants import PACKAGE_NAME, PACKAGE_ROOT
 if project_is_installed(PACKAGE_NAME) and project_is_editable(PACKAGE_NAME):
    from cumm.gemm.main import GemmMainUnitTest, SHUFFLE_SIMT_PARAMS, SHUFFLE_VOLTA_PARAMS, SHUFFLE_TURING_PARAMS
    from spconv.csrc.sparse.all import SpconvOps
-    # from cumm.gemm.gather import GatherAll, ScatterAll
    cu = GemmMainUnitTest(SHUFFLE_SIMT_PARAMS + SHUFFLE_VOLTA_PARAMS + SHUFFLE_TURING_PARAMS)
    cu.namespace = "cumm.gemm.main"
    pccm.builder.build_pybind([cu, SpconvOps()],

--- a/spconv/core_cc/csrc/sparse/all/__init__.pyi
+++ b/spconv/core_cc/csrc/sparse/all/__init__.pyi
@@ -3,14 +3,12 @@ from pccm.stubs import EnumValue, EnumClassValue
 from cumm.tensorview import Tensor
 class SpconvOps:
    @staticmethod
-    def generate_conv_inds(indices: Tensor, hashdata: Tensor, indice_pairs: Tensor, indice_pairs_uniq: Tensor, out_inds: Tensor, indice_num_per_loc: Tensor, batch_size: int, output_dims: List[int], input_dims: List[int], ksize: List[int], stride: List[int], padding: List[int], dilation: List[int]) -> int: 
+    def generate_conv_inds_stage1(indices: Tensor, indice_pairs: Tensor, indice_pairs_uniq: Tensor, indice_num_per_loc: Tensor, batch_size: int, output_dims: List[int], input_dims: List[int], ksize: List[int], stride: List[int], padding: List[int], dilation: List[int], transposed: bool = False, stream_int: int = 0) -> None: 
        """
        Args:
            indices: 
-            hashdata: 
            indice_pairs: 
            indice_pairs_uniq: 
-            out_inds: 
            indice_num_per_loc: 
            batch_size: 
            output_dims: 
@@ -19,28 +17,22 @@ class SpconvOps:
            stride: 
            padding: 
            dilation: 
+            transposed: 
+            stream_int: 
        """
        ...
    @staticmethod
-    def generate_conv_inds_stage1(indices: Tensor, indice_pairs: Tensor, indice_pairs_uniq: Tensor, indice_num_per_loc: Tensor, batch_size: int, output_dims: List[int], input_dims: List[int], ksize: List[int], stride: List[int], padding: List[int], dilation: List[int], stream_int: int = 0) -> int: 
+    def generate_conv_inds_stage1_5(indice_pairs_uniq: Tensor, ndim: int, uniq_size: int, stream_int: int = 0) -> int: 
        """
        Args:
-            indices: 
-            indice_pairs: 
            indice_pairs_uniq: 
-            indice_num_per_loc: 
-            batch_size: 
-            output_dims: 
-            input_dims: 
-            ksize: 
-            stride: 
-            padding: 
-            dilation: 
+            ndim: 
+            uniq_size: 
            stream_int: 
        """
        ...
    @staticmethod
-    def generate_conv_inds_stage2(indices: Tensor, hashdata: Tensor, indice_pairs: Tensor, indice_pairs_uniq: Tensor, out_inds: Tensor, num_out_act: int, batch_size: int, output_dims: List[int], input_dims: List[int], ksize: List[int], stride: List[int], padding: List[int], dilation: List[int], stream_int: int = 0) -> int: 
+    def generate_conv_inds_stage2(indices: Tensor, hashdata: Tensor, indice_pairs: Tensor, indice_pairs_uniq: Tensor, out_inds: Tensor, num_out_act: int, batch_size: int, output_dims: List[int], input_dims: List[int], ksize: List[int], stride: List[int], padding: List[int], dilation: List[int], transposed: bool = False, stream_int: int = 0) -> int: 
        """
        Args:
            indices: 
@@ -56,6 +48,7 @@ class SpconvOps:
            stride: 
            padding: 
            dilation: 
+            transposed: 
            stream_int: 
        """
        ...

--- a/spconv/csrc/sparse/all.py
+++ b/spconv/csrc/sparse/all.py
@@ -35,26 +35,30 @@ class SpconvOps(pccm.Class):
            problem = ConvProblem(ndim, ConvOpType.kForward, NHWC, NHWC, NHWC)
            indices = SparseConvIndicesKernel(problem, dtypes.int32)
            # self.add_param_class("ops", indices, "SpconvIndices")
-            cuda_funcs = [self.generate_conv_inds, self.generate_subm_conv_inds, 
-                self.generate_conv_inds_stage1, self.generate_conv_inds_stage2, self.sort_1d_by_key]
+            cuda_funcs = [self.generate_subm_conv_inds, 
+                self.generate_conv_inds_stage1, self.generate_conv_inds_stage1_5, self.generate_conv_inds_stage2, self.sort_1d_by_key]
            self.add_impl_only_param_class(cuda_funcs, f"ops{ndim}d", indices, f"SpconvIndices{ndim}D")


    @pccm.pybind.mark
    @pccm.cuda.static_function
-    def generate_conv_inds(self):
+    def generate_conv_inds_stage1(self):
        code = pccm.FunctionCode()
-        code.arg("indices, hashdata", "tv::Tensor")
-        code.arg("indice_pairs, indice_pairs_uniq, out_inds, indice_num_per_loc", "tv::Tensor")
+        code.arg("indices", "tv::Tensor")
+        code.arg("indice_pairs, indice_pairs_uniq, indice_num_per_loc", "tv::Tensor")
        code.arg("batch_size", "int")
        code.arg("output_dims, input_dims", f"std::vector<int>")
        code.arg("ksize, stride, padding, dilation", f"std::vector<int>")
+        code.arg("transposed", f"bool", "false")
+
+        code.arg("stream_int", f"std::uintptr_t", "0", pyanno="int")
        code.raw(f"""
        int ndim = indices.dim(1) - 1;
        TV_ASSERT_RT_ERR(output_dims.size() == ndim && input_dims.size() == ndim &&
            ksize.size() == ndim && stride.size() == ndim && dilation.size() == ndim &&
            padding.size() == ndim, "your params size not equal to ndim", ndim);
        """)
+
        for ndim in self.ndims:
            code.raw(f"""
            if (ndim == {ndim}){{
@@ -68,53 +72,31 @@ class SpconvOps(pccm.Class):
                    padding_[i] = padding[i];
                    dilation_[i] = dilation[i];
                }}
-                return SpconvIndices{ndim}D::generate_conv_inds(indices, hashdata,
-                    indice_pairs, indice_pairs_uniq, out_inds, indice_num_per_loc,
+                return SpconvIndices{ndim}D::generate_conv_inds_stage1(indices,
+                    indice_pairs, indice_pairs_uniq, indice_num_per_loc,
                    batch_size, output_dims_, input_dims_, 
-                    ksize_, stride_, padding_, dilation_);
+                    ksize_, stride_, padding_, dilation_, transposed, stream_int);
            }}
            """)
        code.raw(f"""TV_THROW_RT_ERR("unknown ndim", ndim);""")
-        return code.ret("int")
+
+        return code# .ret("int")

    @pccm.pybind.mark
    @pccm.cuda.static_function
-    def generate_conv_inds_stage1(self):
+    def generate_conv_inds_stage1_5(self):
        code = pccm.FunctionCode()
-        code.arg("indices", "tv::Tensor")
-        code.arg("indice_pairs, indice_pairs_uniq, indice_num_per_loc", "tv::Tensor")
-        code.arg("batch_size", "int")
-        code.arg("output_dims, input_dims", f"std::vector<int>")
-        code.arg("ksize, stride, padding, dilation", f"std::vector<int>")
+        code.arg("indice_pairs_uniq", "tv::Tensor")
+        code.arg("ndim", "int")
+        code.arg("uniq_size", "int64_t")
        code.arg("stream_int", f"std::uintptr_t", "0", pyanno="int")
-        code.raw(f"""
-        int ndim = indices.dim(1) - 1;
-        TV_ASSERT_RT_ERR(output_dims.size() == ndim && input_dims.size() == ndim &&
-            ksize.size() == ndim && stride.size() == ndim && dilation.size() == ndim &&
-            padding.size() == ndim, "your params size not equal to ndim", ndim);
-        """)
-
        for ndim in self.ndims:
            code.raw(f"""
            if (ndim == {ndim}){{
-                tv::array<int, {ndim}> output_dims_, input_dims_;
-                tv::array<int, {ndim}> ksize_, stride_, padding_, dilation_;
-                for (int i = 0; i < {ndim}; ++i){{
-                    output_dims_[i] = output_dims[i];
-                    input_dims_[i] = input_dims[i];
-                    ksize_[i] = ksize[i];
-                    stride_[i] = stride[i];
-                    padding_[i] = padding[i];
-                    dilation_[i] = dilation[i];
-                }}
-                return SpconvIndices{ndim}D::generate_conv_inds_stage1(indices,
-                    indice_pairs, indice_pairs_uniq, indice_num_per_loc,
-                    batch_size, output_dims_, input_dims_, 
-                    ksize_, stride_, padding_, dilation_);
+                return SpconvIndices{ndim}D::generate_conv_inds_stage1_5(indice_pairs_uniq, uniq_size, stream_int);
            }}
            """)
        code.raw(f"""TV_THROW_RT_ERR("unknown ndim", ndim);""")
-
        return code.ret("int")

    @pccm.pybind.mark
@@ -127,6 +109,7 @@ class SpconvOps(pccm.Class):
        code.arg("batch_size", "int")
        code.arg("output_dims, input_dims", f"std::vector<int>")
        code.arg("ksize, stride, padding, dilation", f"std::vector<int>")
+        code.arg("transposed", f"bool", "false")
        code.arg("stream_int", f"std::uintptr_t", "0", pyanno="int")
        code.raw(f"""
        int ndim = indices.dim(1) - 1;
@@ -151,7 +134,7 @@ class SpconvOps(pccm.Class):
                return SpconvIndices{ndim}D::generate_conv_inds_stage2(indices, hashdata,
                    indice_pairs, indice_pairs_uniq, out_inds, num_out_act,
                    batch_size, output_dims_, input_dims_, 
-                    ksize_, stride_, padding_, dilation_);
+                    ksize_, stride_, padding_, dilation_, transposed, stream_int);
            }}
            """)
        code.raw(f"""TV_THROW_RT_ERR("unknown ndim", ndim);""")

--- a/spconv/csrc/sparse/indices.py
+++ b/spconv/csrc/sparse/indices.py
@@ -225,6 +225,28 @@ class ConvOutLocIter(pccm.ParameterizedClass):
        """)
        return code 

+    @pccm.cuda.member_function(host=True,
+                               device=True,
+                               forceinline=True,
+                               const=True)
+    def query_nhw_out(self):
+        code = pccm.FunctionCode()
+        code.arg("npq_offset", "const int*")
+        code.arg("nhw_offset", f"tv::array<int, {self.ndim + 1}>&")
+        code.ret("bool")
+        code.raw(f"""
+        nhw_offset = npq_to_nhw(npq_offset);
+        """)
+        hw_valid = [] # type: List[str]
+        for i in range(self.ndim):
+            hw_valid.append((f"nhw_offset[{i + 1}] >= 0 && "
+                            f"nhw_offset[{i + 1}] < problem_.output_dims[{i}]"))
+        code.raw(f"""
+        return nhw_offset[0] < problem_.N && 
+            {' && '.join(hw_valid)};
+        """)
+        return code 
+
 class SparseConvIndicesKernel(pccm.ParameterizedClass):
    def __init__(self, problem: ConvProblem, dtype_indices: dtypes.DType):
        super().__init__()
@@ -255,7 +277,7 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
        code.arg("indices_pair_size", "int")

        code.arg("RS", "int")
-        # code.arg("bool", "transposed")
+        code.arg("transposed", "bool")

        code.raw(f"""
        int filter_offset = blockIdx.y;
@@ -264,7 +286,13 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
        int filter_offset_mul_indices_pair_size = filter_offset * indices_pair_size;
        for (int i : tv::KernelLoopX<int>(num_indices_in)) {{
            tv::array<int, {self.ndim + 1}> npq_offset;
-            if (loc_iter.query_npq(indices_in + i * {self.ndim + 1}, npq_offset)){{
+            bool valid;
+            if (transposed){{
+                valid = loc_iter.query_nhw_out(indices_in + i * {self.ndim + 1}, npq_offset);
+            }}else{{
+                valid = loc_iter.query_npq(indices_in + i * {self.ndim + 1}, npq_offset);
+            }}
+            if (valid){{
                int old_num = tv::cuda::atomicAggInc(indice_num_per_loc + filter_offset);
                {self.dtype_indices} offset = loc_iter.layout_npq(npq_offset);
                if (old_num < indices_pair_size){{
@@ -514,81 +542,6 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
        """)
        return code

-    @pccm.cuda.static_function
-    def generate_conv_inds(self):
-        code = pccm.FunctionCode()
-        code.arg("indices, hashdata", "tv::Tensor")
-        code.arg("indice_pairs, indice_pairs_uniq, out_inds, indice_num_per_loc", "tv::Tensor")
-        code.arg("batch_size", "int")
-        code.arg("output_dims, input_dims", f"tv::array<int, {self.ndim}>")
-        code.arg("ksize, stride, padding, dilation", f"tv::array<int, {self.ndim}>")
-        code.raw(f"""
-        // TODO stream
-        // TODO handle num input == 0
-        int kv = tv::arrayops::prod(ksize);
-        TV_ASSERT_RT_ERR(kv == indice_pairs.dim(1), "error");
-        // indice_pairs: [2, kv, indices.dim(0)]
-        // indice_pairs_uniq: [indice_pairs.size() / 2 + 1]
-        // out_inds: [MaxSize, {self.ndim + 1}]
-        auto timer = tv::CudaContextTimer<>();
-        int64_t uniq_size = indice_pairs.size() / 2 + 1;
-        TV_ASSERT_RT_ERR(indice_pairs_uniq.dim(0) == uniq_size, "error");
-        TV_ASSERT_RT_ERR(indice_num_per_loc.dim(0) == kv, "error");
-        
-        int64_t expected_out_size = indices.dim(0) * kv;
-        TV_ASSERT_RT_ERR(out_inds.dim(0) == expected_out_size && out_inds.dim(1) == {self.ndim + 1}, "error");
-        tv::cuda::Launch launcher_num_act_in(indices.dim(0));
-        // tv::cuda::Launch launcher_num_act_in_2(indices.dim(0));
-        launcher_num_act_in.blocks.y = kv;
-
-        ConvProblem problem(batch_size, 1, 1, input_dims, output_dims, ksize, padding, stride, dilation);
-        ConvLocIter loc_iter(problem);
-        tv::cuda::Launch launcher_clean_uniq(uniq_size);
-        launcher_clean_uniq(clean_indices_uniq, indice_pairs_uniq.data_ptr<{self.dtype_indices}>(), uniq_size);
-        tv::ssprint("clean time", timer.report() / 1000.0);
-
-        launcher_num_act_in(calc_conv_indices_stage1, loc_iter, indices.data_ptr<const int>(), 
-            indice_pairs.data_ptr<{self.dtype_indices}>(), 
-            indice_pairs_uniq.data_ptr<{self.dtype_indices}>(), indice_num_per_loc.data_ptr<int>(), indices.dim(0),
-            indice_pairs.dim(2), kv);
-        tv::ssprint("calc_conv_indices_stage1 time", timer.report() / 1000.0, uniq_size);
-
-        thrust::device_ptr<{self.dtype_indices}> ptr_tr(indice_pairs_uniq.data_ptr<{self.dtype_indices}>());
-        auto thrust_ctx = thrust::cuda::par.on(0);
-        thrust::sort(thrust_ctx, ptr_tr, ptr_tr + uniq_size);
-        auto new_end = thrust::unique(thrust_ctx, ptr_tr, ptr_tr + uniq_size);
-        auto num_out_act = new_end - ptr_tr - 1;
-        tv::ssprint("unique time", num_out_act, timer.report() / 1000.0);
-
-        // return num_out_act;
-        // TODO handle invalid num_out_act
-        indice_pairs_uniq = indice_pairs_uniq.slice_first_axis(0, num_out_act);
-        tv::cuda::Launch lanucher_build_hash(num_out_act);
-        using V = {self.dtype_indices};
-        using KeyType = {self.dtype_indices};
-        constexpr KeyType kEmptyKey = std::numeric_limits<KeyType>::max();
-
-        using table_t =
-            tv::hash::LinearHashTable<KeyType, V, tv::hash::Murmur3Hash<KeyType>,
-                                        kEmptyKey, false>;
-        using pair_t = typename table_t::value_type;
-        TV_ASSERT_RT_ERR(hashdata.dim(0) >= num_out_act, "hash size not enough");
-        table_t hash = table_t(hashdata.data_ptr<pair_t>(), hashdata.dim(0));
-        hash.clear();
-        tv::ssprint("clear hash time", hashdata.dim(0), timer.report() / 1000.0);
-
-        lanucher_build_hash(build_conv_hash_table<table_t>, hash, out_inds.data_ptr<int>(), indice_pairs_uniq.data_ptr<const {self.dtype_indices}>(), 
-            loc_iter.layout_npq, num_out_act);
-        tv::ssprint("build_hash time", num_out_act, timer.report() / 1000.0);
-
-        launcher_num_act_in(calc_conv_indices_stage2<table_t>, hash, indice_pairs[1].data_ptr<int>(), indices.dim(0), 
-            indice_pairs.dim(2));
-        tv::ssprint("gem conv inds time", timer.report() / 1000.0);
-        return num_out_act;
-        """)
-
-        return code.ret("int")
-
    @pccm.cuda.static_function
    def generate_conv_inds_stage1(self):
        code = pccm.FunctionCode()
@@ -597,6 +550,8 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
        code.arg("batch_size", "int")
        code.arg("output_dims, input_dims", f"tv::array<int, {self.ndim}>")
        code.arg("ksize, stride, padding, dilation", f"tv::array<int, {self.ndim}>")
+        code.arg("transposed", f"bool", "false")
+
        code.arg("stream_int", f"std::uintptr_t", "0")

        code.raw(f"""
@@ -620,7 +575,23 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
        launcher_num_act_in(calc_conv_indices_stage1, loc_iter, indices.data_ptr<const int>(), 
            indice_pairs.data_ptr<{self.dtype_indices}>(), 
            indice_pairs_uniq.data_ptr<{self.dtype_indices}>(), indice_num_per_loc.data_ptr<int>(), indices.dim(0),
-            indice_pairs.dim(2), kv);
+            indice_pairs.dim(2), kv, transposed);
+        // thrust::device_ptr<{self.dtype_indices}> ptr_tr(indice_pairs_uniq.data_ptr<{self.dtype_indices}>());
+        // auto thrust_ctx = thrust::cuda::par.on(reinterpret_cast<cudaStream_t>(stream_int));
+        // thrust::sort(thrust_ctx, ptr_tr, ptr_tr + uniq_size);
+        // auto new_end = thrust::unique(thrust_ctx, ptr_tr, ptr_tr + uniq_size);
+        // auto num_out_act = new_end - ptr_tr - 1;
+        // return num_out_act;
+        """)
+        return code# .ret("int")
+
+    @pccm.cuda.static_function
+    def generate_conv_inds_stage1_5(self):
+        code = pccm.FunctionCode()
+        code.arg("indice_pairs_uniq", "tv::Tensor")
+        code.arg("uniq_size", "int64_t")
+        code.arg("stream_int", f"std::uintptr_t", "0")
+        code.raw(f"""
        thrust::device_ptr<{self.dtype_indices}> ptr_tr(indice_pairs_uniq.data_ptr<{self.dtype_indices}>());
        auto thrust_ctx = thrust::cuda::par.on(reinterpret_cast<cudaStream_t>(stream_int));
        thrust::sort(thrust_ctx, ptr_tr, ptr_tr + uniq_size);
@@ -630,6 +601,7 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
        """)
        return code.ret("int")

+
    @pccm.cuda.static_function
    def generate_conv_inds_stage2(self):
        code = pccm.FunctionCode()
@@ -639,6 +611,7 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
        code.arg("batch_size", "int")
        code.arg("output_dims, input_dims", f"tv::array<int, {self.ndim}>")
        code.arg("ksize, stride, padding, dilation", f"tv::array<int, {self.ndim}>")
+        code.arg("transposed", f"bool", "false")
        code.arg("stream_int", f"std::uintptr_t", "0")
        code.raw(f"""
        auto custream = reinterpret_cast<cudaStream_t>(stream_int);
@@ -651,7 +624,7 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
        // out_inds: [MaxSize, {self.ndim + 1}]
        auto timer = tv::CudaContextTimer<>();
        int64_t uniq_size = indice_pairs.size() / 2 + 1;
-        TV_ASSERT_RT_ERR(indice_pairs_uniq.dim(0) == uniq_size, "error");
+        TV_ASSERT_RT_ERR(indice_pairs_uniq.dim(0) >= num_out_act, "error");
        TV_ASSERT_RT_ERR(out_inds.dim(0) >= num_out_act && out_inds.dim(1) == {self.ndim + 1}, "error");
        tv::cuda::Launch launcher_num_act_in(indices.dim(0), custream);
        launcher_num_act_in.blocks.y = kv;

--- a/spconv/csrc/sparse/maxpool.py
+++ b/spconv/csrc/sparse/maxpool.py
@@ -22,7 +22,9 @@ from cumm.common import TensorView, TensorViewHashKernel, TensorViewKernel, Thru
 from cumm.gemm import codeops
 from typing import List 
 from cumm.conv.params import ConvProblem
+from cumm.gemm.mask_iters import MaskTileIterator, MaskTileIteratorParams
 import numpy as np 
+from cumm.gemm import (thread_map)

 class IndiceMaxPool(pccm.Class):
    # TODO optimize this function
@@ -171,4 +173,3 @@ class IndiceMaxPool(pccm.Class):
        }});
        """)
        return code
-
--- a/spconv/pytorch/conv.py
+++ b/spconv/pytorch/conv.py
@@ -14,6 +14,7 @@

 import math
 import time
+from typing import List, Optional, Tuple, Union

 import numpy as np
 import torch
@@ -39,6 +40,10 @@ def _calculate_fan_in_and_fan_out_hwio(tensor):
    if dimensions == 2:  # Linear
        fan_in = tensor.size(-2)
        fan_out = tensor.size(-1)
+    else:
+        if FILTER_HWIO:
+            num_input_fmaps = tensor.size(-2)
+            num_output_fmaps = tensor.size(-1)
        else:
            num_input_fmaps = tensor.size(-1)
            num_output_fmaps = tensor.size(-2)
@@ -58,22 +63,22 @@ class SparseConvolution(SparseModule):
    ]

    def __init__(self,
-                 ndim,
-                 in_channels,
-                 out_channels,
-                 kernel_size=3,
-                 stride=1,
-                 padding=0,
-                 dilation=1,
-                 groups=1,
-                 bias=True,
-                 subm=False,
-                 output_padding=0,
-                 transposed=False,
-                 inverse=False,
-                 indice_key=None,
-                 fused_bn=False,
-                 algo=ops.ConvAlgo.Native,
+                 ndim: int,
+                 in_channels: int,
+                 out_channels: int,
+                 kernel_size: Union[int, List[int], Tuple[int, ...]]=3,
+                 stride: Union[int, List[int], Tuple[int, ...]]=1,
+                 padding: Union[int, List[int], Tuple[int, ...]]=0,
+                 dilation: Union[int, List[int], Tuple[int, ...]]=1,
+                 groups: Union[int, List[int], Tuple[int, ...]]=1,
+                 bias: bool=True,
+                 subm: bool=False,
+                 output_padding: Union[int, List[int], Tuple[int, ...]]=0,
+                 transposed: bool=False,
+                 inverse: bool=False,
+                 indice_key: Optional[str]=None,
+                 fused_bn: bool=False,
+                 algo: ops.ConvAlgo=ops.ConvAlgo.Native,
                 name=None):
        super(SparseConvolution, self).__init__(name=name)
        assert groups == 1
@@ -117,8 +122,6 @@ class SparseConvolution(SparseModule):
            self.bias = Parameter(torch.Tensor(out_channels))
        else:
            self.register_parameter('bias', None)
-        # self.workspace_for_splitk = torch.zeros((GLOBAL_MAXIMUM_SPLITK,), dtype=torch.int8)
-        # self.register_buffer("workspace_for_splitk", self.workspace_for_splitk)
        self.reset_parameters()

    def reset_parameters(self):
@@ -234,6 +237,7 @@ class SparseConvolution(SparseModule):
            t = time.time()

        if self.fused_bn:
+            raise NotImplementedError
            assert self.bias is not None
            out_features = ops.fused_indice_conv(features, self.weight,
                                                 self.bias,
@@ -382,6 +386,34 @@ class SparseConv4d(SparseConvolution):
                                           name=name)


+class SparseConvTranspose1d(SparseConvolution):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 bias=True,
+                 indice_key=None,
+                 algo=ops.ConvAlgo.Native,
+                 name=None):
+        super(SparseConvTranspose1d, self).__init__(1,
+                                                    in_channels,
+                                                    out_channels,
+                                                    kernel_size,
+                                                    stride,
+                                                    padding,
+                                                    dilation,
+                                                    groups,
+                                                    bias,
+                                                    transposed=True,
+                                                    indice_key=indice_key,
+                                                    algo=algo,
+                                                    name=name)
+
+
 class SparseConvTranspose2d(SparseConvolution):
    def __init__(self,
                 in_channels,
@@ -437,6 +469,34 @@ class SparseConvTranspose3d(SparseConvolution):
                                                    algo=algo,
                                                    name=name)

+class SparseConvTranspose4d(SparseConvolution):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 bias=True,
+                 indice_key=None,
+                 algo=ops.ConvAlgo.Native,
+                 name=None):
+        super(SparseConvTranspose4d, self).__init__(4,
+                                                    in_channels,
+                                                    out_channels,
+                                                    kernel_size,
+                                                    stride,
+                                                    padding,
+                                                    dilation,
+                                                    groups,
+                                                    bias,
+                                                    transposed=True,
+                                                    indice_key=indice_key,
+                                                    algo=algo,
+                                                    name=name)
+
+
 class SparseInverseConv1d(SparseConvolution):
    def __init__(self,
                 in_channels,

--- a/spconv/pytorch/modules.py
+++ b/spconv/pytorch/modules.py
@@ -143,50 +143,3 @@ class SparseSequential(SparseModule):
                    input = module(input)
        return input

-    def fused(self):
-        """don't use this. no effect.
-        """
-        from spconv.pytorch.conv import SparseConvolution
-        mods = [v for k, v in self._modules.items()]
-        fused_mods = []
-        idx = 0
-        while idx < len(mods):
-            if is_sparse_conv(mods[idx]):
-                if idx < len(mods) - 1 and isinstance(mods[idx + 1],
-                                                      nn.BatchNorm1d):
-                    new_module = SparseConvolution(
-                        ndim=mods[idx].ndim,
-                        in_channels=mods[idx].in_channels,
-                        out_channels=mods[idx].out_channels,
-                        kernel_size=mods[idx].kernel_size,
-                        stride=mods[idx].stride,
-                        padding=mods[idx].padding,
-                        dilation=mods[idx].dilation,
-                        groups=mods[idx].groups,
-                        bias=True,
-                        subm=mods[idx].subm,
-                        output_padding=mods[idx].output_padding,
-                        transposed=mods[idx].transposed,
-                        inverse=mods[idx].inverse,
-                        indice_key=mods[idx].indice_key,
-                        fused_bn=True,
-                    )
-                    new_module.load_state_dict(mods[idx].state_dict(), False)
-                    new_module.to(mods[idx].weight.device)
-                    conv = new_module
-                    bn = mods[idx + 1]
-                    conv.bias.data.zero_()
-                    conv.weight.data[:] = conv.weight.data * bn.weight.data / (
-                        torch.sqrt(bn.running_var) + bn.eps)
-                    conv.bias.data[:] = (
-                        conv.bias.data - bn.running_mean) * bn.weight.data / (
-                            torch.sqrt(bn.running_var) + bn.eps) + bn.bias.data
-                    fused_mods.append(conv)
-                    idx += 2
-                else:
-                    fused_mods.append(mods[idx])
-                    idx += 1
-            else:
-                fused_mods.append(mods[idx])
-                idx += 1
-        return SparseSequential(*fused_mods)
--- a/spconv/pytorch/ops.py
+++ b/spconv/pytorch/ops.py
@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
+import functools
 from enum import Enum
 from cumm import tensorview as tv
 from cumm.gemm.algospec.core import ShuffleStrideType
@@ -23,7 +23,7 @@ from spconv.algo import AlgoHint, ConvAlgo
 from typing import List, Union
 from spconv.pytorch.cppcore import torch_tensor_to_tv, get_current_stream
 from spconv.core_cc.csrc.sparse.all import SpconvOps
-from spconv.algo import GEMM# , GATHER, SCATTER
+from spconv.algo import GEMM  # , GATHER, SCATTER
 import time
 from spconv.constants import FILTER_HWIO

@@ -58,26 +58,17 @@ def get_indice_pairs(indices: torch.Tensor,
                     batch_size: int,
                     spatial_shape: List[int],
                     algo: ConvAlgo,
-                     ksize: Union[int, List[int]],
-                     stride: Union[int, List[int]],
-                     padding: Union[int, List[int]],
-                     dilation: Union[int, List[int]],
-                     out_padding: Union[int, List[int]],
+                     ksize: List[int],
+                     stride: List[int],
+                     padding: List[int],
+                     dilation: List[int],
+                     out_padding: List[int],
                     subm: bool = False,
                     transpose: bool = False):
-
+    # torch.cuda.synchronize()
+    # t = time.time()
    ndim = indices.shape[1] - 1
-    if not isinstance(ksize, (list, tuple)):
-        ksize = [ksize] * ndim
-    if not isinstance(stride, (list, tuple)):
-        stride = [stride] * ndim
-    if not isinstance(padding, (list, tuple)):
-        padding = [padding] * ndim
-    if not isinstance(dilation, (list, tuple)):
-        dilation = [dilation] * ndim
-    if not isinstance(out_padding, (list, tuple)):
-        out_padding = [out_padding] * ndim
-    kv: int = int(np.prod(ksize))
+    kv: int = functools.reduce(lambda x, y: x * y, ksize, 1)
    if not subm:
        if transpose:
            out_shape = get_deconv_output_size(spatial_shape, ksize, stride,
@@ -87,8 +78,9 @@ def get_indice_pairs(indices: torch.Tensor,
                                             padding, dilation)
    else:
        out_shape = spatial_shape
-    assert algo == ConvAlgo.Native and not transpose, "TODO"
+    assert algo == ConvAlgo.Native, "TODO"
    stream = get_current_stream()
+
    pair = torch.full((2, kv, indices.shape[0]),
                      -1,
                      dtype=indices.dtype,
@@ -96,19 +88,20 @@ def get_indice_pairs(indices: torch.Tensor,
    indice_num_per_loc = torch.zeros((kv, ),
                                     dtype=indices.dtype,
                                     device=indices.device)
+
    inds_tv = torch_tensor_to_tv(indices)
    pair_tv = torch_tensor_to_tv(pair)
    indice_num_per_loc_tv = torch_tensor_to_tv(indice_num_per_loc)
-    # torch.cuda.synchronize()
-    # t = time.time()

    if subm:
        out_inds = indices
+
        hashdata = torch.empty((out_inds.shape[0] * 2, ),
                               dtype=torch.int64,
                               device=indices.device)
        out_inds_tv = torch_tensor_to_tv(out_inds)
        hashdata_tv = torch_tensor_to_tv(hashdata, dtype=tv.custom64)
+
        SpconvOps.generate_subm_conv_inds(inds_tv,
                                          hashdata_tv,
                                          pair_tv,
@@ -120,16 +113,15 @@ def get_indice_pairs(indices: torch.Tensor,
                                          dilation=dilation,
                                          stream_int=stream)
        # torch.cuda.synchronize()
-
-        # print("SUBM INDICE GEN", time.time() - t)
+        # print("SUBM", time.time() - t)

    else:
        indice_pairs_uniq = torch.empty((pair.numel() // 2 + 1, ),
                                        dtype=indices.dtype,
                                        device=indices.device)
        indice_pairs_uniq_tv = torch_tensor_to_tv(indice_pairs_uniq)
-        num_act_out = SpconvOps.generate_conv_inds_stage1(
-            inds_tv,
+
+        SpconvOps.generate_conv_inds_stage1(inds_tv,
                                            pair_tv,
                                            indice_pairs_uniq_tv,
                                            indice_num_per_loc_tv,
@@ -140,7 +132,17 @@ def get_indice_pairs(indices: torch.Tensor,
                                            stride=stride,
                                            padding=padding,
                                            dilation=dilation,
+                                            transposed=transpose,
                                            stream_int=stream)
+        uniq_res = indice_pairs_uniq.unique()
+        num_act_out = uniq_res.shape[0] - 1
+        uniq_res_tv = torch_tensor_to_tv(uniq_res)
+        # num_act_out = SpconvOps.generate_conv_inds_stage1_5(
+        #     indice_pairs_uniq_tv,
+        #     ndim,
+        #     uniq_size=indice_pairs_uniq_tv.size,
+        #     stream_int=stream)
+        # uniq_res_tv = indice_pairs_uniq_tv.slice_first_axis(0, num_act_out)
        out_inds = torch.empty((num_act_out, indices.shape[1]),
                               dtype=indices.dtype,
                               device=indices.device)
@@ -152,7 +154,7 @@ def get_indice_pairs(indices: torch.Tensor,
        SpconvOps.generate_conv_inds_stage2(inds_tv,
                                            hashdata_tv,
                                            pair_tv,
-                                            indice_pairs_uniq_tv,
+                                            uniq_res_tv,
                                            out_inds_tv,
                                            num_out_act=num_act_out,
                                            batch_size=batch_size,
@@ -162,11 +164,10 @@ def get_indice_pairs(indices: torch.Tensor,
                                            stride=stride,
                                            padding=padding,
                                            dilation=dilation,
+                                            transposed=transpose,
                                            stream_int=stream)
        # torch.cuda.synchronize()
-
-        # print("INDICE GEN", time.time() - t)
-
+        # print("REGU", time.time() - t)
    return out_inds, pair, indice_num_per_loc


@@ -228,8 +229,6 @@ def indice_conv(features: torch.Tensor,
        c_inds_shape=[nhot_profile],
        hint=AlgoHint.Fowrard.value)

-    gather_data_tv = tv.Tensor()
-    scatter_data_tv = tv.Tensor()

    maxnhot = max(indice_pair_num_cpu)
    if profile_res is None:
@@ -270,13 +269,10 @@ def indice_conv(features: torch.Tensor,
            continue
        inp_indices = pair_in[i].slice_first_axis(0, nhot)
        out_indices = pair_out[i].slice_first_axis(0, nhot)
-        # inp_indices = torch_tensor_to_tv(inp_indices_th)
-        # out_indices = torch_tensor_to_tv(out_indices_th)
        b = filters_tv[i]
        # inp @ filter.T, NC @ KC
        beta = 1.0 if inited else 0.0
-        algo_desp = GEMM.run_profile(
-            profile_res,
+        algo_desp = GEMM.run_profile(profile_res,
                                     a,
                                     b,
                                     c,
@@ -295,11 +291,11 @@ def indice_conv(features: torch.Tensor,
        # gather_times += gather_time
        inited = True
    # torch.cuda.synchronize()
-    # print(stream, valid_count, maxnhot, features.shape[0], features.shape[1], out_channel, time.time() - t, total_times, txt)
-    # print(algo_desp, profile_res.external_gather, profile_res.splitk, features.shape[0], features.shape[1], out_channel, time.time() - t, total_times)
+    # # print(stream, valid_count, maxnhot, features.shape[0], features.shape[1], out_channel, time.time() - t, total_times, txt)
+    # # print(algo_desp, profile_res.external_gather, profile_res.splitk, features.shape[0], features.shape[1], out_channel, time.time() - t)

-    # print(indice_pair_num_cpu)
-    # print(time.time() - t)
+    # # print(indice_pair_num_cpu)
+    # print("G", time.time() - t)
    return out_features


@@ -316,8 +312,6 @@ def indice_conv_backward(features: torch.Tensor,
                         inverse: bool = False,
                         subm: bool = False,
                         algo: ConvAlgo = ConvAlgo.Native):
-    # workspace = torch.empty((10000), dtype=torch.uint8, device=features.device)
-    # workspace_tv = torch_tensor_to_tv(workspace)
    # torch.cuda.synchronize()
    # t = time.time()

@@ -400,7 +394,6 @@ def indice_conv_backward(features: torch.Tensor,
            c_inds=out_indices,
            alpha=1.0,
            beta=0.0,
-            # scatter_data=scatter_data_tv.slice_first_axis(0, nhot_profile),
            hint=AlgoHint.BackwardInput.value,
            stream=stream)
    if not FILTER_HWIO:
@@ -445,7 +438,6 @@ def indice_conv_backward(features: torch.Tensor,
            b_inds=b_inds_wgrad,
            alpha=1.0,
            beta=0.0,
-            # scatter_data=scatter_data_tv.slice_first_axis(0, nhot_profile),
            hint=AlgoHint.BackwardWeight.value,
            stream=stream)
        # print(profile_res_wgrad.algo_desp, profile_res_wgrad.splitk, min_time)
@@ -457,8 +449,9 @@ def indice_conv_backward(features: torch.Tensor,
    else:
        b_shape = [maxnhot, out_bp_tv.dim(1)]
        a_shape = [maxnhot, features_tv.dim(1)]
-    m, n, k = GEMM.extract_mnk(
-        a_shape, b_shape, profile_res_wgrad.algo_desp.trans_a,
+    m, n, k = GEMM.extract_mnk(a_shape,
+                               b_shape,
+                               profile_res_wgrad.algo_desp.trans_a,
                               profile_res_wgrad.algo_desp.trans_b,
                               profile_res_wgrad.algo_desp.trans_c,
                               arch=arch,
@@ -466,12 +459,15 @@ def indice_conv_backward(features: torch.Tensor,
                               a_inds_shape=[maxnhot],
                               b_inds_shape=[maxnhot],
                               hint=AlgoHint.BackwardWeight.value)
-    workspace_size = profile_res_wgrad.algo_desp.query_workspace_size(m, n, k, profile_res_wgrad.splitk)
+    workspace_size = profile_res_wgrad.algo_desp.query_workspace_size(
+        m, n, k, profile_res_wgrad.splitk)
    workspace = torch.Tensor()

    workspace_tv = tv.Tensor()
    if workspace_size > 0:
-        workspace = torch.empty((workspace_size,), dtype=torch.int8, device=features.device)
+        workspace = torch.empty((workspace_size, ),
+                                dtype=torch.int8,
+                                device=features.device)
        workspace_tv = torch_tensor_to_tv(workspace)
    # print(workspace_size, m, n, k, profile_res_wgrad.splitk)
    # torch.cuda.synchronize()
@@ -538,11 +534,13 @@ def indice_conv_backward(features: torch.Tensor,
    # dw_time = time.time() - t
    # # print(dw_time + di_time, di_time, dw_time, profile_res_wgrad.splitk, profile_res_wgrad.algo_desp, dfilters.shape)
    # # print(dw_time + di_time)
-    # print(time.time() - t)
+    # print("BWG", time.time() - t)
    return (din, dfilters.reshape(filters_shape))


 def indice_maxpool(features, indice_pairs, indice_pair_num, num_activate_out):
+    # torch.cuda.synchronize()
+    # t = time.time()
    out_channel = features.shape[-1]
    out_features = torch.zeros((num_activate_out, out_channel),
                               dtype=features.dtype,
@@ -558,6 +556,9 @@ def indice_maxpool(features, indice_pairs, indice_pair_num, num_activate_out):
        out_indices = torch_tensor_to_tv(indice_pairs[1][i, :nhot])
        SpconvOps.maxpool_forward(out_features_tv, features_tv, out_indices,
                                  inp_indices, stream)
+    # torch.cuda.synchronize()
+    # print("M", time.time() - t)
+
    return out_features



--- a/spconv/pytorch/pool.py
+++ b/spconv/pytorch/pool.py
@@ -142,6 +142,20 @@ class SparseMaxPool(SparseModule):
        return out_tensor


+class SparseMaxPool1d(SparseMaxPool):
+    def __init__(self,
+                 kernel_size,
+                 stride=None,
+                 padding=0,
+                 dilation=1,
+                 name=None):
+        super(SparseMaxPool1d, self).__init__(1,
+                                              kernel_size,
+                                              stride,
+                                              padding,
+                                              dilation,
+                                              name=name)
+
 class SparseMaxPool2d(SparseMaxPool):
    def __init__(self,
                 kernel_size,
@@ -170,3 +184,17 @@ class SparseMaxPool3d(SparseMaxPool):
                                              padding,
                                              dilation,
                                              name=name)
+
+class SparseMaxPool4d(SparseMaxPool):
+    def __init__(self,
+                 kernel_size,
+                 stride=None,
+                 padding=0,
+                 dilation=1,
+                 name=None):
+        super(SparseMaxPool4d, self).__init__(4,
+                                              kernel_size,
+                                              stride,
+                                              padding,
+                                              dilation,
+                                              name=name)
--- a/test/aaa.py
+++ b/test/aaa.py
@@ -13,20 +13,20 @@
 # limitations under the License.

 STR = """
-0.0016176700592041016
-0.002481698989868164
-0.0027854442596435547
-0.0031723976135253906
-0.0017437934875488281
-0.0020503997802734375
-0.001399993896484375
-0.0016183853149414062
-0.0007357597351074219
-0.0008492469787597656
-0.0006558895111083984
-0.0007994174957275391
-0.000335693359375
-0.000347137451171875
+BWG 0.0008761882781982422
+BWG 0.0008311271667480469
+BWG 0.002079486846923828
+BWG 0.002329587936401367
+BWG 0.0025458335876464844
+BWG 0.0026700496673583984
+BWG 0.002583742141723633
+BWG 0.0025262832641601562
+BWG 0.003481149673461914
+BWG 0.003238201141357422
+BWG 0.005095958709716797
+BWG 0.0037899017333984375
+BWG 0.003931283950805664
+BWG 0.003300189971923828
 """
 """
 0.003921985626220703
@@ -46,37 +46,86 @@ STR = """
 0.00030994415283203125
 """

-STR = """
-0.0006084442138671875
-0.0005354881286621094
-0.0012688636779785156
-0.0012619495391845703
-0.002301931381225586
-0.0019693374633789062
-0.0038712024688720703
-0.002872467041015625
-0.005068302154541016
-0.0047588348388671875
-0.007832765579223633
-0.005643367767333984
-0.005807161331176758
-0.004715442657470703"""
+STR1 = """
+SUBM 0.00036716461181640625
+G 0.0010955333709716797
+G 0.0010745525360107422
+REGU 0.0006923675537109375
+M 0.0005242824554443359
+SUBM 0.0003108978271484375
+G 0.0010905265808105469
+G 0.0011067390441894531
+REGU 0.00058746337890625
+M 0.0005304813385009766
+SUBM 0.0002682209014892578
+G 0.0010945796966552734
+G 0.0011165142059326172
+REGU 0.0005419254302978516
+M 0.0005164146423339844
+SUBM 0.00021505355834960938
+G 0.0010805130004882812
+G 0.0010516643524169922
+REGU 0.00052642822265625
+M 0.0004677772521972656
+SUBM 0.0002262592315673828
+G 0.0010986328125
+G 0.0010256767272949219
+REGU 0.0005693435668945312
+M 0.00048661231994628906
+SUBM 0.0002319812774658203
+G 0.0011110305786132812
+G 0.0011196136474609375
+REGU 0.0005295276641845703
+M 0.0005729198455810547
+SUBM 0.00023889541625976562
+G 0.0005326271057128906
+G 0.0005140304565429688
 """
-0.0004992485046386719
-0.0003979206085205078
-0.0013720989227294922
-0.0015933513641357422
-0.0027768611907958984
-0.0024590492248535156
-0.004837512969970703
-0.004601001739501953
-0.009881019592285156
-0.008889913558959961
-0.017162084579467773
-0.009079217910766602
-0.009355545043945312
-0.0068836212158203125
+STR2 = """
+SUBM 0.0003352165222167969
+G 0.001149892807006836
+G 0.0017066001892089844
+REGU 0.0006349086761474609
+M 0.00048804283142089844
+SUBM 0.00029850006103515625
+G 0.001767873764038086
+G 0.0020656585693359375
+REGU 0.0005462169647216797
+M 0.0005753040313720703
+SUBM 0.0002789497375488281
+G 0.0012230873107910156
+G 0.0014438629150390625
+REGU 0.0005102157592773438
+M 0.0005676746368408203
+SUBM 0.00020241737365722656
+G 0.00102996826171875
+G 0.0011174678802490234
+REGU 0.0005424022674560547
+M 0.0005102157592773438
+SUBM 0.0001976490020751953
+G 0.0010385513305664062
+G 0.0010204315185546875
+REGU 0.0005321502685546875
+M 0.00047278404235839844
+SUBM 0.00021529197692871094
+G 0.0010280609130859375
+G 0.0010151863098144531
+REGU 0.0004942417144775391
+M 0.0004811286926269531
+SUBM 0.00020694732666015625
+G 0.0005142688751220703
+G 0.0005171298980712891
 """
+def _handle_lines(s: str):
+    arr = s.split(" ")
+    return (arr[0], float(arr[-1]))
+from cumm.gemm.codeops import group_by
+def print_str(s: str):
+
+    nums = list(map(_handle_lines, s.strip().split("\n")))
+    num_dict = group_by(lambda x: x[0], nums)
+    num_dict_ = {k: sum([vv[1] for vv in v]) for k, v in num_dict.items()}
+    print(num_dict_)

-nums = list(map(float, STR.strip().split("\n")))
-print(sum(nums))
\ No newline at end of file
+print_str(STR1)
+print_str(STR2)
\ No newline at end of file
--- a/test/benchmark.py
+++ b/test/benchmark.py
@@ -224,25 +224,26 @@ def main():
    # voxels, coors, spatial_shape = waymo_data()
    # with open("/home/yy/test_spconv.pkl", "wb") as f:
    #     pickle.dump((voxels, coors, spatial_shape), f)
-    with open("/home/yy/test_spconv.pkl", "rb") as f:
+    with open(Path(__file__).parent / "data" / "test_spconv.pkl", "rb") as f:
        (voxels, coors, spatial_shape) = pickle.load(f)
    print(spatial_shape)
    print(voxels.shape)
    # voxels = voxels[:100]
    # coors = coors[:100]
-    voxels_th = torch.from_numpy(voxels).cuda().float()
+    dtype = torch.float32
+
+    voxels_th = torch.from_numpy(voxels).cuda().to(dtype)
    coors_th = torch.from_numpy(coors).cuda().int()
    voxels_th.requires_grad = True
-
    algo = spconv.ConvAlgo.Native
-    net = Net(spatial_shape, algo).cuda().eval().float()
+    net = Net(spatial_shape, algo).cuda().eval().to(dtype)
    print(coors_th.shape)
    out = net(voxels_th, coors_th, 1)
    print(out.spatial_shape)
    print(voxels.mean(),  voxels.max(), voxels.min())
    dout = np.random.uniform(-0.2, 0.2,
                                out.features.shape).astype(np.float32)
-    dout_t = torch.from_numpy(dout).cuda()
+    dout_t = torch.from_numpy(dout).cuda().to(dtype)

    print(out.spatial_shape, out.features.mean(),  out.features.max(),  out.features.min())
    times = []

--- a/test/data/test_spconv.pkl
+++ b/test/data/test_spconv.pkl
--- a/test/test_conv.py
+++ b/test/test_conv.py
@@ -381,17 +381,17 @@ class TestSpConv(TestCase):
            else:
                filters = np.random.uniform(0, 1, size=[k, k, k, OC,
                                                        IC]).astype(np.float32)
-
+            dtype = torch.float16
            indices_t = torch.from_numpy(indices).int().to(device)
-            features_t = torch.from_numpy(features).to(device)
+            features_t = torch.from_numpy(features).to(device).to(dtype)
            features_t.requires_grad = True
-            features_dense_t = torch.from_numpy(features_dense).to(device)
+            features_dense_t = torch.from_numpy(features_dense).to(device).to(dtype)
            features_dense_t.requires_grad = True
            net = SparseConv3dTestTorch(1, 3, shape, IC, OC, k, s, p,
-                                        d).to(device)
+                                        d).to(device).to(dtype)
            net_ref = Conv3dTestTorch(1, 3, shape, IC, OC, k, s, p,
-                                      d).to(device)
-            filters_t = torch.from_numpy(filters).to(device)
+                                      d).to(device).to(dtype)
+            filters_t = torch.from_numpy(filters).to(device).to(dtype)
            if FILTER_HWIO:
                net_ref.net[0].weight.data[:] = filters_t.permute(4, 3, 0, 1,
                                                                2).contiguous()
@@ -442,6 +442,11 @@ class TestSpConv(TestCase):
        strides = [2, 3]
        paddings = [0, 1, 2]
        dilations = [1, 2, 3]
+        ksizes = [3]
+
+        strides = [1]
+        paddings = [0]
+        dilations = [1]

        for dev, shape, bs, IC, OC, k, s, p, d in params_grid(
                devices, shapes, batchsizes, in_channels, out_channels, ksizes,
@@ -458,8 +463,13 @@ class TestSpConv(TestCase):
            indices = np.ascontiguousarray(
                sparse_dict["indices"][:, [3, 0, 1, 2]]).astype(np.int32)
            features_dense = sparse_dict["features_dense"].astype(np.float32)
+            if FILTER_HWIO:
                filters = np.random.uniform(0, 1, size=[k, k, k, IC,
                                                        OC]).astype(np.float32)
+            else:
+                filters = np.random.uniform(0, 1, size=[k, k, k, OC,
+                                                        IC]).astype(np.float32)
+
            indices_t = torch.from_numpy(indices).int().to(device)
            features_t = torch.from_numpy(features).to(device)
            features_t.requires_grad = True
@@ -470,11 +480,20 @@ class TestSpConv(TestCase):
            net_ref = DeConv3dTestTorch(1, 3, shape, IC, OC, k, s, p,
                                        d).to(device)
            filters_t = torch.from_numpy(filters).to(device)
+            print(net_ref.net[0].weight.shape)
+            if FILTER_HWIO:
+                net_ref.net[0].weight.data[:] = filters_t.permute(3, 4, 0, 1,
+                                                                2).contiguous()
+            else:
                net_ref.net[0].weight.data[:] = filters_t.permute(4, 3, 0, 1,
                                                                2).contiguous()
            net.net[0].weight.data[:] = filters_t
            out_ref = net_ref(features_dense_t)
            out = net(features_t, indices_t, bs).dense()
+            out_np = out.detach().cpu().numpy()
+            out_ref_np = out_ref.detach().cpu().numpy()
+            self.assertAllClose(out_np, out_ref_np, atol=1e-4)
+
            dout = np.random.uniform(-0.2, 0.2,
                                     out_ref.shape).astype(features.dtype)
            dout_t = torch.from_numpy(dout).to(device)
@@ -490,12 +509,12 @@ class TestSpConv(TestCase):
            for layer, layer_ref in zip(net.net, net_ref.net):
                dw = layer.weight.grad.detach().cpu().numpy()
                dw_ref = layer_ref.weight.grad.detach().cpu().numpy()
+                if FILTER_HWIO:
+                    dw = dw.transpose(3, 4, 0, 1, 2)
+                else:
                    dw = dw.transpose(4, 3, 0, 1, 2)
                self.assertAllClose(dw, dw_ref, atol=1e-4)

-            out_np = out.detach().cpu().numpy()
-            out_ref_np = out_ref.detach().cpu().numpy()
-            self.assertAllClose(out_np, out_ref_np, atol=1e-4)

    def testSpCpConv3d(self):
        np.random.seed(484)

--- a/tools/build-wheels.sh
+++ b/tools/build-wheels.sh
@@ -27,8 +27,8 @@ function repair_wheel {

 export SPCONV_DISABLE_JIT="1"
 export CUMM_CUDA_ARCH_LIST="all"
-# Compile wheels, we only support 3.7-3.10.
-# "/opt/python/cp36-cp36m/bin/pip" wheel /io/ --no-deps -w /io/wheelhouse_tmp
+# Compile wheels, we only support 3.6-3.10.
+"/opt/python/cp36-cp36m/bin/pip" wheel /io/ --no-deps -w /io/wheelhouse_tmp
 "/opt/python/cp37-cp37m/bin/pip" wheel /io/ --no-deps -w /io/wheelhouse_tmp
 "/opt/python/cp38-cp38/bin/pip" wheel /io/ --no-deps -w /io/wheelhouse_tmp
 "/opt/python/cp39-cp39/bin/pip" wheel /io/ --no-deps -w /io/wheelhouse_tmp