working on c++ only

899008fa · yan.yan · f78575ea · f78575ea · 899008fa · 899008fa
Commit 899008fa authored Jul 20, 2022 by yan.yan
20 changed files
--- a/docs/DEVELOPMENT.md
+++ b/docs/DEVELOPMENT.md
-<!--
- Copyright 2021 Yan Yan
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-     http://www.apache.org/licenses/LICENSE-2.0
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-->
-# How to develop spconv 2.x
-## First step
-spconv 2.x is written in a unique c++ framework ```pccm```. read [pccm guide]() to learn how to use ```pccm```.
-It's recommend to uninstall spconv and cumm installed by pip, then install spconv and cumm both in editable mode (```pip install -e .```)
-## Architecture
\ No newline at end of file
--- a/setup.py
+++ b/setup.py
@@ -159,6 +159,9 @@ if disable_jit is not None and disable_jit == "1":
    from spconv.csrc.utils import BoxOps
    from spconv.csrc.hash.core import HashTable
    from cumm.common import CompileInfo
+    from spconv.csrc.sparse.alloc import ExternalAllocator
+    from spconv.csrc.sparse.convops import GemmTunerSimple, ExternalSpconvMatmul
+    from spconv.csrc.sparse.convops import ConvTunerSimple, ConvGemmOps
    cu = GemmMainUnitTest(SHUFFLE_SIMT_PARAMS + SHUFFLE_VOLTA_PARAMS + SHUFFLE_TURING_PARAMS)
    convcu = ConvMainUnitTest(IMPLGEMM_SIMT_PARAMS + IMPLGEMM_VOLTA_PARAMS + IMPLGEMM_TURING_PARAMS)
@@ -172,14 +175,30 @@ if disable_jit is not None and disable_jit == "1":
            std = "c++14" 
        else:
            std = "c++17"
-    cus = [cu, convcu, SpconvOps(), BoxOps(), HashTable(), CompileInfo()]
    if CUMM_CPU_ONLY_BUILD:
-        cus = [SpconvOps(), BoxOps(), HashTable(), CompileInfo()]
+        gemmtuner = GemmTunerSimple(cu)
+        gemmtuner.namespace = "csrc.sparse.convops.gemmops"
+        convtuner = ConvTunerSimple(convcu)
+        convtuner.namespace = "csrc.sparse.convops.convops"
+        convops = ConvGemmOps(gemmtuner, convtuner)
+        convops.namespace = "csrc.sparse.convops.spops"
+    else:
+        gemmtuner = GemmTunerSimple(None)
+        gemmtuner.namespace = "csrc.sparse.convops.gemmops"
+        convtuner = ConvTunerSimple(None)
+        convtuner.namespace = "csrc.sparse.convops.convops"
+        convops = ConvGemmOps(gemmtuner, convtuner)
+        convops.namespace = "csrc.sparse.convops.spops"
+    cus = [gemmtuner, convtuner,
+        convops, SpconvOps(), BoxOps(), HashTable(), CompileInfo(), 
+        ExternalAllocator(),
+        ExternalSpconvMatmul()]
+    if not CUMM_CPU_ONLY_BUILD:
+        cus.extend([cu, convcu])
    ext_modules: List[Extension] = [
        PCCMExtension(cus,
                      "spconv/core_cc",
                      Path(__file__).resolve().parent / "spconv",
-                      objects_folder="objects",
                      std=std,
                      disable_pch=True,
                      verbose=True)

--- a/spconv/algo.py
+++ b/spconv/algo.py
@@ -37,7 +37,7 @@ from cumm import dtypes
 from spconv.constants import (NDIM_DONT_CARE, SPCONV_BWD_SPLITK,
                              SPCONV_NVRTC_MODE, SPCONV_DEBUG_NVRTC_KERNELS)
-from spconv.core import ALL_IMPGEMM_PARAMS, AlgoHint, ConvAlgo
+from spconv.core import ALL_IMPGEMM_PARAMS, AlgoHint, ConvAlgo, ALL_NATIVE_PARAMS
 from spconv.core_cc.cumm.conv.main import ConvMainUnitTest
 from spconv.core_cc.cumm.gemm.main import GemmMainUnitTest
 from spconv.cppconstants import COMPILED_CUDA_ARCHS
@@ -49,14 +49,17 @@ from spconv import algocore
 from cumm.conv.main import gen_gemm_kernels as gen_conv_kernels
 from cumm.gemm.main import gen_gemm_kernels
+from spconv.core_cc.csrc.sparse.convops import GemmTuneResult, ConvTuneResult
+from spconv.core_cc.csrc.sparse.convops.gemmops import GemmTunerSimple as GemmTunerSimpleBase
+from spconv.core_cc.csrc.sparse.convops.convops import ConvTunerSimple as ConvTunerSimpleBase
 ALL_ALGO_DESPS = GemmMainUnitTest.get_all_algo_desp()
 ALL_CONV_ALGO_DESPS = ConvMainUnitTest.get_all_conv_algo_desp()
-_GEMM_STATIC_KEY = Tuple[bool, bool, bool, int, int, int, str, str]
+_GEMM_STATIC_KEY = Tuple[bool, bool, bool, int, int, int, int, str]
 class SimpleGemmAlgoMeta:
    def __init__(self, tile_ms: List[int], tile_ns: List[int],
                 tile_ks: List[int],
                 tile_shape_to_algos: Dict[int, List[int]]) -> None:
@@ -67,19 +70,29 @@ class SimpleGemmAlgoMeta:
 class BestAlgoByProfile:
-    def __init__(self, algo_desp: GemmAlgoDesp, arch: Tuple[int, int], splitk: int = 1) -> None:
+    def __init__(self,
+                 algo_desp: GemmAlgoDesp,
+                 arch: Tuple[int, int],
+                 splitk: int = 1) -> None:
        self.algo_desp = algo_desp
        self.splitk = splitk
        self.arch = arch
 class BestConvAlgoByProfile:
-    def __init__(self, algo_desp: ConvAlgoDesp, arch: Tuple[int, int], splitk: int = 1) -> None:
+    def __init__(self,
+                 algo_desp: ConvAlgoDesp,
+                 arch: Tuple[int, int],
+                 splitk: int = 1) -> None:
        self.algo_desp = algo_desp
        self.splitk = splitk
        self.arch = arch
-def _get_nvrtc_params(mod: CummNVRTCModule, ker: Union[GemmKernel, ConvKernel], kernel_name: str):
+def _get_nvrtc_params(mod: CummNVRTCModule, ker: Union[GemmKernel, ConvKernel],
+                      kernel_name: str):
    nvrtc_mode = SPCONV_NVRTC_MODE
    nvrtc_params = tv.gemm.NVRTCParams()
    nvrtc_params.cumodule = mod.get_cpp_object()
@@ -89,8 +102,7 @@ def _get_nvrtc_params(mod: CummNVRTCModule, ker: Union[GemmKernel, ConvKernel],
    ns = ker.namespace
    if nvrtc_mode == NVRTCMode.DynamicParallism:
-        nvrtc_params.kernel_name = mod.get_lowered_name(
+        nvrtc_params.kernel_name = mod.get_lowered_name(f"{ns}::nvrtc_kernel")
-            f"{ns}::nvrtc_kernel")
    elif nvrtc_mode == NVRTCMode.KernelAndCPU:
        nvrtc_params.kernel_name = mod.get_lowered_name(f"{ns}::{kernel_name}")
@@ -101,8 +113,10 @@ def _get_nvrtc_params(mod: CummNVRTCModule, ker: Union[GemmKernel, ConvKernel],
        nvrtc_params.param_storage = tv.empty([nvrtc_params.param_size],
                                              tv.uint8, 0)
-        nvrtc_params.param_storage_cpu = tv.empty(
+        nvrtc_params.param_storage_cpu = tv.empty([nvrtc_params.param_size],
-            [nvrtc_params.param_size], tv.uint8, -1, pinned=True)
+                                                  tv.uint8,
+                                                  -1,
+                                                  pinned=True)
    elif nvrtc_mode == NVRTCMode.Direct:
        nvrtc_params.kernel_name = mod.get_lowered_name(f"{ns}::{kernel_name}")
@@ -120,9 +134,84 @@ def _get_nvrtc_params(mod: CummNVRTCModule, ker: Union[GemmKernel, ConvKernel],
        raise NotImplementedError
    return nvrtc_params
+class GemmTunerSimple(GemmTunerSimpleBase):
+    def __init__(self, desps: List[GemmAlgoDesp]) -> None:
+        super().__init__(desps)
+        self._nvrtc_caches: Dict[Tuple[str, Tuple[int, int], int], NVRTCParams] = {}
+    def _compile_nvrtc_module(self, desp: GemmAlgoDesp):
+        params = algocore.get_gemm_param_from_desp(desp)
+        kernel = gen_gemm_kernels(params, SPCONV_NVRTC_MODE)
+        kernel.namespace = "spconv"
+        custom_names = []
+        if SPCONV_NVRTC_MODE == NVRTCMode.ConstantMemory:
+            custom_names = [
+                f"&{kernel.namespace}::{NVRTCConstants.CONSTANT_PARAM_KEY}"
+            ]
+        cudadevrt = ""
+        if SPCONV_NVRTC_MODE == NVRTCMode.DynamicParallism:
+            cudadevrt_p = get_cudadevrt_path()
+            assert cudadevrt_p is not None, "DynamicParallism must have cudadevrt"
+            cudadevrt = str(cudadevrt_p)
+        mod = CummNVRTCModule([kernel],
+                              cudadevrt_path=cudadevrt,
+                              custom_names=custom_names)
+        mod.load()
+        return mod, kernel
+    def cached_get_nvrtc_params(self, desp: GemmAlgoDesp, arch: Tuple[int, int], stream_int: int) -> NVRTCParams:
+        key = (str(desp), arch, stream_int)
+        if key in self._nvrtc_caches:
+            return self._nvrtc_caches[key]
+        mod, ker = self._compile_nvrtc_module(desp)
+        nvrtc_params = _get_nvrtc_params(mod, ker, "gemm_kernel")
+        self._nvrtc_caches[key] = nvrtc_params
+        return nvrtc_params
+class ConvTunerSimple(ConvTunerSimpleBase):
+    def __init__(self, desps: List[ConvAlgoDesp]) -> None:
+        super().__init__(desps)
+        self._nvrtc_caches: Dict[Tuple[str, Tuple[int, int], int], NVRTCParams] = {}
+    def _compile_nvrtc_module(self, desp: ConvAlgoDesp):
+        params = algocore.get_conv_param_from_desp(desp)
+        kernel = gen_conv_kernels(params, SPCONV_NVRTC_MODE)
+        kernel.namespace = "spconv"
+        custom_names = []
+        if SPCONV_NVRTC_MODE == NVRTCMode.ConstantMemory:
+            custom_names = [
+                f"&{kernel.namespace}::{NVRTCConstants.CONSTANT_PARAM_KEY}"
+            ]
+        cudadevrt = ""
+        if SPCONV_NVRTC_MODE == NVRTCMode.DynamicParallism:
+            cudadevrt_p = get_cudadevrt_path()
+            assert cudadevrt_p is not None, "DynamicParallism must have cudadevrt"
+            cudadevrt = str(cudadevrt_p)
+        mod = CummNVRTCModule([kernel],
+                              cudadevrt_path=cudadevrt,
+                              verbose=False,
+                              custom_names=custom_names)
+        mod.load()
+        return mod, kernel
+    def cached_get_nvrtc_params(self, desp: ConvAlgoDesp, arch: Tuple[int, int], stream_int: int) -> NVRTCParams:
+        key = (str(desp), arch, stream_int)
+        if key in self._nvrtc_caches:
+            return self._nvrtc_caches[key]
+        mod, ker = self._compile_nvrtc_module(desp)
+        print(f"Can't find algo {desp} in prebuilt. compile with nvrtc...")
+        nvrtc_params = _get_nvrtc_params(mod, ker, "conv_kernel")
+        self._nvrtc_caches[key] = nvrtc_params
+        return nvrtc_params
 class SimpleGemm:
    def __init__(self, prebuilt_desps: List[GemmAlgoDesp]) -> None:
-        all_desps = [algocore.get_conv_algo_desp_from_param(p) for p in ALL_IMPGEMM_PARAMS]
+        all_desps = [
+            algocore.get_gemm_algo_desp_from_param(p)
+            for p in ALL_NATIVE_PARAMS
+        ]
        self.prebuilt_desps = prebuilt_desps
        self.prebuilt_desp_names = {str(d) for d in prebuilt_desps}
        if SPCONV_DEBUG_NVRTC_KERNELS:
@@ -178,7 +267,9 @@ class SimpleGemm:
        kernel.namespace = "spconv"
        custom_names = []
        if SPCONV_NVRTC_MODE == NVRTCMode.ConstantMemory:
-            custom_names = [f"&{kernel.namespace}::{NVRTCConstants.CONSTANT_PARAM_KEY}"]
+            custom_names = [
+                f"&{kernel.namespace}::{NVRTCConstants.CONSTANT_PARAM_KEY}"
+            ]
        cudadevrt = ""
        if SPCONV_NVRTC_MODE == NVRTCMode.DynamicParallism:
            cudadevrt_p = get_cudadevrt_path()
@@ -186,12 +277,12 @@ class SimpleGemm:
            cudadevrt = str(cudadevrt_p)
        mod = CummNVRTCModule([kernel],
                              cudadevrt_path=cudadevrt,
-                            verbose=False,
                              custom_names=custom_names)
        mod.load()
        return mod, kernel
-    def _cached_get_nvrtc_params(self, desp: GemmAlgoDesp, arch: Tuple[int, int]):
+    def _cached_get_nvrtc_params(self, desp: GemmAlgoDesp, arch: Tuple[int,
+                                                                       int]):
        key = (str(desp), arch)
        if key in self._nvrtc_caches:
            return self._nvrtc_caches[key]
@@ -218,12 +309,15 @@ class SimpleGemm:
            trans_c = False
        avail_algos = get_available_algo_str_from_arch(arch)
        finally_algos: List[GemmAlgoDesp] = []
+        # print(self.static_key_to_desps)
        for algo in avail_algos:
            static_key = (trans_a, trans_b, trans_c, a.dtype, b.dtype, c.dtype,
                          shuffle_type.value, algo)
+            # print(static_key)
            desps = self.static_key_to_desps.get(static_key, None)
            if desps is None or len(desps) == 0:
                continue
+            # print(desps)
            for desp in desps:
                # skip volta tensor op since it is very slow in architectures except volta.
                if arch >= (7, 5) and desp.algo == GemmAlgo.Volta.value:
@@ -430,6 +524,7 @@ class SimpleGemm:
        best_scatter_params = (-1, -1, -1, -1)
        all_profile_res: List[BestAlgoByProfile] = []
+        # print(avail)
        for desp in avail:
            c_.zero_whole_storage_()
            split_k_slices = 1
@@ -466,7 +561,8 @@ class SimpleGemm:
                times.append(np.mean(this_times[1:]))
                spk_speeds.append(times[-1])
-                all_profile_res.append(BestAlgoByProfile(desp, arch, splitk=spk))
+                all_profile_res.append(
+                    BestAlgoByProfile(desp, arch, splitk=spk))
        min_time = 1000
        min_idx = -1
@@ -490,8 +586,7 @@ class SimpleGemm:
        return res, min_time
-    def run_with_tuned_result(
+    def run_with_tuned_result(self,
-        self,
                              profile_res: BestAlgoByProfile,
                              a: tv.Tensor,
                              b: tv.Tensor,
@@ -501,7 +596,7 @@ class SimpleGemm:
                              trans_c: bool,
                              arch: Tuple[int, int],
                              stream: int,
-        shuffle_type: ShuffleStrideType = ShuffleStrideType.NoShuffle,
+                              shuffle_type: ShuffleStrideType,
                              a_inds: tv.Tensor = tv.Tensor(),
                              b_inds: tv.Tensor = tv.Tensor(),
                              c_inds: tv.Tensor = tv.Tensor(),
@@ -510,7 +605,8 @@ class SimpleGemm:
                              beta: float = 0.0,
                              gather_data: tv.Tensor = tv.Tensor(),
                              workspace: tv.Tensor = tv.Tensor(),
-        timer: CUDAKernelTimer = CUDAKernelTimer(False)):
+                              timer: CUDAKernelTimer = CUDAKernelTimer(False),
+                              force_nvrtc: bool = False):
        m, n, k = GemmMainUnitTest.extract_mnk(a.shape, b.shape, trans_a,
                                               trans_b, trans_c,
                                               shuffle_type.value,
@@ -526,8 +622,10 @@ class SimpleGemm:
        if profile_res.splitk > 1:
            split_k_slices = profile_res.splitk
        params = GemmParams()
-        if algo_desp.is_nvrtc and str(algo_desp) not in self.prebuilt_desp_names:
+        is_not_static = str(algo_desp) not in self.prebuilt_desp_names
-            params.nvrtc_params = self._cached_get_nvrtc_params(algo_desp, profile_res.arch)
+        if algo_desp.is_nvrtc and (is_not_static or force_nvrtc):
+            params.nvrtc_params = self._cached_get_nvrtc_params(
+                algo_desp, profile_res.arch)
        params.a = a
        params.b = b
@@ -569,8 +667,12 @@ _CONV_STATIC_KEY = Tuple[int, int, int, int, int, int, int, int, int, str, int]
 class SimpleConv:
    def __init__(self, prebuilt_desps: List[ConvAlgoDesp]) -> None:
-        all_desps = [algocore.get_conv_algo_desp_from_param(p) for p in ALL_IMPGEMM_PARAMS]
+        all_desps = [
+            algocore.get_conv_algo_desp_from_param(p)
+            for p in ALL_IMPGEMM_PARAMS
+        ]
        self.prebuilt_desps = prebuilt_desps
        self.prebuilt_desp_names = {str(d) for d in prebuilt_desps}
        self.prebuilt_desp_names.clear()
@@ -650,6 +752,7 @@ class SimpleConv:
                    use_f32_as_accum = weight.dim(0) * kv > 128 * 27
            else:
                use_f32_as_accum = fp32_accum
+        use_f32_as_accum = False
        for algo in avail_algos:
            static_key = (layout_i.layout_type.value,
                          layout_w.layout_type.value,
@@ -664,7 +767,6 @@ class SimpleConv:
                if arch >= (7, 5) and desp.algo == GemmAlgo.Volta.value:
                    continue
                if arch >= (7, 0) and is_fp16:
-                    # skip simt fp16 kernels if we have tensor core
                    if desp.algo == GemmAlgo.Simt:
                        continue
                    if use_f32_as_accum:
@@ -675,6 +777,7 @@ class SimpleConv:
                ldw = weight.dim(-1)
                ldo = out.dim(-1)
                mask_width_valid = True
                if desp.op_type == ConvOpType.kBackwardWeight.value:
                    assert mask_width > 0
                    mask_width_valid = mask_width % desp.tile_shape[2] == 0
@@ -722,7 +825,9 @@ class SimpleConv:
        kernel.namespace = "spconv"
        custom_names = []
        if SPCONV_NVRTC_MODE == NVRTCMode.ConstantMemory:
-            custom_names = [f"&{kernel.namespace}::{NVRTCConstants.CONSTANT_PARAM_KEY}"]
+            custom_names = [
+                f"&{kernel.namespace}::{NVRTCConstants.CONSTANT_PARAM_KEY}"
+            ]
        cudadevrt = ""
        if SPCONV_NVRTC_MODE == NVRTCMode.DynamicParallism:
            cudadevrt_p = get_cudadevrt_path()
@@ -735,10 +840,12 @@ class SimpleConv:
        mod.load()
        return mod, kernel
-    def _cached_get_nvrtc_params(self, desp: ConvAlgoDesp, arch: Tuple[int, int]):
+    def _cached_get_nvrtc_params(self, desp: ConvAlgoDesp, arch: Tuple[int,
+                                                                       int]):
        key = (str(desp), arch)
        if key in self._nvrtc_caches:
            return self._nvrtc_caches[key]
+        print(f"Can't find algo {desp} in prebuilt. compile with nvrtc...")
        mod, ker = self._compile_nvrtc_module(desp)
        nvrtc_params = _get_nvrtc_params(mod, ker, "conv_kernel")
        self._nvrtc_caches[key] = nvrtc_params
@@ -795,8 +902,8 @@ class SimpleConv:
            params.indices = indices
            params.mask = mask
            params.mask_output = mask_output
-            if op_type == ConvOpType.kBackwardWeight:
+            # if op_type == ConvOpType.kBackwardWeight:
-                assert not mask_output.empty()
+            #     assert not mask_output.empty()
            if op_type == ConvOpType.kBackwardInput:
                params.reverse_mask = reverse_mask
            params.mask_filter = mask_filter
@@ -808,20 +915,20 @@ class SimpleConv:
            spk_speeds = []
            for spk in splitk_tests:
                this_times = []
-                for j in range(3):
+                for j in range(4):
-                    GemmMainUnitTest.stream_synchronize(stream)
-                    t = time.time()
                    params.split_k_slices = spk
-                    if desp.is_nvrtc and str(desp) not in self.prebuilt_desp_names:
+                    with tv.measure_duration(stream=stream) as measure:
+                        if desp.is_nvrtc and str(
+                                desp) not in self.prebuilt_desp_names:
                            tv.gemm.run_nvrtc_conv_kernel(params)
                        else:
                            ConvMainUnitTest.implicit_gemm2(params)
-                    GemmMainUnitTest.stream_synchronize(stream)
+                    this_times.append(measure.duration)
-                    this_times.append(time.time() - t)
                times.append(np.mean(this_times[1:]))
                spk_speeds.append(times[-1])
-                all_profile_res.append(BestConvAlgoByProfile(desp, arch, splitk=spk))
+                all_profile_res.append(
+                    BestConvAlgoByProfile(desp, arch, splitk=spk))
        if not all_profile_res:
            raise ValueError("can't find suitable algorithm for", op_type)
        min_time = 1000
@@ -865,7 +972,8 @@ class SimpleConv:
                              stream: int = 0,
                              workspace: tv.Tensor = tv.Tensor(),
                              verbose: bool = False,
-                              timer: CUDAKernelTimer = CUDAKernelTimer(False)):
+                              timer: CUDAKernelTimer = CUDAKernelTimer(False),
+                              force_nvrtc: bool = False):
        channel_k = output.dim(1)
        channel_c = inp.dim(1)
        # GemmMainUnitTest.stream_synchronize(stream)
@@ -879,13 +987,17 @@ class SimpleConv:
        else:
            op_type_value = op_type.value
        params = ConvParams(NDIM_DONT_CARE, ConvOpTypeCpp(op_type_value))
-        if algo_desp.is_nvrtc and str(algo_desp) not in self.prebuilt_desp_names:
+        is_not_static = str(
-            params.nvrtc_params = self._cached_get_nvrtc_params(algo_desp, profile_res.arch)
+                algo_desp) not in self.prebuilt_desp_names
+        if algo_desp.is_nvrtc and (is_not_static or force_nvrtc):
+            params.nvrtc_params = self._cached_get_nvrtc_params(
+                algo_desp, profile_res.arch)
        params.conv_algo_desp = profile_res.algo_desp
        params.input = inp
        params.verbose = verbose
        params.weight = weight.view([channel_k, -1, channel_c])
        params.output = output
        params.split_k_slices = split_k_slices
        params.alpha = alpha
        params.beta = beta
@@ -893,6 +1005,7 @@ class SimpleConv:
        params.mask_argsort = mask_argsort
        params.indices = indices
        params.mask = mask
        params.mask_filter = mask_filter
        params.mask_width = mask_width
        params.mask_filter = mask_filter
@@ -919,6 +1032,13 @@ class SimpleConv:
 GEMM = SimpleGemm(ALL_ALGO_DESPS)
 CONV = SimpleConv(ALL_CONV_ALGO_DESPS)
+GEMM_CPP = GemmTunerSimple([
+            algocore.get_gemm_algo_desp_from_param(p)
+            for p in ALL_NATIVE_PARAMS])
+CONV_CPP = ConvTunerSimple([
+            algocore.get_conv_algo_desp_from_param(p)
+            for p in ALL_IMPGEMM_PARAMS])
 if __name__ == "__main__":
    print(len(ALL_CONV_ALGO_DESPS))
    print(ALL_CONV_ALGO_DESPS[0])
--- a/spconv/algocore.py
+++ b/spconv/algocore.py
@@ -24,8 +24,8 @@ from cumm.tensorview.gemm import ConvLayoutType as ConvLayoutTypeCpp
 from cumm.tensorview.gemm import ShuffleStrideType as ShuffleStrideTypeCpp
 from cumm.tensorview.gemm import ConvParams, GemmAlgoDesp, GemmParams
-from cumm.gemm.main import GemmAlgoParams
+from cumm.gemm.main import GemmAlgoParams, gen_gemm_kernels
-from cumm.conv.main import ConvAlgoParams, ConvIterAlgo
+from cumm.conv.main import ConvAlgoParams, ConvIterAlgo, gen_gemm_kernels as gen_conv_kernels
 from cumm import dtypes
 from cumm.conv.bases import (NCHW, NHWC, ConvIterAlgo, ConvLayout,
                             ConvLayoutType, ConvMode, ConvOpType)
@@ -56,10 +56,15 @@ def _assign_gemm_desp_props(desp: Union[ConvAlgoDesp, GemmAlgoDesp],
    desp.access_per_vector = p.access_per_vector
    desp.is_nvrtc = p.is_nvrtc
 def get_gemm_algo_desp_from_param(p: GemmAlgoParams):
    desp = GemmAlgoDesp()
    _assign_gemm_desp_props(desp, p)
+    # here we must generate kernel for element-per-access data
+    ker = gen_gemm_kernels(p)
+    desp.element_per_access_a = ker.input_spec.input_iter_a.element_per_acc
+    desp.element_per_access_b = ker.input_spec.input_iter_b.element_per_acc
+    desp.element_per_access_c = ker.output_spec.out_iter.element_per_acc
    return desp
@@ -78,6 +83,10 @@ def get_conv_algo_desp_from_param(p: ConvAlgoParams):
    desp.interleave_o = p.layout_desp_output.interleave
    desp.mask_sparse = p.mask_sparse
    desp.increment_k_first = p.increment_k_first
+    ker = gen_conv_kernels(p)
+    desp.element_per_access_a = ker.input_spec.input_iter_a.element_per_acc
+    desp.element_per_access_b = ker.input_spec.input_iter_b.element_per_acc
+    desp.element_per_access_c = ker.output_spec.out_iter.element_per_acc
    return desp
@@ -106,6 +115,7 @@ def _assign_gemm_params(desp: Union[ConvAlgoDesp, GemmAlgoDesp],
    p.is_nvrtc = desp.is_nvrtc
 def get_gemm_param_from_desp(desp: GemmAlgoDesp):
    p = GemmAlgoParams((0, 0, 0), (0, 0, 0), 0, "s8,s8,s8,s8,s8", False, False,
                          False, GemmAlgo.Simt)

--- a/spconv/benchmark/me.py
+++ b/spconv/benchmark/me.py
-"""Benchmark MinkowskiEngine
-"""
-from spconv.benchmark.core import get_voxel_data
-import time
-from pathlib import Path
-import numpy as np
-import torch
-from torch import nn
-from spconv.core import ConvAlgo
-from cumm import dtypes
-from spconv.test_utils import params_grid
-_DTYPE_TO_TORCH_DTYPE = {
-    dtypes.float32: torch.float32,
-    dtypes.float16: torch.float16,
-}
-def bench_me_basic(dtype_str: str):
-    dtype = dtypes.get_dtype_by_shortcut(dtype_str)
-    if dtype not in _DTYPE_TO_TORCH_DTYPE:
-        raise NotImplementedError("only support bench f32 and f16 for now")
-    torch_dtype = _DTYPE_TO_TORCH_DTYPE[dtype]
--- a/spconv/benchmark/thsp.py
+++ b/spconv/benchmark/thsp.py
-"""Benchmark torchsparse
-"""
-from spconv.benchmark.core import get_voxel_data
-import time
-from pathlib import Path
-import numpy as np
-import torch
-from torch import nn
-from spconv.core import ConvAlgo
-from cumm import dtypes
-from spconv.test_utils import params_grid
-_DTYPE_TO_TORCH_DTYPE = {
-    dtypes.float32: torch.float32,
-    dtypes.float16: torch.float16,
-}
-def bench_torchsparse_basic(dtype_str: str):
-    dtype = dtypes.get_dtype_by_shortcut(dtype_str)
-    if dtype not in _DTYPE_TO_TORCH_DTYPE:
-        raise NotImplementedError("only support bench f32 and f16 for now")
-    torch_dtype = _DTYPE_TO_TORCH_DTYPE[dtype]
--- a/spconv/build.py
+++ b/spconv/build.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 from pathlib import Path
+from typing import List
 import pccm
 from pccm.utils import project_is_editable, project_is_installed
@@ -32,6 +33,10 @@ if project_is_installed(PACKAGE_NAME) and project_is_editable(
    from spconv.csrc.sparse.alloc import ExternalAllocator
    from spconv.csrc.utils import BoxOps
    from spconv.csrc.hash.core import HashTable
+    from spconv.csrc.sparse.convops import GemmTunerSimple, ExternalSpconvMatmul
+    from spconv.csrc.sparse.convops import ConvTunerSimple, ConvGemmOps
+    from spconv.csrc.sparse.convops import SimpleExternalSpconvMatmul
    all_shuffle = SHUFFLE_SIMT_PARAMS + SHUFFLE_VOLTA_PARAMS + SHUFFLE_TURING_PARAMS
    all_shuffle = list(filter(lambda x: not x.is_nvrtc, all_shuffle))
    cu = GemmMainUnitTest(all_shuffle)
@@ -41,8 +46,35 @@ if project_is_installed(PACKAGE_NAME) and project_is_editable(
    all_imp = list(filter(lambda x: not x.is_nvrtc, all_imp))
    convcu = ConvMainUnitTest(all_imp)
    convcu.namespace = "cumm.conv.main"
-    pccm.builder.build_pybind([cu, convcu, SpconvOps(), BoxOps(), HashTable(), CompileInfo(), ExternalAllocator()],
+    gemmtuner = GemmTunerSimple(cu)
+    gemmtuner.namespace = "csrc.sparse.convops.gemmops"
+    convtuner = ConvTunerSimple(convcu)
+    convtuner.namespace = "csrc.sparse.convops.convops"
+    convops = ConvGemmOps(gemmtuner, convtuner)
+    convops.namespace = "csrc.sparse.convops.spops"
+    cus = [
+        cu, convcu, gemmtuner, convtuner,
+        convops,
+        SpconvOps(),
+        BoxOps(),
+        HashTable(),
+        CompileInfo(),
+        ExternalAllocator(),
+        ExternalSpconvMatmul(),
+        SimpleExternalSpconvMatmul(),
+    ]
+    pccm.builder.build_pybind(cus,
                              PACKAGE_ROOT / "core_cc",
                              namespace_root=PACKAGE_ROOT,
-                              load_library=False)
+                              load_library=False,
+                              verbose=True)
+    # cus_dev: List[pccm.Class] = [
+    # ]
+    # pccm.builder.build_pybind(cus_dev,
+    #                           PACKAGE_ROOT / "core_cc_dev",
+    #                           namespace_root=PACKAGE_ROOT,
+    #                           load_library=False,
+    #                           verbose=True)
--- a/spconv/constants.py
+++ b/spconv/constants.py
@@ -30,6 +30,7 @@ if _filter_hwio_env is not None:
    raise NotImplementedError("SPCONV_FILTER_HWIO is deprecated. use SPCONV_SAVED_WEIGHT_LAYOUT instead.")
 DISABLE_JIT = os.getenv("SPCONV_DISABLE_JIT", "0") == "1"
 NDIM_DONT_CARE = 3
 FILTER_HWIO = False
@@ -59,8 +60,10 @@ SPCONV_BWD_SPLITK = list(map(int, os.getenv("SPCONV_BWD_SPLITK", "1,2,4,8,16,32,
 SPCONV_NVRTC_MODE = NVRTCMode.ConstantMemory
 SPCONV_DEBUG_NVRTC_KERNELS = False
+SPCONV_DEBUG_CPP_ONLY = project_is_editable(PACKAGE_NAME)
-class SpconvAllocatorKeys:
+class AllocKeys:
    Pair = "Pair"
    IndiceNumPerLoc = "IndiceNumPerLoc"
    PairMask = "PairMask"
@@ -72,5 +75,31 @@ class SpconvAllocatorKeys:
    # MaskArgSortFwd = "MaskArgSortFwd"
    MaskArgSortBwd = "MaskArgSortBwd"
+    MaskOutputFwd = "MaskOutputFwd"
    OutFeatures = "OutFeatures"
+    Features = "Features"
+    Filters = "Filters"
+    OutBp = "OutBp"
+    DIn = "DIn"
+    DFilters = "DFilters"
+    InpBuffer = "InpBuffer"
+    OutBuffer = "OutBuffer"
+    IndicePairsUniq = "IndicePairsUniq"
+    IndicePairsUniqBackup = "IndicePairsUniqBackup"
+    HashKOrKV = "HashKOrKV"
+    HashV = "HashV"
+    ThrustTemp = "ThrustTemp"
 SPCONV_DEBUG_WEIGHT = False
+SPCONV_CPP_INDICE_PAIRS = True 
+SPCONV_CPP_INDICE_PAIRS_IGEMM = True 
+SPCONV_CPP_GEMM = True
\ No newline at end of file
--- a/spconv/core.py
+++ b/spconv/core.py
@@ -16,9 +16,10 @@ from cumm.gemm.main import gen_shuffle_params_v2 as gen_shuffle_params, GemmAlgo
 from cumm.gemm import kernel
 from typing import List
 from cumm.gemm.algospec.core import TensorOp
-from cumm.conv.main import gen_gemm_params as gen_conv_params, ConvFwdAndBwdInput, ConvBwdWeight, ConvIterAlgo, GemmAlgo
+from cumm.conv.main import gen_gemm_params as gen_conv_params, ConvFwdAndBwdInput, ConvBwdWeight, ConvFwd, ConvIterAlgo, GemmAlgo
 from cumm.conv.bases import (NCHW, NHWC, ConvIterAlgo, ConvLayout,
                             ConvLayoutType, ConvMode, ConvOpType)
+from spconv.algocore import get_gemm_algo_desp_from_param
 from spconv.constants import NDIM_DONT_CARE
@@ -402,32 +403,6 @@ IMPLGEMM_SIMT_PARAMS = [
                     increment_k_first=True,
                     access_per_vector=1),
 ]
-IMPLGEMM_SIMT_PARAMS = [
-    *gen_conv_params(ConvFwdAndBwdInput, (64, 32, 16), (32, 32, 8),
-                     NDIM_DONT_CARE,
-                     ConvIterAlgo.Optimized,
-                     2, ["f32,f32,f32,f32,f32", "f16,f16,f16,f32,f32"],
-                     NHWC,
-                     NHWC,
-                     NHWC,
-                     GemmAlgo.Simt,
-                     None,
-                     mask_sparse=True,
-                     increment_k_first=True,
-                     access_per_vector=1),
-    *gen_conv_params(ConvBwdWeight, (64, 32, 16), (32, 32, 8),
-                     NDIM_DONT_CARE,
-                     ConvIterAlgo.Optimized,
-                     2, ["f32,f32,f32,f32,f32", "f16,f16,f16,f32,f32"],
-                     NHWC,
-                     NHWC,
-                     NHWC,
-                     GemmAlgo.Simt,
-                     None,
-                     mask_sparse=True,
-                     increment_k_first=True,
-                     access_per_vector=1),
-]
 IMPLGEMM_VOLTA_PARAMS = [
@@ -693,6 +668,181 @@ IMPLGEMM_TURING_PARAMS = [
    #     NHWC, NHWC, NHWC, GemmAlgo.Turing, TensorOp((16, 8, 8)), mask_sparse=True, increment_k_first=True, access_per_vector=1),
    # gen_conv_params(ConvFwdAndBwdInput, )
+    # all int8 kernels use nvrtc.
+    *gen_conv_params(ConvFwd, (32, 32, 32), (32, 32, 32),
+                     NDIM_DONT_CARE,
+                     ConvIterAlgo.Optimized,
+                     2, ["s8,s8,s8,s32,s32"],
+                     NHWC,
+                     NHWC,
+                     NHWC,
+                     GemmAlgo.Turing,
+                     TensorOp((8, 8, 16)),
+                     mask_sparse=True,
+                     increment_k_first=True,
+                     access_per_vector=1,
+                     is_nvrtc=False),
+    *gen_conv_params(ConvFwd, (32, 64, 32), (32, 32, 32),
+                     NDIM_DONT_CARE,
+                     ConvIterAlgo.Optimized,
+                     2, ["s8,s8,s8,s32,s32"],
+                     NHWC,
+                     NHWC,
+                     NHWC,
+                     GemmAlgo.Turing,
+                     TensorOp((8, 8, 16)),
+                     mask_sparse=True,
+                     increment_k_first=True,
+                     access_per_vector=1,
+                     is_nvrtc=False),
+    *gen_conv_params(ConvFwd, (32, 32, 64), (32, 32, 32),
+                     NDIM_DONT_CARE,
+                     ConvIterAlgo.Optimized,
+                     2, ["s8,s8,s8,s32,s32"],
+                     NHWC,
+                     NHWC,
+                     NHWC,
+                     GemmAlgo.Turing,
+                     TensorOp((8, 8, 16)),
+                     mask_sparse=True,
+                     increment_k_first=True,
+                     access_per_vector=1,
+                     is_nvrtc=False),
+    *gen_conv_params(ConvFwd, (32, 64, 64), (32, 32, 32),
+                     NDIM_DONT_CARE,
+                     ConvIterAlgo.Optimized,
+                     2, ["s8,s8,s8,s32,s32"],
+                     NHWC,
+                     NHWC,
+                     NHWC,
+                     GemmAlgo.Turing,
+                     TensorOp((8, 8, 16)),
+                     mask_sparse=True,
+                     increment_k_first=True,
+                     access_per_vector=1,
+                     is_nvrtc=False),
+    *gen_conv_params(ConvFwd, (64, 128, 32), (32, 64, 32),
+                     NDIM_DONT_CARE,
+                     ConvIterAlgo.Optimized,
+                     2, ["s8,s8,s8,s32,s32"],
+                     NHWC,
+                     NHWC,
+                     NHWC,
+                     GemmAlgo.Turing,
+                     TensorOp((8, 8, 16)),
+                     mask_sparse=True,
+                     increment_k_first=True,
+                     access_per_vector=1,
+                     is_nvrtc=False),
+    *gen_conv_params(ConvFwd, (64, 64, 32), (32, 64, 32),
+                     NDIM_DONT_CARE,
+                     ConvIterAlgo.Optimized,
+                     2, ["s8,s8,s8,s32,s32"],
+                     NHWC,
+                     NHWC,
+                     NHWC,
+                     GemmAlgo.Turing,
+                     TensorOp((8, 8, 16)),
+                     mask_sparse=True,
+                     increment_k_first=True,
+                     access_per_vector=1,
+                     is_nvrtc=False),
+    *gen_conv_params(ConvFwd, (64, 64, 32), (32, 32, 32),
+                     NDIM_DONT_CARE,
+                     ConvIterAlgo.Optimized,
+                     2, ["s8,s8,s8,s32,s32"],
+                     NHWC,
+                     NHWC,
+                     NHWC,
+                     GemmAlgo.Turing,
+                     TensorOp((8, 8, 16)),
+                     mask_sparse=True,
+                     increment_k_first=True,
+                     access_per_vector=1,
+                     is_nvrtc=False),
+    *gen_conv_params(ConvFwd, (64, 32, 32), (32, 32, 32),
+                     NDIM_DONT_CARE,
+                     ConvIterAlgo.Optimized,
+                     2, ["s8,s8,s8,s32,s32"],
+                     NHWC,
+                     NHWC,
+                     NHWC,
+                     GemmAlgo.Turing,
+                     TensorOp((8, 8, 16)),
+                     mask_sparse=True,
+                     increment_k_first=True,
+                     access_per_vector=1,
+                     is_nvrtc=False),
+    *gen_conv_params(ConvFwd, (128, 128, 64), (64, 64, 64),
+                     NDIM_DONT_CARE,
+                     ConvIterAlgo.Optimized,
+                     2, ["s8,s8,s8,s32,s32"],
+                     NHWC,
+                     NHWC,
+                     NHWC,
+                     GemmAlgo.Turing,
+                     TensorOp((8, 8, 16)),
+                     mask_sparse=True,
+                     increment_k_first=True,
+                     access_per_vector=1,
+                     is_nvrtc=False),
+    *gen_conv_params(ConvFwd, (64, 128, 64), (32, 64, 64),
+                     NDIM_DONT_CARE,
+                     ConvIterAlgo.Optimized,
+                     2, ["s8,s8,s8,s32,s32"],
+                     NHWC,
+                     NHWC,
+                     NHWC,
+                     GemmAlgo.Turing,
+                     TensorOp((8, 8, 16)),
+                     mask_sparse=True,
+                     increment_k_first=True,
+                     access_per_vector=1,
+                     is_nvrtc=False),
+    # *gen_conv_params(ConvFwd, (32, 32, 32), (32, 32, 32),
+    #                  NDIM_DONT_CARE,
+    #                  ConvIterAlgo.Optimized,
+    #                  2, ["s8,s8,s8,s32,s32"],
+    #                  NHWC,
+    #                  NHWC,
+    #                  NHWC,
+    #                  GemmAlgo.Turing,
+    #                  TensorOp((8, 8, 16)),
+    #                  mask_sparse=True,
+    #                  increment_k_first=True,
+    #                  access_per_vector=0,
+    #                  is_nvrtc=True),
+    # *gen_conv_params(ConvFwd, (32, 64, 32), (32, 32, 32),
+    #                  NDIM_DONT_CARE,
+    #                  ConvIterAlgo.Optimized,
+    #                  2, ["s8,s8,s8,s32,s32"],
+    #                  NHWC,
+    #                  NHWC,
+    #                  NHWC,
+    #                  GemmAlgo.Turing,
+    #                  TensorOp((8, 8, 16)),
+    #                  mask_sparse=True,
+    #                  increment_k_first=True,
+    #                  access_per_vector=0,
+    #                  is_nvrtc=True),
+    # *gen_conv_params(ConvFwd, (32, 32, 64), (32, 32, 32),
+    #                  NDIM_DONT_CARE,
+    #                  ConvIterAlgo.Optimized,
+    #                  2, ["s8,s8,s8,s32,s32"],
+    #                  NHWC,
+    #                  NHWC,
+    #                  NHWC,
+    #                  GemmAlgo.Turing,
+    #                  TensorOp((8, 8, 16)),
+    #                  mask_sparse=True,
+    #                  increment_k_first=True,
+    #                  access_per_vector=0,
+    #                  is_nvrtc=True),
 ]
 ALL_NATIVE_PARAMS = SHUFFLE_SIMT_PARAMS + SHUFFLE_TURING_PARAMS + SHUFFLE_VOLTA_PARAMS

--- a/spconv/core_cc/csrc/sparse/all/__init__.pyi
+++ b/spconv/core_cc/csrc/sparse/all/__init__.pyi
@@ -48,7 +48,7 @@ class SpconvOps:
        """
        ...
    @staticmethod
-    def generate_conv_inds_stage2(indices: Tensor, hashdata_k: Tensor, hashdata_v: Tensor, indice_pairs: Tensor, indice_pairs_uniq: Tensor, indice_pairs_uniq_before_sort: Tensor, out_inds: Tensor, num_out_act: int, batch_size: int, output_dims: List[int], input_dims: List[int], ksize: List[int], stride: List[int], padding: List[int], dilation: List[int], transposed: bool = False, stream_int: int = 0) -> int: 
+    def generate_conv_inds_stage2(indices: Tensor, hashdata_k: Tensor, hashdata_v: Tensor, indice_pairs: Tensor, indice_pairs_uniq: Tensor, indice_pairs_uniq_before_sort: Tensor, out_inds: Tensor, indice_num_per_loc: Tensor, num_out_act: int, batch_size: int, output_dims: List[int], input_dims: List[int], ksize: List[int], stride: List[int], padding: List[int], dilation: List[int], transposed: bool = False, stream_int: int = 0, use_bound_algo: bool = False) -> int: 
        """
        Args:
            indices: 
@@ -58,6 +58,7 @@ class SpconvOps:
            indice_pairs_uniq: 
            indice_pairs_uniq_before_sort: 
            out_inds: 
+            indice_num_per_loc: 
            num_out_act: 
            batch_size: 
            output_dims: 
@@ -68,6 +69,7 @@ class SpconvOps:
            dilation: 
            transposed: 
            stream_int: 
+            use_bound_algo: 
        """
        ...
    @staticmethod
@@ -191,6 +193,31 @@ class SpconvOps:
        """
        ...
    @staticmethod
+    def indice_maxpool(out_features: Tensor, features: Tensor, indice_pairs: Tensor, indice_pair_num: Tensor, num_activate_out: int, stream: int = 0) -> None: 
+        """
+        Args:
+            out_features: 
+            features: 
+            indice_pairs: 
+            indice_pair_num: 
+            num_activate_out: 
+            stream: 
+        """
+        ...
+    @staticmethod
+    def indice_maxpool_backward(din: Tensor, features: Tensor, out_features: Tensor, out_bp: Tensor, indice_pairs: Tensor, indice_pair_num: Tensor, stream: int = 0) -> None: 
+        """
+        Args:
+            din: 
+            features: 
+            out_features: 
+            out_bp: 
+            indice_pairs: 
+            indice_pair_num: 
+            stream: 
+        """
+        ...
+    @staticmethod
    def maxpool_implicit_gemm_forward(out: Tensor, inp: Tensor, inds: Tensor, stream: int = 0) -> None: 
        """
        Args:
@@ -369,7 +396,18 @@ class SpconvOps:
    @staticmethod
    def get_int32_max() -> int: ...
    @staticmethod
-    def get_indice_pairs_implicit_gemm(allocator, indices: Tensor, batch_size: int, input_dims: List[int], algo: int, ksize: List[int], stride: List[int], padding: List[int], dilation: List[int], out_padding: List[int], subm: bool, transposed: bool, is_train: bool, stream_int: int = 0) -> Tensor: 
+    def get_indice_gen_workspace_size(kv: int, num_act_in: int, num_act_out_bound: int, subm: bool, use_int64_hash_k: bool) -> int: 
+        """
+        Args:
+            kv: 
+            num_act_in: 
+            num_act_out_bound: 
+            subm: 
+            use_int64_hash_k: 
+        """
+        ...
+    @staticmethod
+    def get_indice_pairs_implicit_gemm(allocator, indices: Tensor, batch_size: int, input_dims: List[int], algo: int, ksize: List[int], stride: List[int], padding: List[int], dilation: List[int], out_padding: List[int], subm: bool, transposed: bool, is_train: bool, stream_int: int = 0, num_out_act_bound: int = -1) -> Tuple[Tensor, int]: 
        """
        Args:
            allocator: 
@@ -386,10 +424,11 @@ class SpconvOps:
            transposed: 
            is_train: 
            stream_int: 
+            num_out_act_bound: 
        """
        ...
    @staticmethod
-    def get_indice_pairs(allocator, indices: Tensor, batch_size: int, input_dims: List[int], algo: int, ksize: List[int], stride: List[int], padding: List[int], dilation: List[int], out_padding: List[int], subm: bool, transposed: bool, stream_int: int = 0) -> None: 
+    def get_indice_pairs(allocator, indices: Tensor, batch_size: int, input_dims: List[int], algo: int, ksize: List[int], stride: List[int], padding: List[int], dilation: List[int], out_padding: List[int], subm: bool, transposed: bool, stream_int: int = 0, num_out_act_bound: int = -1) -> int: 
        """
        Args:
            allocator: 
@@ -405,12 +444,6 @@ class SpconvOps:
            subm: 
            transposed: 
            stream_int: 
-        """
+            num_out_act_bound: 
-        ...
-    @staticmethod
-    def test_allocator(allocator) -> None: 
-        """
-        Args:
-            allocator: 
        """
        ...
--- a/spconv/core_cc/csrc/sparse/alloc.pyi
+++ b/spconv/core_cc/csrc/sparse/alloc.pyi
@@ -2,25 +2,29 @@ from typing import overload, Any, Callable, Dict, List, Optional, Set, Tuple, Ty
 from pccm.stubs import EnumValue, EnumClassValue
 from cumm.tensorview import Tensor
 class ExternalAllocator:
-    def zeros(self, name: str, shape: List[int], dtype: int, device: int) -> Tensor: 
+    def zeros(self, name: str, shape: List[int], dtype: int, device: int, is_temp_memory: bool = False, stream: int = 0) -> Tensor: 
        """
        Args:
            name: 
            shape: 
            dtype: 
            device: 
+            is_temp_memory: 
+            stream: 
        """
        ...
-    def empty(self, name: str, shape: List[int], dtype: int, device: int) -> Tensor: 
+    def empty(self, name: str, shape: List[int], dtype: int, device: int, is_temp_memory: bool = False, stream: int = 0) -> Tensor: 
        """
        Args:
            name: 
            shape: 
            dtype: 
            device: 
+            is_temp_memory: 
+            stream: 
        """
        ...
-    def full_int(self, name: str, shape: List[int], value: int, dtype: int, device: int) -> Tensor: 
+    def full_int(self, name: str, shape: List[int], value: int, dtype: int, device: int, is_temp_memory: bool = False, stream: int = 0) -> Tensor: 
        """
        Args:
            name: 
@@ -28,9 +32,11 @@ class ExternalAllocator:
            value: 
            dtype: 
            device: 
+            is_temp_memory: 
+            stream: 
        """
        ...
-    def full_float(self, name: str, shape: List[int], value: float, dtype: int, device: int) -> Tensor: 
+    def full_float(self, name: str, shape: List[int], value: float, dtype: int, device: int, is_temp_memory: bool = False, stream: int = 0) -> Tensor: 
        """
        Args:
            name: 
@@ -38,6 +44,14 @@ class ExternalAllocator:
            value: 
            dtype: 
            device: 
+            is_temp_memory: 
+            stream: 
+        """
+        ...
+    def get_tensor_by_name(self, name: str) -> Tensor: 
+        """
+        Args:
+            name: 
        """
        ...
    def free(self, ten: Tensor) -> None: 

--- a/spconv/core_cc/csrc/sparse/convops/__init__.pyi
+++ b/spconv/core_cc/csrc/sparse/convops/__init__.pyi
+from typing import overload, Any, Callable, Dict, List, Optional, Set, Tuple, Type, Union
+from pccm.stubs import EnumValue, EnumClassValue
+from cumm.tensorview.gemm import GemmAlgoDesp
+from cumm.tensorview.gemm import ConvAlgoDesp
+from cumm.tensorview import Tensor
+from ...csrc.sparse.convops import ExternalSpconvMatmul
+class GemmTuneResult:
+    algo_desp: GemmAlgoDesp
+    arch: Tuple[int, int]
+    splitk: int
+    def is_valid(self) -> bool: ...
+    @overload
+    def __init__(self) -> None: ...
+    @overload
+    def __init__(self, algo_desp: GemmAlgoDesp, arch: Tuple[int, int], splitk: int) -> None: 
+        """
+        Args:
+            algo_desp: 
+            arch: 
+            splitk: 
+        """
+        ...
+class ConvTuneResult:
+    algo_desp: ConvAlgoDesp
+    arch: Tuple[int, int]
+    splitk: int
+    @overload
+    def __init__(self) -> None: ...
+    @overload
+    def __init__(self, algo_desp: ConvAlgoDesp, arch: Tuple[int, int], splitk: int) -> None: 
+        """
+        Args:
+            algo_desp: 
+            arch: 
+            splitk: 
+        """
+        ...
+    def is_valid(self) -> bool: ...
+class ExternalSpconvMatmul:
+    def indice_conv_init_gemm(self, features_n: str, filters_n: str, all_weight_is_krsc: bool, is_kc_not_ck: bool, kv_center: int, out_channel: int, stream_int: int = 0) -> Tensor: 
+        """
+        Args:
+            features_n: 
+            filters_n: 
+            all_weight_is_krsc: 
+            is_kc_not_ck: 
+            kv_center: 
+            out_channel: 
+            stream_int: 
+        """
+        ...
+    def indice_conv_cpu_gemm(self, inp_buffer_n: str, out_buffer_n: str, filters_n: str, all_weight_is_krsc: bool, is_kc_not_ck: bool, nhot: int, index: int) -> None: 
+        """
+        Args:
+            inp_buffer_n: 
+            out_buffer_n: 
+            filters_n: 
+            all_weight_is_krsc: 
+            is_kc_not_ck: 
+            nhot: 
+            index: 
+        """
+        ...
+    def indice_conv_bwd_init_gemm(self, features_n: str, filters_n: str, out_bp_n: str, dfilters_n: str, all_weight_is_krsc: bool, is_kc_not_ck: bool, kv_center: int, stream_int: int = 0) -> Tensor: 
+        """
+        Args:
+            features_n: 
+            filters_n: 
+            out_bp_n: 
+            dfilters_n: 
+            all_weight_is_krsc: 
+            is_kc_not_ck: 
+            kv_center: 
+            stream_int: 
+        """
+        ...
+    def indice_conv_bwd_cpu_gemm(self, inp_buffer_n: str, out_buffer_n: str, filters_n: str, dfilters_n: str, all_weight_is_krsc: bool, is_kc_not_ck: bool, nhot: int, index: int) -> None: 
+        """
+        Args:
+            inp_buffer_n: 
+            out_buffer_n: 
+            filters_n: 
+            dfilters_n: 
+            all_weight_is_krsc: 
+            is_kc_not_ck: 
+            nhot: 
+            index: 
+        """
+        ...
+class SimpleExternalSpconvMatmul(ExternalSpconvMatmul):
+    def __init__(self, alloc) -> None: 
+        """
+        Args:
+            alloc: 
+        """
+        ...
--- a/spconv/core_cc/csrc/sparse/convops/convops.pyi
+++ b/spconv/core_cc/csrc/sparse/convops/convops.pyi
+from typing import overload, Any, Callable, Dict, List, Optional, Set, Tuple, Type, Union
+from pccm.stubs import EnumValue, EnumClassValue
+from cumm.tensorview.gemm import ConvAlgoDesp
+from cumm.tensorview import Tensor
+from cumm.tensorview.gemm import NVRTCParams
+from spconv.core_cc.csrc.sparse.convops import ConvTuneResult
+from cumm.tensorview import CUDAKernelTimer
+class ConvTunerSimple:
+    def __init__(self, desps: List[ConvAlgoDesp]) -> None: 
+        """
+        Args:
+            desps: 
+        """
+        ...
+    @staticmethod
+    def get_available_algo_str_from_arch(arch: Tuple[int, int]) -> List[str]: 
+        """
+        Args:
+            arch: 
+        """
+        ...
+    def get_all_available(self, inp: Tensor, weight: Tensor, out: Tensor, layout_i: int, layout_w: int, layout_o: int, interleave_i: int, interleave_w: int, interleave_o: int, arch: Tuple[int, int], op_type: int, mask_width: int, auto_fp32_accum: bool, fp32_accum: bool) -> List[ConvAlgoDesp]: 
+        """
+        Args:
+            inp: 
+            weight: 
+            out: 
+            layout_i: 
+            layout_w: 
+            layout_o: 
+            interleave_i: 
+            interleave_w: 
+            interleave_o: 
+            arch: 
+            op_type: 
+            mask_width: 
+            auto_fp32_accum: 
+            fp32_accum: 
+        """
+        ...
+    def cached_get_nvrtc_params(self, desp: ConvAlgoDesp, arch: Tuple[int, int], stream_int: int) -> NVRTCParams: 
+        """
+        Args:
+            desp: 
+            arch: 
+            stream_int: 
+        """
+        ...
+    def tune_and_cache(self, op_type: int, inp: Tensor, weight: Tensor, output: Tensor, layout_i: int, layout_w: int, layout_o: int, interleave_i: int, interleave_w: int, interleave_o: int, arch: Tuple[int, int], mask: Tensor, mask_argsort: Tensor, indices: Tensor, reverse_mask: bool, mask_filter: int = 0xffffffff, mask_width: int = -1, mask_output: Tensor =  Tensor(), alpha: float = 1.0, beta: float = 0.0, stream_int: int = 0, auto_fp32_accum: bool = True, fp32_accum: bool = False, num_run: int = 5) -> Tuple[ConvTuneResult, float]: 
+        """
+        Args:
+            op_type: 
+            inp: 
+            weight: 
+            output: 
+            layout_i: 
+            layout_w: 
+            layout_o: 
+            interleave_i: 
+            interleave_w: 
+            interleave_o: 
+            arch: 
+            mask: 
+            mask_argsort: 
+            indices: 
+            reverse_mask: 
+            mask_filter: 
+            mask_width: 
+            mask_output: 
+            alpha: 
+            beta: 
+            stream_int: 
+            auto_fp32_accum: 
+            fp32_accum: 
+            num_run: 
+        """
+        ...
+    def get_tuned_algo(self, op_type: int, i_dtype: int, w_dtype: int, o_dtype: int, k: int, c: int, arch: Tuple[int, int], mask_width: int = -1) -> Tuple[Any, bool]: 
+        """
+        Args:
+            op_type: 
+            i_dtype: 
+            w_dtype: 
+            o_dtype: 
+            k: 
+            c: 
+            arch: 
+            mask_width: 
+        """
+        ...
+    def run_with_tuned_result(self, profile_res, op_type: int, inp: Tensor, weight: Tensor, output: Tensor, mask: Tensor, mask_argsort: Tensor, mask_output: Tensor, indices: Tensor, reverse_mask: bool, mask_filter: int = 0xffffffff, mask_width: int = -1, alpha: float = 1.0, beta: float = 0.0, stream_int: int = 0, workspace: Tensor =  Tensor(), verbose: bool = False, timer: CUDAKernelTimer =  CUDAKernelTimer(false), force_nvrtc: bool = False) -> None: 
+        """
+        Args:
+            profile_res: 
+            op_type: 
+            inp: 
+            weight: 
+            output: 
+            mask: 
+            mask_argsort: 
+            mask_output: 
+            indices: 
+            reverse_mask: 
+            mask_filter: 
+            mask_width: 
+            alpha: 
+            beta: 
+            stream_int: 
+            workspace: 
+            verbose: 
+            timer: 
+            force_nvrtc: 
+        """
+        ...
+    def query_workspace_size(self, desp: ConvAlgoDesp, splitk: int, op_type: int, N: int, C: int, K: int, kv: int) -> int: 
+        """
+        Args:
+            desp: 
+            splitk: 
+            op_type: 
+            N: 
+            C: 
+            K: 
+            kv: 
+        """
+        ...
--- a/spconv/core_cc/csrc/sparse/convops/gemmops.pyi
+++ b/spconv/core_cc/csrc/sparse/convops/gemmops.pyi
+from typing import overload, Any, Callable, Dict, List, Optional, Set, Tuple, Type, Union
+from pccm.stubs import EnumValue, EnumClassValue
+from cumm.tensorview.gemm import GemmAlgoDesp
+from cumm.tensorview import Tensor
+from cumm.tensorview.gemm import NVRTCParams
+from spconv.core_cc.csrc.sparse.convops import GemmTuneResult
+from cumm.tensorview import CUDAKernelTimer
+class GemmTunerSimple:
+    def __init__(self, desps: List[GemmAlgoDesp]) -> None: 
+        """
+        Args:
+            desps: 
+        """
+        ...
+    @staticmethod
+    def get_available_algo_str_from_arch(arch: Tuple[int, int]) -> List[str]: 
+        """
+        Args:
+            arch: 
+        """
+        ...
+    def get_all_available(self, a: Tensor, b: Tensor, c: Tensor, trans_a: bool, trans_b: bool, trans_c: bool, arch: Tuple[int, int], shuffle_type: int) -> List[GemmAlgoDesp]: 
+        """
+        Args:
+            a: 
+            b: 
+            c: 
+            trans_a: 
+            trans_b: 
+            trans_c: 
+            arch: 
+            shuffle_type: 
+        """
+        ...
+    def cached_get_nvrtc_params(self, desp: GemmAlgoDesp, arch: Tuple[int, int], stream_int: int) -> NVRTCParams: 
+        """
+        Args:
+            desp: 
+            arch: 
+            stream_int: 
+        """
+        ...
+    def tune_and_cache(self, a: Tensor, b: Tensor, c: Tensor, trans_a: bool, trans_b: bool, trans_c: bool, arch: Tuple[int, int], shuffle_type: int, a_inds: Tensor, b_inds: Tensor, c_inds: Tensor, hint: int = 0, alpha: float = 1.0, beta: float = 0.0, stream_int: int = 0, num_run: int = 5) -> Tuple[GemmTuneResult, float]: 
+        """
+        Args:
+            a: 
+            b: 
+            c: 
+            trans_a: 
+            trans_b: 
+            trans_c: 
+            arch: 
+            shuffle_type: 
+            a_inds: 
+            b_inds: 
+            c_inds: 
+            hint: 
+            alpha: 
+            beta: 
+            stream_int: 
+            num_run: 
+        """
+        ...
+    def get_tuned_algo(self, a_dtype: int, b_dtype: int, c_dtype: int, a_shape: List[int], b_shape: List[int], c_shape: List[int], trans_a: bool, trans_b: bool, trans_c: bool, arch: Tuple[int, int], shuffle_type: int, a_inds_shape: List[int], b_inds_shape: List[int], c_inds_shape: List[int], hint: int = 0) -> Tuple[Any, bool]: 
+        """
+        Args:
+            a_dtype: 
+            b_dtype: 
+            c_dtype: 
+            a_shape: 
+            b_shape: 
+            c_shape: 
+            trans_a: 
+            trans_b: 
+            trans_c: 
+            arch: 
+            shuffle_type: 
+            a_inds_shape: 
+            b_inds_shape: 
+            c_inds_shape: 
+            hint: 
+        """
+        ...
+    def run_with_tuned_result(self, profile_res, a: Tensor, b: Tensor, c: Tensor, trans_a: bool, trans_b: bool, trans_c: bool, arch: Tuple[int, int], stream_int: int, shuffle_type: int, a_inds: Tensor, b_inds: Tensor, c_inds: Tensor, hint: int = 0, alpha: float = 1.0, beta: float = 0.0, workspace: Tensor =  Tensor(), timer: CUDAKernelTimer =  CUDAKernelTimer(False), force_nvrtc: bool = False) -> None: 
+        """
+        Args:
+            profile_res: 
+            a: 
+            b: 
+            c: 
+            trans_a: 
+            trans_b: 
+            trans_c: 
+            arch: 
+            stream_int: 
+            shuffle_type: 
+            a_inds: 
+            b_inds: 
+            c_inds: 
+            hint: 
+            alpha: 
+            beta: 
+            workspace: 
+            timer: 
+            force_nvrtc: 
+        """
+        ...
--- a/spconv/core_cc/csrc/sparse/convops/spops.pyi
+++ b/spconv/core_cc/csrc/sparse/convops/spops.pyi
+from typing import overload, Any, Callable, Dict, List, Optional, Set, Tuple, Type, Union
+from pccm.stubs import EnumValue, EnumClassValue
+from cumm.tensorview import Tensor
+from cumm.tensorview import CUDAKernelTimer
+class ConvGemmOps:
+    @staticmethod
+    def get_compute_capability(index: int = -1) -> Tuple[int, int]: 
+        """
+        Args:
+            index: 
+        """
+        ...
+    @staticmethod
+    def indice_conv(allocator, ext_mm, gemm_tuner, all_w_is_krsc: bool, filter_hwio: bool, features: Tensor, filters: Tensor, indice_pairs: Tensor, indice_pair_num: Tensor, num_activate_out: int, inverse: bool = False, subm: bool = False, algo: int = 0, stream_int: int = 0) -> None: 
+        """
+        1. this function need to take a out features
+        that from subm first mm.
+        2. this function don't support CPU.
+        Args:
+            allocator: 
+            ext_mm: 
+            gemm_tuner: 
+            all_w_is_krsc: 
+            filter_hwio: 
+            features: 
+            filters: 
+            indice_pairs: 
+            indice_pair_num: 
+            num_activate_out: 
+            inverse: 
+            subm: 
+            algo: 
+            stream_int: 
+        """
+        ...
+    @staticmethod
+    def indice_conv_backward(allocator, ext_mm, gemm_tuner, all_w_is_krsc: bool, filter_hwio: bool, features: Tensor, filters: Tensor, out_bp: Tensor, indice_pairs: Tensor, indice_pair_num: Tensor, inverse: bool = False, subm: bool = False, algo: int = 0, stream_int: int = 0) -> None: 
+        """
+        Args:
+            allocator: 
+            ext_mm: 
+            gemm_tuner: 
+            all_w_is_krsc: 
+            filter_hwio: 
+            features: 
+            filters: 
+            out_bp: 
+            indice_pairs: 
+            indice_pair_num: 
+            inverse: 
+            subm: 
+            algo: 
+            stream_int: 
+        """
+        ...
+    @staticmethod
+    def implicit_gemm(allocator, conv_tuner, features: Tensor, filters: Tensor, pair_fwd: Tensor, pair_mask_fwd_splits: List[Tensor], mask_argsort_fwd_splits: List[Tensor], num_activate_out: int, masks: Tensor, is_train: bool = False, is_subm: bool = False, stream_int: int = 0, timer: CUDAKernelTimer =  CUDAKernelTimer(False), auto_fp32_accum: bool = True, fp32_accum: bool = False) -> int: 
+        """
+        Args:
+            allocator: 
+            conv_tuner: 
+            features: 
+            filters: 
+            pair_fwd: 
+            pair_mask_fwd_splits: 
+            mask_argsort_fwd_splits: 
+            num_activate_out: 
+            masks: 
+            is_train: 
+            is_subm: 
+            stream_int: 
+            timer: 
+            auto_fp32_accum: 
+            fp32_accum: 
+        """
+        ...
+    @staticmethod
+    def implicit_gemm_backward(allocator, conv_tuner, features: Tensor, filters: Tensor, out_bp: Tensor, pair_fwd: Tensor, pair_bwd: Tensor, pair_mask_fwd_splits: List[Tensor], pair_mask_bwd_splits: List[Tensor], mask_argsort_fwd_splits: List[Tensor], mask_argsort_bwd_splits: List[Tensor], mask_output_fwd: Tensor, masks: Tensor, mask_width: int, is_subm: bool, stream_int: int = 0, timer: CUDAKernelTimer =  CUDAKernelTimer(False), auto_fp32_accum: bool = True, fp32_accum: bool = False) -> None: 
+        """
+        Args:
+            allocator: 
+            conv_tuner: 
+            features: 
+            filters: 
+            out_bp: 
+            pair_fwd: 
+            pair_bwd: 
+            pair_mask_fwd_splits: 
+            pair_mask_bwd_splits: 
+            mask_argsort_fwd_splits: 
+            mask_argsort_bwd_splits: 
+            mask_output_fwd: 
+            masks: 
+            mask_width: 
+            is_subm: 
+            stream_int: 
+            timer: 
+            auto_fp32_accum: 
+            fp32_accum: 
+        """
+        ...
--- a/spconv/core_cc/cumm/common.pyi
+++ b/spconv/core_cc/cumm/common.pyi
@@ -3,3 +3,10 @@ from pccm.stubs import EnumValue, EnumClassValue
 class CompileInfo:
    @staticmethod
    def get_compiled_cuda_arch() -> List[Tuple[int, int]]: ...
+    @staticmethod
+    def arch_is_compiled(arch: Tuple[int, int]) -> bool: 
+        """
+        Args:
+            arch: 
+        """
+        ...
--- a/spconv/core_cc/cumm/gemm/main.pyi
+++ b/spconv/core_cc/cumm/gemm/main.pyi
 from typing import overload, Any, Callable, Dict, List, Optional, Set, Tuple, Type, Union
 from pccm.stubs import EnumValue, EnumClassValue
+from cumm.tensorview.gemm import GemmAlgoDesp
 from cumm.tensorview.gemm import GemmParams
 class GemmMainUnitTest:
    @staticmethod
-    def get_all_algo_desp() -> List[Any]: ...
+    def get_all_algo_desp() -> List[GemmAlgoDesp]: ...
    @staticmethod
-    def extract_mnk(a_shape: List[int], b_shape: List[int], trans_a: bool, trans_b: bool, trans_c: bool, shuffle_type: str = "0", a_inds_shape: List[int] =  [], b_inds_shape: List[int] =  [], c_inds_shape: List[int] =  []) -> Tuple[int, int, int]: 
+    def extract_mnk(a_shape: List[int], b_shape: List[int], trans_a: bool, trans_b: bool, trans_c: bool, shuffle_type: int = 0, a_inds_shape: List[int] =  [], b_inds_shape: List[int] =  [], c_inds_shape: List[int] =  []) -> Tuple[int, int, int]: 
        """
        Args:
            a_shape: 

--- a/spconv/csrc/sparse/all.py
+++ b/spconv/csrc/sparse/all.py
--- a/spconv/csrc/sparse/alloc.py
+++ b/spconv/csrc/sparse/alloc.py
 import pccm 
 from cumm.common import TensorView, TensorViewCPU, TensorViewKernel, ThrustLib
+from spconv.constants import AllocKeys
 class ExternalAllocatorGuard(pccm.Class):
    def __init__(self):
        super().__init__()
@@ -51,6 +53,9 @@ class ExternalAllocator(pccm.Class):
        code.arg("shape", "std::vector<int64_t>")
        code.arg("dtype", "int")
        code.arg("device", "int")
+        code.arg("is_temp_memory", "bool", "false")
+        code.arg("stream", "std::uintptr_t", "0")
        return code.ret("tv::Tensor")
    @pccm.pybind.mark(virtual=True)
@@ -61,6 +66,9 @@ class ExternalAllocator(pccm.Class):
        code.arg("shape", "std::vector<int64_t>")
        code.arg("dtype", "int")
        code.arg("device", "int")
+        code.arg("is_temp_memory", "bool", "false")
+        code.arg("stream", "std::uintptr_t", "0")
        return code.ret("tv::Tensor")
    @pccm.pybind.mark(virtual=True)
@@ -72,6 +80,9 @@ class ExternalAllocator(pccm.Class):
        code.arg("value", "int")
        code.arg("dtype", "int")
        code.arg("device", "int")
+        code.arg("is_temp_memory", "bool", "false")
+        code.arg("stream", "std::uintptr_t", "0")
        return code.ret("tv::Tensor")
    @pccm.pybind.mark(virtual=True)
@@ -83,6 +94,15 @@ class ExternalAllocator(pccm.Class):
        code.arg("value", "float")
        code.arg("dtype", "int")
        code.arg("device", "int")
+        code.arg("is_temp_memory", "bool", "false")
+        code.arg("stream", "std::uintptr_t", "0")
+        return code.ret("tv::Tensor")
+    @pccm.pybind.mark(virtual=True)
+    @pccm.member_function(virtual=True, pure_virtual=True)
+    def get_tensor_by_name(self):
+        code = pccm.code()
+        code.arg("name", "std::string")
        return code.ret("tv::Tensor")
    @pccm.pybind.mark(virtual=True)
@@ -105,9 +125,11 @@ class ExternalAllocator(pccm.Class):
        code.arg("shape", "std::vector<int64_t>")
        code.arg("dtype", "int")
        code.arg("device", "int")
+        code.arg("name", "std::string", "\"\"")
+        code.arg("stream", "std::uintptr_t", "0")
        code.raw(f"""
        // "" means temp memory
-        auto ten = zeros("", shape, dtype, device);
+        auto ten = zeros(name, shape, dtype, device, true, stream);
        return std::make_{self.ptr_type}<ExternalAllocatorGuard>(ten, [this](tv::Tensor ten){{
            this->free(ten);
        }});
@@ -120,8 +142,10 @@ class ExternalAllocator(pccm.Class):
        code.arg("shape", "std::vector<int64_t>")
        code.arg("dtype", "int")
        code.arg("device", "int")
+        code.arg("name", "std::string", "\"\"")
+        code.arg("stream", "std::uintptr_t", "0")
        code.raw(f"""
-        auto ten = empty("", shape, dtype, device);
+        auto ten = empty(name, shape, dtype, device, true, stream);
        return std::make_{self.ptr_type}<ExternalAllocatorGuard>(ten, [this](tv::Tensor ten){{
            this->free(ten);
        }});
@@ -135,8 +159,10 @@ class ExternalAllocator(pccm.Class):
        code.arg("value", "int")
        code.arg("dtype", "int")
        code.arg("device", "int")
+        code.arg("name", "std::string", "\"\"")
+        code.arg("stream", "std::uintptr_t", "0")
        code.raw(f"""
-        auto ten = full_int("", shape, value, dtype, device);
+        auto ten = full_int(name, shape, value, dtype, device, true, stream);
        return std::make_{self.ptr_type}<ExternalAllocatorGuard>(ten, [this](tv::Tensor ten){{
            this->free(ten);
        }});
@@ -150,8 +176,10 @@ class ExternalAllocator(pccm.Class):
        code.arg("value", "int")
        code.arg("dtype", "int")
        code.arg("device", "int")
+        code.arg("name", "std::string", "\"\"")
+        code.arg("stream", "std::uintptr_t", "0")
        code.raw(f"""
-        auto ten = full_float("", shape, value, dtype, device);
+        auto ten = full_float(name, shape, value, dtype, device, true, stream);
        return std::make_{self.ptr_type}<ExternalAllocatorGuard>(ten, [this](tv::Tensor t){{
            this->free(t);
        }});
@@ -179,7 +207,7 @@ class ThrustAllocator(pccm.Class):
        code.arg("num_bytes", "std::ptrdiff_t")
        code.ret("char*")
        code.raw(f"""
-        auto ten = allocator_.empty("", {{num_bytes}}, tv::uint8, 0);
+        auto ten = allocator_.empty({pccm.literal(AllocKeys.ThrustTemp)}, {{num_bytes}}, tv::uint8, 0);
        return reinterpret_cast<char*>(ten.raw_data());
        """)
        return code
@@ -193,3 +221,158 @@ class ThrustAllocator(pccm.Class):
        return allocator_.free_noexcept(tv::from_blob(ptr, {{num_bytes}}, tv::uint8, 0));
        """)
        return code
+class StaticAllocator(ExternalAllocator):
+    """a simple allocator for tensorrt plugin.
+    """
+    def __init__(self):
+        super().__init__()
+        self.add_dependency(TensorView)
+        self.add_member("tensor_dict_", "std::unordered_map<std::string, tv::Tensor>")
+        self.add_member("repr_", "std::string")
+        self.add_member("thrust_tmp_tensor_", "tv::Tensor")
+        self.grow = 1.5
+    @pccm.pybind.mark 
+    @pccm.constructor
+    def ctor(self):
+        code = pccm.code()
+        code.arg("tensor_dict", "std::unordered_map<std::string, tv::Tensor>")
+        code.ctor_init("tensor_dict_", "tensor_dict")
+        code.raw(f"""
+        std::stringstream ss;
+        for (auto& p : tensor_dict){{
+            tv::ssprint(ss, p.first, p.second.shape(), tv::dtype_str(p.second.dtype()), "\\n");
+        }}
+        repr_ = ss.str();
+        """)
+        return code 
+    @pccm.member_function(virtual=True)
+    def _get_raw_and_check(self):
+        code = pccm.code()
+        code.arg("name", "std::string")
+        code.arg("shape", "std::vector<int64_t>")
+        code.arg("dtype", "int")
+        code.arg("device", "int")
+        code.raw(f"""
+        auto res = get_tensor_by_name(name);
+        size_t total = std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<int64_t>());
+        TV_ASSERT_RT_ERR(res.nbytes() >= total * tv::bit_size(tv::DType(dtype)) 
+            && res.device() == device, "alloc failed", shape, res.shape());
+        return tv::from_blob(res.raw_data(), shape, dtype, device);
+        """)
+        return code.ret("tv::Tensor")
+    @pccm.pybind.mark
+    @pccm.member_function(virtual=True)
+    def zeros(self):
+        code = pccm.code()
+        code.arg("name", "std::string")
+        code.arg("shape", "std::vector<int64_t>")
+        code.arg("dtype", "int")
+        code.arg("device", "int")
+        code.arg("is_temp_memory", "bool", "false")
+        code.arg("stream", "std::uintptr_t", "0")
+        code.raw(f"""
+        auto tvctx = tv::Context();
+        tvctx.set_cuda_stream(reinterpret_cast<cudaStream_t>(stream));
+        auto blob = _get_raw_and_check(name, shape, dtype, device);
+        return blob.zero_(tvctx);
+        """)
+        return code.ret("tv::Tensor")
+    @pccm.pybind.mark
+    @pccm.member_function(virtual=True)
+    def empty(self):
+        code = pccm.code()
+        code.arg("name", "std::string")
+        code.arg("shape", "std::vector<int64_t>")
+        code.arg("dtype", "int")
+        code.arg("device", "int")
+        code.arg("is_temp_memory", "bool", "false")
+        code.arg("stream", "std::uintptr_t", "0")
+        code.raw(f"""
+        if (name == {pccm.literal(AllocKeys.ThrustTemp)}){{
+            // thrust tmp shouldn't inside tensor_dict. use a simple method to allocate
+            // we assume each allocator always handle one stream
+            // so we can just use one tensor
+            tv::Tensor res = thrust_tmp_tensor_;
+            if (res.empty()){{
+                res = tv::empty(shape, dtype, device);
+                thrust_tmp_tensor_ = res;
+            }}
+            if (shape[0] > thrust_tmp_tensor_.dim(0)){{
+                res = tv::empty({{int64_t(shape[0] * {self.grow})}}, dtype, device);
+                thrust_tmp_tensor_ = res;
+            }}
+            return res;
+        }}else{{
+            auto blob = _get_raw_and_check(name, shape, dtype, device);
+            return blob;
+        }}
+        """)
+        return code.ret("tv::Tensor")
+    @pccm.pybind.mark
+    @pccm.member_function(virtual=True)
+    def full_int(self):
+        code = pccm.code()
+        code.arg("name", "std::string")
+        code.arg("shape", "std::vector<int64_t>")
+        code.arg("value", "int")
+        code.arg("dtype", "int")
+        code.arg("device", "int")
+        code.arg("is_temp_memory", "bool", "false")
+        code.arg("stream", "std::uintptr_t", "0")
+        code.raw(f"""
+        auto tvctx = tv::Context();
+        auto blob = _get_raw_and_check(name, shape, dtype, device);
+        return blob.fill_(tvctx, value);
+        """)
+        return code.ret("tv::Tensor")
+    @pccm.pybind.mark
+    @pccm.member_function(virtual=True)
+    def full_float(self):
+        code = pccm.code()
+        code.arg("name", "std::string")
+        code.arg("shape", "std::vector<int64_t>")
+        code.arg("value", "float")
+        code.arg("dtype", "int")
+        code.arg("device", "int")
+        code.arg("is_temp_memory", "bool", "false")
+        code.arg("stream", "std::uintptr_t", "0")
+        code.raw(f"""
+        auto blob = _get_raw_and_check(name, shape, dtype, device);
+        return blob.fill_(tvctx, value);
+        """)
+        return code.ret("tv::Tensor")
+    @pccm.pybind.mark
+    @pccm.member_function(virtual=True)
+    def get_tensor_by_name(self):
+        code = pccm.code()
+        code.arg("name", "std::string")
+        code.raw(f"""
+        TV_ASSERT_RT_ERR(tensor_dict_.find(name) != tensor_dict_.end(), "can't find", name, "exists:\\n", repr_);
+        return tensor_dict_.at(name);
+        """)
+        return code.ret("tv::Tensor")
+    @pccm.pybind.mark
+    @pccm.member_function(virtual=True)
+    def free(self):
+        code = pccm.code()
+        code.arg("ten", "tv::Tensor")
+        return code
+    @pccm.pybind.mark
+    @pccm.member_function(virtual=True)
+    def free_noexcept(self):
+        code = pccm.code()
+        code.arg("ten", "tv::Tensor")
+        return code
--- a/spconv/csrc/sparse/convops.py
+++ b/spconv/csrc/sparse/convops.py