working on c++ only

899008fa · yan.yan · f78575ea · f78575ea · 899008fa · 899008fa
Commit 899008fa authored Jul 20, 2022 by yan.yan
20 changed files
--- a/docs/DEVELOPMENT.md
+++ b/docs/DEVELOPMENT.md
-<!--
- Copyright 2021 Yan Yan
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-->
-
-# How to develop spconv 2.x
-
-## First step
-
-spconv 2.x is written in a unique c++ framework ```pccm```. read [pccm guide]() to learn how to use ```pccm```.
-
-It's recommend to uninstall spconv and cumm installed by pip, then install spconv and cumm both in editable mode (```pip install -e .```)
-
-## Architecture
\ No newline at end of file
--- a/setup.py
+++ b/setup.py
@@ -159,6 +159,9 @@ if disable_jit is not None and disable_jit == "1":
    from spconv.csrc.utils import BoxOps
    from spconv.csrc.hash.core import HashTable
    from cumm.common import CompileInfo
+    from spconv.csrc.sparse.alloc import ExternalAllocator
+    from spconv.csrc.sparse.convops import GemmTunerSimple, ExternalSpconvMatmul
+    from spconv.csrc.sparse.convops import ConvTunerSimple, ConvGemmOps

    cu = GemmMainUnitTest(SHUFFLE_SIMT_PARAMS + SHUFFLE_VOLTA_PARAMS + SHUFFLE_TURING_PARAMS)
    convcu = ConvMainUnitTest(IMPLGEMM_SIMT_PARAMS + IMPLGEMM_VOLTA_PARAMS + IMPLGEMM_TURING_PARAMS)
@@ -172,14 +175,30 @@ if disable_jit is not None and disable_jit == "1":
            std = "c++14" 
        else:
            std = "c++17"
-    cus = [cu, convcu, SpconvOps(), BoxOps(), HashTable(), CompileInfo()]
    if CUMM_CPU_ONLY_BUILD:
-        cus = [SpconvOps(), BoxOps(), HashTable(), CompileInfo()]
+        gemmtuner = GemmTunerSimple(cu)
+        gemmtuner.namespace = "csrc.sparse.convops.gemmops"
+        convtuner = ConvTunerSimple(convcu)
+        convtuner.namespace = "csrc.sparse.convops.convops"
+        convops = ConvGemmOps(gemmtuner, convtuner)
+        convops.namespace = "csrc.sparse.convops.spops"
+    else:
+        gemmtuner = GemmTunerSimple(None)
+        gemmtuner.namespace = "csrc.sparse.convops.gemmops"
+        convtuner = ConvTunerSimple(None)
+        convtuner.namespace = "csrc.sparse.convops.convops"
+        convops = ConvGemmOps(gemmtuner, convtuner)
+        convops.namespace = "csrc.sparse.convops.spops"
+    cus = [gemmtuner, convtuner,
+        convops, SpconvOps(), BoxOps(), HashTable(), CompileInfo(), 
+        ExternalAllocator(),
+        ExternalSpconvMatmul()]
+    if not CUMM_CPU_ONLY_BUILD:
+        cus.extend([cu, convcu])
    ext_modules: List[Extension] = [
        PCCMExtension(cus,
                      "spconv/core_cc",
                      Path(__file__).resolve().parent / "spconv",
-                      objects_folder="objects",
                      std=std,
                      disable_pch=True,
                      verbose=True)

--- a/spconv/algo.py
+++ b/spconv/algo.py
--- a/spconv/algocore.py
+++ b/spconv/algocore.py
@@ -24,8 +24,8 @@ from cumm.tensorview.gemm import ConvLayoutType as ConvLayoutTypeCpp
 from cumm.tensorview.gemm import ShuffleStrideType as ShuffleStrideTypeCpp

 from cumm.tensorview.gemm import ConvParams, GemmAlgoDesp, GemmParams
-from cumm.gemm.main import GemmAlgoParams
-from cumm.conv.main import ConvAlgoParams, ConvIterAlgo
+from cumm.gemm.main import GemmAlgoParams, gen_gemm_kernels
+from cumm.conv.main import ConvAlgoParams, ConvIterAlgo, gen_gemm_kernels as gen_conv_kernels
 from cumm import dtypes
 from cumm.conv.bases import (NCHW, NHWC, ConvIterAlgo, ConvLayout,
                             ConvLayoutType, ConvMode, ConvOpType)
@@ -56,10 +56,15 @@ def _assign_gemm_desp_props(desp: Union[ConvAlgoDesp, GemmAlgoDesp],
    desp.access_per_vector = p.access_per_vector
    desp.is_nvrtc = p.is_nvrtc

-
 def get_gemm_algo_desp_from_param(p: GemmAlgoParams):
    desp = GemmAlgoDesp()
    _assign_gemm_desp_props(desp, p)
+    # here we must generate kernel for element-per-access data
+    ker = gen_gemm_kernels(p)
+    desp.element_per_access_a = ker.input_spec.input_iter_a.element_per_acc
+    desp.element_per_access_b = ker.input_spec.input_iter_b.element_per_acc
+    desp.element_per_access_c = ker.output_spec.out_iter.element_per_acc
+
    return desp


@@ -78,6 +83,10 @@ def get_conv_algo_desp_from_param(p: ConvAlgoParams):
    desp.interleave_o = p.layout_desp_output.interleave
    desp.mask_sparse = p.mask_sparse
    desp.increment_k_first = p.increment_k_first
+    ker = gen_conv_kernels(p)
+    desp.element_per_access_a = ker.input_spec.input_iter_a.element_per_acc
+    desp.element_per_access_b = ker.input_spec.input_iter_b.element_per_acc
+    desp.element_per_access_c = ker.output_spec.out_iter.element_per_acc
    return desp


@@ -106,6 +115,7 @@ def _assign_gemm_params(desp: Union[ConvAlgoDesp, GemmAlgoDesp],
    p.is_nvrtc = desp.is_nvrtc


+
 def get_gemm_param_from_desp(desp: GemmAlgoDesp):
    p = GemmAlgoParams((0, 0, 0), (0, 0, 0), 0, "s8,s8,s8,s8,s8", False, False,
                          False, GemmAlgo.Simt)

--- a/spconv/benchmark/me.py
+++ b/spconv/benchmark/me.py
-"""Benchmark MinkowskiEngine
-"""
-from spconv.benchmark.core import get_voxel_data
-
-import time
-from pathlib import Path
-
-import numpy as np
-import torch
-from torch import nn
-from spconv.core import ConvAlgo
-from cumm import dtypes
-from spconv.test_utils import params_grid
-
-_DTYPE_TO_TORCH_DTYPE = {
-    dtypes.float32: torch.float32,
-    dtypes.float16: torch.float16,
-}
-
-def bench_me_basic(dtype_str: str):
-    dtype = dtypes.get_dtype_by_shortcut(dtype_str)
-    if dtype not in _DTYPE_TO_TORCH_DTYPE:
-        raise NotImplementedError("only support bench f32 and f16 for now")
-    torch_dtype = _DTYPE_TO_TORCH_DTYPE[dtype]
--- a/spconv/benchmark/thsp.py
+++ b/spconv/benchmark/thsp.py
-"""Benchmark torchsparse
-"""
-from spconv.benchmark.core import get_voxel_data
-
-import time
-from pathlib import Path
-
-import numpy as np
-import torch
-from torch import nn
-from spconv.core import ConvAlgo
-from cumm import dtypes
-from spconv.test_utils import params_grid
-
-_DTYPE_TO_TORCH_DTYPE = {
-    dtypes.float32: torch.float32,
-    dtypes.float16: torch.float16,
-}
-
-def bench_torchsparse_basic(dtype_str: str):
-    dtype = dtypes.get_dtype_by_shortcut(dtype_str)
-    if dtype not in _DTYPE_TO_TORCH_DTYPE:
-        raise NotImplementedError("only support bench f32 and f16 for now")
-    torch_dtype = _DTYPE_TO_TORCH_DTYPE[dtype]
--- a/spconv/build.py
+++ b/spconv/build.py
@@ -13,6 +13,7 @@
 # limitations under the License.

 from pathlib import Path
+from typing import List

 import pccm
 from pccm.utils import project_is_editable, project_is_installed
@@ -32,17 +33,48 @@ if project_is_installed(PACKAGE_NAME) and project_is_editable(
    from spconv.csrc.sparse.alloc import ExternalAllocator
    from spconv.csrc.utils import BoxOps
    from spconv.csrc.hash.core import HashTable
+    from spconv.csrc.sparse.convops import GemmTunerSimple, ExternalSpconvMatmul
+    from spconv.csrc.sparse.convops import ConvTunerSimple, ConvGemmOps
+    from spconv.csrc.sparse.convops import SimpleExternalSpconvMatmul
+
    all_shuffle = SHUFFLE_SIMT_PARAMS + SHUFFLE_VOLTA_PARAMS + SHUFFLE_TURING_PARAMS
    all_shuffle = list(filter(lambda x: not x.is_nvrtc, all_shuffle))
    cu = GemmMainUnitTest(all_shuffle)
    cu.namespace = "cumm.gemm.main"
    all_imp = (IMPLGEMM_SIMT_PARAMS + IMPLGEMM_VOLTA_PARAMS +
-                              IMPLGEMM_TURING_PARAMS)
+               IMPLGEMM_TURING_PARAMS)
    all_imp = list(filter(lambda x: not x.is_nvrtc, all_imp))
    convcu = ConvMainUnitTest(all_imp)
    convcu.namespace = "cumm.conv.main"
-    pccm.builder.build_pybind([cu, convcu, SpconvOps(), BoxOps(), HashTable(), CompileInfo(), ExternalAllocator()],
+    gemmtuner = GemmTunerSimple(cu)
+    gemmtuner.namespace = "csrc.sparse.convops.gemmops"
+    convtuner = ConvTunerSimple(convcu)
+    convtuner.namespace = "csrc.sparse.convops.convops"
+    convops = ConvGemmOps(gemmtuner, convtuner)
+    convops.namespace = "csrc.sparse.convops.spops"
+
+    cus = [
+        cu, convcu, gemmtuner, convtuner,
+        convops,
+        SpconvOps(),
+        BoxOps(),
+        HashTable(),
+        CompileInfo(),
+        ExternalAllocator(),
+        ExternalSpconvMatmul(),
+        SimpleExternalSpconvMatmul(),
+
+    ]
+    pccm.builder.build_pybind(cus,
                              PACKAGE_ROOT / "core_cc",
                              namespace_root=PACKAGE_ROOT,
-                              load_library=False)
+                              load_library=False,
+                              verbose=True)

+    # cus_dev: List[pccm.Class] = [
+    # ]
+    # pccm.builder.build_pybind(cus_dev,
+    #                           PACKAGE_ROOT / "core_cc_dev",
+    #                           namespace_root=PACKAGE_ROOT,
+    #                           load_library=False,
+    #                           verbose=True)
--- a/spconv/constants.py
+++ b/spconv/constants.py
@@ -30,6 +30,7 @@ if _filter_hwio_env is not None:
    raise NotImplementedError("SPCONV_FILTER_HWIO is deprecated. use SPCONV_SAVED_WEIGHT_LAYOUT instead.")

 DISABLE_JIT = os.getenv("SPCONV_DISABLE_JIT", "0") == "1"
+
 NDIM_DONT_CARE = 3
 FILTER_HWIO = False

@@ -59,8 +60,10 @@ SPCONV_BWD_SPLITK = list(map(int, os.getenv("SPCONV_BWD_SPLITK", "1,2,4,8,16,32,
 SPCONV_NVRTC_MODE = NVRTCMode.ConstantMemory
 SPCONV_DEBUG_NVRTC_KERNELS = False

+SPCONV_DEBUG_CPP_ONLY = project_is_editable(PACKAGE_NAME)
+

-class SpconvAllocatorKeys:
+class AllocKeys:
    Pair = "Pair"
    IndiceNumPerLoc = "IndiceNumPerLoc"
    PairMask = "PairMask"
@@ -72,5 +75,31 @@ class SpconvAllocatorKeys:
    # MaskArgSortFwd = "MaskArgSortFwd"
    MaskArgSortBwd = "MaskArgSortBwd"

+    MaskOutputFwd = "MaskOutputFwd"
+
    OutFeatures = "OutFeatures"
+
+    Features = "Features"
+    Filters = "Filters"
+    OutBp = "OutBp"
+    DIn = "DIn"
+    DFilters = "DFilters"
+
+    InpBuffer = "InpBuffer"
+    OutBuffer = "OutBuffer"
+
+    IndicePairsUniq = "IndicePairsUniq"
+    IndicePairsUniqBackup = "IndicePairsUniqBackup"
+
+    HashKOrKV = "HashKOrKV"
+    HashV = "HashV"
+
+    ThrustTemp = "ThrustTemp"
+
+
 SPCONV_DEBUG_WEIGHT = False
+
+SPCONV_CPP_INDICE_PAIRS = True 
+SPCONV_CPP_INDICE_PAIRS_IGEMM = True 
+
+SPCONV_CPP_GEMM = True
\ No newline at end of file
--- a/spconv/core.py
+++ b/spconv/core.py
@@ -16,9 +16,10 @@ from cumm.gemm.main import gen_shuffle_params_v2 as gen_shuffle_params, GemmAlgo
 from cumm.gemm import kernel
 from typing import List
 from cumm.gemm.algospec.core import TensorOp
-from cumm.conv.main import gen_gemm_params as gen_conv_params, ConvFwdAndBwdInput, ConvBwdWeight, ConvIterAlgo, GemmAlgo
+from cumm.conv.main import gen_gemm_params as gen_conv_params, ConvFwdAndBwdInput, ConvBwdWeight, ConvFwd, ConvIterAlgo, GemmAlgo
 from cumm.conv.bases import (NCHW, NHWC, ConvIterAlgo, ConvLayout,
                             ConvLayoutType, ConvMode, ConvOpType)
+from spconv.algocore import get_gemm_algo_desp_from_param
 from spconv.constants import NDIM_DONT_CARE


@@ -402,32 +403,6 @@ IMPLGEMM_SIMT_PARAMS = [
                     increment_k_first=True,
                     access_per_vector=1),
 ]
-IMPLGEMM_SIMT_PARAMS = [
-    *gen_conv_params(ConvFwdAndBwdInput, (64, 32, 16), (32, 32, 8),
-                     NDIM_DONT_CARE,
-                     ConvIterAlgo.Optimized,
-                     2, ["f32,f32,f32,f32,f32", "f16,f16,f16,f32,f32"],
-                     NHWC,
-                     NHWC,
-                     NHWC,
-                     GemmAlgo.Simt,
-                     None,
-                     mask_sparse=True,
-                     increment_k_first=True,
-                     access_per_vector=1),
-    *gen_conv_params(ConvBwdWeight, (64, 32, 16), (32, 32, 8),
-                     NDIM_DONT_CARE,
-                     ConvIterAlgo.Optimized,
-                     2, ["f32,f32,f32,f32,f32", "f16,f16,f16,f32,f32"],
-                     NHWC,
-                     NHWC,
-                     NHWC,
-                     GemmAlgo.Simt,
-                     None,
-                     mask_sparse=True,
-                     increment_k_first=True,
-                     access_per_vector=1),
-]


 IMPLGEMM_VOLTA_PARAMS = [
@@ -693,6 +668,181 @@ IMPLGEMM_TURING_PARAMS = [
    #     NHWC, NHWC, NHWC, GemmAlgo.Turing, TensorOp((16, 8, 8)), mask_sparse=True, increment_k_first=True, access_per_vector=1),

    # gen_conv_params(ConvFwdAndBwdInput, )
+    # all int8 kernels use nvrtc.
+    *gen_conv_params(ConvFwd, (32, 32, 32), (32, 32, 32),
+                     NDIM_DONT_CARE,
+                     ConvIterAlgo.Optimized,
+                     2, ["s8,s8,s8,s32,s32"],
+                     NHWC,
+                     NHWC,
+                     NHWC,
+                     GemmAlgo.Turing,
+                     TensorOp((8, 8, 16)),
+                     mask_sparse=True,
+                     increment_k_first=True,
+                     access_per_vector=1,
+                     is_nvrtc=False),
+
+    *gen_conv_params(ConvFwd, (32, 64, 32), (32, 32, 32),
+                     NDIM_DONT_CARE,
+                     ConvIterAlgo.Optimized,
+                     2, ["s8,s8,s8,s32,s32"],
+                     NHWC,
+                     NHWC,
+                     NHWC,
+                     GemmAlgo.Turing,
+                     TensorOp((8, 8, 16)),
+                     mask_sparse=True,
+                     increment_k_first=True,
+                     access_per_vector=1,
+                     is_nvrtc=False),
+    *gen_conv_params(ConvFwd, (32, 32, 64), (32, 32, 32),
+                     NDIM_DONT_CARE,
+                     ConvIterAlgo.Optimized,
+                     2, ["s8,s8,s8,s32,s32"],
+                     NHWC,
+                     NHWC,
+                     NHWC,
+                     GemmAlgo.Turing,
+                     TensorOp((8, 8, 16)),
+                     mask_sparse=True,
+                     increment_k_first=True,
+                     access_per_vector=1,
+                     is_nvrtc=False),
+    *gen_conv_params(ConvFwd, (32, 64, 64), (32, 32, 32),
+                     NDIM_DONT_CARE,
+                     ConvIterAlgo.Optimized,
+                     2, ["s8,s8,s8,s32,s32"],
+                     NHWC,
+                     NHWC,
+                     NHWC,
+                     GemmAlgo.Turing,
+                     TensorOp((8, 8, 16)),
+                     mask_sparse=True,
+                     increment_k_first=True,
+                     access_per_vector=1,
+                     is_nvrtc=False),
+    *gen_conv_params(ConvFwd, (64, 128, 32), (32, 64, 32),
+                     NDIM_DONT_CARE,
+                     ConvIterAlgo.Optimized,
+                     2, ["s8,s8,s8,s32,s32"],
+                     NHWC,
+                     NHWC,
+                     NHWC,
+                     GemmAlgo.Turing,
+                     TensorOp((8, 8, 16)),
+                     mask_sparse=True,
+                     increment_k_first=True,
+                     access_per_vector=1,
+                     is_nvrtc=False),
+    *gen_conv_params(ConvFwd, (64, 64, 32), (32, 64, 32),
+                     NDIM_DONT_CARE,
+                     ConvIterAlgo.Optimized,
+                     2, ["s8,s8,s8,s32,s32"],
+                     NHWC,
+                     NHWC,
+                     NHWC,
+                     GemmAlgo.Turing,
+                     TensorOp((8, 8, 16)),
+                     mask_sparse=True,
+                     increment_k_first=True,
+                     access_per_vector=1,
+                     is_nvrtc=False),
+    *gen_conv_params(ConvFwd, (64, 64, 32), (32, 32, 32),
+                     NDIM_DONT_CARE,
+                     ConvIterAlgo.Optimized,
+                     2, ["s8,s8,s8,s32,s32"],
+                     NHWC,
+                     NHWC,
+                     NHWC,
+                     GemmAlgo.Turing,
+                     TensorOp((8, 8, 16)),
+                     mask_sparse=True,
+                     increment_k_first=True,
+                     access_per_vector=1,
+                     is_nvrtc=False),
+    *gen_conv_params(ConvFwd, (64, 32, 32), (32, 32, 32),
+                     NDIM_DONT_CARE,
+                     ConvIterAlgo.Optimized,
+                     2, ["s8,s8,s8,s32,s32"],
+                     NHWC,
+                     NHWC,
+                     NHWC,
+                     GemmAlgo.Turing,
+                     TensorOp((8, 8, 16)),
+                     mask_sparse=True,
+                     increment_k_first=True,
+                     access_per_vector=1,
+                     is_nvrtc=False),
+    *gen_conv_params(ConvFwd, (128, 128, 64), (64, 64, 64),
+                     NDIM_DONT_CARE,
+                     ConvIterAlgo.Optimized,
+                     2, ["s8,s8,s8,s32,s32"],
+                     NHWC,
+                     NHWC,
+                     NHWC,
+                     GemmAlgo.Turing,
+                     TensorOp((8, 8, 16)),
+                     mask_sparse=True,
+                     increment_k_first=True,
+                     access_per_vector=1,
+                     is_nvrtc=False),
+    *gen_conv_params(ConvFwd, (64, 128, 64), (32, 64, 64),
+                     NDIM_DONT_CARE,
+                     ConvIterAlgo.Optimized,
+                     2, ["s8,s8,s8,s32,s32"],
+                     NHWC,
+                     NHWC,
+                     NHWC,
+                     GemmAlgo.Turing,
+                     TensorOp((8, 8, 16)),
+                     mask_sparse=True,
+                     increment_k_first=True,
+                     access_per_vector=1,
+                     is_nvrtc=False),
+
+
+    # *gen_conv_params(ConvFwd, (32, 32, 32), (32, 32, 32),
+    #                  NDIM_DONT_CARE,
+    #                  ConvIterAlgo.Optimized,
+    #                  2, ["s8,s8,s8,s32,s32"],
+    #                  NHWC,
+    #                  NHWC,
+    #                  NHWC,
+    #                  GemmAlgo.Turing,
+    #                  TensorOp((8, 8, 16)),
+    #                  mask_sparse=True,
+    #                  increment_k_first=True,
+    #                  access_per_vector=0,
+    #                  is_nvrtc=True),
+
+    # *gen_conv_params(ConvFwd, (32, 64, 32), (32, 32, 32),
+    #                  NDIM_DONT_CARE,
+    #                  ConvIterAlgo.Optimized,
+    #                  2, ["s8,s8,s8,s32,s32"],
+    #                  NHWC,
+    #                  NHWC,
+    #                  NHWC,
+    #                  GemmAlgo.Turing,
+    #                  TensorOp((8, 8, 16)),
+    #                  mask_sparse=True,
+    #                  increment_k_first=True,
+    #                  access_per_vector=0,
+    #                  is_nvrtc=True),
+    # *gen_conv_params(ConvFwd, (32, 32, 64), (32, 32, 32),
+    #                  NDIM_DONT_CARE,
+    #                  ConvIterAlgo.Optimized,
+    #                  2, ["s8,s8,s8,s32,s32"],
+    #                  NHWC,
+    #                  NHWC,
+    #                  NHWC,
+    #                  GemmAlgo.Turing,
+    #                  TensorOp((8, 8, 16)),
+    #                  mask_sparse=True,
+    #                  increment_k_first=True,
+    #                  access_per_vector=0,
+    #                  is_nvrtc=True),
+
 ]

 ALL_NATIVE_PARAMS = SHUFFLE_SIMT_PARAMS + SHUFFLE_TURING_PARAMS + SHUFFLE_VOLTA_PARAMS

--- a/spconv/core_cc/csrc/sparse/all/__init__.pyi
+++ b/spconv/core_cc/csrc/sparse/all/__init__.pyi
@@ -48,7 +48,7 @@ class SpconvOps:
        """
        ...
    @staticmethod
-    def generate_conv_inds_stage2(indices: Tensor, hashdata_k: Tensor, hashdata_v: Tensor, indice_pairs: Tensor, indice_pairs_uniq: Tensor, indice_pairs_uniq_before_sort: Tensor, out_inds: Tensor, num_out_act: int, batch_size: int, output_dims: List[int], input_dims: List[int], ksize: List[int], stride: List[int], padding: List[int], dilation: List[int], transposed: bool = False, stream_int: int = 0) -> int: 
+    def generate_conv_inds_stage2(indices: Tensor, hashdata_k: Tensor, hashdata_v: Tensor, indice_pairs: Tensor, indice_pairs_uniq: Tensor, indice_pairs_uniq_before_sort: Tensor, out_inds: Tensor, indice_num_per_loc: Tensor, num_out_act: int, batch_size: int, output_dims: List[int], input_dims: List[int], ksize: List[int], stride: List[int], padding: List[int], dilation: List[int], transposed: bool = False, stream_int: int = 0, use_bound_algo: bool = False) -> int: 
        """
        Args:
            indices: 
@@ -58,6 +58,7 @@ class SpconvOps:
            indice_pairs_uniq: 
            indice_pairs_uniq_before_sort: 
            out_inds: 
+            indice_num_per_loc: 
            num_out_act: 
            batch_size: 
            output_dims: 
@@ -68,6 +69,7 @@ class SpconvOps:
            dilation: 
            transposed: 
            stream_int: 
+            use_bound_algo: 
        """
        ...
    @staticmethod
@@ -191,6 +193,31 @@ class SpconvOps:
        """
        ...
    @staticmethod
+    def indice_maxpool(out_features: Tensor, features: Tensor, indice_pairs: Tensor, indice_pair_num: Tensor, num_activate_out: int, stream: int = 0) -> None: 
+        """
+        Args:
+            out_features: 
+            features: 
+            indice_pairs: 
+            indice_pair_num: 
+            num_activate_out: 
+            stream: 
+        """
+        ...
+    @staticmethod
+    def indice_maxpool_backward(din: Tensor, features: Tensor, out_features: Tensor, out_bp: Tensor, indice_pairs: Tensor, indice_pair_num: Tensor, stream: int = 0) -> None: 
+        """
+        Args:
+            din: 
+            features: 
+            out_features: 
+            out_bp: 
+            indice_pairs: 
+            indice_pair_num: 
+            stream: 
+        """
+        ...
+    @staticmethod
    def maxpool_implicit_gemm_forward(out: Tensor, inp: Tensor, inds: Tensor, stream: int = 0) -> None: 
        """
        Args:
@@ -369,7 +396,18 @@ class SpconvOps:
    @staticmethod
    def get_int32_max() -> int: ...
    @staticmethod
-    def get_indice_pairs_implicit_gemm(allocator, indices: Tensor, batch_size: int, input_dims: List[int], algo: int, ksize: List[int], stride: List[int], padding: List[int], dilation: List[int], out_padding: List[int], subm: bool, transposed: bool, is_train: bool, stream_int: int = 0) -> Tensor: 
+    def get_indice_gen_workspace_size(kv: int, num_act_in: int, num_act_out_bound: int, subm: bool, use_int64_hash_k: bool) -> int: 
+        """
+        Args:
+            kv: 
+            num_act_in: 
+            num_act_out_bound: 
+            subm: 
+            use_int64_hash_k: 
+        """
+        ...
+    @staticmethod
+    def get_indice_pairs_implicit_gemm(allocator, indices: Tensor, batch_size: int, input_dims: List[int], algo: int, ksize: List[int], stride: List[int], padding: List[int], dilation: List[int], out_padding: List[int], subm: bool, transposed: bool, is_train: bool, stream_int: int = 0, num_out_act_bound: int = -1) -> Tuple[Tensor, int]: 
        """
        Args:
            allocator: 
@@ -386,10 +424,11 @@ class SpconvOps:
            transposed: 
            is_train: 
            stream_int: 
+            num_out_act_bound: 
        """
        ...
    @staticmethod
-    def get_indice_pairs(allocator, indices: Tensor, batch_size: int, input_dims: List[int], algo: int, ksize: List[int], stride: List[int], padding: List[int], dilation: List[int], out_padding: List[int], subm: bool, transposed: bool, stream_int: int = 0) -> None: 
+    def get_indice_pairs(allocator, indices: Tensor, batch_size: int, input_dims: List[int], algo: int, ksize: List[int], stride: List[int], padding: List[int], dilation: List[int], out_padding: List[int], subm: bool, transposed: bool, stream_int: int = 0, num_out_act_bound: int = -1) -> int: 
        """
        Args:
            allocator: 
@@ -405,12 +444,6 @@ class SpconvOps:
            subm: 
            transposed: 
            stream_int: 
-        """
-        ...
-    @staticmethod
-    def test_allocator(allocator) -> None: 
-        """
-        Args:
-            allocator: 
+            num_out_act_bound: 
        """
        ...
--- a/spconv/core_cc/csrc/sparse/alloc.pyi
+++ b/spconv/core_cc/csrc/sparse/alloc.pyi
@@ -2,25 +2,29 @@ from typing import overload, Any, Callable, Dict, List, Optional, Set, Tuple, Ty
 from pccm.stubs import EnumValue, EnumClassValue
 from cumm.tensorview import Tensor
 class ExternalAllocator:
-    def zeros(self, name: str, shape: List[int], dtype: int, device: int) -> Tensor: 
+    def zeros(self, name: str, shape: List[int], dtype: int, device: int, is_temp_memory: bool = False, stream: int = 0) -> Tensor: 
        """
        Args:
            name: 
            shape: 
            dtype: 
            device: 
+            is_temp_memory: 
+            stream: 
        """
        ...
-    def empty(self, name: str, shape: List[int], dtype: int, device: int) -> Tensor: 
+    def empty(self, name: str, shape: List[int], dtype: int, device: int, is_temp_memory: bool = False, stream: int = 0) -> Tensor: 
        """
        Args:
            name: 
            shape: 
            dtype: 
            device: 
+            is_temp_memory: 
+            stream: 
        """
        ...
-    def full_int(self, name: str, shape: List[int], value: int, dtype: int, device: int) -> Tensor: 
+    def full_int(self, name: str, shape: List[int], value: int, dtype: int, device: int, is_temp_memory: bool = False, stream: int = 0) -> Tensor: 
        """
        Args:
            name: 
@@ -28,9 +32,11 @@ class ExternalAllocator:
            value: 
            dtype: 
            device: 
+            is_temp_memory: 
+            stream: 
        """
        ...
-    def full_float(self, name: str, shape: List[int], value: float, dtype: int, device: int) -> Tensor: 
+    def full_float(self, name: str, shape: List[int], value: float, dtype: int, device: int, is_temp_memory: bool = False, stream: int = 0) -> Tensor: 
        """
        Args:
            name: 
@@ -38,6 +44,14 @@ class ExternalAllocator:
            value: 
            dtype: 
            device: 
+            is_temp_memory: 
+            stream: 
+        """
+        ...
+    def get_tensor_by_name(self, name: str) -> Tensor: 
+        """
+        Args:
+            name: 
        """
        ...
    def free(self, ten: Tensor) -> None: 

--- a/spconv/core_cc/csrc/sparse/convops/__init__.pyi
+++ b/spconv/core_cc/csrc/sparse/convops/__init__.pyi
+from typing import overload, Any, Callable, Dict, List, Optional, Set, Tuple, Type, Union
+from pccm.stubs import EnumValue, EnumClassValue
+from cumm.tensorview.gemm import GemmAlgoDesp
+from cumm.tensorview.gemm import ConvAlgoDesp
+from cumm.tensorview import Tensor
+from ...csrc.sparse.convops import ExternalSpconvMatmul
+class GemmTuneResult:
+    algo_desp: GemmAlgoDesp
+    arch: Tuple[int, int]
+    splitk: int
+    def is_valid(self) -> bool: ...
+    @overload
+    def __init__(self) -> None: ...
+    @overload
+    def __init__(self, algo_desp: GemmAlgoDesp, arch: Tuple[int, int], splitk: int) -> None: 
+        """
+        Args:
+            algo_desp: 
+            arch: 
+            splitk: 
+        """
+        ...
+class ConvTuneResult:
+    algo_desp: ConvAlgoDesp
+    arch: Tuple[int, int]
+    splitk: int
+    @overload
+    def __init__(self) -> None: ...
+    @overload
+    def __init__(self, algo_desp: ConvAlgoDesp, arch: Tuple[int, int], splitk: int) -> None: 
+        """
+        Args:
+            algo_desp: 
+            arch: 
+            splitk: 
+        """
+        ...
+    def is_valid(self) -> bool: ...
+class ExternalSpconvMatmul:
+    def indice_conv_init_gemm(self, features_n: str, filters_n: str, all_weight_is_krsc: bool, is_kc_not_ck: bool, kv_center: int, out_channel: int, stream_int: int = 0) -> Tensor: 
+        """
+        Args:
+            features_n: 
+            filters_n: 
+            all_weight_is_krsc: 
+            is_kc_not_ck: 
+            kv_center: 
+            out_channel: 
+            stream_int: 
+        """
+        ...
+    def indice_conv_cpu_gemm(self, inp_buffer_n: str, out_buffer_n: str, filters_n: str, all_weight_is_krsc: bool, is_kc_not_ck: bool, nhot: int, index: int) -> None: 
+        """
+        Args:
+            inp_buffer_n: 
+            out_buffer_n: 
+            filters_n: 
+            all_weight_is_krsc: 
+            is_kc_not_ck: 
+            nhot: 
+            index: 
+        """
+        ...
+    def indice_conv_bwd_init_gemm(self, features_n: str, filters_n: str, out_bp_n: str, dfilters_n: str, all_weight_is_krsc: bool, is_kc_not_ck: bool, kv_center: int, stream_int: int = 0) -> Tensor: 
+        """
+        Args:
+            features_n: 
+            filters_n: 
+            out_bp_n: 
+            dfilters_n: 
+            all_weight_is_krsc: 
+            is_kc_not_ck: 
+            kv_center: 
+            stream_int: 
+        """
+        ...
+    def indice_conv_bwd_cpu_gemm(self, inp_buffer_n: str, out_buffer_n: str, filters_n: str, dfilters_n: str, all_weight_is_krsc: bool, is_kc_not_ck: bool, nhot: int, index: int) -> None: 
+        """
+        Args:
+            inp_buffer_n: 
+            out_buffer_n: 
+            filters_n: 
+            dfilters_n: 
+            all_weight_is_krsc: 
+            is_kc_not_ck: 
+            nhot: 
+            index: 
+        """
+        ...
+class SimpleExternalSpconvMatmul(ExternalSpconvMatmul):
+    def __init__(self, alloc) -> None: 
+        """
+        Args:
+            alloc: 
+        """
+        ...
--- a/spconv/core_cc/csrc/sparse/convops/convops.pyi
+++ b/spconv/core_cc/csrc/sparse/convops/convops.pyi
+from typing import overload, Any, Callable, Dict, List, Optional, Set, Tuple, Type, Union
+from pccm.stubs import EnumValue, EnumClassValue
+from cumm.tensorview.gemm import ConvAlgoDesp
+from cumm.tensorview import Tensor
+from cumm.tensorview.gemm import NVRTCParams
+from spconv.core_cc.csrc.sparse.convops import ConvTuneResult
+from cumm.tensorview import CUDAKernelTimer
+class ConvTunerSimple:
+    def __init__(self, desps: List[ConvAlgoDesp]) -> None: 
+        """
+        Args:
+            desps: 
+        """
+        ...
+    @staticmethod
+    def get_available_algo_str_from_arch(arch: Tuple[int, int]) -> List[str]: 
+        """
+        Args:
+            arch: 
+        """
+        ...
+    def get_all_available(self, inp: Tensor, weight: Tensor, out: Tensor, layout_i: int, layout_w: int, layout_o: int, interleave_i: int, interleave_w: int, interleave_o: int, arch: Tuple[int, int], op_type: int, mask_width: int, auto_fp32_accum: bool, fp32_accum: bool) -> List[ConvAlgoDesp]: 
+        """
+        Args:
+            inp: 
+            weight: 
+            out: 
+            layout_i: 
+            layout_w: 
+            layout_o: 
+            interleave_i: 
+            interleave_w: 
+            interleave_o: 
+            arch: 
+            op_type: 
+            mask_width: 
+            auto_fp32_accum: 
+            fp32_accum: 
+        """
+        ...
+    def cached_get_nvrtc_params(self, desp: ConvAlgoDesp, arch: Tuple[int, int], stream_int: int) -> NVRTCParams: 
+        """
+        Args:
+            desp: 
+            arch: 
+            stream_int: 
+        """
+        ...
+    def tune_and_cache(self, op_type: int, inp: Tensor, weight: Tensor, output: Tensor, layout_i: int, layout_w: int, layout_o: int, interleave_i: int, interleave_w: int, interleave_o: int, arch: Tuple[int, int], mask: Tensor, mask_argsort: Tensor, indices: Tensor, reverse_mask: bool, mask_filter: int = 0xffffffff, mask_width: int = -1, mask_output: Tensor =  Tensor(), alpha: float = 1.0, beta: float = 0.0, stream_int: int = 0, auto_fp32_accum: bool = True, fp32_accum: bool = False, num_run: int = 5) -> Tuple[ConvTuneResult, float]: 
+        """
+        Args:
+            op_type: 
+            inp: 
+            weight: 
+            output: 
+            layout_i: 
+            layout_w: 
+            layout_o: 
+            interleave_i: 
+            interleave_w: 
+            interleave_o: 
+            arch: 
+            mask: 
+            mask_argsort: 
+            indices: 
+            reverse_mask: 
+            mask_filter: 
+            mask_width: 
+            mask_output: 
+            alpha: 
+            beta: 
+            stream_int: 
+            auto_fp32_accum: 
+            fp32_accum: 
+            num_run: 
+        """
+        ...
+    def get_tuned_algo(self, op_type: int, i_dtype: int, w_dtype: int, o_dtype: int, k: int, c: int, arch: Tuple[int, int], mask_width: int = -1) -> Tuple[Any, bool]: 
+        """
+        Args:
+            op_type: 
+            i_dtype: 
+            w_dtype: 
+            o_dtype: 
+            k: 
+            c: 
+            arch: 
+            mask_width: 
+        """
+        ...
+    def run_with_tuned_result(self, profile_res, op_type: int, inp: Tensor, weight: Tensor, output: Tensor, mask: Tensor, mask_argsort: Tensor, mask_output: Tensor, indices: Tensor, reverse_mask: bool, mask_filter: int = 0xffffffff, mask_width: int = -1, alpha: float = 1.0, beta: float = 0.0, stream_int: int = 0, workspace: Tensor =  Tensor(), verbose: bool = False, timer: CUDAKernelTimer =  CUDAKernelTimer(false), force_nvrtc: bool = False) -> None: 
+        """
+        Args:
+            profile_res: 
+            op_type: 
+            inp: 
+            weight: 
+            output: 
+            mask: 
+            mask_argsort: 
+            mask_output: 
+            indices: 
+            reverse_mask: 
+            mask_filter: 
+            mask_width: 
+            alpha: 
+            beta: 
+            stream_int: 
+            workspace: 
+            verbose: 
+            timer: 
+            force_nvrtc: 
+        """
+        ...
+    def query_workspace_size(self, desp: ConvAlgoDesp, splitk: int, op_type: int, N: int, C: int, K: int, kv: int) -> int: 
+        """
+        Args:
+            desp: 
+            splitk: 
+            op_type: 
+            N: 
+            C: 
+            K: 
+            kv: 
+        """
+        ...
--- a/spconv/core_cc/csrc/sparse/convops/gemmops.pyi
+++ b/spconv/core_cc/csrc/sparse/convops/gemmops.pyi
+from typing import overload, Any, Callable, Dict, List, Optional, Set, Tuple, Type, Union
+from pccm.stubs import EnumValue, EnumClassValue
+from cumm.tensorview.gemm import GemmAlgoDesp
+from cumm.tensorview import Tensor
+from cumm.tensorview.gemm import NVRTCParams
+from spconv.core_cc.csrc.sparse.convops import GemmTuneResult
+from cumm.tensorview import CUDAKernelTimer
+class GemmTunerSimple:
+    def __init__(self, desps: List[GemmAlgoDesp]) -> None: 
+        """
+        Args:
+            desps: 
+        """
+        ...
+    @staticmethod
+    def get_available_algo_str_from_arch(arch: Tuple[int, int]) -> List[str]: 
+        """
+        Args:
+            arch: 
+        """
+        ...
+    def get_all_available(self, a: Tensor, b: Tensor, c: Tensor, trans_a: bool, trans_b: bool, trans_c: bool, arch: Tuple[int, int], shuffle_type: int) -> List[GemmAlgoDesp]: 
+        """
+        Args:
+            a: 
+            b: 
+            c: 
+            trans_a: 
+            trans_b: 
+            trans_c: 
+            arch: 
+            shuffle_type: 
+        """
+        ...
+    def cached_get_nvrtc_params(self, desp: GemmAlgoDesp, arch: Tuple[int, int], stream_int: int) -> NVRTCParams: 
+        """
+        Args:
+            desp: 
+            arch: 
+            stream_int: 
+        """
+        ...
+    def tune_and_cache(self, a: Tensor, b: Tensor, c: Tensor, trans_a: bool, trans_b: bool, trans_c: bool, arch: Tuple[int, int], shuffle_type: int, a_inds: Tensor, b_inds: Tensor, c_inds: Tensor, hint: int = 0, alpha: float = 1.0, beta: float = 0.0, stream_int: int = 0, num_run: int = 5) -> Tuple[GemmTuneResult, float]: 
+        """
+        Args:
+            a: 
+            b: 
+            c: 
+            trans_a: 
+            trans_b: 
+            trans_c: 
+            arch: 
+            shuffle_type: 
+            a_inds: 
+            b_inds: 
+            c_inds: 
+            hint: 
+            alpha: 
+            beta: 
+            stream_int: 
+            num_run: 
+        """
+        ...
+    def get_tuned_algo(self, a_dtype: int, b_dtype: int, c_dtype: int, a_shape: List[int], b_shape: List[int], c_shape: List[int], trans_a: bool, trans_b: bool, trans_c: bool, arch: Tuple[int, int], shuffle_type: int, a_inds_shape: List[int], b_inds_shape: List[int], c_inds_shape: List[int], hint: int = 0) -> Tuple[Any, bool]: 
+        """
+        Args:
+            a_dtype: 
+            b_dtype: 
+            c_dtype: 
+            a_shape: 
+            b_shape: 
+            c_shape: 
+            trans_a: 
+            trans_b: 
+            trans_c: 
+            arch: 
+            shuffle_type: 
+            a_inds_shape: 
+            b_inds_shape: 
+            c_inds_shape: 
+            hint: 
+        """
+        ...
+    def run_with_tuned_result(self, profile_res, a: Tensor, b: Tensor, c: Tensor, trans_a: bool, trans_b: bool, trans_c: bool, arch: Tuple[int, int], stream_int: int, shuffle_type: int, a_inds: Tensor, b_inds: Tensor, c_inds: Tensor, hint: int = 0, alpha: float = 1.0, beta: float = 0.0, workspace: Tensor =  Tensor(), timer: CUDAKernelTimer =  CUDAKernelTimer(False), force_nvrtc: bool = False) -> None: 
+        """
+        Args:
+            profile_res: 
+            a: 
+            b: 
+            c: 
+            trans_a: 
+            trans_b: 
+            trans_c: 
+            arch: 
+            stream_int: 
+            shuffle_type: 
+            a_inds: 
+            b_inds: 
+            c_inds: 
+            hint: 
+            alpha: 
+            beta: 
+            workspace: 
+            timer: 
+            force_nvrtc: 
+        """
+        ...
--- a/spconv/core_cc/csrc/sparse/convops/spops.pyi
+++ b/spconv/core_cc/csrc/sparse/convops/spops.pyi
+from typing import overload, Any, Callable, Dict, List, Optional, Set, Tuple, Type, Union
+from pccm.stubs import EnumValue, EnumClassValue
+from cumm.tensorview import Tensor
+from cumm.tensorview import CUDAKernelTimer
+class ConvGemmOps:
+    @staticmethod
+    def get_compute_capability(index: int = -1) -> Tuple[int, int]: 
+        """
+        Args:
+            index: 
+        """
+        ...
+    @staticmethod
+    def indice_conv(allocator, ext_mm, gemm_tuner, all_w_is_krsc: bool, filter_hwio: bool, features: Tensor, filters: Tensor, indice_pairs: Tensor, indice_pair_num: Tensor, num_activate_out: int, inverse: bool = False, subm: bool = False, algo: int = 0, stream_int: int = 0) -> None: 
+        """
+        1. this function need to take a out features
+        that from subm first mm.
+        2. this function don't support CPU.
+        Args:
+            allocator: 
+            ext_mm: 
+            gemm_tuner: 
+            all_w_is_krsc: 
+            filter_hwio: 
+            features: 
+            filters: 
+            indice_pairs: 
+            indice_pair_num: 
+            num_activate_out: 
+            inverse: 
+            subm: 
+            algo: 
+            stream_int: 
+        """
+        ...
+    @staticmethod
+    def indice_conv_backward(allocator, ext_mm, gemm_tuner, all_w_is_krsc: bool, filter_hwio: bool, features: Tensor, filters: Tensor, out_bp: Tensor, indice_pairs: Tensor, indice_pair_num: Tensor, inverse: bool = False, subm: bool = False, algo: int = 0, stream_int: int = 0) -> None: 
+        """
+        Args:
+            allocator: 
+            ext_mm: 
+            gemm_tuner: 
+            all_w_is_krsc: 
+            filter_hwio: 
+            features: 
+            filters: 
+            out_bp: 
+            indice_pairs: 
+            indice_pair_num: 
+            inverse: 
+            subm: 
+            algo: 
+            stream_int: 
+        """
+        ...
+    @staticmethod
+    def implicit_gemm(allocator, conv_tuner, features: Tensor, filters: Tensor, pair_fwd: Tensor, pair_mask_fwd_splits: List[Tensor], mask_argsort_fwd_splits: List[Tensor], num_activate_out: int, masks: Tensor, is_train: bool = False, is_subm: bool = False, stream_int: int = 0, timer: CUDAKernelTimer =  CUDAKernelTimer(False), auto_fp32_accum: bool = True, fp32_accum: bool = False) -> int: 
+        """
+        Args:
+            allocator: 
+            conv_tuner: 
+            features: 
+            filters: 
+            pair_fwd: 
+            pair_mask_fwd_splits: 
+            mask_argsort_fwd_splits: 
+            num_activate_out: 
+            masks: 
+            is_train: 
+            is_subm: 
+            stream_int: 
+            timer: 
+            auto_fp32_accum: 
+            fp32_accum: 
+        """
+        ...
+    @staticmethod
+    def implicit_gemm_backward(allocator, conv_tuner, features: Tensor, filters: Tensor, out_bp: Tensor, pair_fwd: Tensor, pair_bwd: Tensor, pair_mask_fwd_splits: List[Tensor], pair_mask_bwd_splits: List[Tensor], mask_argsort_fwd_splits: List[Tensor], mask_argsort_bwd_splits: List[Tensor], mask_output_fwd: Tensor, masks: Tensor, mask_width: int, is_subm: bool, stream_int: int = 0, timer: CUDAKernelTimer =  CUDAKernelTimer(False), auto_fp32_accum: bool = True, fp32_accum: bool = False) -> None: 
+        """
+        Args:
+            allocator: 
+            conv_tuner: 
+            features: 
+            filters: 
+            out_bp: 
+            pair_fwd: 
+            pair_bwd: 
+            pair_mask_fwd_splits: 
+            pair_mask_bwd_splits: 
+            mask_argsort_fwd_splits: 
+            mask_argsort_bwd_splits: 
+            mask_output_fwd: 
+            masks: 
+            mask_width: 
+            is_subm: 
+            stream_int: 
+            timer: 
+            auto_fp32_accum: 
+            fp32_accum: 
+        """
+        ...
--- a/spconv/core_cc/cumm/common.pyi
+++ b/spconv/core_cc/cumm/common.pyi
@@ -3,3 +3,10 @@ from pccm.stubs import EnumValue, EnumClassValue
 class CompileInfo:
    @staticmethod
    def get_compiled_cuda_arch() -> List[Tuple[int, int]]: ...
+    @staticmethod
+    def arch_is_compiled(arch: Tuple[int, int]) -> bool: 
+        """
+        Args:
+            arch: 
+        """
+        ...
--- a/spconv/core_cc/cumm/gemm/main.pyi
+++ b/spconv/core_cc/cumm/gemm/main.pyi
 from typing import overload, Any, Callable, Dict, List, Optional, Set, Tuple, Type, Union
 from pccm.stubs import EnumValue, EnumClassValue
+from cumm.tensorview.gemm import GemmAlgoDesp
 from cumm.tensorview.gemm import GemmParams
 class GemmMainUnitTest:
    @staticmethod
-    def get_all_algo_desp() -> List[Any]: ...
+    def get_all_algo_desp() -> List[GemmAlgoDesp]: ...
    @staticmethod
-    def extract_mnk(a_shape: List[int], b_shape: List[int], trans_a: bool, trans_b: bool, trans_c: bool, shuffle_type: str = "0", a_inds_shape: List[int] =  [], b_inds_shape: List[int] =  [], c_inds_shape: List[int] =  []) -> Tuple[int, int, int]: 
+    def extract_mnk(a_shape: List[int], b_shape: List[int], trans_a: bool, trans_b: bool, trans_c: bool, shuffle_type: int = 0, a_inds_shape: List[int] =  [], b_inds_shape: List[int] =  [], c_inds_shape: List[int] =  []) -> Tuple[int, int, int]: 
        """
        Args:
            a_shape: 

--- a/spconv/csrc/sparse/all.py
+++ b/spconv/csrc/sparse/all.py
--- a/spconv/csrc/sparse/alloc.py
+++ b/spconv/csrc/sparse/alloc.py
 import pccm 
 from cumm.common import TensorView, TensorViewCPU, TensorViewKernel, ThrustLib

+from spconv.constants import AllocKeys
+
 class ExternalAllocatorGuard(pccm.Class):
    def __init__(self):
        super().__init__()
@@ -51,6 +53,9 @@ class ExternalAllocator(pccm.Class):
        code.arg("shape", "std::vector<int64_t>")
        code.arg("dtype", "int")
        code.arg("device", "int")
+        code.arg("is_temp_memory", "bool", "false")
+        code.arg("stream", "std::uintptr_t", "0")
+
        return code.ret("tv::Tensor")

    @pccm.pybind.mark(virtual=True)
@@ -61,6 +66,9 @@ class ExternalAllocator(pccm.Class):
        code.arg("shape", "std::vector<int64_t>")
        code.arg("dtype", "int")
        code.arg("device", "int")
+        code.arg("is_temp_memory", "bool", "false")
+        code.arg("stream", "std::uintptr_t", "0")
+
        return code.ret("tv::Tensor")

    @pccm.pybind.mark(virtual=True)
@@ -72,6 +80,9 @@ class ExternalAllocator(pccm.Class):
        code.arg("value", "int")
        code.arg("dtype", "int")
        code.arg("device", "int")
+        code.arg("is_temp_memory", "bool", "false")
+        code.arg("stream", "std::uintptr_t", "0")
+
        return code.ret("tv::Tensor")

    @pccm.pybind.mark(virtual=True)
@@ -83,6 +94,15 @@ class ExternalAllocator(pccm.Class):
        code.arg("value", "float")
        code.arg("dtype", "int")
        code.arg("device", "int")
+        code.arg("is_temp_memory", "bool", "false")
+        code.arg("stream", "std::uintptr_t", "0")
+        return code.ret("tv::Tensor")
+
+    @pccm.pybind.mark(virtual=True)
+    @pccm.member_function(virtual=True, pure_virtual=True)
+    def get_tensor_by_name(self):
+        code = pccm.code()
+        code.arg("name", "std::string")
        return code.ret("tv::Tensor")

    @pccm.pybind.mark(virtual=True)
@@ -105,9 +125,11 @@ class ExternalAllocator(pccm.Class):
        code.arg("shape", "std::vector<int64_t>")
        code.arg("dtype", "int")
        code.arg("device", "int")
+        code.arg("name", "std::string", "\"\"")
+        code.arg("stream", "std::uintptr_t", "0")
        code.raw(f"""
        // "" means temp memory
-        auto ten = zeros("", shape, dtype, device);
+        auto ten = zeros(name, shape, dtype, device, true, stream);
        return std::make_{self.ptr_type}<ExternalAllocatorGuard>(ten, [this](tv::Tensor ten){{
            this->free(ten);
        }});
@@ -120,8 +142,10 @@ class ExternalAllocator(pccm.Class):
        code.arg("shape", "std::vector<int64_t>")
        code.arg("dtype", "int")
        code.arg("device", "int")
+        code.arg("name", "std::string", "\"\"")
+        code.arg("stream", "std::uintptr_t", "0")
        code.raw(f"""
-        auto ten = empty("", shape, dtype, device);
+        auto ten = empty(name, shape, dtype, device, true, stream);
        return std::make_{self.ptr_type}<ExternalAllocatorGuard>(ten, [this](tv::Tensor ten){{
            this->free(ten);
        }});
@@ -135,8 +159,10 @@ class ExternalAllocator(pccm.Class):
        code.arg("value", "int")
        code.arg("dtype", "int")
        code.arg("device", "int")
+        code.arg("name", "std::string", "\"\"")
+        code.arg("stream", "std::uintptr_t", "0")
        code.raw(f"""
-        auto ten = full_int("", shape, value, dtype, device);
+        auto ten = full_int(name, shape, value, dtype, device, true, stream);
        return std::make_{self.ptr_type}<ExternalAllocatorGuard>(ten, [this](tv::Tensor ten){{
            this->free(ten);
        }});
@@ -150,14 +176,16 @@ class ExternalAllocator(pccm.Class):
        code.arg("value", "int")
        code.arg("dtype", "int")
        code.arg("device", "int")
+        code.arg("name", "std::string", "\"\"")
+        code.arg("stream", "std::uintptr_t", "0")
        code.raw(f"""
-        auto ten = full_float("", shape, value, dtype, device);
+        auto ten = full_float(name, shape, value, dtype, device, true, stream);
        return std::make_{self.ptr_type}<ExternalAllocatorGuard>(ten, [this](tv::Tensor t){{
            this->free(t);
        }});
        """)
        return code.ret(f"std::{self.ptr_type}_ptr<ExternalAllocatorGuard>")
-
+    
 class ThrustAllocator(pccm.Class):
    def __init__(self):
        super().__init__()
@@ -179,7 +207,7 @@ class ThrustAllocator(pccm.Class):
        code.arg("num_bytes", "std::ptrdiff_t")
        code.ret("char*")
        code.raw(f"""
-        auto ten = allocator_.empty("", {{num_bytes}}, tv::uint8, 0);
+        auto ten = allocator_.empty({pccm.literal(AllocKeys.ThrustTemp)}, {{num_bytes}}, tv::uint8, 0);
        return reinterpret_cast<char*>(ten.raw_data());
        """)
        return code
@@ -192,4 +220,159 @@ class ThrustAllocator(pccm.Class):
        code.raw(f"""
        return allocator_.free_noexcept(tv::from_blob(ptr, {{num_bytes}}, tv::uint8, 0));
        """)
-        return code        
+        return code
+
+class StaticAllocator(ExternalAllocator):
+    """a simple allocator for tensorrt plugin.
+    """
+    def __init__(self):
+        super().__init__()
+        self.add_dependency(TensorView)
+        self.add_member("tensor_dict_", "std::unordered_map<std::string, tv::Tensor>")
+        self.add_member("repr_", "std::string")
+        self.add_member("thrust_tmp_tensor_", "tv::Tensor")
+        self.grow = 1.5
+
+    @pccm.pybind.mark 
+    @pccm.constructor
+    def ctor(self):
+        code = pccm.code()
+        code.arg("tensor_dict", "std::unordered_map<std::string, tv::Tensor>")
+        code.ctor_init("tensor_dict_", "tensor_dict")
+        code.raw(f"""
+        std::stringstream ss;
+        for (auto& p : tensor_dict){{
+            tv::ssprint(ss, p.first, p.second.shape(), tv::dtype_str(p.second.dtype()), "\\n");
+        }}
+        repr_ = ss.str();
+        """)
+        return code 
+
+    @pccm.member_function(virtual=True)
+    def _get_raw_and_check(self):
+        code = pccm.code()
+        code.arg("name", "std::string")
+        code.arg("shape", "std::vector<int64_t>")
+        code.arg("dtype", "int")
+        code.arg("device", "int")
+        code.raw(f"""
+        auto res = get_tensor_by_name(name);
+        size_t total = std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<int64_t>());
+        TV_ASSERT_RT_ERR(res.nbytes() >= total * tv::bit_size(tv::DType(dtype)) 
+            && res.device() == device, "alloc failed", shape, res.shape());
+        return tv::from_blob(res.raw_data(), shape, dtype, device);
+        """)
+        return code.ret("tv::Tensor")
+
+
+    @pccm.pybind.mark
+    @pccm.member_function(virtual=True)
+    def zeros(self):
+        code = pccm.code()
+        code.arg("name", "std::string")
+        code.arg("shape", "std::vector<int64_t>")
+        code.arg("dtype", "int")
+        code.arg("device", "int")
+        code.arg("is_temp_memory", "bool", "false")
+        code.arg("stream", "std::uintptr_t", "0")
+        code.raw(f"""
+        auto tvctx = tv::Context();
+        tvctx.set_cuda_stream(reinterpret_cast<cudaStream_t>(stream));
+        auto blob = _get_raw_and_check(name, shape, dtype, device);
+        return blob.zero_(tvctx);
+        """)
+        return code.ret("tv::Tensor")
+
+    @pccm.pybind.mark
+    @pccm.member_function(virtual=True)
+    def empty(self):
+        code = pccm.code()
+        code.arg("name", "std::string")
+        code.arg("shape", "std::vector<int64_t>")
+        code.arg("dtype", "int")
+        code.arg("device", "int")
+        code.arg("is_temp_memory", "bool", "false")
+        code.arg("stream", "std::uintptr_t", "0")
+        code.raw(f"""
+        if (name == {pccm.literal(AllocKeys.ThrustTemp)}){{
+            // thrust tmp shouldn't inside tensor_dict. use a simple method to allocate
+            // we assume each allocator always handle one stream
+            // so we can just use one tensor
+            tv::Tensor res = thrust_tmp_tensor_;
+            if (res.empty()){{
+                res = tv::empty(shape, dtype, device);
+                thrust_tmp_tensor_ = res;
+            }}
+            if (shape[0] > thrust_tmp_tensor_.dim(0)){{
+                res = tv::empty({{int64_t(shape[0] * {self.grow})}}, dtype, device);
+                thrust_tmp_tensor_ = res;
+            }}
+            return res;
+        }}else{{
+            auto blob = _get_raw_and_check(name, shape, dtype, device);
+            return blob;
+        }}
+        """)
+        return code.ret("tv::Tensor")
+
+    @pccm.pybind.mark
+    @pccm.member_function(virtual=True)
+    def full_int(self):
+        code = pccm.code()
+        code.arg("name", "std::string")
+        code.arg("shape", "std::vector<int64_t>")
+        code.arg("value", "int")
+        code.arg("dtype", "int")
+        code.arg("device", "int")
+        code.arg("is_temp_memory", "bool", "false")
+        code.arg("stream", "std::uintptr_t", "0")
+        code.raw(f"""
+        auto tvctx = tv::Context();
+        auto blob = _get_raw_and_check(name, shape, dtype, device);
+        return blob.fill_(tvctx, value);
+        """)
+        return code.ret("tv::Tensor")
+
+    @pccm.pybind.mark
+    @pccm.member_function(virtual=True)
+    def full_float(self):
+        code = pccm.code()
+        code.arg("name", "std::string")
+        code.arg("shape", "std::vector<int64_t>")
+        code.arg("value", "float")
+        code.arg("dtype", "int")
+        code.arg("device", "int")
+        code.arg("is_temp_memory", "bool", "false")
+        code.arg("stream", "std::uintptr_t", "0")
+        code.raw(f"""
+        auto blob = _get_raw_and_check(name, shape, dtype, device);
+        return blob.fill_(tvctx, value);
+        """)
+        return code.ret("tv::Tensor")
+
+    @pccm.pybind.mark
+    @pccm.member_function(virtual=True)
+    def get_tensor_by_name(self):
+        code = pccm.code()
+        code.arg("name", "std::string")
+        code.raw(f"""
+        TV_ASSERT_RT_ERR(tensor_dict_.find(name) != tensor_dict_.end(), "can't find", name, "exists:\\n", repr_);
+        return tensor_dict_.at(name);
+        """)
+        return code.ret("tv::Tensor")
+
+    @pccm.pybind.mark
+    @pccm.member_function(virtual=True)
+    def free(self):
+        code = pccm.code()
+        code.arg("ten", "tv::Tensor")
+        return code
+
+    @pccm.pybind.mark
+    @pccm.member_function(virtual=True)
+    def free_noexcept(self):
+        code = pccm.code()
+        code.arg("ten", "tv::Tensor")
+        return code
+
+
--- a/spconv/csrc/sparse/convops.py
+++ b/spconv/csrc/sparse/convops.py