still working on c++ only

21bb00ae · Yan Yan · 899008fa · 21bb00ae · 21bb00ae · 21bb00ae
Commit 21bb00ae authored Jul 27, 2022 by Yan Yan
20 changed files
--- a/example/tensorrt/README.md
+++ b/example/tensorrt/README.md
+<!--
+ Copyright 2022 Yan Yan
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+     http://www.apache.org/licenses/LICENSE-2.0
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+TODO
\ No newline at end of file
--- a/setup.py
+++ b/setup.py
@@ -175,7 +175,7 @@ if disable_jit is not None and disable_jit == "1":
            std = "c++14" 
        else:
            std = "c++17"
-    if CUMM_CPU_ONLY_BUILD:
+    if not CUMM_CPU_ONLY_BUILD:
        gemmtuner = GemmTunerSimple(cu)
        gemmtuner.namespace = "csrc.sparse.convops.gemmops"
        convtuner = ConvTunerSimple(convcu)

--- a/spconv/build.py
+++ b/spconv/build.py
@@ -62,8 +62,7 @@ if project_is_installed(PACKAGE_NAME) and project_is_editable(
        CompileInfo(),
        ExternalAllocator(),
        ExternalSpconvMatmul(),
-        SimpleExternalSpconvMatmul(),
+        SimpleExternalSpconvMatmul(), # for debug, won't be included in release
    ]
    pccm.builder.build_pybind(cus,
                              PACKAGE_ROOT / "core_cc",

--- a/spconv/constants.py
+++ b/spconv/constants.py
@@ -64,7 +64,7 @@ SPCONV_DEBUG_CPP_ONLY = project_is_editable(PACKAGE_NAME)
 class AllocKeys:
-    Pair = "Pair"
+    PairBwd = "PairBwd"
    IndiceNumPerLoc = "IndiceNumPerLoc"
    PairMask = "PairMask"
    MaskArgSort = "MaskArgSort"
@@ -103,3 +103,5 @@ SPCONV_CPP_INDICE_PAIRS = True
 SPCONV_CPP_INDICE_PAIRS_IGEMM = True 
 SPCONV_CPP_GEMM = True
+SPCONV_FX_TRACE_MODE = os.getenv("SPCONV_FX_TRACE_MODE", "0") == "1"
\ No newline at end of file
--- a/spconv/core_cc/csrc/sparse/all/__init__.pyi
+++ b/spconv/core_cc/csrc/sparse/all/__init__.pyi
@@ -240,6 +240,28 @@ class SpconvOps:
        """
        ...
    @staticmethod
+    def avgpool_implicit_gemm_forward(out: Tensor, inp: Tensor, inds: Tensor, count_out: Tensor, stream: int = 0) -> None: 
+        """
+        Args:
+            out: 
+            inp: 
+            inds: 
+            count_out: 
+            stream: 
+        """
+        ...
+    @staticmethod
+    def avgpool_implicit_gemm_backward(dout: Tensor, dinp: Tensor, inds: Tensor, count_out: Tensor, stream: int = 0) -> None: 
+        """
+        Args:
+            dout: 
+            dinp: 
+            inds: 
+            count_out: 
+            stream: 
+        """
+        ...
+    @staticmethod
    def maxpool_forward_cpu(out: Tensor, inp: Tensor, out_inds: Tensor, in_inds: Tensor) -> None: 
        """
        Args:
@@ -280,15 +302,6 @@ class SpconvOps:
        """
        ...
    @staticmethod
-    def sort_1d_by_key(data: Tensor, indices: Tensor =  Tensor(), stream: int = 0) -> Tensor: 
-        """
-        Args:
-            data: 
-            indices: 
-            stream: 
-        """
-        ...
-    @staticmethod
    def sort_1d_by_key_allocator(data: Tensor, alloc_func, indices: Tensor =  Tensor(), stream: int = 0) -> Tensor: 
        """
        Args:
@@ -348,6 +361,24 @@ class SpconvOps:
        """
        ...
    @staticmethod
+    def maximum_value_int(data: Tensor, value: int, stream_int: int) -> None: 
+        """
+        Args:
+            data: 
+            value: 
+            stream_int: 
+        """
+        ...
+    @staticmethod
+    def sort_1d_by_key(data: Tensor, indices: Tensor =  Tensor(), stream: int = 0) -> Tensor: 
+        """
+        Args:
+            data: 
+            indices: 
+            stream: 
+        """
+        ...
+    @staticmethod
    def calc_point2voxel_meta_data(vsize_xyz: List[float], coors_range_xyz: List[float]) -> Tuple[List[float], List[int], List[int], List[float]]: 
        """
        Args:
@@ -407,6 +438,18 @@ class SpconvOps:
        """
        ...
    @staticmethod
+    def get_indice_gen_tensors_from_workspace(workspace, kv: int, num_act_in: int, num_act_out_bound: int, subm: bool, use_int64_hash_k: bool) -> Dict[str, Tensor]: 
+        """
+        Args:
+            workspace: 
+            kv: 
+            num_act_in: 
+            num_act_out_bound: 
+            subm: 
+            use_int64_hash_k: 
+        """
+        ...
+    @staticmethod
    def get_indice_pairs_implicit_gemm(allocator, indices: Tensor, batch_size: int, input_dims: List[int], algo: int, ksize: List[int], stride: List[int], padding: List[int], dilation: List[int], out_padding: List[int], subm: bool, transposed: bool, is_train: bool, stream_int: int = 0, num_out_act_bound: int = -1) -> Tuple[Tensor, int]: 
        """
        Args:
@@ -428,7 +471,7 @@ class SpconvOps:
        """
        ...
    @staticmethod
-    def get_indice_pairs(allocator, indices: Tensor, batch_size: int, input_dims: List[int], algo: int, ksize: List[int], stride: List[int], padding: List[int], dilation: List[int], out_padding: List[int], subm: bool, transposed: bool, stream_int: int = 0, num_out_act_bound: int = -1) -> int: 
+    def get_indice_pairs(allocator, indices: Tensor, batch_size: int, input_dims: List[int], algo: int, ksize: List[int], stride: List[int], padding: List[int], dilation: List[int], out_padding: List[int], subm: bool, transposed: bool, stream_int: int = 0, num_out_act_bound: int = -1, num_input_act_bound: int = -1) -> int: 
        """
        Args:
            allocator: 
@@ -445,5 +488,6 @@ class SpconvOps:
            transposed: 
            stream_int: 
            num_out_act_bound: 
+            num_input_act_bound: 
        """
        ...
--- a/spconv/core_cc/csrc/sparse/alloc.pyi
+++ b/spconv/core_cc/csrc/sparse/alloc.pyi
@@ -2,29 +2,29 @@ from typing import overload, Any, Callable, Dict, List, Optional, Set, Tuple, Ty
 from pccm.stubs import EnumValue, EnumClassValue
 from cumm.tensorview import Tensor
 class ExternalAllocator:
-    def zeros(self, name: str, shape: List[int], dtype: int, device: int, is_temp_memory: bool = False, stream: int = 0) -> Tensor: 
+    def zeros(self, name: str, shape: List[int], dtype: int, device: int, stream: int = 0, is_temp_memory: bool = False) -> Tensor: 
        """
        Args:
            name: 
            shape: 
            dtype: 
            device: 
-            is_temp_memory: 
            stream: 
+            is_temp_memory: 
        """
        ...
-    def empty(self, name: str, shape: List[int], dtype: int, device: int, is_temp_memory: bool = False, stream: int = 0) -> Tensor: 
+    def empty(self, name: str, shape: List[int], dtype: int, device: int, stream: int = 0, is_temp_memory: bool = False) -> Tensor: 
        """
        Args:
            name: 
            shape: 
            dtype: 
            device: 
-            is_temp_memory: 
            stream: 
+            is_temp_memory: 
        """
        ...
-    def full_int(self, name: str, shape: List[int], value: int, dtype: int, device: int, is_temp_memory: bool = False, stream: int = 0) -> Tensor: 
+    def full_int(self, name: str, shape: List[int], value: int, dtype: int, device: int, stream: int = 0, is_temp_memory: bool = False) -> Tensor: 
        """
        Args:
            name: 
@@ -32,11 +32,11 @@ class ExternalAllocator:
            value: 
            dtype: 
            device: 
-            is_temp_memory: 
            stream: 
+            is_temp_memory: 
        """
        ...
-    def full_float(self, name: str, shape: List[int], value: float, dtype: int, device: int, is_temp_memory: bool = False, stream: int = 0) -> Tensor: 
+    def full_float(self, name: str, shape: List[int], value: float, dtype: int, device: int, stream: int = 0, is_temp_memory: bool = False) -> Tensor: 
        """
        Args:
            name: 
@@ -44,8 +44,8 @@ class ExternalAllocator:
            value: 
            dtype: 
            device: 
-            is_temp_memory: 
            stream: 
+            is_temp_memory: 
        """
        ...
    def get_tensor_by_name(self, name: str) -> Tensor: 

--- a/spconv/csrc/sparse/all.py
+++ b/spconv/csrc/sparse/all.py
--- a/spconv/csrc/sparse/alloc.py
+++ b/spconv/csrc/sparse/alloc.py
@@ -2,7 +2,8 @@ import pccm
 from cumm.common import TensorView, TensorViewCPU, TensorViewKernel, ThrustLib
 from spconv.constants import AllocKeys
+from cumm.constants import CUMM_CPU_ONLY_BUILD
+from .indices import CudaCommonKernel
 class ExternalAllocatorGuard(pccm.Class):
    def __init__(self):
        super().__init__()
@@ -53,8 +54,8 @@ class ExternalAllocator(pccm.Class):
        code.arg("shape", "std::vector<int64_t>")
        code.arg("dtype", "int")
        code.arg("device", "int")
-        code.arg("is_temp_memory", "bool", "false")
        code.arg("stream", "std::uintptr_t", "0")
+        code.arg("is_temp_memory", "bool", "false")
        return code.ret("tv::Tensor")
@@ -66,8 +67,8 @@ class ExternalAllocator(pccm.Class):
        code.arg("shape", "std::vector<int64_t>")
        code.arg("dtype", "int")
        code.arg("device", "int")
-        code.arg("is_temp_memory", "bool", "false")
        code.arg("stream", "std::uintptr_t", "0")
+        code.arg("is_temp_memory", "bool", "false")
        return code.ret("tv::Tensor")
@@ -80,8 +81,8 @@ class ExternalAllocator(pccm.Class):
        code.arg("value", "int")
        code.arg("dtype", "int")
        code.arg("device", "int")
-        code.arg("is_temp_memory", "bool", "false")
        code.arg("stream", "std::uintptr_t", "0")
+        code.arg("is_temp_memory", "bool", "false")
        return code.ret("tv::Tensor")
@@ -94,8 +95,9 @@ class ExternalAllocator(pccm.Class):
        code.arg("value", "float")
        code.arg("dtype", "int")
        code.arg("device", "int")
-        code.arg("is_temp_memory", "bool", "false")
        code.arg("stream", "std::uintptr_t", "0")
+        code.arg("is_temp_memory", "bool", "false")
        return code.ret("tv::Tensor")
    @pccm.pybind.mark(virtual=True)
@@ -129,7 +131,7 @@ class ExternalAllocator(pccm.Class):
        code.arg("stream", "std::uintptr_t", "0")
        code.raw(f"""
        // "" means temp memory
-        auto ten = zeros(name, shape, dtype, device, true, stream);
+        auto ten = zeros(name, shape, dtype, device, stream, true);
        return std::make_{self.ptr_type}<ExternalAllocatorGuard>(ten, [this](tv::Tensor ten){{
            this->free(ten);
        }});
@@ -145,7 +147,7 @@ class ExternalAllocator(pccm.Class):
        code.arg("name", "std::string", "\"\"")
        code.arg("stream", "std::uintptr_t", "0")
        code.raw(f"""
-        auto ten = empty(name, shape, dtype, device, true, stream);
+        auto ten = empty(name, shape, dtype, device, stream, true);
        return std::make_{self.ptr_type}<ExternalAllocatorGuard>(ten, [this](tv::Tensor ten){{
            this->free(ten);
        }});
@@ -162,7 +164,7 @@ class ExternalAllocator(pccm.Class):
        code.arg("name", "std::string", "\"\"")
        code.arg("stream", "std::uintptr_t", "0")
        code.raw(f"""
-        auto ten = full_int(name, shape, value, dtype, device, true, stream);
+        auto ten = full_int(name, shape, value, dtype, device, stream, true);
        return std::make_{self.ptr_type}<ExternalAllocatorGuard>(ten, [this](tv::Tensor ten){{
            this->free(ten);
        }});
@@ -179,7 +181,7 @@ class ExternalAllocator(pccm.Class):
        code.arg("name", "std::string", "\"\"")
        code.arg("stream", "std::uintptr_t", "0")
        code.raw(f"""
-        auto ten = full_float(name, shape, value, dtype, device, true, stream);
+        auto ten = full_float(name, shape, value, dtype, device, stream, true);
        return std::make_{self.ptr_type}<ExternalAllocatorGuard>(ten, [this](tv::Tensor t){{
            this->free(t);
        }});
@@ -222,8 +224,10 @@ class ThrustAllocator(pccm.Class):
        """)
        return code
 class StaticAllocator(ExternalAllocator):
-    """a simple allocator for tensorrt plugin.
+    """a static allocator for tensorrt plugin.
    """
    def __init__(self):
        super().__init__()
@@ -232,6 +236,7 @@ class StaticAllocator(ExternalAllocator):
        self.add_member("repr_", "std::string")
        self.add_member("thrust_tmp_tensor_", "tv::Tensor")
        self.grow = 1.5
+        self.cuda_common_kernel = CudaCommonKernel()
    @pccm.pybind.mark 
    @pccm.constructor
@@ -242,7 +247,22 @@ class StaticAllocator(ExternalAllocator):
        code.raw(f"""
        std::stringstream ss;
        for (auto& p : tensor_dict){{
-            tv::ssprint(ss, p.first, p.second.shape(), tv::dtype_str(p.second.dtype()), "\\n");
+            tv::sstream_print(ss, p.first, p.second.shape(), tv::dtype_str(p.second.dtype()), "\\n");
+        }}
+        repr_ = ss.str();
+        """)
+        return code 
+    @pccm.pybind.mark 
+    @pccm.member_function
+    def set_new_tensor_dict(self):
+        code = pccm.code()
+        code.arg("tensor_dict", "std::unordered_map<std::string, tv::Tensor>")
+        code.raw(f"""
+        tensor_dict_ = tensor_dict;
+        std::stringstream ss;
+        for (auto& p : tensor_dict){{
+            tv::sstream_print(ss, p.first, p.second.shape(), tv::dtype_str(p.second.dtype()), "\\n");
        }}
        repr_ = ss.str();
        """)
@@ -255,12 +275,21 @@ class StaticAllocator(ExternalAllocator):
        code.arg("shape", "std::vector<int64_t>")
        code.arg("dtype", "int")
        code.arg("device", "int")
+        code.arg("is_temp_memory", "bool", "false")
        code.raw(f"""
        auto res = get_tensor_by_name(name);
        size_t total = std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<int64_t>());
-        TV_ASSERT_RT_ERR(res.nbytes() >= total * tv::bit_size(tv::DType(dtype)) 
+        TV_ASSERT_RT_ERR(res.nbytes() >= total * tv::bit_size(tv::DType(dtype)) / 8 
-            && res.device() == device, "alloc failed", shape, res.shape());
+            && res.device() == device, "alloc failed, tensor size too small", shape, res.shape());
-        return tv::from_blob(res.raw_data(), shape, dtype, device);
+        // if (is_temp_memory){{
+        // }}else{{
+        //     // size must exactly match
+        //     TV_ASSERT_RT_ERR(res.nbytes() == total * tv::bit_size(tv::DType(dtype)) / 8 
+        //         && res.device() == device, "alloc failed, named memory size must match", shape, res.shape());
+        // }}
+        return tv::from_blob(res.raw_data(), shape, tv::DType(dtype), device);
        """)
        return code.ret("tv::Tensor")
@@ -273,16 +302,22 @@ class StaticAllocator(ExternalAllocator):
        code.arg("shape", "std::vector<int64_t>")
        code.arg("dtype", "int")
        code.arg("device", "int")
-        code.arg("is_temp_memory", "bool", "false")
        code.arg("stream", "std::uintptr_t", "0")
+        code.arg("is_temp_memory", "bool", "false")
        code.raw(f"""
        auto tvctx = tv::Context();
+        """)
+        if not CUMM_CPU_ONLY_BUILD:
+            code.raw(f"""
            tvctx.set_cuda_stream(reinterpret_cast<cudaStream_t>(stream));
-        auto blob = _get_raw_and_check(name, shape, dtype, device);
+            """)
+        code.raw(f"""
+        auto blob = _get_raw_and_check(name, shape, dtype, device, is_temp_memory);
        return blob.zero_(tvctx);
        """)
        return code.ret("tv::Tensor")
    @pccm.pybind.mark
    @pccm.member_function(virtual=True)
    def empty(self):
@@ -291,8 +326,8 @@ class StaticAllocator(ExternalAllocator):
        code.arg("shape", "std::vector<int64_t>")
        code.arg("dtype", "int")
        code.arg("device", "int")
-        code.arg("is_temp_memory", "bool", "false")
        code.arg("stream", "std::uintptr_t", "0")
+        code.arg("is_temp_memory", "bool", "false")
        code.raw(f"""
        if (name == {pccm.literal(AllocKeys.ThrustTemp)}){{
            // thrust tmp shouldn't inside tensor_dict. use a simple method to allocate
@@ -300,23 +335,28 @@ class StaticAllocator(ExternalAllocator):
            // so we can just use one tensor
            tv::Tensor res = thrust_tmp_tensor_;
            if (res.empty()){{
-                res = tv::empty(shape, dtype, device);
+                res = tv::empty(shape, tv::DType(dtype), device);
                thrust_tmp_tensor_ = res;
            }}
            if (shape[0] > thrust_tmp_tensor_.dim(0)){{
-                res = tv::empty({{int64_t(shape[0] * {self.grow})}}, dtype, device);
+                res = tv::empty({{int64_t(shape[0] * {self.grow})}}, tv::DType(dtype), device);
                thrust_tmp_tensor_ = res;
            }}
            return res;
        }}else{{
-            auto blob = _get_raw_and_check(name, shape, dtype, device);
+            auto blob = _get_raw_and_check(name, shape, dtype, device, is_temp_memory);
            return blob;
        }}
        """)
        return code.ret("tv::Tensor")
+    # cpu only build can't use pccm.cuda
+    __CUDA_DECORATOR = pccm.member_function
+    if not CUMM_CPU_ONLY_BUILD:
+        __CUDA_DECORATOR = pccm.cuda.member_function
    @pccm.pybind.mark
-    @pccm.member_function(virtual=True)
+    @__CUDA_DECORATOR
    def full_int(self):
        code = pccm.code()
        code.arg("name", "std::string")
@@ -324,17 +364,36 @@ class StaticAllocator(ExternalAllocator):
        code.arg("value", "int")
        code.arg("dtype", "int")
        code.arg("device", "int")
-        code.arg("is_temp_memory", "bool", "false")
        code.arg("stream", "std::uintptr_t", "0")
+        code.arg("is_temp_memory", "bool", "false")
        code.raw(f"""
        auto tvctx = tv::Context();
-        auto blob = _get_raw_and_check(name, shape, dtype, device);
+        auto blob = _get_raw_and_check(name, shape, dtype, device, is_temp_memory);
-        return blob.fill_(tvctx, value);
+        """)
+        if not CUMM_CPU_ONLY_BUILD:
+            code.add_param_class("cudakers", self.cuda_common_kernel)
+            code.raw(f"""
+            tvctx.set_cuda_stream(reinterpret_cast<cudaStream_t>(stream));
+            using ints_t = std::tuple<int32_t, int16_t, int8_t, int64_t, uint32_t, uint64_t, uint16_t, uint8_t>;
+            tv::Dispatch<ints_t>()(blob.dtype(), [&](auto I){{
+                using T = TV_DECLTYPE(I);
+                tv::cuda::Launch lanucher_fill(blob.size(), reinterpret_cast<cudaStream_t>(stream));
+                lanucher_fill(cudakers::fill_kernel<T>, blob.data_ptr<T>(), value, blob.size());
+            }});
+            """)
+        else:
+            code.raw(f"""
+            blob.fill_(value);
+            """)
+        code.raw(f"""
+        return blob;
        """)
        return code.ret("tv::Tensor")
    @pccm.pybind.mark
-    @pccm.member_function(virtual=True)
+    @__CUDA_DECORATOR
    def full_float(self):
        code = pccm.code()
        code.arg("name", "std::string")
@@ -342,11 +401,29 @@ class StaticAllocator(ExternalAllocator):
        code.arg("value", "float")
        code.arg("dtype", "int")
        code.arg("device", "int")
-        code.arg("is_temp_memory", "bool", "false")
        code.arg("stream", "std::uintptr_t", "0")
+        code.arg("is_temp_memory", "bool", "false")
+        code.raw(f"""
+        auto tvctx = tv::Context();
+        auto blob = _get_raw_and_check(name, shape, dtype, device, is_temp_memory);
+        """)
+        if not CUMM_CPU_ONLY_BUILD:
+            code.add_param_class("cudakers", self.cuda_common_kernel)
            code.raw(f"""
-        auto blob = _get_raw_and_check(name, shape, dtype, device);
+            tvctx.set_cuda_stream(reinterpret_cast<cudaStream_t>(stream));
-        return blob.fill_(tvctx, value);
+            using dtypes_t = std::tuple<float, double>;
+            tv::Dispatch<dtypes_t>()(blob.dtype(), [&](auto I){{
+                using T = TV_DECLTYPE(I);
+                tv::cuda::Launch lanucher_fill(blob.size(), reinterpret_cast<cudaStream_t>(stream));
+                lanucher_fill(cudakers::fill_kernel<T>, blob.data_ptr<T>(), value, blob.size());
+            }});
+            """)
+        else:
+            code.raw(f"""
+            blob.fill_(value);
+            """)
+        code.raw(f"""
+        return blob;
        """)
        return code.ret("tv::Tensor")
@@ -364,6 +441,7 @@ class StaticAllocator(ExternalAllocator):
    @pccm.pybind.mark
    @pccm.member_function(virtual=True)
    def free(self):
+        # nothing here because this is a static allocator
        code = pccm.code()
        code.arg("ten", "tv::Tensor")
        return code

--- a/spconv/csrc/sparse/convops.py
+++ b/spconv/csrc/sparse/convops.py
@@ -78,11 +78,9 @@ class ExternalSpconvMatmul(pccm.Class):
        return code
 class SimpleExternalSpconvMatmul(ExternalSpconvMatmul):
-    """a helper class to warp matmul operations
+    """implement gemm in cuda via cublasLt. (only support forward)
-    because we don't want to implement matmul
+    should be used with tensorrt plugin.
-    (link to cublas/mkl/pytorch) in python package.
    """
    def __init__(self):
        super().__init__()
        self.add_dependency(TensorView, ExternalAllocator)
@@ -311,7 +309,7 @@ class SimpleExternalSpconvMatmul(ExternalSpconvMatmul):
          TV_THROW_RT_ERR("unsupported");
        }}
        check_cublas_status(cublasLtMatmul(
-            handle, operationDesc, alpha_storage, a.raw_data(), Adesc, b.raw_data(),
+            handle, operationDesc, alpha_storage, a.const_raw_data(), Adesc, b.const_raw_data(),
            Bdesc, beta_storage, c.raw_data(), Cdesc, c.raw_data(), Cdesc,
            &heuristicResult.algo, nullptr, 0, stream));
        if (preference)
@@ -1417,11 +1415,12 @@ class ConvGemmOps(pccm.ParameterizedClass):
                is_KC_not_CK, kv_center, out_channel);
        }}else{{
            out_features = allocator.zeros({pccm.literal(AllocKeys.OutFeatures)}, 
-                {{num_activate_out, out_channel}}, features.dtype(), features.device());
+                {{num_activate_out, out_channel}}, features.dtype(), features.device(), stream_int);
        }}
        if (kv == 1 && subm){{
            return;
        }}
        auto indice_pair_num_cpu = indice_pair_num.cpu();
        auto indice_pair_num_cpu_ptr = indice_pair_num_cpu.data_ptr<int>();
        int maxnhot = 0;
@@ -1618,7 +1617,7 @@ class ConvGemmOps(pccm.ParameterizedClass):
        int kv_center = kv / 2;
        tv::Tensor din;
        auto dfilters = allocator.zeros({pccm.literal(AllocKeys.DFilters)}, 
-                prev_filter_shape_vec, features.dtype(), features.device());
+                prev_filter_shape_vec, features.dtype(), features.device(), stream_int);
        dfilters = dfilters.view(filters.shape());
        if (subm){{
            din = ext_mm.indice_conv_bwd_init_gemm({pccm.literal(AllocKeys.Features)}, 
@@ -1628,7 +1627,7 @@ class ConvGemmOps(pccm.ParameterizedClass):
                is_KC_not_CK, kv_center);
        }}else{{
            din = allocator.zeros({pccm.literal(AllocKeys.DIn)}, 
-                    features.shape_vector(), features.dtype(), features.device());
+                    features.shape_vector(), features.dtype(), features.device(), stream_int);
        }}
        if (kv == 1 && subm){{
            return;
@@ -1922,10 +1921,10 @@ class ConvGemmOps(pccm.ParameterizedClass):
        tv::Tensor out_features;
        if (is_subm){{
            out_features = allocator.empty({pccm.literal(AllocKeys.OutFeatures)}, 
-                {{num_activate_out, out_channel}}, features.dtype(), features.device());
+                {{num_activate_out, out_channel}}, features.dtype(), features.device(), stream_int);
        }}else{{
            out_features = allocator.zeros({pccm.literal(AllocKeys.OutFeatures)}, 
-                {{num_activate_out, out_channel}}, features.dtype(), features.device());
+                {{num_activate_out, out_channel}}, features.dtype(), features.device(), stream_int);
        }}
        auto arch = get_compute_capability();
        constexpr auto kForwardInt = static_cast<int>(tv::gemm::ConvOpType::kForward);
@@ -1966,7 +1965,7 @@ class ConvGemmOps(pccm.ParameterizedClass):
        if (is_train){{
            mask_output_fwd = allocator.empty({pccm.literal(AllocKeys.MaskOutputFwd)}, 
                {{num_split, tv::div_up(num_activate_out, mask_width)}}, 
-                tv::uint32, features.device());
+                tv::uint32, features.device(), stream_int);
            for (int i = 0; i < num_split; ++i){{
                mask_output_fwd_splits.push_back(mask_output_fwd[i]);
            }}
@@ -2042,13 +2041,13 @@ class ConvGemmOps(pccm.ParameterizedClass):
        tv::Tensor din;
        if (is_subm){{
            din = allocator.empty({pccm.literal(AllocKeys.DIn)}, 
-                features.shape_vector(), features.dtype(), features.device());
+                features.shape_vector(), features.dtype(), features.device(), stream_int);
        }}else{{
            din = allocator.zeros({pccm.literal(AllocKeys.DIn)}, 
-                features.shape_vector(), features.dtype(), features.device());
+                features.shape_vector(), features.dtype(), features.device(), stream_int);
        }}
        tv::Tensor dfilters = allocator.zeros({pccm.literal(AllocKeys.DFilters)}, 
-            filters_shape_vec, filters.dtype(), filters.device());
+            filters_shape_vec, filters.dtype(), filters.device(), stream_int);
        dfilters = dfilters.view(out_channel, -1, in_channel);
        constexpr auto kForwardInt = static_cast<int>(tv::gemm::ConvOpType::kForward);

--- a/spconv/csrc/sparse/indices.py
+++ b/spconv/csrc/sparse/indices.py
--- a/spconv/csrc/sparse/maxpool.py
+++ b/spconv/csrc/sparse/maxpool.py
@@ -180,6 +180,85 @@ class IndiceMaxPool(pccm.Class):
        """)
        return code
+    @pccm.cuda.cuda_global_function
+    def forward_avgpool_implicit_gemm_kernel(self):
+        code = pccm.FunctionCode()
+        code.targ("T")
+        code.arg("out_features", f"T*")
+        code.arg("in_features", f"const T*")
+        code.arg("indices", "const int*")
+        code.arg("count_out", "int*")
+        code.arg("num_features", "int")
+        code.arg("RS", "int")
+        code.arg("num_indices", "int")
+        code.raw(f"""
+        for (int i : tv::KernelLoopY<int>(num_indices)) {{
+            auto out_ptr = out_features + i * num_features;
+            auto indices_ptr = indices + i;
+            int in_idx = 0;
+            int count = 0;
+            for (int k = 0; k < RS; ++k){{
+                in_idx = indices_ptr[0];
+                count += int(in_idx != -1);
+                indices_ptr += num_indices;
+            }}
+            if (count_out != nullptr){{
+                count_out[i] = count;
+            }}
+            for (int j : tv::KernelLoopX<int>(num_features)) {{
+                indices_ptr = indices + i;
+                int in_idx;
+                T in, in_temp;
+                in = T(0);
+                for (int k = 0; k < RS; ++k){{
+                    in_idx = indices_ptr[0];
+                    bool valid = in_idx != -1;
+                    in_temp = valid ? in_features[in_idx * num_features + j] : T(0);
+                    in += in_temp;
+                    indices_ptr += num_indices;
+                }}
+                out_ptr[j] = count > 0 ? in / T(count) : T(0);
+            }}
+        }}
+        """)
+        return code
+    @pccm.cuda.cuda_global_function
+    def backward_avgpool_implicit_gemm_kernel(self):
+        code = pccm.FunctionCode()
+        code.targ("T")
+        code.arg("dout_features", f"const T*")
+        code.arg("din_features", f"T*")
+        code.arg("indices_bwd", "const int*")
+        code.arg("count_out", "const int*")
+        code.arg("num_features", "int")
+        code.arg("RS", "int")
+        code.arg("num_indices", "int")
+        code.raw(f"""
+        for (int i : tv::KernelLoopY<int>(num_indices)) {{
+            auto din_ptr = din_features + i * num_features;
+            for (int j : tv::KernelLoopX<int>(num_features)) {{
+                auto indices_ptr = indices_bwd + i;
+                int out_idx = 0;
+                T sum_val = T(0);
+                for (int k = 0; k < RS; ++k){{
+                    out_idx = indices_ptr[0];
+                    bool valid = out_idx != -1;
+                    T dout = valid ? dout_features[out_idx * num_features + j] : T(0);
+                    int count = valid ? count_out[out_idx] : T(0);
+                    sum_val += dout * T(count);
+                    indices_ptr += num_indices;
+                }}
+                din_ptr[j] = sum_val;
+            }}
+        }}
+        """)
+        return code
    @pccm.cuda.static_function
    def forward(self):
        code = pccm.FunctionCode()
@@ -348,6 +427,92 @@ class IndiceMaxPool(pccm.Class):
        """)
        return code
+    @pccm.cuda.static_function
+    def forward_avgpool_implicit_gemm(self):
+        code = pccm.FunctionCode()
+        code.arg("out", "tv::Tensor")
+        code.arg("in", "tv::Tensor")
+        code.arg("inds", "tv::Tensor")
+        code.arg("count_out", "tv::Tensor")
+        code.arg("stream", "std::uintptr_t", "0")
+        code.raw(f"""
+        auto nhot = out.dim(0);
+        tv::check_shape(inds, {{-1, nhot}});
+        tv::check_shape(in, {{-1, out.dim(1)}});
+        auto cudastream = reinterpret_cast<cudaStream_t>(stream);
+        tv::dispatch<float, double, tv::half_t, tv::bfloat16_t>(out.dtype(), [&](auto I){{
+            using T = TV_DECLTYPE(I);
+            constexpr int MaxThreads = 512;
+            tv::cuda::Launch launcher(1);
+            bool found = tv::dispatch_int_noexcept<512, 256, 128, 64, 32, 16>(out.dim(1), [](int my, int expect){{return my >= expect;}}, [&](auto V){{
+                // if out.dim(1) > value in list above, run this function.
+                // if a value is found, other value won't be executed.
+                int NumFeatures = TV_DECLTYPE(V)::value;
+                int Num0 = MaxThreads / NumFeatures;
+                dim3 blocks(tv::div_up(out.dim(1), int64_t(NumFeatures)), tv::div_up(nhot, int64_t(Num0)));
+                dim3 threads(NumFeatures, Num0);
+                launcher = tv::cuda::Launch(blocks, threads, cudastream);
+            }});
+            if (!found){{
+                int NumFeatures = 16;
+                int Num0 = MaxThreads / NumFeatures;
+                dim3 blocks(tv::div_up(out.dim(1), int64_t(NumFeatures)), tv::div_up(nhot, int64_t(Num0)));
+                dim3 threads(NumFeatures, Num0);
+                launcher = tv::cuda::Launch(blocks, threads, cudastream);
+            }}
+            launcher(forward_avgpool_implicit_gemm_kernel<T>, out.data_ptr<T>(), in.data_ptr<const T>(),
+                inds.data_ptr<const int>(), count_out.data_ptr<int>(), out.dim(1), inds.dim(0), inds.dim(1));
+        }});
+        """)
+        return code
+    @pccm.cuda.static_function
+    def backward_avgpool_implicit_gemm(self):
+        code = pccm.FunctionCode()
+        code.arg("dout", "tv::Tensor")
+        code.arg("din", "tv::Tensor")
+        code.arg("inds", "tv::Tensor")
+        code.arg("count_out", "tv::Tensor")
+        code.arg("stream", "std::uintptr_t", "0")
+        code.raw(f"""
+        auto nhot = din.dim(0);
+        TV_ASSERT_RT_ERR(!count_out.empty(), "count out must not empty")
+        tv::check_shape(inds, {{-1, nhot}});
+        tv::check_shape(din, {{-1, dout.dim(1)}});
+        int num_act_out = dout.dim(1);
+        auto cudastream = reinterpret_cast<cudaStream_t>(stream);
+        tv::dispatch<float, double, tv::half_t, tv::bfloat16_t>(dout.dtype(), [&](auto I){{
+            using T = TV_DECLTYPE(I);
+            constexpr int MaxThreads = 512;
+            tv::cuda::Launch launcher(1);
+            bool found = tv::dispatch_int_noexcept<512, 256, 128, 64, 32, 16>(dout.dim(1), [](int my, int expect){{return my >= expect;}}, [&](auto V){{
+                // if out.dim(1) > value in list above, run this function.
+                // if a value is found, other value won't be executed.
+                int NumFeatures = TV_DECLTYPE(V)::value;
+                int Num0 = MaxThreads / NumFeatures;
+                dim3 blocks(tv::div_up(dout.dim(1), int64_t(NumFeatures)), tv::div_up(nhot, int64_t(Num0)));
+                dim3 threads(NumFeatures, Num0);
+                launcher = tv::cuda::Launch(blocks, threads, cudastream);
+            }});
+            if (!found){{
+                int NumFeatures = 16;
+                int Num0 = MaxThreads / NumFeatures;
+                dim3 blocks(tv::div_up(dout.dim(1), int64_t(NumFeatures)), tv::div_up(nhot, int64_t(Num0)));
+                dim3 threads(NumFeatures, Num0);
+                launcher = tv::cuda::Launch(blocks, threads, cudastream);
+            }}
+            launcher(backward_avgpool_implicit_gemm_kernel<T>, 
+                dout.data_ptr<const T>(), din.data_ptr<T>(),
+                inds.data_ptr<const int>(), count_out.data_ptr<const int>(),
+                dout.dim(1), inds.dim(0), inds.dim(1));
+        }});
+        """)
+        return code
 class IndiceMaxPoolCPU(pccm.Class):
    def __init__(self):

--- a/spconv/csrc/sparse/pointops.py
+++ b/spconv/csrc/sparse/pointops.py
@@ -297,7 +297,7 @@ class Point2Voxel(pccm.ParameterizedClass, pccm.pybind.PybindClassMixin):
        self.add_dependency(TensorView)
        self.p2v_c = Point2VoxelCommon(dtype, ndim, zyx)
        self.add_param_class("p2v_c", self.p2v_c, "Point2VoxelCommon")
-        layout = TensorGeneric(ndim, True)
+        layout = TensorGeneric(ndim, False)
        self.add_param_class("layout_ns", layout, "Layout")
        self.dtype = dtype
        self.ndim = ndim
@@ -489,7 +489,7 @@ class Point2VoxelCPU(pccm.ParameterizedClass, pccm.pybind.PybindClassMixin):
    def __init__(self, dtype: dtypes.DType, ndim: int, zyx: bool = True):
        super().__init__()
        self.add_dependency(TensorView)
-        layout = TensorGeneric(ndim, True)
+        layout = TensorGeneric(ndim, False)
        self.add_param_class("layout_ns", layout, "Layout")
        self.dtype = dtype
        self.ndim = ndim

--- a/spconv/gencode/__init__.py
+++ b/spconv/gencode/__init__.py
+# Copyright 2022 Yan Yan
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
--- a/spconv/gencode/__main__.py
+++ b/spconv/gencode/__main__.py
@@ -10,33 +10,41 @@ from spconv.core import (IMPLGEMM_SIMT_PARAMS, IMPLGEMM_TURING_PARAMS,
                         SHUFFLE_TURING_PARAMS, SHUFFLE_VOLTA_PARAMS)
 from spconv.csrc.hash.core import HashTable
 from spconv.csrc.sparse.all import SpconvOps
-from spconv.csrc.sparse.alloc import ExternalAllocator
+from spconv.csrc.sparse.alloc import ExternalAllocator, StaticAllocator
 from spconv.csrc.sparse.convops import (ConvGemmOps, ConvTunerSimple,
                                        ExternalSpconvMatmul, GemmTunerSimple,
                                        SimpleExternalSpconvMatmul)
 from spconv.csrc.utils import BoxOps
+from cumm.gemm.algospec.core import (GemmAlgo, ShuffleStrideType)
+from cumm.conv.bases import ConvLayout, ConvLayoutType, ConvOpType
 def main(include: str,
         src: str,
         libname: str = "spconv",
-         prefix: str = "spconvlib"):
+         prefix: str = "spconvlib",
+         inference_only: bool = False):
    all_shuffle = SHUFFLE_SIMT_PARAMS + SHUFFLE_VOLTA_PARAMS + SHUFFLE_TURING_PARAMS
    all_shuffle = list(filter(lambda x: not x.is_nvrtc, all_shuffle))
+    if inference_only:
+        all_shuffle = list(filter(lambda x: x.shuffle_stride != ShuffleStrideType.ShuffleAB, all_shuffle))
    cu = GemmMainUnitTest(all_shuffle)
    cu.namespace = "cumm.gemm.main"
    all_imp = (IMPLGEMM_SIMT_PARAMS + IMPLGEMM_VOLTA_PARAMS +
               IMPLGEMM_TURING_PARAMS)
    # all_imp = IMPLGEMM_SIMT_PARAMS
    all_imp = list(filter(lambda x: not x.is_nvrtc, all_imp))
+    if inference_only:
+        all_imp = list(filter(lambda x: x.op_type == ConvOpType.kForward, all_imp))
    convcu = ConvMainUnitTest(all_imp)
    convcu.namespace = "cumm.conv.main"
    gemmtuner = GemmTunerSimple(cu)
-    gemmtuner.namespace = "csrc.sparse.convops.gemmops"
+    gemmtuner.namespace = "spconv.csrc.sparse.convops.gemmops"
    convtuner = ConvTunerSimple(convcu)
-    convtuner.namespace = "csrc.sparse.convops.convops"
+    convtuner.namespace = "spconv.csrc.sparse.convops.convops"
    convops = ConvGemmOps(gemmtuner, convtuner)
-    convops.namespace = "csrc.sparse.convops.spops"
+    convops.namespace = "spconv.csrc.sparse.convops.spops"
    cus = [
        cu,
@@ -51,6 +59,7 @@ def main(include: str,
        ExternalAllocator(),
        ExternalSpconvMatmul(),
        SimpleExternalSpconvMatmul(),
+        StaticAllocator(),
    ]
    gen_cmake(libname, cus, include, src, namespace_prefix=prefix)

--- a/spconv/pytorch/__init__.py
+++ b/spconv/pytorch/__init__.py
@@ -17,7 +17,9 @@ from spconv.pytorch.modules import (SparseModule, SparseSequential,
                                    assign_name_for_sparse_modules)
 from spconv.pytorch.ops import ConvAlgo
 from spconv.pytorch.pool import (SparseMaxPool1d, SparseMaxPool2d,
-                                 SparseMaxPool3d, SparseMaxPool4d)
+                                 SparseMaxPool3d, SparseMaxPool4d,
+                                 SparseAvgPool1d, SparseAvgPool2d,
+                                 SparseAvgPool3d)
 from spconv.pytorch.tables import AddTable, ConcatTable, JoinTable

--- a/spconv/pytorch/conv.py
+++ b/spconv/pytorch/conv.py
@@ -38,6 +38,9 @@ from torch.nn.init import calculate_gain
 FILTER_HWIO = False
+_MAX_NUM_VOXELS_DURING_TRAINING = "max_num_voxels_during_training"
 class SparseConvolution(SparseModule):
    __constants__ = [
        'stride', 'padding', 'dilation', 'groups', 'bias', 'subm', 'inverse',
@@ -61,6 +64,7 @@ class SparseConvolution(SparseModule):
                 indice_key: Optional[str] = None,
                 algo: Optional[ConvAlgo] = None,
                 fp32_accum: Optional[bool] = None,
+                 record_voxel_count: bool = False,
                 name=None):
        super(SparseConvolution, self).__init__(name=name)
        assert groups == 1, "don't support groups for now"
@@ -89,6 +93,12 @@ class SparseConvolution(SparseModule):
        self.groups = groups
        self.subm = subm
        self.indice_key = indice_key
+        if record_voxel_count and not self.subm and not self.inverse:
+            # we record maximum voxel num in both inference and training if
+            # record_voxel_count flag setting.
+            self.register_buffer(_MAX_NUM_VOXELS_DURING_TRAINING,
+                                 torch.zeros(1, dtype=torch.int32))
+        self.record_voxel_count = record_voxel_count
        if algo is None:
            if kv <= 32 and not CPU_ONLY_BUILD:
                if kv < 8:
@@ -122,37 +132,46 @@ class SparseConvolution(SparseModule):
        else:
            self.register_parameter('bias', None)
        self.reset_parameters()
+        if hasattr(self, "_register_load_state_dict_pre_hook"):
-        self._register_load_state_dict_pre_hook(self._load_weight_different_layout)
+            self._register_load_state_dict_pre_hook(
+                self._load_weight_different_layout)
-    def _load_weight_different_layout(
-            self, state_dict, prefix, local_metadata, strict,
+    def _load_weight_different_layout(self, state_dict, prefix, local_metadata,
-            missing_keys, unexpected_keys, error_msgs):
+                                      strict, missing_keys, unexpected_keys,
+                                      error_msgs):
+        if self.record_voxel_count and not self.subm and not self.inverse and _MAX_NUM_VOXELS_DURING_TRAINING not in state_dict:
+            state_dict[prefix + _MAX_NUM_VOXELS_DURING_TRAINING] = torch.zeros(
+                1, dtype=torch.int32)
        if not SAVED_WEIGHT_LAYOUT:
            return
        key = prefix + "weight"
        assert key in state_dict
        ndim = self.ndim
        if SAVED_WEIGHT_LAYOUT == "RSKC":
-            state_dict[key] = state_dict[key].permute(ndim, *range(ndim), ndim + 1).contiguous()
+            state_dict[key] = state_dict[key].permute(ndim, *range(ndim),
+                                                      ndim + 1).contiguous()
        elif SAVED_WEIGHT_LAYOUT == "RSCK":
-            state_dict[key] = state_dict[key].permute(ndim + 1, *range(ndim), ndim).contiguous()
+            state_dict[key] = state_dict[key].permute(ndim + 1, *range(ndim),
+                                                      ndim).contiguous()
        if ALL_WEIGHT_IS_KRSC or self.algo != ConvAlgo.Native:
            # in spconv 2.2, we only support KRSC layout.
            if SAVED_WEIGHT_LAYOUT == "RSKC":
-                state_dict[key] = state_dict[key].permute(ndim, *range(ndim), ndim + 1).contiguous()
+                state_dict[key] = state_dict[key].permute(
+                    ndim, *range(ndim), ndim + 1).contiguous()
            elif SAVED_WEIGHT_LAYOUT == "RSCK":
-                state_dict[key] = state_dict[key].permute(ndim + 1, *range(ndim), ndim).contiguous()
+                state_dict[key] = state_dict[key].permute(
+                    ndim + 1, *range(ndim), ndim).contiguous()
        else:
            if self.algo == ConvAlgo.Native:
                # to RSCK
                if SAVED_WEIGHT_LAYOUT == "RSKC":
-                    state_dict[key] = state_dict[key].permute(*range(ndim), ndim + 1, ndim).contiguous()
+                    state_dict[key] = state_dict[key].permute(
+                        *range(ndim), ndim + 1, ndim).contiguous()
                elif SAVED_WEIGHT_LAYOUT == "KRSC":
-                    state_dict[key] = state_dict[key].permute(*range(1, ndim + 1), 0, ndim + 1).contiguous()
+                    state_dict[key] = state_dict[key].permute(
+                        *range(1, ndim + 1), 0, ndim + 1).contiguous()
    def extra_repr(self):
        s = ('{in_channels}, {out_channels}, kernel_size={kernel_size}'
@@ -218,6 +237,9 @@ class SparseConvolution(SparseModule):
            bound = 1 / math.sqrt(fan_in)
            init.uniform_(self.bias, -bound, bound)
+    def is_inverseable(self):
+        return self.indice_key is not None and not self.subm
    def forward(self, input: SparseConvTensor):
        assert isinstance(input, SparseConvTensor)
        assert input.features.shape[
@@ -410,7 +432,6 @@ class SparseConvolution(SparseModule):
                        self._check_subm_reuse_valid(input, spatial_shape,
                                                     datas)
                    else:
                        with input._timer.namespace("gen_pairs"):
                            # we need to gen bwd indices for regular conv
                            # because it may be inversed.
@@ -491,6 +512,14 @@ class SparseConvolution(SparseModule):
                features.shape[0])
            out_tensor.benchmark_record[self.name]["num_out_points"].append(
                out_features.shape[0])
+        if not self.subm and not self.inverse and self.record_voxel_count:
+            if hasattr(self,
+                        _MAX_NUM_VOXELS_DURING_TRAINING):
+                ops.maximum_value_int_(
+                    getattr(
+                        self,
+                        _MAX_NUM_VOXELS_DURING_TRAINING),
+                    outids.shape[0])
        out_tensor = out_tensor.replace_feature(out_features)
        out_tensor.indices = outids
        out_tensor.indice_dict = indice_dict
@@ -534,8 +563,10 @@ class SparseConv1d(SparseConvolution):
                 indice_key=None,
                 algo: Optional[ConvAlgo] = None,
                 fp32_accum: Optional[bool] = None,
+                 record_voxel_count: bool = False,
                 name=None):
-        super(SparseConv1d, self).__init__(1,
+        super(SparseConv1d,
+              self).__init__(1,
                             in_channels,
                             out_channels,
                             kernel_size,
@@ -547,6 +578,7 @@ class SparseConv1d(SparseConvolution):
                             indice_key=indice_key,
                             algo=algo,
                             fp32_accum=fp32_accum,
+                             record_voxel_count=record_voxel_count,
                             name=name)
@@ -563,8 +595,10 @@ class SparseConv2d(SparseConvolution):
                 indice_key=None,
                 algo: Optional[ConvAlgo] = None,
                 fp32_accum: Optional[bool] = None,
+                 record_voxel_count: bool = False,
                 name=None):
-        super(SparseConv2d, self).__init__(2,
+        super(SparseConv2d,
+              self).__init__(2,
                             in_channels,
                             out_channels,
                             kernel_size,
@@ -576,6 +610,7 @@ class SparseConv2d(SparseConvolution):
                             indice_key=indice_key,
                             algo=algo,
                             fp32_accum=fp32_accum,
+                             record_voxel_count=record_voxel_count,
                             name=name)
@@ -592,8 +627,10 @@ class SparseConv3d(SparseConvolution):
                 indice_key=None,
                 algo: Optional[ConvAlgo] = None,
                 fp32_accum: Optional[bool] = None,
+                 record_voxel_count: bool = False,
                 name=None):
-        super(SparseConv3d, self).__init__(3,
+        super(SparseConv3d,
+              self).__init__(3,
                             in_channels,
                             out_channels,
                             kernel_size,
@@ -605,6 +642,7 @@ class SparseConv3d(SparseConvolution):
                             indice_key=indice_key,
                             algo=algo,
                             fp32_accum=fp32_accum,
+                             record_voxel_count=record_voxel_count,
                             name=name)
@@ -621,8 +659,10 @@ class SparseConv4d(SparseConvolution):
                 indice_key=None,
                 algo: Optional[ConvAlgo] = None,
                 fp32_accum: Optional[bool] = None,
+                 record_voxel_count: bool = False,
                 name=None):
-        super(SparseConv4d, self).__init__(4,
+        super(SparseConv4d,
+              self).__init__(4,
                             in_channels,
                             out_channels,
                             kernel_size,
@@ -634,6 +674,7 @@ class SparseConv4d(SparseConvolution):
                             indice_key=indice_key,
                             algo=algo,
                             fp32_accum=fp32_accum,
+                             record_voxel_count=record_voxel_count,
                             name=name)
@@ -650,8 +691,10 @@ class SparseConvTranspose1d(SparseConvolution):
                 indice_key=None,
                 algo: Optional[ConvAlgo] = None,
                 fp32_accum: Optional[bool] = None,
+                 record_voxel_count: bool = False,
                 name=None):
-        super(SparseConvTranspose1d, self).__init__(1,
+        super(SparseConvTranspose1d,
+              self).__init__(1,
                             in_channels,
                             out_channels,
                             kernel_size,
@@ -664,6 +707,7 @@ class SparseConvTranspose1d(SparseConvolution):
                             indice_key=indice_key,
                             algo=algo,
                             fp32_accum=fp32_accum,
+                             record_voxel_count=record_voxel_count,
                             name=name)
@@ -680,8 +724,10 @@ class SparseConvTranspose2d(SparseConvolution):
                 indice_key=None,
                 algo: Optional[ConvAlgo] = None,
                 fp32_accum: Optional[bool] = None,
+                 record_voxel_count: bool = False,
                 name=None):
-        super(SparseConvTranspose2d, self).__init__(2,
+        super(SparseConvTranspose2d,
+              self).__init__(2,
                             in_channels,
                             out_channels,
                             kernel_size,
@@ -694,6 +740,7 @@ class SparseConvTranspose2d(SparseConvolution):
                             indice_key=indice_key,
                             algo=algo,
                             fp32_accum=fp32_accum,
+                             record_voxel_count=record_voxel_count,
                             name=name)
@@ -710,8 +757,10 @@ class SparseConvTranspose3d(SparseConvolution):
                 indice_key=None,
                 algo: Optional[ConvAlgo] = None,
                 fp32_accum: Optional[bool] = None,
+                 record_voxel_count: bool = False,
                 name=None):
-        super(SparseConvTranspose3d, self).__init__(3,
+        super(SparseConvTranspose3d,
+              self).__init__(3,
                             in_channels,
                             out_channels,
                             kernel_size,
@@ -724,6 +773,7 @@ class SparseConvTranspose3d(SparseConvolution):
                             indice_key=indice_key,
                             algo=algo,
                             fp32_accum=fp32_accum,
+                             record_voxel_count=record_voxel_count,
                             name=name)
@@ -740,8 +790,10 @@ class SparseConvTranspose4d(SparseConvolution):
                 indice_key=None,
                 algo: Optional[ConvAlgo] = None,
                 fp32_accum: Optional[bool] = None,
+                 record_voxel_count: bool = False,
                 name=None):
-        super(SparseConvTranspose4d, self).__init__(4,
+        super(SparseConvTranspose4d,
+              self).__init__(4,
                             in_channels,
                             out_channels,
                             kernel_size,
@@ -754,6 +806,7 @@ class SparseConvTranspose4d(SparseConvolution):
                             indice_key=indice_key,
                             algo=algo,
                             fp32_accum=fp32_accum,
+                             record_voxel_count=record_voxel_count,
                             name=name)

--- a/spconv/pytorch/core.py
+++ b/spconv/pytorch/core.py
@@ -12,13 +12,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import List, Optional, Tuple, Union, Dict
+from typing import Any, List, Optional, Tuple, Union, Dict
 import numpy as np
 import torch
 from spconv.core import ConvAlgo
 from spconv.pytorch.constants import PYTORCH_VERSION
 from spconv.tools import CUDAKernelTimer
+from spconv.constants import SPCONV_FX_TRACE_MODE
 if PYTORCH_VERSION >= [1, 8, 0]:
    try:
@@ -59,7 +60,8 @@ class ThrustSortAllocator:
 class IndiceData(object):
    def __init__(self, out_indices, indices, indice_pairs, indice_pair_num,
                 spatial_shape, out_spatial_shape, is_subm: bool, algo: ConvAlgo,
-                 ksize: List[int], stride: List[int], dilation: List[int], padding: List[int]):
+                 ksize: List[int], stride: List[int], dilation: List[int], padding: List[int],
+                 voxel_num: Optional[Any] = None):
        self.out_indices = out_indices
        self.indices = indices
        self.indice_pairs = indice_pairs
@@ -72,6 +74,8 @@ class IndiceData(object):
        self.stride = stride
        self.dilation = dilation
        self.padding = padding
+        # voxel_num is only used in tensorrt conversion.
+        self.voxel_num = voxel_num
 class ImplicitGemmIndiceData(object):
@@ -83,7 +87,9 @@ class ImplicitGemmIndiceData(object):
                 mask_argsort_bwd_splits: List[torch.Tensor],
                 masks: List[np.ndarray], spatial_shape, 
                 out_spatial_shape, is_subm: bool, algo: ConvAlgo,
-                 ksize: List[int], stride: List[int], dilation: List[int], padding: List[int]):
+                 ksize: List[int], stride: List[int], dilation: List[int], padding: List[int],
+                 in_voxel_num: Optional[Any] = None,
+                 out_voxel_num: Optional[Any] = None):
        self.out_indices = out_indices
        self.indices = indices
        self.pair_fwd = pair_fwd
@@ -101,6 +107,9 @@ class ImplicitGemmIndiceData(object):
        self.stride = stride
        self.dilation = dilation
        self.padding = padding
+        # in/out voxel_num is only used in tensorrt conversion.
+        self.in_voxel_num = in_voxel_num
+        self.out_voxel_num = out_voxel_num
 def scatter_nd(indices, updates, shape):
@@ -147,6 +156,7 @@ class SparseConvTensor(metaclass=SpConvTensorMeta):
            force_algo: force conv/pool layers use this algo, should only used for debug.
        """
        ndim = indices.shape[1] - 1
+        if not SPCONV_FX_TRACE_MODE:
            assert features.ndim == 2
            assert indices.ndim == 2
            assert len(spatial_shape) == ndim, "spatial shape must equal to ndim"

--- a/spconv/pytorch/cppcore.py
+++ b/spconv/pytorch/cppcore.py
@@ -103,7 +103,7 @@ class TorchAllocator(ExternalAllocator):
        self.allocated: Dict[Union[str, int], torch.Tensor] = {}
    def zeros(self, name: str, shape: List[int], dtype: int,
-              device: int, is_temp_memory: bool = False, stream: int = 0) -> tv.Tensor:
+              device: int, stream: int = 0, is_temp_memory: bool = False) -> tv.Tensor:
        # TODO free memory by name if its already free by pointer.
        # provide a name if you want to access it after c++ function exit.
        torch_uint_workaround = dtype in _TORCH_UINT_WORKAROUNDS
@@ -126,7 +126,7 @@ class TorchAllocator(ExternalAllocator):
        return ten_tv
    def empty(self, name: str, shape: List[int], dtype: int,
-              device: int, is_temp_memory: bool = False, stream: int = 0) -> tv.Tensor:
+              device: int, stream: int = 0, is_temp_memory: bool = False) -> tv.Tensor:
        torch_uint_workaround = dtype in _TORCH_UINT_WORKAROUNDS
        dtype_bkp = dtype
        if dtype in _TORCH_UINT_WORKAROUNDS:
@@ -147,7 +147,7 @@ class TorchAllocator(ExternalAllocator):
        return ten_tv
    def full_int(self, name: str, shape: List[int], value: int, dtype: int,
-                 device: int, is_temp_memory: bool = False, stream: int = 0) -> tv.Tensor:
+                 device: int, stream: int = 0, is_temp_memory: bool = False) -> tv.Tensor:
        if dtype in _TORCH_UINT_WORKAROUNDS and value < 0:
            raise NotImplementedError("you can't use full for unsigned dtypes")
        torch_uint_workaround = dtype in _TORCH_UINT_WORKAROUNDS
@@ -171,7 +171,7 @@ class TorchAllocator(ExternalAllocator):
        return ten_tv
    def full_float(self, name: str, shape: List[int], value: float, dtype: int,
-                   device: int, is_temp_memory: bool = False, stream: int = 0) -> tv.Tensor:
+                   device: int, stream: int = 0, is_temp_memory: bool = False) -> tv.Tensor:
        if dtype in _TORCH_UINT_WORKAROUNDS and value < 0:
            raise NotImplementedError("you can't use full for unsigned dtypes")
        torch_uint_workaround = dtype in _TORCH_UINT_WORKAROUNDS

--- a/spconv/pytorch/functional.py
+++ b/spconv/pytorch/functional.py
@@ -361,6 +361,25 @@ class SparseMaxPoolImplicitGemmFunction(Function):
            features, out, grad_output, indice_pairs_bwd)
        return input_bp, None, None, None
+class SparseAvgPoolImplicitGemmFunction(Function):
+    @staticmethod
+    @_TORCH_CUSTOM_FWD
+    def forward(ctx, features: torch.Tensor, indice_pairs_fwd: torch.Tensor,
+                indice_pairs_bwd: torch.Tensor, num_activate_out: int, calc_count):
+        out, count = ops.indice_avgpool_implicit_gemm(features, indice_pairs_fwd,
+                                               num_activate_out, calc_count)
+        ctx.save_for_backward(indice_pairs_bwd, features, out, count)
+        return out
+    @staticmethod
+    @once_differentiable
+    @_TORCH_CUSTOM_BWD
+    def backward(ctx, grad_output):
+        indice_pairs_bwd, features, out, count = ctx.saved_tensors
+        input_bp = ops.indice_avgpool_implicit_gemm_backward(
+            grad_output, indice_pairs_bwd, count)
+        return input_bp, None, None, None, None
 indice_conv = SparseConvFunction.apply
 implicit_gemm = SparseImplicitGemmFunction.apply
@@ -368,6 +387,7 @@ indice_inverse_conv = SparseInverseConvFunction.apply
 indice_subm_conv = SubMConvFunction.apply
 indice_maxpool = SparseMaxPoolFunction.apply
 indice_maxpool_implicit_gemm = SparseMaxPoolImplicitGemmFunction.apply
+indice_avgpool_implicit_gemm = SparseAvgPoolImplicitGemmFunction.apply
 def _indice_to_scalar(indices: torch.Tensor, shape: List[int]):

--- a/spconv/pytorch/modules.py
+++ b/spconv/pytorch/modules.py
@@ -132,12 +132,11 @@ class SparseSequential(SparseModule):
                if isinstance(input, list):
                    input = module(input)
                else:
-                    assert isinstance(input, spconv.SparseConvTensor)
+                    # assert isinstance(input, spconv.SparseConvTensor)
                    # self._sparity_dict[k] = input.sparity
                    input = module(input)
            else:
                if isinstance(input, spconv.SparseConvTensor):
-                    print(input.features.shape)
                    if input.indices.shape[0] != 0:
                        input = input.replace_feature(module(input.features))
                else: