add fused bias/act

d0bfb3a3 · yan.yan · 2b195e43 · d0bfb3a3 · d0bfb3a3 · d0bfb3a3
Commit d0bfb3a3 authored Sep 06, 2022 by yan.yan
15 changed files
--- a/setup.py
+++ b/setup.py
@@ -52,7 +52,7 @@ REQUIRES_PYTHON = '>=3.6'
 VERSION = None

 # What packages are required for this module to be executed?
-REQUIRED = ["pccm>=0.2.21", "pybind11>=2.6.0", "fire", "numpy", *deps]
+REQUIRED = ["pccm>=0.3.5", "pybind11>=2.6.0", "fire", "numpy", *deps]

 # What packages are optional?
 EXTRAS = {
@@ -162,6 +162,7 @@ if disable_jit is not None and disable_jit == "1":
    from spconv.csrc.sparse.alloc import ExternalAllocator
    from spconv.csrc.sparse.convops import GemmTunerSimple, ExternalSpconvMatmul
    from spconv.csrc.sparse.convops import ConvTunerSimple, ConvGemmOps
+    from spconv.csrc.sparse.inference import InferenceOps

    cu = GemmMainUnitTest(SHUFFLE_SIMT_PARAMS + SHUFFLE_VOLTA_PARAMS + SHUFFLE_TURING_PARAMS)
    convcu = ConvMainUnitTest(IMPLGEMM_SIMT_PARAMS + IMPLGEMM_VOLTA_PARAMS + IMPLGEMM_TURING_PARAMS)
@@ -192,7 +193,7 @@ if disable_jit is not None and disable_jit == "1":
    cus = [gemmtuner, convtuner,
        convops, SpconvOps(), BoxOps(), HashTable(), CompileInfo(), 
        ExternalAllocator(),
-        ExternalSpconvMatmul()]
+        ExternalSpconvMatmul(), InferenceOps()]
    if not CUMM_CPU_ONLY_BUILD:
        cus.extend([cu, convcu])
    ext_modules: List[Extension] = [

--- a/spconv/algo.py
+++ b/spconv/algo.py
@@ -606,7 +606,11 @@ class SimpleGemm:
                              gather_data: tv.Tensor = tv.Tensor(),
                              workspace: tv.Tensor = tv.Tensor(),
                              timer: CUDAKernelTimer = CUDAKernelTimer(False),
-                              force_nvrtc: bool = False):
+                              force_nvrtc: bool = False,
+                              bias: Optional[tv.Tensor] = None,
+                              act_alpha: float = 0.0,
+                              act_beta: float = 0.0,
+                              act_type: tv.gemm.Activation = tv.gemm.Activation.None_):
        m, n, k = GemmMainUnitTest.extract_mnk(a.shape, b.shape, trans_a,
                                               trans_b, trans_c,
                                               shuffle_type.value,
@@ -630,6 +634,8 @@ class SimpleGemm:
        params.a = a
        params.b = b
        params.c = c
+        if bias is not None:
+            params.d = bias
        params.a_inds = a_inds
        params.b_inds = b_inds
        params.c_inds = c_inds
@@ -638,6 +644,9 @@ class SimpleGemm:
        params.stream = stream
        params.alpha = alpha
        params.beta = beta
+        params.act_alpha = act_alpha
+        params.act_beta = act_beta
+        params.act_type = act_type
        params.workspace = workspace
        # gather = 0
        # if profile_res.external_gather and not gather_data.empty():
@@ -973,7 +982,11 @@ class SimpleConv:
                              workspace: tv.Tensor = tv.Tensor(),
                              verbose: bool = False,
                              timer: CUDAKernelTimer = CUDAKernelTimer(False),
-                              force_nvrtc: bool = False):
+                              force_nvrtc: bool = False,
+                              bias: Optional[tv.Tensor] = None,
+                              act_alpha: float = 0.0,
+                              act_beta: float = 0.0,
+                              act_type: tv.gemm.Activation = tv.gemm.Activation.None_):
        channel_k = output.dim(1)
        channel_c = inp.dim(1)
        # GemmMainUnitTest.stream_synchronize(stream)
@@ -989,7 +1002,7 @@ class SimpleConv:
        params = ConvParams(NDIM_DONT_CARE, ConvOpTypeCpp(op_type_value))
        is_not_static = str(
                algo_desp) not in self.prebuilt_desp_names
-        if algo_desp.is_nvrtc and (is_not_static or force_nvrtc):
+        if force_nvrtc or (algo_desp.is_nvrtc and is_not_static):
            params.nvrtc_params = self._cached_get_nvrtc_params(
                algo_desp, profile_res.arch)
        params.conv_algo_desp = profile_res.algo_desp
@@ -1001,6 +1014,9 @@ class SimpleConv:
        params.split_k_slices = split_k_slices
        params.alpha = alpha
        params.beta = beta
+        params.act_alpha = act_alpha
+        params.act_beta = act_beta
+        params.act_type = act_type
        params.stream = stream
        params.mask_argsort = mask_argsort
        params.indices = indices
@@ -1011,6 +1027,8 @@ class SimpleConv:
        params.mask_filter = mask_filter
        params.mask_output = mask_output
        params.reverse_mask = reverse_mask
+        if bias is not None:
+            params.bias = bias
        if timer.enable:
            assert timer._timer is not None
            params.timer = timer._timer

--- a/spconv/build.py
+++ b/spconv/build.py
@@ -36,6 +36,7 @@ if project_is_installed(PACKAGE_NAME) and project_is_editable(
    from spconv.csrc.sparse.convops import GemmTunerSimple, ExternalSpconvMatmul
    from spconv.csrc.sparse.convops import ConvTunerSimple, ConvGemmOps
    from spconv.csrc.sparse.convops import SimpleExternalSpconvMatmul
+    from spconv.csrc.sparse.inference import InferenceOps

    all_shuffle = SHUFFLE_SIMT_PARAMS + SHUFFLE_VOLTA_PARAMS + SHUFFLE_TURING_PARAMS
    all_shuffle = list(filter(lambda x: not x.is_nvrtc, all_shuffle))
@@ -63,6 +64,7 @@ if project_is_installed(PACKAGE_NAME) and project_is_editable(
        ExternalAllocator(),
        ExternalSpconvMatmul(),
        SimpleExternalSpconvMatmul(), # for debug, won't be included in release
+        InferenceOps(),
    ]
    pccm.builder.build_pybind(cus,
                              PACKAGE_ROOT / "core_cc",

--- a/spconv/constants.py
+++ b/spconv/constants.py
@@ -100,7 +100,9 @@ class AllocKeys:

 SPCONV_DEBUG_WEIGHT = False

-SPCONV_CPP_INDICE_PAIRS = False 
+SPCONV_CPP_INDICE_PAIRS = True 
+
+SPCONV_USE_DIRECT_TABLE = True 

 # currently use cpp pair gen is slightly slower than python, I don't know why.
 SPCONV_CPP_INDICE_PAIRS_IGEMM = os.getenv("SPCONV_CPP_INDICE_PAIRS_IGEMM", "0") == "1" 

--- a/spconv/core_cc/csrc/sparse/convops/convops.pyi
+++ b/spconv/core_cc/csrc/sparse/convops/convops.pyi
@@ -5,6 +5,7 @@ from cumm.tensorview import Tensor
 from cumm.tensorview.gemm import NVRTCParams
 from spconv.core_cc.csrc.sparse.convops import ConvTuneResult
 from cumm.tensorview import CUDAKernelTimer
+from cumm.tensorview.gemm import Activation
 class ConvTunerSimple:
    def __init__(self, desps: List[ConvAlgoDesp]) -> None: 
        """
@@ -88,7 +89,7 @@ class ConvTunerSimple:
            mask_width: 
        """
        ...
-    def run_with_tuned_result(self, profile_res, op_type: int, inp: Tensor, weight: Tensor, output: Tensor, mask: Tensor, mask_argsort: Tensor, mask_output: Tensor, indices: Tensor, reverse_mask: bool, mask_filter: int = 0xffffffff, mask_width: int = -1, alpha: float = 1.0, beta: float = 0.0, stream_int: int = 0, workspace: Tensor =  Tensor(), verbose: bool = False, timer: CUDAKernelTimer =  CUDAKernelTimer(false), force_nvrtc: bool = False) -> None: 
+    def run_with_tuned_result(self, profile_res, op_type: int, inp: Tensor, weight: Tensor, output: Tensor, mask: Tensor, mask_argsort: Tensor, mask_output: Tensor, indices: Tensor, reverse_mask: bool, mask_filter: int = 0xffffffff, mask_width: int = -1, alpha: float = 1.0, beta: float = 0.0, stream_int: int = 0, workspace: Tensor =  Tensor(), verbose: bool = False, timer: CUDAKernelTimer =  CUDAKernelTimer(false), force_nvrtc: bool = False, bias: Tensor =  Tensor(), act_alpha: float = 0.0, act_beta: float = 0.0, act_type: Activation =  Activation.None_) -> None: 
        """
        Args:
            profile_res: 
@@ -110,6 +111,10 @@ class ConvTunerSimple:
            verbose: 
            timer: 
            force_nvrtc: 
+            bias: 
+            act_alpha: 
+            act_beta: 
+            act_type: 
        """
        ...
    def query_workspace_size(self, desp: ConvAlgoDesp, splitk: int, op_type: int, N: int, C: int, K: int, kv: int) -> int: 

--- a/spconv/core_cc/csrc/sparse/convops/gemmops.pyi
+++ b/spconv/core_cc/csrc/sparse/convops/gemmops.pyi
@@ -5,6 +5,7 @@ from cumm.tensorview import Tensor
 from cumm.tensorview.gemm import NVRTCParams
 from spconv.core_cc.csrc.sparse.convops import GemmTuneResult
 from cumm.tensorview import CUDAKernelTimer
+from cumm.tensorview.gemm import Activation
 class GemmTunerSimple:
    def __init__(self, desps: List[GemmAlgoDesp]) -> None: 
        """
@@ -81,7 +82,7 @@ class GemmTunerSimple:
            hint: 
        """
        ...
-    def run_with_tuned_result(self, profile_res, a: Tensor, b: Tensor, c: Tensor, trans_a: bool, trans_b: bool, trans_c: bool, arch: Tuple[int, int], stream_int: int, shuffle_type: int, a_inds: Tensor, b_inds: Tensor, c_inds: Tensor, hint: int = 0, alpha: float = 1.0, beta: float = 0.0, workspace: Tensor =  Tensor(), timer: CUDAKernelTimer =  CUDAKernelTimer(False), force_nvrtc: bool = False) -> None: 
+    def run_with_tuned_result(self, profile_res, a: Tensor, b: Tensor, c: Tensor, trans_a: bool, trans_b: bool, trans_c: bool, arch: Tuple[int, int], stream_int: int, shuffle_type: int, a_inds: Tensor, b_inds: Tensor, c_inds: Tensor, hint: int = 0, alpha: float = 1.0, beta: float = 0.0, workspace: Tensor =  Tensor(), timer: CUDAKernelTimer =  CUDAKernelTimer(False), force_nvrtc: bool = False, bias: Tensor =  Tensor(), act_alpha: float = 0.0, act_beta: float = 0.0, act_type: Activation =  Activation.None_) -> None: 
        """
        Args:
            profile_res: 
@@ -103,5 +104,9 @@ class GemmTunerSimple:
            workspace: 
            timer: 
            force_nvrtc: 
+            bias: 
+            act_alpha: 
+            act_beta: 
+            act_type: 
        """
        ...
--- a/spconv/core_cc/csrc/sparse/convops/spops.pyi
+++ b/spconv/core_cc/csrc/sparse/convops/spops.pyi
 from typing import overload, Any, Callable, Dict, List, Optional, Set, Tuple, Type, Union
 from pccm.stubs import EnumValue, EnumClassValue
 from cumm.tensorview import Tensor
+from cumm.tensorview.gemm import Activation
 from cumm.tensorview import CUDAKernelTimer
 class ConvGemmOps:
    @staticmethod
@@ -11,7 +12,7 @@ class ConvGemmOps:
        """
        ...
    @staticmethod
-    def indice_conv(allocator, ext_mm, gemm_tuner, all_w_is_krsc: bool, filter_hwio: bool, features: Tensor, filters: Tensor, indice_pairs: Tensor, indice_pair_num: Tensor, arch: Tuple[int, int], num_activate_out: int, inverse: bool = False, subm: bool = False, algo: int = 0, stream_int: int = 0) -> None: 
+    def indice_conv(allocator, ext_mm, gemm_tuner, all_w_is_krsc: bool, filter_hwio: bool, features: Tensor, filters: Tensor, indice_pairs: Tensor, indice_pair_num: Tensor, arch: Tuple[int, int], num_activate_out: int, inverse: bool = False, subm: bool = False, algo: int = 0, stream_int: int = 0, bias: Tensor =  Tensor(), act_alpha: float = 0.0, act_beta: float = 0.0, act_type: Activation =  Activation.None_) -> None: 
        """
        1. this function need to take a out features
        that from subm first mm.
@@ -32,6 +33,10 @@ class ConvGemmOps:
            subm: 
            algo: 
            stream_int: 
+            bias: 
+            act_alpha: 
+            act_beta: 
+            act_type: 
        """
        ...
    @staticmethod
@@ -56,7 +61,7 @@ class ConvGemmOps:
        """
        ...
    @staticmethod
-    def implicit_gemm(allocator, conv_tuner, features: Tensor, filters: Tensor, pair_fwd: Tensor, pair_mask_fwd_splits: List[Tensor], mask_argsort_fwd_splits: List[Tensor], num_activate_out: int, masks: Tensor, arch: Tuple[int, int], is_train: bool = False, is_subm: bool = False, stream_int: int = 0, timer: CUDAKernelTimer =  CUDAKernelTimer(False), auto_fp32_accum: bool = True, fp32_accum: bool = False) -> Tuple[int, Any]: 
+    def implicit_gemm(allocator, conv_tuner, features: Tensor, filters: Tensor, pair_fwd: Tensor, pair_mask_fwd_splits: List[Tensor], mask_argsort_fwd_splits: List[Tensor], num_activate_out: int, masks: Tensor, arch: Tuple[int, int], is_train: bool = False, is_subm: bool = False, stream_int: int = 0, timer: CUDAKernelTimer =  CUDAKernelTimer(False), auto_fp32_accum: bool = True, fp32_accum: bool = False, bias: Tensor =  Tensor(), act_alpha: float = 0.0, act_beta: float = 0.0, act_type: Activation =  Activation.None_) -> Tuple[int, Any]: 
        """
        Args:
            allocator: 
@@ -75,6 +80,10 @@ class ConvGemmOps:
            timer: 
            auto_fp32_accum: 
            fp32_accum: 
+            bias: 
+            act_alpha: 
+            act_beta: 
+            act_type: 
        """
        ...
    @staticmethod

--- a/spconv/core_cc/csrc/sparse/inference.pyi
+++ b/spconv/core_cc/csrc/sparse/inference.pyi
+from typing import overload, Any, Callable, Dict, List, Optional, Set, Tuple, Type, Union
+from pccm.stubs import EnumValue, EnumClassValue
+from cumm.tensorview import Tensor
+from cumm.tensorview.gemm import Activation
+class InferenceOps:
+    @staticmethod
+    def bias_add_act_inplace(out: Tensor, bias: Tensor, act_type: Activation =  Activation.None_, alpha: float = 0.0, beta: float = 0.0, stream: int = 0) -> None: 
+        """
+        Args:
+            out: 
+            bias: 
+            act_type: 
+            alpha: 
+            beta: 
+            stream: 
+        """
+        ...
+    @staticmethod
+    def bias_add_inplace(out: Tensor, bias: Tensor, stream: int = 0) -> None: 
+        """
+        Args:
+            out: 
+            bias: 
+            stream: 
+        """
+        ...
+    @staticmethod
+    def activation_inplace(out: Tensor, act_type: Activation, alpha: float, beta: float, stream: int = 0) -> None: 
+        """
+        Args:
+            out: 
+            act_type: 
+            alpha: 
+            beta: 
+            stream: 
+        """
+        ...
--- a/spconv/csrc/sparse/convops.py
+++ b/spconv/csrc/sparse/convops.py
@@ -14,7 +14,7 @@ from spconv.csrc.sparse.gather import GatherCPU

 from .alloc import ExternalAllocator
 from cumm.common import CompileInfo
-
+from .inference import InferenceOps

 class ExternalSpconvMatmul(pccm.Class):
    """a helper class to warp matmul operations
@@ -834,6 +834,12 @@ class GemmTunerSimple(pccm.ParameterizedClass):
        code.arg("timer", "tv::CUDAKernelTimer", "tv::CUDAKernelTimer(false)",
                 "cumm.tensorview.CUDAKernelTimer = CUDAKernelTimer(False)")
        code.arg("force_nvrtc", f"bool", "false")
+        code.arg("bias", "tv::Tensor", "tv::Tensor()",
+                 "cumm.tensorview.Tensor = Tensor()")
+        code.arg("act_alpha", f"float", "0.0")
+        code.arg("act_beta", f"float", "0.0")
+        code.arg("act_type", f"tv::gemm::Activation", "tv::gemm::Activation::kNone", "cumm.tensorview.gemm.Activation = Activation.None_")
+
        if CUMM_CPU_ONLY_BUILD:
            code.raw(f"TV_THROW_RT_ERR(\"not implemented for cpu!!!\")")
            return code
@@ -847,12 +853,13 @@ class GemmTunerSimple(pccm.ParameterizedClass):

        tv::gemm::GemmParams params;
        bool desp_is_static = prebuilt_names_.find(desp.__repr__()) == prebuilt_names_.end();
-        if (desp.is_nvrtc && (desp_is_static || force_nvrtc)){{
+        if (force_nvrtc || (desp.is_nvrtc && desp_is_static)){{
            params.nvrtc_params = cached_get_nvrtc_params(desp, profile_res.arch, stream_int);
        }}
        params.a = a;
        params.b = b;
        params.c = c;
+        params.d = bias;
        params.a_inds = a_inds;
        params.b_inds = b_inds;
        params.c_inds = c_inds;
@@ -861,6 +868,10 @@ class GemmTunerSimple(pccm.ParameterizedClass):
        params.stream = stream_int;
        params.alpha = alpha;
        params.beta = beta;
+        params.act_alpha = act_alpha;
+        params.act_beta = act_beta;
+        params.act_type = act_type;
+
        params.workspace = workspace;
        GemmMain::matmul2(params);
        """)
@@ -1257,15 +1268,18 @@ class ConvTunerSimple(pccm.ParameterizedClass):
        code.arg("timer", "tv::CUDAKernelTimer", "tv::CUDAKernelTimer(false)",
                 "cumm.tensorview.CUDAKernelTimer = CUDAKernelTimer(false)")
        code.arg("force_nvrtc", f"bool", "false")
+        code.arg("bias", "tv::Tensor", "tv::Tensor()",
+                 "cumm.tensorview.Tensor = Tensor()")
+        code.arg("act_alpha", f"float", "0.0")
+        code.arg("act_beta", f"float", "0.0")
+        code.arg("act_type", f"tv::gemm::Activation", "tv::gemm::Activation::kNone", "cumm.tensorview.gemm.Activation = Activation.None_")
+
        if CUMM_CPU_ONLY_BUILD:
            code.raw(f"TV_THROW_RT_ERR(\"not implemented for cpu!!!\")")
            return code

        code.raw(f"""
        auto desp = profile_res.algo_desp;
-        if (force_nvrtc){{
-            desp.is_nvrtc = true;
-        }}
        int split_k_slices = 1;
        if (profile_res.splitk > 1){{
            split_k_slices = profile_res.splitk;
@@ -1276,7 +1290,7 @@ class ConvTunerSimple(pccm.ParameterizedClass):
        auto arch = profile_res.arch;
        tv::gemm::ConvParams params({NDIM_DONT_CARE}, op_type_cpp, timer);
        bool desp_is_static = prebuilt_names_.find(desp.__repr__()) == prebuilt_names_.end();
-        if (desp.is_nvrtc && (desp_is_static || force_nvrtc)){{
+        if (force_nvrtc || (desp.is_nvrtc && desp_is_static)){{
            params.nvrtc_params = cached_get_nvrtc_params(desp, arch, stream_int);
        }}
        params.conv_algo_desp = desp;
@@ -1284,10 +1298,15 @@ class ConvTunerSimple(pccm.ParameterizedClass):
        params.weight = weight.view(channel_k, -1, channel_c);
        params.output = output;
        params.verbose = verbose;
+        params.bias = bias;

        params.split_k_slices = split_k_slices;
        params.alpha = alpha;
        params.beta = beta;
+        params.act_alpha = act_alpha;
+        params.act_beta = act_beta;
+        params.act_type = act_type;
+
        params.stream = stream_int;
        params.mask_argsort = mask_argsort;
        params.indices = indices;
@@ -1336,6 +1355,7 @@ class ConvGemmOps(pccm.ParameterizedClass):
            GemmTuneResult,
            ConvTuneResult,
            ExternalSpconvMatmul,
+            InferenceOps,
        )
        self.add_param_class("gemm", gemm_tuner, "GemmTuner")
        self.add_param_class("conv", conv_tuner, "ConvTuner")
@@ -1384,11 +1404,18 @@ class ConvGemmOps(pccm.ParameterizedClass):
        code.arg("subm", "bool", "false")
        code.arg("algo", "int", f"{ConvAlgo.Native.value}")
        code.arg("stream_int", f"std::uintptr_t", "0", pyanno="int")
+        code.arg("bias", "tv::Tensor", "tv::Tensor()",
+                 "cumm.tensorview.Tensor = Tensor()")
+        code.arg("act_alpha", f"float", "0.0")
+        code.arg("act_beta", f"float", "0.0")
+        code.arg("act_type", f"tv::gemm::Activation", "tv::gemm::Activation::kNone", "cumm.tensorview.gemm.Activation = Activation.None_")

        code.raw(f"""
        int kv_dim, out_channel, kv;
        std::vector<int64_t> filter_shape_per_kv;
        bool is_KC_not_CK;
+        bool has_bias = !bias.empty();
+        bool has_act = act_type != tv::gemm::Activation::kNone;
        if (!all_w_is_krsc){{
            kv_dim = 0;
            is_KC_not_CK = !filter_hwio;
@@ -1419,10 +1446,22 @@ class ConvGemmOps(pccm.ParameterizedClass):
            out_features = allocator.zeros({pccm.literal(AllocKeys.OutFeatures)}, 
                {{num_activate_out, out_channel}}, features.dtype(), features.device(), stream_int);
        }}
+        if (has_act || has_bias){{
+            TV_ASSERT_RT_ERR(!features.is_cpu(), "bias and act don't support cpu.");
+        }}
        if (kv == 1 && subm){{
+            if (has_bias && has_act){{
+                InferenceOps::bias_add_act_inplace(out_features, bias, act_type, act_alpha, act_beta, stream_int);
+            }}else{{
+                if (has_bias){{
+                    InferenceOps::bias_add_inplace(out_features, bias, stream_int);
+                }}
+                if (has_act){{
+                    InferenceOps::activation_inplace(out_features, act_type, act_alpha, act_beta, stream_int);
+                }}
+            }}
            return;
        }}
-        
        auto indice_pair_num_cpu = indice_pair_num.cpu();
        auto indice_pair_num_cpu_ptr = indice_pair_num_cpu.data_ptr<int>();
        int maxnhot = 0;
@@ -1571,6 +1610,16 @@ class ConvGemmOps(pccm.ParameterizedClass):
                beta);
            inited = true;
        }}
+        if (has_bias && has_act){{
+            InferenceOps::bias_add_act_inplace(out_features, bias, act_type, act_alpha, act_beta, stream_int);
+        }}else{{
+            if (has_bias){{
+                InferenceOps::bias_add_inplace(out_features, bias, stream_int);
+            }}
+            if (has_act){{
+                InferenceOps::activation_inplace(out_features, act_type, act_alpha, act_beta, stream_int);
+            }}
+        }}
        """)
        return code

@@ -1913,11 +1962,21 @@ class ConvGemmOps(pccm.ParameterizedClass):
        code.arg("auto_fp32_accum", "bool", "true")
        code.arg("fp32_accum", "bool", "false")

+        code.arg("bias", "tv::Tensor", "tv::Tensor()",
+                 "cumm.tensorview.Tensor = Tensor()")
+        code.arg("act_alpha", f"float", "0.0")
+        code.arg("act_beta", f"float", "0.0")
+        code.arg("act_type", f"tv::gemm::Activation", "tv::gemm::Activation::kNone", "cumm.tensorview.gemm.Activation = Activation.None_")
+
+
        if CUMM_CPU_ONLY_BUILD:
            code.raw(f"TV_THROW_RT_ERR(\"not implemented for cpu!!!\")")
            return code.ret("int")

        code.raw(f"""
+        if (!bias.empty() || act_type != tv::gemm::Activation::kNone){{
+            TV_ASSERT_RT_ERR(pair_mask_fwd_splits.size() == 1, "SplitGemm don't support fused bias/act for now.");
+        }}
        uint32_t* mask_ptr = masks.data_ptr<uint32_t>();
        int num_mask = masks.dim(0);
        int out_channel = filters.dim(0);
@@ -1989,6 +2048,7 @@ class ConvGemmOps(pccm.ParameterizedClass):
        
        for (int j = 0; j < num_split; ++j){{
            float beta = j == 0 ? 0 : 1;
+
            conv_tuner.run_with_tuned_result(
                tune_res,
                kForwardInt,
@@ -2006,7 +2066,12 @@ class ConvGemmOps(pccm.ParameterizedClass):
                stream_int,
                tv::Tensor(), // workspace
                false, // verbose
-                timer);
+                timer, 
+                false,
+                bias,
+                act_alpha,
+                act_beta,
+                act_type);
        }}
        // auto end_ev = tv::CUDAEvent();
        // end_ev.record(stream_int);

--- a/spconv/csrc/sparse/inference.py
+++ b/spconv/csrc/sparse/inference.py
+# Copyright 2021 Yan Yan
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pccm
+from cumm.common import TensorView, GemmDTypes, TensorViewKernel, ThrustLib, GemmBasic
+from spconv.csrc.sparse.cpu_core import OMPLib
+from cumm.constants import CUMM_CPU_ONLY_BUILD
+
+class InferenceOpsKernel(pccm.ParameterizedClass):
+    def __init__(self):
+        super().__init__()
+        self.add_dependency(TensorViewKernel, GemmBasic)
+
+    @pccm.cuda.cuda_global_function
+    def bias_add_inplace_kernel(self):
+        code = pccm.FunctionCode()
+        code.targ("T")
+
+        code.arg("out_features", f"T*")
+        code.arg("bias", f"const T*")
+        code.arg("size", "int")
+        code.arg("num_features", "int")
+
+        code.raw(f"""
+        for (int i : tv::KernelLoopY<int>(size)) {{
+            auto out_ptr = out_features + i * num_features;
+            for (int j : tv::KernelLoopX<int>(num_features)) {{
+                out_ptr[j] = bias[j] + out_ptr[j];
+            }}
+        }}
+        """)
+        return code
+
+    @pccm.cuda.cuda_global_function
+    def bias_add_act_inplace_kernel(self):
+        code = pccm.FunctionCode()
+        code.targ("T")
+
+        code.arg("out_features", f"T*")
+        code.arg("bias", f"const T*")
+        code.arg("act_type", f"tv::gemm::Activation")
+        code.arg("alpha", f"T")
+        code.arg("beta", f"T")
+        code.arg("size", "int")
+        code.arg("num_features", "int")
+
+        code.raw(f"""
+        for (int i : tv::KernelLoopY<int>(size)) {{
+            auto out_ptr = out_features + i * num_features;
+            for (int j : tv::KernelLoopX<int>(num_features)) {{
+                T o = out_ptr[j] + bias[j];
+                switch (act_type){{
+                    case tv::gemm::Activation::kNone:
+                        break;
+                    case tv::gemm::Activation::kReLU:{{
+                        o = o >= T(0) ? o : T(0);
+                    }}
+                    case tv::gemm::Activation::kLeakyReLU:{{
+                        o = o >= T(0) ? o : o * alpha;
+                    }}
+                    default: ;
+                }}
+                out_ptr[j] = o;
+            }}
+        }}
+        """)
+        return code
+
+    @pccm.cuda.cuda_global_function
+    def activation_inplace_kernel(self):
+        code = pccm.FunctionCode()
+        code.targ("T")
+
+        code.arg("out_features", f"T*")
+        code.arg("act_type", f"tv::gemm::Activation")
+        code.arg("alpha", f"T")
+        code.arg("beta", f"T")
+        code.arg("size", "int")
+
+        code.raw(f"""
+        for (int i : tv::KernelLoopX<int>(size)) {{
+            T o = out_features[i];
+            switch (act_type){{
+                case tv::gemm::Activation::kNone:
+                    break;
+                case tv::gemm::Activation::kReLU:{{
+                    out_features[i] = o >= T(0) ? o : T(0);
+                }}
+                case tv::gemm::Activation::kLeakyReLU:{{
+                    out_features[i] = o >= T(0) ? o : o * alpha;
+                }}
+                default: ;
+            }}
+        }}
+        """)
+        return code
+
+
+class InferenceOps(pccm.Class):
+    def __init__(self):
+        super().__init__()
+        self.add_dependency(TensorView)
+        self.kernel = InferenceOpsKernel()
+        self.add_include("tensorview/gemm/core/constants.h")
+
+    if CUMM_CPU_ONLY_BUILD:
+        _DECORATOR = pccm.static_function
+    else:
+        _DECORATOR = pccm.cuda.static_function
+
+    @pccm.pybind.mark
+    @_DECORATOR
+    def bias_add_act_inplace(self):
+        code = pccm.FunctionCode()
+        code.arg("out", "tv::Tensor")
+        code.arg("bias", "tv::Tensor")
+        code.arg("act_type", f"tv::gemm::Activation", "tv::gemm::Activation::kNone", "cumm.tensorview.gemm.Activation = Activation.None_")
+        code.arg("alpha", f"float", "0.0")
+        code.arg("beta", f"float", "0.0")
+        code.arg("stream", "std::uintptr_t", "0")
+        if CUMM_CPU_ONLY_BUILD:
+            code.raw(f"""
+            TV_THROW_RT_ERR("this function don't support cpu only build.")
+            """)
+            return code
+        code.add_param_class("ker", self.kernel)
+        code.raw(f"""
+        auto nhot = out.dim(0);
+        auto cudastream = reinterpret_cast<cudaStream_t>(stream);
+        TV_ASSERT_RT_ERR(bias.dim(0) == out.dim(1), "error");
+        tv::dispatch<float, double, tv::half_t, tv::bfloat16_t>(out.dtype(), [&](auto I){{
+            using T = TV_DECLTYPE(I);
+            constexpr int MaxThreads = 512;
+            tv::cuda::Launch launcher(1);
+            bool found = tv::dispatch_int_noexcept<512, 256, 128, 64, 32, 16>(out.dim(1), [](int my, int expect){{return my >= expect;}}, [&](auto V){{
+                // if out.dim(1) > value in list above, run this function.
+                // if a value is found, other value won't be executed.
+                int NumFeatures = TV_DECLTYPE(V)::value;
+                int Num0 = MaxThreads / NumFeatures;
+                dim3 blocks(tv::div_up(out.dim(1), int64_t(NumFeatures)), tv::div_up(nhot, int64_t(Num0)));
+                dim3 threads(NumFeatures, Num0);
+                launcher = tv::cuda::Launch(blocks, threads, cudastream);
+            }});
+            if (!found){{
+                int NumFeatures = 16;
+                int Num0 = MaxThreads / NumFeatures;
+                dim3 blocks(tv::div_up(out.dim(1), int64_t(NumFeatures)), tv::div_up(nhot, int64_t(Num0)));
+                dim3 threads(NumFeatures, Num0);
+                launcher = tv::cuda::Launch(blocks, threads, cudastream);
+            }}
+            if (act_type == tv::gemm::Activation::kNone){{
+                launcher(ker::bias_add_inplace_kernel<T>, out.data_ptr<T>(), bias.data_ptr<const T>(),
+                    nhot, out.dim(1));
+            }}else{{
+                launcher(ker::bias_add_act_inplace_kernel<T>, out.data_ptr<T>(), bias.data_ptr<const T>(),
+                    act_type, T(alpha), T(beta), nhot, out.dim(1));
+            }}
+
+        }});
+        """)
+        return code
+
+    @pccm.pybind.mark
+    @_DECORATOR
+    def bias_add_inplace(self):
+        code = pccm.FunctionCode()
+        code.arg("out", "tv::Tensor")
+        code.arg("bias", "tv::Tensor")
+        code.arg("stream", "std::uintptr_t", "0")
+        code.raw(f"""
+        return bias_add_act_inplace(out, bias, tv::gemm::Activation::kNone, 0, 0, stream);
+        """)
+        return code
+
+
+    @pccm.pybind.mark
+    @_DECORATOR
+    def activation_inplace(self):
+        code = pccm.FunctionCode()
+        code.arg("out", "tv::Tensor")
+        code.arg("act_type", f"tv::gemm::Activation")
+        code.arg("alpha", f"float")
+        code.arg("beta", f"float")
+        code.arg("stream", "std::uintptr_t", "0")
+        if CUMM_CPU_ONLY_BUILD:
+            code.raw(f"""
+            TV_THROW_RT_ERR("this function don't support cpu only build.")
+            """)
+            return code
+        code.add_param_class("ker", self.kernel)
+
+        code.raw(f"""
+        auto nhot = out.size();
+        auto cudastream = reinterpret_cast<cudaStream_t>(stream);
+        tv::cuda::Launch launcher = tv::cuda::Launch(nhot, cudastream);
+        tv::dispatch<float, double, tv::half_t, tv::bfloat16_t>(out.dtype(), [&](auto I){{
+            using T = TV_DECLTYPE(I);
+            launcher(ker::activation_inplace_kernel<T>, out.data_ptr<T>(), act_type, T(alpha), T(beta),
+                nhot);
+        }});
+        """)
+        return code
--- a/spconv/gencode/__main__.py
+++ b/spconv/gencode/__main__.py
@@ -17,6 +17,7 @@ from spconv.csrc.sparse.convops import (ConvGemmOps, ConvTunerSimple,
 from spconv.csrc.utils import BoxOps
 from cumm.gemm.algospec.core import (GemmAlgo, ShuffleStrideType)
 from cumm.conv.bases import ConvLayout, ConvLayoutType, ConvOpType
+from spconv.csrc.sparse.inference import InferenceOps


 def main(include: str,
@@ -60,6 +61,7 @@ def main(include: str,
        ExternalSpconvMatmul(),
        SimpleExternalSpconvMatmul(),
        StaticAllocator(),
+        InferenceOps(),
    ]

    gen_cmake(libname, cus, include, src, namespace_prefix=prefix)

--- a/spconv/pytorch/core.py
+++ b/spconv/pytorch/core.py
@@ -162,6 +162,7 @@ class SparseConvTensor(metaclass=SpConvTensorMeta):
            assert len(spatial_shape) == ndim, "spatial shape must equal to ndim"
            assert indices.dtype == torch.int32, "only support int32"
            assert batch_size > 0
+            # assert features.shape[0] == indices.shape[0]
        self._features = features
        self.indices = indices
        self.spatial_shape = [int(v) for v in spatial_shape]
@@ -197,6 +198,9 @@ class SparseConvTensor(metaclass=SpConvTensorMeta):

        return new_spt

+    def minus(self):
+        return self.replace_feature(-self.features)
+
    @property
    def features(self):
        return self._features

--- a/spconv/pytorch/ops.py
+++ b/spconv/pytorch/ops.py
@@ -41,7 +41,7 @@ else:
    GEMM_CPP = None
    CONV_CPP = None
 import time
-from spconv.constants import FILTER_HWIO, ALL_WEIGHT_IS_KRSC, AllocKeys
+from spconv.constants import FILTER_HWIO, ALL_WEIGHT_IS_KRSC, AllocKeys, SPCONV_USE_DIRECT_TABLE
 from cumm.gemm import codeops
 from spconv.tools import CUDAKernelTimer

@@ -101,8 +101,12 @@ class _HashData:
                                        dtype=torch.int32,
                                        device=device)
            hashdata_tv = torch_tensor_to_tv(self.hashdata)
-            self.hashdata_k_tv = hashdata_tv[0]
-            self.hashdata_v_tv = hashdata_tv[1]
+            if num == 0:
+                self.hashdata_k_tv = tv.Tensor()
+                self.hashdata_v_tv = tv.Tensor()
+            else:
+                self.hashdata_k_tv = hashdata_tv[0]
+                self.hashdata_v_tv = hashdata_tv[1]


 def get_indice_pairs(indices: torch.Tensor,
@@ -315,7 +319,7 @@ def get_indice_pairs_implicit_gemm(
        alloc: Optional[ThrustSortAllocator] = None,
        timer: CUDAKernelTimer = CUDAKernelTimer(False),
        num_out_act_bound: int = -1,
-        direct_table: bool = True):
+        direct_table: bool = SPCONV_USE_DIRECT_TABLE):
    """
    Why return tuple? because pytorch seems don't support custom object in autograd.
    return: (
@@ -535,7 +539,6 @@ def get_indice_pairs_implicit_gemm(
            indices.shape[0], ksize, stride, padding, dilation)
        if transpose:
            max_num_act = kv * indices.shape[0]
-
        pair_bwd = pair
        pair_bwd_tv = pair_tv
        indice_pairs_uniq = torch.empty((pair.numel() + 1, ),

--- a/test/benchmark.py
+++ b/test/benchmark.py
@@ -32,9 +32,9 @@ def waymo_data(batch_size=1, num_features=-1):
    #                        150000)
    data = np.load(Path(__file__).parent / "data" / "benchmark-pc.npz")
    pc = np.ascontiguousarray(data["pc"])
-    print(pc.shape)
    voxels_tv, indices_tv, _ = gen.point_to_voxel(tv.from_numpy(pc))
    voxels = voxels_tv.numpy().reshape(-1, 3)
+
    if num_features > 0:
        voxels = np.zeros((voxels.shape[0], num_features), dtype=voxels.dtype)
    coors = indices_tv.numpy()
@@ -316,6 +316,7 @@ import json

 def main():
    import pickle
+
    np.random.seed(50051)
    torch.manual_seed(50051)
    # voxels, coors, spatial_shape = waymo_data(num_features=128)
@@ -377,14 +378,6 @@ def main():
            # print("------------")
            with tv.measure_duration() as measure:
                out_nograd = net(voxels_th, coors_th, 1, show_metrics)
-            # res = timer.collect_by_name("forward", timer.get_all_pair_time())
-            # res2 = timer.collect_by_name("forward0", timer.get_all_pair_time())
-
-            # print(sum(res.values()) + sum(res2.values()))
-            # print(timer.get_all_pair_time())
-
-            # print(sum(timer.get_all_pair_time().values()))
-            # sort_bench()
            times.append(measure.duration)
            if show_metrics:
                timer = out_nograd._timer

--- a/test/test_all_algo.py
+++ b/test/test_all_algo.py
@@ -31,6 +31,7 @@ import pccm
 import torch
 import torch.nn.functional as F
 from spconv.core_cc.csrc.sparse.convops import GemmTuneResult, ConvTuneResult
+from spconv.pytorch.core import SparseConvTensor
 from spconv.test_utils import TestCase
 from cumm import tensorview as tv
 from cumm.conv.bases import NCHW, NHWC, ConvIterAlgo, ConvOpType
@@ -44,8 +45,10 @@ from spconv.pytorch.cppcore import get_current_stream, torch_tensor_to_tv
 from spconv.test_utils import generate_sparse_data, params_grid
 import tqdm 
 from spconv.constants import ALL_WEIGHT_IS_KRSC, SPCONV_CPP_GEMM
-
+from spconv.core_cc.csrc.sparse.inference import InferenceOps
+from spconv.pytorch import functional as Fsp
 assert ALL_WEIGHT_IS_KRSC is True, "we only support KRSC in spconv >= 2.2"
+from spconv.pytorch.hash import HashTable

 # TODO remove or release this when tf32 op is ready
 torch.backends.cuda.matmul.allow_tf32 = False
@@ -60,8 +63,9 @@ NUMPY_DTYPE_TO_TORCH = {

 class SparseConvTester:
    def __init__(self, algo: ConvAlgo, subm: bool, shape: List[int], bs: int, dtype: np.dtype, N: int, K: int, C: int, 
-        ksize: int, stride: int, padding: int, dilation: int) -> None:
+        ksize: int, stride: int, padding: int, dilation: int, check_bias: bool = False, check_act: bool = False) -> None:
        ndim = 3
+        transpose = False
        self.shape = shape 
        self.bs = bs 
        self.dtype = dtype 
@@ -77,6 +81,15 @@ class SparseConvTester:
        op = expand_nd(ndim, 0)
        self.kv: int = np.prod(self.ksize)
        self.num_split = 1 if algo == ConvAlgo.MaskImplicitGemm else 2
+        if not subm:
+            if transpose:
+                out_shape = ops.get_deconv_output_size(shape, self.ksize, self.stride,
+                                                self.padding, self.dilation, op)
+            else:
+                out_shape = ops.get_conv_output_size(shape, self.ksize, self.stride,
+                                                self.padding, self.dilation)
+        else:
+            out_shape = shape

        sparse_dict = generate_sparse_data(shape, [N] * bs, C)

@@ -88,10 +101,15 @@ class SparseConvTester:
        out_inds, pair_ref, indice_num_per_loc = ops.get_indice_pairs(
            indices_th, 1, shape, ConvAlgo.Native, self.ksize, self.stride, self.padding,
            self.dilation, op, subm)
+        self.ref_out_inds = out_inds
+        self.ref_out_inds_scalar = Fsp._indice_to_scalar(out_inds.long(), [bs, *out_shape])
        self.indice_num_per_loc_np = indice_num_per_loc.cpu().numpy()
        self.indice_pairs_np = pair_ref.cpu().numpy()
        self.pair_native = pair_ref
        self.indice_num_per_loc = indice_num_per_loc
+        self.use_direct_table = True
+        
+        self.out_shape = out_shape
        if algo == ConvAlgo.Native:
            self.out_inds: torch.Tensor = out_inds
            self.num_inds_per_loc: torch.Tensor = indice_num_per_loc
@@ -105,7 +123,7 @@ class SparseConvTester:
        else:
            res = ops.get_indice_pairs_implicit_gemm(indices_th, bs, shape,
                                                    algo, self.ksize, self.stride, self.padding,
-                                                    self.dilation, op, subm=subm)
+                                                    self.dilation, op, subm=subm, direct_table=self.use_direct_table)
            
            self.out_inds = res[0]
            self.num_inds_per_loc = res[1]
@@ -116,8 +134,27 @@ class SparseConvTester:
            self.mask_argsort_fwd_splits = res[6]
            self.mask_argsort_bwd_splits = res[7]
            self.masks = res[8]
+        
+        self.out_inds_scalar = Fsp._indice_to_scalar(self.out_inds.long(), [bs, *out_shape])
+
+        table = HashTable(out_inds.device, torch.int64, torch.int32, self.out_inds.shape[0] * 2)
+        # test coords -> test out indexes
+        table.insert(self.out_inds_scalar, torch.arange(0, self.out_inds.shape[0], dtype=torch.int32, device=self.device))
+        # out_order:  test_order_to_ref, test index for each ref coord
+        out_order, is_empty = table.query(self.ref_out_inds_scalar)
+        assert is_empty.int().sum().item() == 0, "shouldn't happen"
+        self.out_order = out_order.cpu().numpy()
+
+        # inp_table = HashTable(out_inds.device, torch.int64, torch.int32, self.ref_out_inds.shape[0] * 2)
+        # inp_table.insert(self.ref_out_inds_scalar, torch.arange(0, self.ref_out_inds.shape[0], dtype=torch.int32, device=self.device))
+        # # out_order:  ref index for each out coord
+        # out_order, is_empty = inp_table.query(self.out_inds_scalar)
+
+
        self.voxels_np = voxels_np
        self.indices_np = indices_np
+        self.check_bias = check_bias
+        self.check_act = check_act

        self.subm = subm
        if dtype == np.int8:
@@ -128,6 +165,10 @@ class SparseConvTester:
            self.output = np.random.randint(-2, 2, size=[
                self.out_inds.shape[0], K
            ]).astype(dtype)
+            self.bias = np.random.randint(-2, 2, size=[
+                K
+            ]).astype(dtype)
+
        else:
            self.inp = np.random.uniform(-1, 1, size=[
                voxels_np.shape[0], C
@@ -136,14 +177,25 @@ class SparseConvTester:
            self.output = np.random.uniform(-1, 1, size=[
                self.out_inds.shape[0], K
            ]).astype(dtype)
+            self.bias = np.random.uniform(-1, 1, size=[
+                K
+            ]).astype(dtype)
+
        self.weight_ref = self.weight.transpose(1, 2, 3, 0, 4)
        self.weight_ref = np.ascontiguousarray(self.weight_ref).reshape(-1, K, C)
-
        self.out_ref, self.din_ref, self.dw_ref = self._get_ref_output()
-
+        if check_bias:
+            self.out_ref += self.bias
+            # relu
+        if check_act:
+            self.out_ref = np.maximum(self.out_ref, 0)
        self.dw_ref = np.ascontiguousarray(self.dw_ref.transpose(1, 0, 2).reshape(K, *self.ksize, C))
        self.arch = tv.get_compute_capability()

+    def get_output_ref_spt(self):
+        return SparseConvTensor(torch.from_numpy(self.out_ref).cuda(), self.ref_out_inds, self.out_shape, self.bs)
+
+
    def _get_ref_output(self):
        output_ref = np.zeros_like(self.output, dtype=np.float32)
        dinput_ref = np.zeros_like(self.inp, dtype=np.float32)
@@ -165,13 +217,15 @@ class SparseConvTester:
                np.float32) @ self.weight_ref[filter_offset].T.astype(
                    np.float32)
            output_ref[o_inds] += cc
-            a = self.output[o_inds]
+            # we use random output as dout here
+            a = self.output[self.out_order][o_inds]
            # NK @ KC
            cc = a.astype(
                np.float32) @ self.weight_ref[filter_offset].astype(
                    np.float32)
            dinput_ref[i_inds] += cc
-            out_gather = self.output[o_inds]  # [N, K]
+            # use random output and random inp as dout and inp
+            out_gather = self.output[self.out_order][o_inds]  # [N, K]
            inp_gather = self.inp[i_inds]  # [N, C]
            # KN @ NC
            dw_res = out_gather.astype(
@@ -225,7 +279,7 @@ def _test_impgemm_conv_cuda(subm: bool):
    shapes = [[19, 18, 17]]
    batchsizes = [1]
    dtypes = [np.float32, np.float16]
-    dtypes = [np.int8]
+    # dtypes = [np.int8]
    test_case = TestCase()
    # in_channels = [32]
    # out_channels = [32, 48, 64]
@@ -245,6 +299,7 @@ def _test_impgemm_conv_cuda(subm: bool):
        strides = [1, 2, 3]
        paddings = [0, 1]
        dilations = [1, 2]
+
    algos = [
        # ConvAlgo.MaskSplitImplicitGemm,
        ConvAlgo.MaskImplicitGemm,
@@ -261,11 +316,14 @@ def _test_impgemm_conv_cuda(subm: bool):
        multipler = max(C, K) / multiple_base
        multipler = max(multipler, 1.0)
        # print(num_batch)
-        tester = SparseConvTester(algo, subm, shape, bs, dtype, num_batch, K, C, k, s, p, d)
+        tester = SparseConvTester(algo, subm, shape, bs, dtype, num_batch, K, C, k, s, p, d, check_bias=True, check_act=True)
+        bias = None
+        act = tv.gemm.Activation.None_
+        if tester.check_bias:
+            bias = tv.from_numpy(tester.bias).cuda()
        atol, rtol = dtype_to_tol[dtype]
        mask_width_to_mask_out_fwd: Dict[int, torch.Tensor] = {}
        mask_width_to_mask_out_bwd: Dict[int, torch.Tensor] = {}
-
        op_types = [ConvOpType.kForward, ConvOpType.kBackwardInput]
        spk = 1
        for op_type in op_types:
@@ -276,7 +334,11 @@ def _test_impgemm_conv_cuda(subm: bool):
                    NHWC.layout_type.value, NHWC.interleave, NHWC.interleave, NHWC.interleave, arch, op_type.value, -1, True, False)
            else:
                avail_desps = CONV.get_all_available(inp_tv, weight_tv, output_tv, NHWC, NHWC, NHWC, arch, op_type, -1)
-
+            if op_type == ConvOpType.kForward and tester.check_act:
+                act = tv.gemm.Activation.ReLU
+            else:
+                act = tv.gemm.Activation.None_
+            assert avail_desps
            for desp in avail_desps:
                if not subm:
                    if op_type == ConvOpType.kForward:
@@ -292,7 +354,10 @@ def _test_impgemm_conv_cuda(subm: bool):
                                      dtype=torch.int32,
                                      device=tester.device)
                mask_output_fwd = mask_width_to_mask_out_fwd[mask_width]
-
+                is_fwd = desp.op_type.value == ConvOpType.kForward.value
+                bias_cur = bias 
+                if op_type != ConvOpType.kForward:
+                    bias_cur = None
                if subm:
                    if desp.op_type.value == ConvOpType.kForward.value:
                        indice_pairs = tester.pair_fwd
@@ -303,9 +368,12 @@ def _test_impgemm_conv_cuda(subm: bool):
                    mask_output = mask_output_fwd
                    # print([bin(x.item()) for x in masks])
                    for j in range(tester.num_split):
-                        beta = 1 if j == 1 else 0
+                        beta = 1 if j > 0 else 0
+                        if bias_cur is not None:
+                            beta = 1
+                        if j > 0:
+                            bias_cur = None
                        mask_filter = tester.masks[j].item()
-
                        reverse_mask = False
                        if desp.op_type.value == ConvOpType.kBackwardWeight.value:
                            mask_op = mask_output[j]
@@ -338,6 +406,8 @@ def _test_impgemm_conv_cuda(subm: bool):
                                beta=beta,
                                verbose=False,
                                force_nvrtc=force_nvrtc,
+                                bias=bias_cur if is_fwd and bias_cur is not None else tv.Tensor(),
+                                act_type=act,
                            )
                        else:
                            CONV.run_with_tuned_result(
@@ -356,6 +426,8 @@ def _test_impgemm_conv_cuda(subm: bool):
                                beta=beta,
                                verbose=False,
                                force_nvrtc=force_nvrtc,
+                                bias=bias_cur if is_fwd else None,
+                                act_type=act,
                            )

                else:
@@ -382,7 +454,12 @@ def _test_impgemm_conv_cuda(subm: bool):
                        mask_output = mask_output_fwd

                    for j in range(tester.num_split):
-                        beta = 1 if j == 1 else 0
+                        # beta = 1 if j == 1 else 0
+                        beta = 1 if j > 0 else 0
+                        if bias_cur is not None:
+                            beta = 1
+                        if j > 0:
+                            bias_cur = None
                        mask_filter = tester.masks[j].item()
                        reverse_mask = False
                        if desp.op_type.value == ConvOpType.kBackwardWeight.value:
@@ -406,6 +483,9 @@ def _test_impgemm_conv_cuda(subm: bool):
                                mask_width=mask_width,
                                beta=beta,
                                verbose=False,
+                                force_nvrtc=force_nvrtc,
+                                bias=bias if is_fwd and bias is not None else tv.Tensor(),
+                                act_type=act,
                            )
                        else:
                            CONV.run_with_tuned_result(
@@ -423,6 +503,9 @@ def _test_impgemm_conv_cuda(subm: bool):
                                mask_width=mask_width,
                                beta=beta,
                                verbose=False,
+                                force_nvrtc=force_nvrtc,
+                                bias=bias if is_fwd else None,
+                                act_type=act,
                            )

                out_ref = tester.out_ref
@@ -430,6 +513,7 @@ def _test_impgemm_conv_cuda(subm: bool):
                dw_ref = tester.dw_ref
                if op_type == ConvOpType.kForward:
                    out_my = output_tv.cpu().numpy()
+                    out_my = out_my[tester.out_order]
                    if dtype != np.float16:
                        test_case.assertAllClose(out_ref, out_my, atol=atol, rtol=rtol)
                    else:
@@ -437,7 +521,6 @@ def _test_impgemm_conv_cuda(subm: bool):
                        if (error_norm > 5):
                            print(f"{desp}, Error={error_norm}")
                        assert error_norm < 10 * multipler
-                    # print(desp, )
                else:
                    din_my = inp_tv.cpu().numpy()
                    if dtype != np.float16:
@@ -446,7 +529,6 @@ def _test_impgemm_conv_cuda(subm: bool):
                        error_norm = np.linalg.norm(din_ref.reshape(-1) - din_my.reshape(-1))
                        assert error_norm < 10 * multipler, f"{desp}, {error_norm}, {k}, {s}, {p}, {d}"
        inp_tv, weight_tv, output_tv = tester.get_operands(ConvOpType.kBackwardWeight)
-
        for spk in [1, 4, 16, 64]:
            for mask_width, mask_output in mask_width_to_mask_out_fwd.items():
                if SPCONV_CPP_GEMM:
@@ -554,7 +636,10 @@ def _test_native_conv_cuda(subm: bool):
    for shape, bs, C, K, k, s, p, d, dtype in tqdm.tqdm(params_grid(
            shapes, batchsizes, in_channels, out_channels, ksizes,
            strides, paddings, dilations, dtypes)):
-        tester = SparseConvTester(ConvAlgo.Native, subm, shape, bs, dtype, 1500, K, C, k, s, p, d)
+        tester = SparseConvTester(ConvAlgo.Native, subm, shape, bs, dtype, 1500, K, C, k, s, p, d, check_bias=True, check_act=True)
+        bias = None
+        if tester.check_bias:
+            bias = tv.from_numpy(tester.bias).cuda()
        atol, rtol = dtype_to_tol[dtype]
        multipler = max(C, K) / multiple_base
        multipler = max(multipler, 1.0)
@@ -580,7 +665,6 @@ def _test_native_conv_cuda(subm: bool):
            inp_tv = torch_tensor_to_tv(inp_th)
            weight_tv = torch_tensor_to_tv(weight_th)
            output_tv = torch_tensor_to_tv(output_th)
-
            if op_type == ConvOpType.kForward:
                a = inp_tv
                c = output_tv
@@ -593,9 +677,11 @@ def _test_native_conv_cuda(subm: bool):
                for desp in avail_desps:
                    if subm:
                        torch.mm(inp_th, weight_th[:, tester.kv // 2].T, out=output_th)
+                            # output_th += bias_th
                    else:
                        output_tv.zero_()
                    inited = subm
+                    # determine last valid subm indices, then apply 
                    for i, nhot in enumerate(indice_pair_num_cpu):
                        if subm and i == kv_center:
                            continue
@@ -643,8 +729,14 @@ def _test_native_conv_cuda(subm: bool):
                                hint=AlgoHint.Fowrard.value,
                                alpha=1.0,
                                beta=beta)
-
                        inited = True
+                    if bias is not None and tester.check_act:
+                        InferenceOps.bias_add_act_inplace(output_tv, bias, tv.gemm.Activation.ReLU, 0, 0)
+                    else:
+                        if bias is not None:
+                            InferenceOps.bias_add_inplace(output_tv, bias, 0)
+                        if tester.check_act:
+                            InferenceOps.activation_inplace(output_tv, tv.gemm.Activation.ReLU, 0, 0)
                    out_my = output_tv.cpu().numpy()
                    if dtype != np.float16:
                        # error_norm = np.linalg.norm(out_ref.reshape(-1) - out_my.reshape(-1))
@@ -807,7 +899,7 @@ def _test_native_conv_cuda(subm: bool):
 def test_all_algo_unit():
    # for i in range(5):
    _test_impgemm_conv_cuda(True)
-    _test_impgemm_conv_cuda(False)
+    _test_impgemm_conv_cuda(True)
    _test_native_conv_cuda(True)
    _test_native_conv_cuda(False)