Commit d0bfb3a3 authored by yan.yan's avatar yan.yan
Browse files

add fused bias/act

parent 2b195e43
......@@ -52,7 +52,7 @@ REQUIRES_PYTHON = '>=3.6'
VERSION = None
# What packages are required for this module to be executed?
REQUIRED = ["pccm>=0.2.21", "pybind11>=2.6.0", "fire", "numpy", *deps]
REQUIRED = ["pccm>=0.3.5", "pybind11>=2.6.0", "fire", "numpy", *deps]
# What packages are optional?
EXTRAS = {
......@@ -162,6 +162,7 @@ if disable_jit is not None and disable_jit == "1":
from spconv.csrc.sparse.alloc import ExternalAllocator
from spconv.csrc.sparse.convops import GemmTunerSimple, ExternalSpconvMatmul
from spconv.csrc.sparse.convops import ConvTunerSimple, ConvGemmOps
from spconv.csrc.sparse.inference import InferenceOps
cu = GemmMainUnitTest(SHUFFLE_SIMT_PARAMS + SHUFFLE_VOLTA_PARAMS + SHUFFLE_TURING_PARAMS)
convcu = ConvMainUnitTest(IMPLGEMM_SIMT_PARAMS + IMPLGEMM_VOLTA_PARAMS + IMPLGEMM_TURING_PARAMS)
......@@ -192,7 +193,7 @@ if disable_jit is not None and disable_jit == "1":
cus = [gemmtuner, convtuner,
convops, SpconvOps(), BoxOps(), HashTable(), CompileInfo(),
ExternalAllocator(),
ExternalSpconvMatmul()]
ExternalSpconvMatmul(), InferenceOps()]
if not CUMM_CPU_ONLY_BUILD:
cus.extend([cu, convcu])
ext_modules: List[Extension] = [
......
......@@ -606,7 +606,11 @@ class SimpleGemm:
gather_data: tv.Tensor = tv.Tensor(),
workspace: tv.Tensor = tv.Tensor(),
timer: CUDAKernelTimer = CUDAKernelTimer(False),
force_nvrtc: bool = False):
force_nvrtc: bool = False,
bias: Optional[tv.Tensor] = None,
act_alpha: float = 0.0,
act_beta: float = 0.0,
act_type: tv.gemm.Activation = tv.gemm.Activation.None_):
m, n, k = GemmMainUnitTest.extract_mnk(a.shape, b.shape, trans_a,
trans_b, trans_c,
shuffle_type.value,
......@@ -630,6 +634,8 @@ class SimpleGemm:
params.a = a
params.b = b
params.c = c
if bias is not None:
params.d = bias
params.a_inds = a_inds
params.b_inds = b_inds
params.c_inds = c_inds
......@@ -638,6 +644,9 @@ class SimpleGemm:
params.stream = stream
params.alpha = alpha
params.beta = beta
params.act_alpha = act_alpha
params.act_beta = act_beta
params.act_type = act_type
params.workspace = workspace
# gather = 0
# if profile_res.external_gather and not gather_data.empty():
......@@ -973,7 +982,11 @@ class SimpleConv:
workspace: tv.Tensor = tv.Tensor(),
verbose: bool = False,
timer: CUDAKernelTimer = CUDAKernelTimer(False),
force_nvrtc: bool = False):
force_nvrtc: bool = False,
bias: Optional[tv.Tensor] = None,
act_alpha: float = 0.0,
act_beta: float = 0.0,
act_type: tv.gemm.Activation = tv.gemm.Activation.None_):
channel_k = output.dim(1)
channel_c = inp.dim(1)
# GemmMainUnitTest.stream_synchronize(stream)
......@@ -989,7 +1002,7 @@ class SimpleConv:
params = ConvParams(NDIM_DONT_CARE, ConvOpTypeCpp(op_type_value))
is_not_static = str(
algo_desp) not in self.prebuilt_desp_names
if algo_desp.is_nvrtc and (is_not_static or force_nvrtc):
if force_nvrtc or (algo_desp.is_nvrtc and is_not_static):
params.nvrtc_params = self._cached_get_nvrtc_params(
algo_desp, profile_res.arch)
params.conv_algo_desp = profile_res.algo_desp
......@@ -1001,6 +1014,9 @@ class SimpleConv:
params.split_k_slices = split_k_slices
params.alpha = alpha
params.beta = beta
params.act_alpha = act_alpha
params.act_beta = act_beta
params.act_type = act_type
params.stream = stream
params.mask_argsort = mask_argsort
params.indices = indices
......@@ -1011,6 +1027,8 @@ class SimpleConv:
params.mask_filter = mask_filter
params.mask_output = mask_output
params.reverse_mask = reverse_mask
if bias is not None:
params.bias = bias
if timer.enable:
assert timer._timer is not None
params.timer = timer._timer
......
......@@ -36,6 +36,7 @@ if project_is_installed(PACKAGE_NAME) and project_is_editable(
from spconv.csrc.sparse.convops import GemmTunerSimple, ExternalSpconvMatmul
from spconv.csrc.sparse.convops import ConvTunerSimple, ConvGemmOps
from spconv.csrc.sparse.convops import SimpleExternalSpconvMatmul
from spconv.csrc.sparse.inference import InferenceOps
all_shuffle = SHUFFLE_SIMT_PARAMS + SHUFFLE_VOLTA_PARAMS + SHUFFLE_TURING_PARAMS
all_shuffle = list(filter(lambda x: not x.is_nvrtc, all_shuffle))
......@@ -63,6 +64,7 @@ if project_is_installed(PACKAGE_NAME) and project_is_editable(
ExternalAllocator(),
ExternalSpconvMatmul(),
SimpleExternalSpconvMatmul(), # for debug, won't be included in release
InferenceOps(),
]
pccm.builder.build_pybind(cus,
PACKAGE_ROOT / "core_cc",
......
......@@ -100,7 +100,9 @@ class AllocKeys:
SPCONV_DEBUG_WEIGHT = False
SPCONV_CPP_INDICE_PAIRS = False
SPCONV_CPP_INDICE_PAIRS = True
SPCONV_USE_DIRECT_TABLE = True
# currently use cpp pair gen is slightly slower than python, I don't know why.
SPCONV_CPP_INDICE_PAIRS_IGEMM = os.getenv("SPCONV_CPP_INDICE_PAIRS_IGEMM", "0") == "1"
......
......@@ -5,6 +5,7 @@ from cumm.tensorview import Tensor
from cumm.tensorview.gemm import NVRTCParams
from spconv.core_cc.csrc.sparse.convops import ConvTuneResult
from cumm.tensorview import CUDAKernelTimer
from cumm.tensorview.gemm import Activation
class ConvTunerSimple:
def __init__(self, desps: List[ConvAlgoDesp]) -> None:
"""
......@@ -88,7 +89,7 @@ class ConvTunerSimple:
mask_width:
"""
...
def run_with_tuned_result(self, profile_res, op_type: int, inp: Tensor, weight: Tensor, output: Tensor, mask: Tensor, mask_argsort: Tensor, mask_output: Tensor, indices: Tensor, reverse_mask: bool, mask_filter: int = 0xffffffff, mask_width: int = -1, alpha: float = 1.0, beta: float = 0.0, stream_int: int = 0, workspace: Tensor = Tensor(), verbose: bool = False, timer: CUDAKernelTimer = CUDAKernelTimer(false), force_nvrtc: bool = False) -> None:
def run_with_tuned_result(self, profile_res, op_type: int, inp: Tensor, weight: Tensor, output: Tensor, mask: Tensor, mask_argsort: Tensor, mask_output: Tensor, indices: Tensor, reverse_mask: bool, mask_filter: int = 0xffffffff, mask_width: int = -1, alpha: float = 1.0, beta: float = 0.0, stream_int: int = 0, workspace: Tensor = Tensor(), verbose: bool = False, timer: CUDAKernelTimer = CUDAKernelTimer(false), force_nvrtc: bool = False, bias: Tensor = Tensor(), act_alpha: float = 0.0, act_beta: float = 0.0, act_type: Activation = Activation.None_) -> None:
"""
Args:
profile_res:
......@@ -110,6 +111,10 @@ class ConvTunerSimple:
verbose:
timer:
force_nvrtc:
bias:
act_alpha:
act_beta:
act_type:
"""
...
def query_workspace_size(self, desp: ConvAlgoDesp, splitk: int, op_type: int, N: int, C: int, K: int, kv: int) -> int:
......
......@@ -5,6 +5,7 @@ from cumm.tensorview import Tensor
from cumm.tensorview.gemm import NVRTCParams
from spconv.core_cc.csrc.sparse.convops import GemmTuneResult
from cumm.tensorview import CUDAKernelTimer
from cumm.tensorview.gemm import Activation
class GemmTunerSimple:
def __init__(self, desps: List[GemmAlgoDesp]) -> None:
"""
......@@ -81,7 +82,7 @@ class GemmTunerSimple:
hint:
"""
...
def run_with_tuned_result(self, profile_res, a: Tensor, b: Tensor, c: Tensor, trans_a: bool, trans_b: bool, trans_c: bool, arch: Tuple[int, int], stream_int: int, shuffle_type: int, a_inds: Tensor, b_inds: Tensor, c_inds: Tensor, hint: int = 0, alpha: float = 1.0, beta: float = 0.0, workspace: Tensor = Tensor(), timer: CUDAKernelTimer = CUDAKernelTimer(False), force_nvrtc: bool = False) -> None:
def run_with_tuned_result(self, profile_res, a: Tensor, b: Tensor, c: Tensor, trans_a: bool, trans_b: bool, trans_c: bool, arch: Tuple[int, int], stream_int: int, shuffle_type: int, a_inds: Tensor, b_inds: Tensor, c_inds: Tensor, hint: int = 0, alpha: float = 1.0, beta: float = 0.0, workspace: Tensor = Tensor(), timer: CUDAKernelTimer = CUDAKernelTimer(False), force_nvrtc: bool = False, bias: Tensor = Tensor(), act_alpha: float = 0.0, act_beta: float = 0.0, act_type: Activation = Activation.None_) -> None:
"""
Args:
profile_res:
......@@ -103,5 +104,9 @@ class GemmTunerSimple:
workspace:
timer:
force_nvrtc:
bias:
act_alpha:
act_beta:
act_type:
"""
...
from typing import overload, Any, Callable, Dict, List, Optional, Set, Tuple, Type, Union
from pccm.stubs import EnumValue, EnumClassValue
from cumm.tensorview import Tensor
from cumm.tensorview.gemm import Activation
from cumm.tensorview import CUDAKernelTimer
class ConvGemmOps:
@staticmethod
......@@ -11,7 +12,7 @@ class ConvGemmOps:
"""
...
@staticmethod
def indice_conv(allocator, ext_mm, gemm_tuner, all_w_is_krsc: bool, filter_hwio: bool, features: Tensor, filters: Tensor, indice_pairs: Tensor, indice_pair_num: Tensor, arch: Tuple[int, int], num_activate_out: int, inverse: bool = False, subm: bool = False, algo: int = 0, stream_int: int = 0) -> None:
def indice_conv(allocator, ext_mm, gemm_tuner, all_w_is_krsc: bool, filter_hwio: bool, features: Tensor, filters: Tensor, indice_pairs: Tensor, indice_pair_num: Tensor, arch: Tuple[int, int], num_activate_out: int, inverse: bool = False, subm: bool = False, algo: int = 0, stream_int: int = 0, bias: Tensor = Tensor(), act_alpha: float = 0.0, act_beta: float = 0.0, act_type: Activation = Activation.None_) -> None:
"""
1. this function need to take a out features
that from subm first mm.
......@@ -32,6 +33,10 @@ class ConvGemmOps:
subm:
algo:
stream_int:
bias:
act_alpha:
act_beta:
act_type:
"""
...
@staticmethod
......@@ -56,7 +61,7 @@ class ConvGemmOps:
"""
...
@staticmethod
def implicit_gemm(allocator, conv_tuner, features: Tensor, filters: Tensor, pair_fwd: Tensor, pair_mask_fwd_splits: List[Tensor], mask_argsort_fwd_splits: List[Tensor], num_activate_out: int, masks: Tensor, arch: Tuple[int, int], is_train: bool = False, is_subm: bool = False, stream_int: int = 0, timer: CUDAKernelTimer = CUDAKernelTimer(False), auto_fp32_accum: bool = True, fp32_accum: bool = False) -> Tuple[int, Any]:
def implicit_gemm(allocator, conv_tuner, features: Tensor, filters: Tensor, pair_fwd: Tensor, pair_mask_fwd_splits: List[Tensor], mask_argsort_fwd_splits: List[Tensor], num_activate_out: int, masks: Tensor, arch: Tuple[int, int], is_train: bool = False, is_subm: bool = False, stream_int: int = 0, timer: CUDAKernelTimer = CUDAKernelTimer(False), auto_fp32_accum: bool = True, fp32_accum: bool = False, bias: Tensor = Tensor(), act_alpha: float = 0.0, act_beta: float = 0.0, act_type: Activation = Activation.None_) -> Tuple[int, Any]:
"""
Args:
allocator:
......@@ -75,6 +80,10 @@ class ConvGemmOps:
timer:
auto_fp32_accum:
fp32_accum:
bias:
act_alpha:
act_beta:
act_type:
"""
...
@staticmethod
......
from typing import overload, Any, Callable, Dict, List, Optional, Set, Tuple, Type, Union
from pccm.stubs import EnumValue, EnumClassValue
from cumm.tensorview import Tensor
from cumm.tensorview.gemm import Activation
class InferenceOps:
@staticmethod
def bias_add_act_inplace(out: Tensor, bias: Tensor, act_type: Activation = Activation.None_, alpha: float = 0.0, beta: float = 0.0, stream: int = 0) -> None:
"""
Args:
out:
bias:
act_type:
alpha:
beta:
stream:
"""
...
@staticmethod
def bias_add_inplace(out: Tensor, bias: Tensor, stream: int = 0) -> None:
"""
Args:
out:
bias:
stream:
"""
...
@staticmethod
def activation_inplace(out: Tensor, act_type: Activation, alpha: float, beta: float, stream: int = 0) -> None:
"""
Args:
out:
act_type:
alpha:
beta:
stream:
"""
...
......@@ -14,7 +14,7 @@ from spconv.csrc.sparse.gather import GatherCPU
from .alloc import ExternalAllocator
from cumm.common import CompileInfo
from .inference import InferenceOps
class ExternalSpconvMatmul(pccm.Class):
"""a helper class to warp matmul operations
......@@ -834,6 +834,12 @@ class GemmTunerSimple(pccm.ParameterizedClass):
code.arg("timer", "tv::CUDAKernelTimer", "tv::CUDAKernelTimer(false)",
"cumm.tensorview.CUDAKernelTimer = CUDAKernelTimer(False)")
code.arg("force_nvrtc", f"bool", "false")
code.arg("bias", "tv::Tensor", "tv::Tensor()",
"cumm.tensorview.Tensor = Tensor()")
code.arg("act_alpha", f"float", "0.0")
code.arg("act_beta", f"float", "0.0")
code.arg("act_type", f"tv::gemm::Activation", "tv::gemm::Activation::kNone", "cumm.tensorview.gemm.Activation = Activation.None_")
if CUMM_CPU_ONLY_BUILD:
code.raw(f"TV_THROW_RT_ERR(\"not implemented for cpu!!!\")")
return code
......@@ -847,12 +853,13 @@ class GemmTunerSimple(pccm.ParameterizedClass):
tv::gemm::GemmParams params;
bool desp_is_static = prebuilt_names_.find(desp.__repr__()) == prebuilt_names_.end();
if (desp.is_nvrtc && (desp_is_static || force_nvrtc)){{
if (force_nvrtc || (desp.is_nvrtc && desp_is_static)){{
params.nvrtc_params = cached_get_nvrtc_params(desp, profile_res.arch, stream_int);
}}
params.a = a;
params.b = b;
params.c = c;
params.d = bias;
params.a_inds = a_inds;
params.b_inds = b_inds;
params.c_inds = c_inds;
......@@ -861,6 +868,10 @@ class GemmTunerSimple(pccm.ParameterizedClass):
params.stream = stream_int;
params.alpha = alpha;
params.beta = beta;
params.act_alpha = act_alpha;
params.act_beta = act_beta;
params.act_type = act_type;
params.workspace = workspace;
GemmMain::matmul2(params);
""")
......@@ -1257,15 +1268,18 @@ class ConvTunerSimple(pccm.ParameterizedClass):
code.arg("timer", "tv::CUDAKernelTimer", "tv::CUDAKernelTimer(false)",
"cumm.tensorview.CUDAKernelTimer = CUDAKernelTimer(false)")
code.arg("force_nvrtc", f"bool", "false")
code.arg("bias", "tv::Tensor", "tv::Tensor()",
"cumm.tensorview.Tensor = Tensor()")
code.arg("act_alpha", f"float", "0.0")
code.arg("act_beta", f"float", "0.0")
code.arg("act_type", f"tv::gemm::Activation", "tv::gemm::Activation::kNone", "cumm.tensorview.gemm.Activation = Activation.None_")
if CUMM_CPU_ONLY_BUILD:
code.raw(f"TV_THROW_RT_ERR(\"not implemented for cpu!!!\")")
return code
code.raw(f"""
auto desp = profile_res.algo_desp;
if (force_nvrtc){{
desp.is_nvrtc = true;
}}
int split_k_slices = 1;
if (profile_res.splitk > 1){{
split_k_slices = profile_res.splitk;
......@@ -1276,7 +1290,7 @@ class ConvTunerSimple(pccm.ParameterizedClass):
auto arch = profile_res.arch;
tv::gemm::ConvParams params({NDIM_DONT_CARE}, op_type_cpp, timer);
bool desp_is_static = prebuilt_names_.find(desp.__repr__()) == prebuilt_names_.end();
if (desp.is_nvrtc && (desp_is_static || force_nvrtc)){{
if (force_nvrtc || (desp.is_nvrtc && desp_is_static)){{
params.nvrtc_params = cached_get_nvrtc_params(desp, arch, stream_int);
}}
params.conv_algo_desp = desp;
......@@ -1284,10 +1298,15 @@ class ConvTunerSimple(pccm.ParameterizedClass):
params.weight = weight.view(channel_k, -1, channel_c);
params.output = output;
params.verbose = verbose;
params.bias = bias;
params.split_k_slices = split_k_slices;
params.alpha = alpha;
params.beta = beta;
params.act_alpha = act_alpha;
params.act_beta = act_beta;
params.act_type = act_type;
params.stream = stream_int;
params.mask_argsort = mask_argsort;
params.indices = indices;
......@@ -1336,6 +1355,7 @@ class ConvGemmOps(pccm.ParameterizedClass):
GemmTuneResult,
ConvTuneResult,
ExternalSpconvMatmul,
InferenceOps,
)
self.add_param_class("gemm", gemm_tuner, "GemmTuner")
self.add_param_class("conv", conv_tuner, "ConvTuner")
......@@ -1384,11 +1404,18 @@ class ConvGemmOps(pccm.ParameterizedClass):
code.arg("subm", "bool", "false")
code.arg("algo", "int", f"{ConvAlgo.Native.value}")
code.arg("stream_int", f"std::uintptr_t", "0", pyanno="int")
code.arg("bias", "tv::Tensor", "tv::Tensor()",
"cumm.tensorview.Tensor = Tensor()")
code.arg("act_alpha", f"float", "0.0")
code.arg("act_beta", f"float", "0.0")
code.arg("act_type", f"tv::gemm::Activation", "tv::gemm::Activation::kNone", "cumm.tensorview.gemm.Activation = Activation.None_")
code.raw(f"""
int kv_dim, out_channel, kv;
std::vector<int64_t> filter_shape_per_kv;
bool is_KC_not_CK;
bool has_bias = !bias.empty();
bool has_act = act_type != tv::gemm::Activation::kNone;
if (!all_w_is_krsc){{
kv_dim = 0;
is_KC_not_CK = !filter_hwio;
......@@ -1419,10 +1446,22 @@ class ConvGemmOps(pccm.ParameterizedClass):
out_features = allocator.zeros({pccm.literal(AllocKeys.OutFeatures)},
{{num_activate_out, out_channel}}, features.dtype(), features.device(), stream_int);
}}
if (has_act || has_bias){{
TV_ASSERT_RT_ERR(!features.is_cpu(), "bias and act don't support cpu.");
}}
if (kv == 1 && subm){{
if (has_bias && has_act){{
InferenceOps::bias_add_act_inplace(out_features, bias, act_type, act_alpha, act_beta, stream_int);
}}else{{
if (has_bias){{
InferenceOps::bias_add_inplace(out_features, bias, stream_int);
}}
if (has_act){{
InferenceOps::activation_inplace(out_features, act_type, act_alpha, act_beta, stream_int);
}}
}}
return;
}}
auto indice_pair_num_cpu = indice_pair_num.cpu();
auto indice_pair_num_cpu_ptr = indice_pair_num_cpu.data_ptr<int>();
int maxnhot = 0;
......@@ -1571,6 +1610,16 @@ class ConvGemmOps(pccm.ParameterizedClass):
beta);
inited = true;
}}
if (has_bias && has_act){{
InferenceOps::bias_add_act_inplace(out_features, bias, act_type, act_alpha, act_beta, stream_int);
}}else{{
if (has_bias){{
InferenceOps::bias_add_inplace(out_features, bias, stream_int);
}}
if (has_act){{
InferenceOps::activation_inplace(out_features, act_type, act_alpha, act_beta, stream_int);
}}
}}
""")
return code
......@@ -1913,11 +1962,21 @@ class ConvGemmOps(pccm.ParameterizedClass):
code.arg("auto_fp32_accum", "bool", "true")
code.arg("fp32_accum", "bool", "false")
code.arg("bias", "tv::Tensor", "tv::Tensor()",
"cumm.tensorview.Tensor = Tensor()")
code.arg("act_alpha", f"float", "0.0")
code.arg("act_beta", f"float", "0.0")
code.arg("act_type", f"tv::gemm::Activation", "tv::gemm::Activation::kNone", "cumm.tensorview.gemm.Activation = Activation.None_")
if CUMM_CPU_ONLY_BUILD:
code.raw(f"TV_THROW_RT_ERR(\"not implemented for cpu!!!\")")
return code.ret("int")
code.raw(f"""
if (!bias.empty() || act_type != tv::gemm::Activation::kNone){{
TV_ASSERT_RT_ERR(pair_mask_fwd_splits.size() == 1, "SplitGemm don't support fused bias/act for now.");
}}
uint32_t* mask_ptr = masks.data_ptr<uint32_t>();
int num_mask = masks.dim(0);
int out_channel = filters.dim(0);
......@@ -1989,6 +2048,7 @@ class ConvGemmOps(pccm.ParameterizedClass):
for (int j = 0; j < num_split; ++j){{
float beta = j == 0 ? 0 : 1;
conv_tuner.run_with_tuned_result(
tune_res,
kForwardInt,
......@@ -2006,7 +2066,12 @@ class ConvGemmOps(pccm.ParameterizedClass):
stream_int,
tv::Tensor(), // workspace
false, // verbose
timer);
timer,
false,
bias,
act_alpha,
act_beta,
act_type);
}}
// auto end_ev = tv::CUDAEvent();
// end_ev.record(stream_int);
......
# Copyright 2021 Yan Yan
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import pccm
from cumm.common import TensorView, GemmDTypes, TensorViewKernel, ThrustLib, GemmBasic
from spconv.csrc.sparse.cpu_core import OMPLib
from cumm.constants import CUMM_CPU_ONLY_BUILD
class InferenceOpsKernel(pccm.ParameterizedClass):
def __init__(self):
super().__init__()
self.add_dependency(TensorViewKernel, GemmBasic)
@pccm.cuda.cuda_global_function
def bias_add_inplace_kernel(self):
code = pccm.FunctionCode()
code.targ("T")
code.arg("out_features", f"T*")
code.arg("bias", f"const T*")
code.arg("size", "int")
code.arg("num_features", "int")
code.raw(f"""
for (int i : tv::KernelLoopY<int>(size)) {{
auto out_ptr = out_features + i * num_features;
for (int j : tv::KernelLoopX<int>(num_features)) {{
out_ptr[j] = bias[j] + out_ptr[j];
}}
}}
""")
return code
@pccm.cuda.cuda_global_function
def bias_add_act_inplace_kernel(self):
code = pccm.FunctionCode()
code.targ("T")
code.arg("out_features", f"T*")
code.arg("bias", f"const T*")
code.arg("act_type", f"tv::gemm::Activation")
code.arg("alpha", f"T")
code.arg("beta", f"T")
code.arg("size", "int")
code.arg("num_features", "int")
code.raw(f"""
for (int i : tv::KernelLoopY<int>(size)) {{
auto out_ptr = out_features + i * num_features;
for (int j : tv::KernelLoopX<int>(num_features)) {{
T o = out_ptr[j] + bias[j];
switch (act_type){{
case tv::gemm::Activation::kNone:
break;
case tv::gemm::Activation::kReLU:{{
o = o >= T(0) ? o : T(0);
}}
case tv::gemm::Activation::kLeakyReLU:{{
o = o >= T(0) ? o : o * alpha;
}}
default: ;
}}
out_ptr[j] = o;
}}
}}
""")
return code
@pccm.cuda.cuda_global_function
def activation_inplace_kernel(self):
code = pccm.FunctionCode()
code.targ("T")
code.arg("out_features", f"T*")
code.arg("act_type", f"tv::gemm::Activation")
code.arg("alpha", f"T")
code.arg("beta", f"T")
code.arg("size", "int")
code.raw(f"""
for (int i : tv::KernelLoopX<int>(size)) {{
T o = out_features[i];
switch (act_type){{
case tv::gemm::Activation::kNone:
break;
case tv::gemm::Activation::kReLU:{{
out_features[i] = o >= T(0) ? o : T(0);
}}
case tv::gemm::Activation::kLeakyReLU:{{
out_features[i] = o >= T(0) ? o : o * alpha;
}}
default: ;
}}
}}
""")
return code
class InferenceOps(pccm.Class):
def __init__(self):
super().__init__()
self.add_dependency(TensorView)
self.kernel = InferenceOpsKernel()
self.add_include("tensorview/gemm/core/constants.h")
if CUMM_CPU_ONLY_BUILD:
_DECORATOR = pccm.static_function
else:
_DECORATOR = pccm.cuda.static_function
@pccm.pybind.mark
@_DECORATOR
def bias_add_act_inplace(self):
code = pccm.FunctionCode()
code.arg("out", "tv::Tensor")
code.arg("bias", "tv::Tensor")
code.arg("act_type", f"tv::gemm::Activation", "tv::gemm::Activation::kNone", "cumm.tensorview.gemm.Activation = Activation.None_")
code.arg("alpha", f"float", "0.0")
code.arg("beta", f"float", "0.0")
code.arg("stream", "std::uintptr_t", "0")
if CUMM_CPU_ONLY_BUILD:
code.raw(f"""
TV_THROW_RT_ERR("this function don't support cpu only build.")
""")
return code
code.add_param_class("ker", self.kernel)
code.raw(f"""
auto nhot = out.dim(0);
auto cudastream = reinterpret_cast<cudaStream_t>(stream);
TV_ASSERT_RT_ERR(bias.dim(0) == out.dim(1), "error");
tv::dispatch<float, double, tv::half_t, tv::bfloat16_t>(out.dtype(), [&](auto I){{
using T = TV_DECLTYPE(I);
constexpr int MaxThreads = 512;
tv::cuda::Launch launcher(1);
bool found = tv::dispatch_int_noexcept<512, 256, 128, 64, 32, 16>(out.dim(1), [](int my, int expect){{return my >= expect;}}, [&](auto V){{
// if out.dim(1) > value in list above, run this function.
// if a value is found, other value won't be executed.
int NumFeatures = TV_DECLTYPE(V)::value;
int Num0 = MaxThreads / NumFeatures;
dim3 blocks(tv::div_up(out.dim(1), int64_t(NumFeatures)), tv::div_up(nhot, int64_t(Num0)));
dim3 threads(NumFeatures, Num0);
launcher = tv::cuda::Launch(blocks, threads, cudastream);
}});
if (!found){{
int NumFeatures = 16;
int Num0 = MaxThreads / NumFeatures;
dim3 blocks(tv::div_up(out.dim(1), int64_t(NumFeatures)), tv::div_up(nhot, int64_t(Num0)));
dim3 threads(NumFeatures, Num0);
launcher = tv::cuda::Launch(blocks, threads, cudastream);
}}
if (act_type == tv::gemm::Activation::kNone){{
launcher(ker::bias_add_inplace_kernel<T>, out.data_ptr<T>(), bias.data_ptr<const T>(),
nhot, out.dim(1));
}}else{{
launcher(ker::bias_add_act_inplace_kernel<T>, out.data_ptr<T>(), bias.data_ptr<const T>(),
act_type, T(alpha), T(beta), nhot, out.dim(1));
}}
}});
""")
return code
@pccm.pybind.mark
@_DECORATOR
def bias_add_inplace(self):
code = pccm.FunctionCode()
code.arg("out", "tv::Tensor")
code.arg("bias", "tv::Tensor")
code.arg("stream", "std::uintptr_t", "0")
code.raw(f"""
return bias_add_act_inplace(out, bias, tv::gemm::Activation::kNone, 0, 0, stream);
""")
return code
@pccm.pybind.mark
@_DECORATOR
def activation_inplace(self):
code = pccm.FunctionCode()
code.arg("out", "tv::Tensor")
code.arg("act_type", f"tv::gemm::Activation")
code.arg("alpha", f"float")
code.arg("beta", f"float")
code.arg("stream", "std::uintptr_t", "0")
if CUMM_CPU_ONLY_BUILD:
code.raw(f"""
TV_THROW_RT_ERR("this function don't support cpu only build.")
""")
return code
code.add_param_class("ker", self.kernel)
code.raw(f"""
auto nhot = out.size();
auto cudastream = reinterpret_cast<cudaStream_t>(stream);
tv::cuda::Launch launcher = tv::cuda::Launch(nhot, cudastream);
tv::dispatch<float, double, tv::half_t, tv::bfloat16_t>(out.dtype(), [&](auto I){{
using T = TV_DECLTYPE(I);
launcher(ker::activation_inplace_kernel<T>, out.data_ptr<T>(), act_type, T(alpha), T(beta),
nhot);
}});
""")
return code
......@@ -17,6 +17,7 @@ from spconv.csrc.sparse.convops import (ConvGemmOps, ConvTunerSimple,
from spconv.csrc.utils import BoxOps
from cumm.gemm.algospec.core import (GemmAlgo, ShuffleStrideType)
from cumm.conv.bases import ConvLayout, ConvLayoutType, ConvOpType
from spconv.csrc.sparse.inference import InferenceOps
def main(include: str,
......@@ -60,6 +61,7 @@ def main(include: str,
ExternalSpconvMatmul(),
SimpleExternalSpconvMatmul(),
StaticAllocator(),
InferenceOps(),
]
gen_cmake(libname, cus, include, src, namespace_prefix=prefix)
......
......@@ -162,6 +162,7 @@ class SparseConvTensor(metaclass=SpConvTensorMeta):
assert len(spatial_shape) == ndim, "spatial shape must equal to ndim"
assert indices.dtype == torch.int32, "only support int32"
assert batch_size > 0
# assert features.shape[0] == indices.shape[0]
self._features = features
self.indices = indices
self.spatial_shape = [int(v) for v in spatial_shape]
......@@ -197,6 +198,9 @@ class SparseConvTensor(metaclass=SpConvTensorMeta):
return new_spt
def minus(self):
return self.replace_feature(-self.features)
@property
def features(self):
return self._features
......
......@@ -41,7 +41,7 @@ else:
GEMM_CPP = None
CONV_CPP = None
import time
from spconv.constants import FILTER_HWIO, ALL_WEIGHT_IS_KRSC, AllocKeys
from spconv.constants import FILTER_HWIO, ALL_WEIGHT_IS_KRSC, AllocKeys, SPCONV_USE_DIRECT_TABLE
from cumm.gemm import codeops
from spconv.tools import CUDAKernelTimer
......@@ -101,8 +101,12 @@ class _HashData:
dtype=torch.int32,
device=device)
hashdata_tv = torch_tensor_to_tv(self.hashdata)
self.hashdata_k_tv = hashdata_tv[0]
self.hashdata_v_tv = hashdata_tv[1]
if num == 0:
self.hashdata_k_tv = tv.Tensor()
self.hashdata_v_tv = tv.Tensor()
else:
self.hashdata_k_tv = hashdata_tv[0]
self.hashdata_v_tv = hashdata_tv[1]
def get_indice_pairs(indices: torch.Tensor,
......@@ -315,7 +319,7 @@ def get_indice_pairs_implicit_gemm(
alloc: Optional[ThrustSortAllocator] = None,
timer: CUDAKernelTimer = CUDAKernelTimer(False),
num_out_act_bound: int = -1,
direct_table: bool = True):
direct_table: bool = SPCONV_USE_DIRECT_TABLE):
"""
Why return tuple? because pytorch seems don't support custom object in autograd.
return: (
......@@ -535,7 +539,6 @@ def get_indice_pairs_implicit_gemm(
indices.shape[0], ksize, stride, padding, dilation)
if transpose:
max_num_act = kv * indices.shape[0]
pair_bwd = pair
pair_bwd_tv = pair_tv
indice_pairs_uniq = torch.empty((pair.numel() + 1, ),
......
......@@ -32,9 +32,9 @@ def waymo_data(batch_size=1, num_features=-1):
# 150000)
data = np.load(Path(__file__).parent / "data" / "benchmark-pc.npz")
pc = np.ascontiguousarray(data["pc"])
print(pc.shape)
voxels_tv, indices_tv, _ = gen.point_to_voxel(tv.from_numpy(pc))
voxels = voxels_tv.numpy().reshape(-1, 3)
if num_features > 0:
voxels = np.zeros((voxels.shape[0], num_features), dtype=voxels.dtype)
coors = indices_tv.numpy()
......@@ -316,6 +316,7 @@ import json
def main():
import pickle
np.random.seed(50051)
torch.manual_seed(50051)
# voxels, coors, spatial_shape = waymo_data(num_features=128)
......@@ -377,14 +378,6 @@ def main():
# print("------------")
with tv.measure_duration() as measure:
out_nograd = net(voxels_th, coors_th, 1, show_metrics)
# res = timer.collect_by_name("forward", timer.get_all_pair_time())
# res2 = timer.collect_by_name("forward0", timer.get_all_pair_time())
# print(sum(res.values()) + sum(res2.values()))
# print(timer.get_all_pair_time())
# print(sum(timer.get_all_pair_time().values()))
# sort_bench()
times.append(measure.duration)
if show_metrics:
timer = out_nograd._timer
......
......@@ -31,6 +31,7 @@ import pccm
import torch
import torch.nn.functional as F
from spconv.core_cc.csrc.sparse.convops import GemmTuneResult, ConvTuneResult
from spconv.pytorch.core import SparseConvTensor
from spconv.test_utils import TestCase
from cumm import tensorview as tv
from cumm.conv.bases import NCHW, NHWC, ConvIterAlgo, ConvOpType
......@@ -44,8 +45,10 @@ from spconv.pytorch.cppcore import get_current_stream, torch_tensor_to_tv
from spconv.test_utils import generate_sparse_data, params_grid
import tqdm
from spconv.constants import ALL_WEIGHT_IS_KRSC, SPCONV_CPP_GEMM
from spconv.core_cc.csrc.sparse.inference import InferenceOps
from spconv.pytorch import functional as Fsp
assert ALL_WEIGHT_IS_KRSC is True, "we only support KRSC in spconv >= 2.2"
from spconv.pytorch.hash import HashTable
# TODO remove or release this when tf32 op is ready
torch.backends.cuda.matmul.allow_tf32 = False
......@@ -60,8 +63,9 @@ NUMPY_DTYPE_TO_TORCH = {
class SparseConvTester:
def __init__(self, algo: ConvAlgo, subm: bool, shape: List[int], bs: int, dtype: np.dtype, N: int, K: int, C: int,
ksize: int, stride: int, padding: int, dilation: int) -> None:
ksize: int, stride: int, padding: int, dilation: int, check_bias: bool = False, check_act: bool = False) -> None:
ndim = 3
transpose = False
self.shape = shape
self.bs = bs
self.dtype = dtype
......@@ -77,6 +81,15 @@ class SparseConvTester:
op = expand_nd(ndim, 0)
self.kv: int = np.prod(self.ksize)
self.num_split = 1 if algo == ConvAlgo.MaskImplicitGemm else 2
if not subm:
if transpose:
out_shape = ops.get_deconv_output_size(shape, self.ksize, self.stride,
self.padding, self.dilation, op)
else:
out_shape = ops.get_conv_output_size(shape, self.ksize, self.stride,
self.padding, self.dilation)
else:
out_shape = shape
sparse_dict = generate_sparse_data(shape, [N] * bs, C)
......@@ -88,10 +101,15 @@ class SparseConvTester:
out_inds, pair_ref, indice_num_per_loc = ops.get_indice_pairs(
indices_th, 1, shape, ConvAlgo.Native, self.ksize, self.stride, self.padding,
self.dilation, op, subm)
self.ref_out_inds = out_inds
self.ref_out_inds_scalar = Fsp._indice_to_scalar(out_inds.long(), [bs, *out_shape])
self.indice_num_per_loc_np = indice_num_per_loc.cpu().numpy()
self.indice_pairs_np = pair_ref.cpu().numpy()
self.pair_native = pair_ref
self.indice_num_per_loc = indice_num_per_loc
self.use_direct_table = True
self.out_shape = out_shape
if algo == ConvAlgo.Native:
self.out_inds: torch.Tensor = out_inds
self.num_inds_per_loc: torch.Tensor = indice_num_per_loc
......@@ -105,7 +123,7 @@ class SparseConvTester:
else:
res = ops.get_indice_pairs_implicit_gemm(indices_th, bs, shape,
algo, self.ksize, self.stride, self.padding,
self.dilation, op, subm=subm)
self.dilation, op, subm=subm, direct_table=self.use_direct_table)
self.out_inds = res[0]
self.num_inds_per_loc = res[1]
......@@ -116,8 +134,27 @@ class SparseConvTester:
self.mask_argsort_fwd_splits = res[6]
self.mask_argsort_bwd_splits = res[7]
self.masks = res[8]
self.out_inds_scalar = Fsp._indice_to_scalar(self.out_inds.long(), [bs, *out_shape])
table = HashTable(out_inds.device, torch.int64, torch.int32, self.out_inds.shape[0] * 2)
# test coords -> test out indexes
table.insert(self.out_inds_scalar, torch.arange(0, self.out_inds.shape[0], dtype=torch.int32, device=self.device))
# out_order: test_order_to_ref, test index for each ref coord
out_order, is_empty = table.query(self.ref_out_inds_scalar)
assert is_empty.int().sum().item() == 0, "shouldn't happen"
self.out_order = out_order.cpu().numpy()
# inp_table = HashTable(out_inds.device, torch.int64, torch.int32, self.ref_out_inds.shape[0] * 2)
# inp_table.insert(self.ref_out_inds_scalar, torch.arange(0, self.ref_out_inds.shape[0], dtype=torch.int32, device=self.device))
# # out_order: ref index for each out coord
# out_order, is_empty = inp_table.query(self.out_inds_scalar)
self.voxels_np = voxels_np
self.indices_np = indices_np
self.check_bias = check_bias
self.check_act = check_act
self.subm = subm
if dtype == np.int8:
......@@ -128,6 +165,10 @@ class SparseConvTester:
self.output = np.random.randint(-2, 2, size=[
self.out_inds.shape[0], K
]).astype(dtype)
self.bias = np.random.randint(-2, 2, size=[
K
]).astype(dtype)
else:
self.inp = np.random.uniform(-1, 1, size=[
voxels_np.shape[0], C
......@@ -136,14 +177,25 @@ class SparseConvTester:
self.output = np.random.uniform(-1, 1, size=[
self.out_inds.shape[0], K
]).astype(dtype)
self.bias = np.random.uniform(-1, 1, size=[
K
]).astype(dtype)
self.weight_ref = self.weight.transpose(1, 2, 3, 0, 4)
self.weight_ref = np.ascontiguousarray(self.weight_ref).reshape(-1, K, C)
self.out_ref, self.din_ref, self.dw_ref = self._get_ref_output()
if check_bias:
self.out_ref += self.bias
# relu
if check_act:
self.out_ref = np.maximum(self.out_ref, 0)
self.dw_ref = np.ascontiguousarray(self.dw_ref.transpose(1, 0, 2).reshape(K, *self.ksize, C))
self.arch = tv.get_compute_capability()
def get_output_ref_spt(self):
return SparseConvTensor(torch.from_numpy(self.out_ref).cuda(), self.ref_out_inds, self.out_shape, self.bs)
def _get_ref_output(self):
output_ref = np.zeros_like(self.output, dtype=np.float32)
dinput_ref = np.zeros_like(self.inp, dtype=np.float32)
......@@ -165,13 +217,15 @@ class SparseConvTester:
np.float32) @ self.weight_ref[filter_offset].T.astype(
np.float32)
output_ref[o_inds] += cc
a = self.output[o_inds]
# we use random output as dout here
a = self.output[self.out_order][o_inds]
# NK @ KC
cc = a.astype(
np.float32) @ self.weight_ref[filter_offset].astype(
np.float32)
dinput_ref[i_inds] += cc
out_gather = self.output[o_inds] # [N, K]
# use random output and random inp as dout and inp
out_gather = self.output[self.out_order][o_inds] # [N, K]
inp_gather = self.inp[i_inds] # [N, C]
# KN @ NC
dw_res = out_gather.astype(
......@@ -225,7 +279,7 @@ def _test_impgemm_conv_cuda(subm: bool):
shapes = [[19, 18, 17]]
batchsizes = [1]
dtypes = [np.float32, np.float16]
dtypes = [np.int8]
# dtypes = [np.int8]
test_case = TestCase()
# in_channels = [32]
# out_channels = [32, 48, 64]
......@@ -245,6 +299,7 @@ def _test_impgemm_conv_cuda(subm: bool):
strides = [1, 2, 3]
paddings = [0, 1]
dilations = [1, 2]
algos = [
# ConvAlgo.MaskSplitImplicitGemm,
ConvAlgo.MaskImplicitGemm,
......@@ -261,11 +316,14 @@ def _test_impgemm_conv_cuda(subm: bool):
multipler = max(C, K) / multiple_base
multipler = max(multipler, 1.0)
# print(num_batch)
tester = SparseConvTester(algo, subm, shape, bs, dtype, num_batch, K, C, k, s, p, d)
tester = SparseConvTester(algo, subm, shape, bs, dtype, num_batch, K, C, k, s, p, d, check_bias=True, check_act=True)
bias = None
act = tv.gemm.Activation.None_
if tester.check_bias:
bias = tv.from_numpy(tester.bias).cuda()
atol, rtol = dtype_to_tol[dtype]
mask_width_to_mask_out_fwd: Dict[int, torch.Tensor] = {}
mask_width_to_mask_out_bwd: Dict[int, torch.Tensor] = {}
op_types = [ConvOpType.kForward, ConvOpType.kBackwardInput]
spk = 1
for op_type in op_types:
......@@ -276,7 +334,11 @@ def _test_impgemm_conv_cuda(subm: bool):
NHWC.layout_type.value, NHWC.interleave, NHWC.interleave, NHWC.interleave, arch, op_type.value, -1, True, False)
else:
avail_desps = CONV.get_all_available(inp_tv, weight_tv, output_tv, NHWC, NHWC, NHWC, arch, op_type, -1)
if op_type == ConvOpType.kForward and tester.check_act:
act = tv.gemm.Activation.ReLU
else:
act = tv.gemm.Activation.None_
assert avail_desps
for desp in avail_desps:
if not subm:
if op_type == ConvOpType.kForward:
......@@ -292,7 +354,10 @@ def _test_impgemm_conv_cuda(subm: bool):
dtype=torch.int32,
device=tester.device)
mask_output_fwd = mask_width_to_mask_out_fwd[mask_width]
is_fwd = desp.op_type.value == ConvOpType.kForward.value
bias_cur = bias
if op_type != ConvOpType.kForward:
bias_cur = None
if subm:
if desp.op_type.value == ConvOpType.kForward.value:
indice_pairs = tester.pair_fwd
......@@ -303,9 +368,12 @@ def _test_impgemm_conv_cuda(subm: bool):
mask_output = mask_output_fwd
# print([bin(x.item()) for x in masks])
for j in range(tester.num_split):
beta = 1 if j == 1 else 0
beta = 1 if j > 0 else 0
if bias_cur is not None:
beta = 1
if j > 0:
bias_cur = None
mask_filter = tester.masks[j].item()
reverse_mask = False
if desp.op_type.value == ConvOpType.kBackwardWeight.value:
mask_op = mask_output[j]
......@@ -338,6 +406,8 @@ def _test_impgemm_conv_cuda(subm: bool):
beta=beta,
verbose=False,
force_nvrtc=force_nvrtc,
bias=bias_cur if is_fwd and bias_cur is not None else tv.Tensor(),
act_type=act,
)
else:
CONV.run_with_tuned_result(
......@@ -356,6 +426,8 @@ def _test_impgemm_conv_cuda(subm: bool):
beta=beta,
verbose=False,
force_nvrtc=force_nvrtc,
bias=bias_cur if is_fwd else None,
act_type=act,
)
else:
......@@ -382,7 +454,12 @@ def _test_impgemm_conv_cuda(subm: bool):
mask_output = mask_output_fwd
for j in range(tester.num_split):
beta = 1 if j == 1 else 0
# beta = 1 if j == 1 else 0
beta = 1 if j > 0 else 0
if bias_cur is not None:
beta = 1
if j > 0:
bias_cur = None
mask_filter = tester.masks[j].item()
reverse_mask = False
if desp.op_type.value == ConvOpType.kBackwardWeight.value:
......@@ -406,6 +483,9 @@ def _test_impgemm_conv_cuda(subm: bool):
mask_width=mask_width,
beta=beta,
verbose=False,
force_nvrtc=force_nvrtc,
bias=bias if is_fwd and bias is not None else tv.Tensor(),
act_type=act,
)
else:
CONV.run_with_tuned_result(
......@@ -423,6 +503,9 @@ def _test_impgemm_conv_cuda(subm: bool):
mask_width=mask_width,
beta=beta,
verbose=False,
force_nvrtc=force_nvrtc,
bias=bias if is_fwd else None,
act_type=act,
)
out_ref = tester.out_ref
......@@ -430,6 +513,7 @@ def _test_impgemm_conv_cuda(subm: bool):
dw_ref = tester.dw_ref
if op_type == ConvOpType.kForward:
out_my = output_tv.cpu().numpy()
out_my = out_my[tester.out_order]
if dtype != np.float16:
test_case.assertAllClose(out_ref, out_my, atol=atol, rtol=rtol)
else:
......@@ -437,7 +521,6 @@ def _test_impgemm_conv_cuda(subm: bool):
if (error_norm > 5):
print(f"{desp}, Error={error_norm}")
assert error_norm < 10 * multipler
# print(desp, )
else:
din_my = inp_tv.cpu().numpy()
if dtype != np.float16:
......@@ -446,7 +529,6 @@ def _test_impgemm_conv_cuda(subm: bool):
error_norm = np.linalg.norm(din_ref.reshape(-1) - din_my.reshape(-1))
assert error_norm < 10 * multipler, f"{desp}, {error_norm}, {k}, {s}, {p}, {d}"
inp_tv, weight_tv, output_tv = tester.get_operands(ConvOpType.kBackwardWeight)
for spk in [1, 4, 16, 64]:
for mask_width, mask_output in mask_width_to_mask_out_fwd.items():
if SPCONV_CPP_GEMM:
......@@ -554,7 +636,10 @@ def _test_native_conv_cuda(subm: bool):
for shape, bs, C, K, k, s, p, d, dtype in tqdm.tqdm(params_grid(
shapes, batchsizes, in_channels, out_channels, ksizes,
strides, paddings, dilations, dtypes)):
tester = SparseConvTester(ConvAlgo.Native, subm, shape, bs, dtype, 1500, K, C, k, s, p, d)
tester = SparseConvTester(ConvAlgo.Native, subm, shape, bs, dtype, 1500, K, C, k, s, p, d, check_bias=True, check_act=True)
bias = None
if tester.check_bias:
bias = tv.from_numpy(tester.bias).cuda()
atol, rtol = dtype_to_tol[dtype]
multipler = max(C, K) / multiple_base
multipler = max(multipler, 1.0)
......@@ -580,7 +665,6 @@ def _test_native_conv_cuda(subm: bool):
inp_tv = torch_tensor_to_tv(inp_th)
weight_tv = torch_tensor_to_tv(weight_th)
output_tv = torch_tensor_to_tv(output_th)
if op_type == ConvOpType.kForward:
a = inp_tv
c = output_tv
......@@ -593,9 +677,11 @@ def _test_native_conv_cuda(subm: bool):
for desp in avail_desps:
if subm:
torch.mm(inp_th, weight_th[:, tester.kv // 2].T, out=output_th)
# output_th += bias_th
else:
output_tv.zero_()
inited = subm
# determine last valid subm indices, then apply
for i, nhot in enumerate(indice_pair_num_cpu):
if subm and i == kv_center:
continue
......@@ -643,8 +729,14 @@ def _test_native_conv_cuda(subm: bool):
hint=AlgoHint.Fowrard.value,
alpha=1.0,
beta=beta)
inited = True
if bias is not None and tester.check_act:
InferenceOps.bias_add_act_inplace(output_tv, bias, tv.gemm.Activation.ReLU, 0, 0)
else:
if bias is not None:
InferenceOps.bias_add_inplace(output_tv, bias, 0)
if tester.check_act:
InferenceOps.activation_inplace(output_tv, tv.gemm.Activation.ReLU, 0, 0)
out_my = output_tv.cpu().numpy()
if dtype != np.float16:
# error_norm = np.linalg.norm(out_ref.reshape(-1) - out_my.reshape(-1))
......@@ -807,7 +899,7 @@ def _test_native_conv_cuda(subm: bool):
def test_all_algo_unit():
# for i in range(5):
_test_impgemm_conv_cuda(True)
_test_impgemm_conv_cuda(False)
_test_impgemm_conv_cuda(True)
_test_native_conv_cuda(True)
_test_native_conv_cuda(False)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment