Commit d0bfb3a3 authored by yan.yan's avatar yan.yan
Browse files

add fused bias/act

parent 2b195e43
...@@ -52,7 +52,7 @@ REQUIRES_PYTHON = '>=3.6' ...@@ -52,7 +52,7 @@ REQUIRES_PYTHON = '>=3.6'
VERSION = None VERSION = None
# What packages are required for this module to be executed? # What packages are required for this module to be executed?
REQUIRED = ["pccm>=0.2.21", "pybind11>=2.6.0", "fire", "numpy", *deps] REQUIRED = ["pccm>=0.3.5", "pybind11>=2.6.0", "fire", "numpy", *deps]
# What packages are optional? # What packages are optional?
EXTRAS = { EXTRAS = {
...@@ -162,6 +162,7 @@ if disable_jit is not None and disable_jit == "1": ...@@ -162,6 +162,7 @@ if disable_jit is not None and disable_jit == "1":
from spconv.csrc.sparse.alloc import ExternalAllocator from spconv.csrc.sparse.alloc import ExternalAllocator
from spconv.csrc.sparse.convops import GemmTunerSimple, ExternalSpconvMatmul from spconv.csrc.sparse.convops import GemmTunerSimple, ExternalSpconvMatmul
from spconv.csrc.sparse.convops import ConvTunerSimple, ConvGemmOps from spconv.csrc.sparse.convops import ConvTunerSimple, ConvGemmOps
from spconv.csrc.sparse.inference import InferenceOps
cu = GemmMainUnitTest(SHUFFLE_SIMT_PARAMS + SHUFFLE_VOLTA_PARAMS + SHUFFLE_TURING_PARAMS) cu = GemmMainUnitTest(SHUFFLE_SIMT_PARAMS + SHUFFLE_VOLTA_PARAMS + SHUFFLE_TURING_PARAMS)
convcu = ConvMainUnitTest(IMPLGEMM_SIMT_PARAMS + IMPLGEMM_VOLTA_PARAMS + IMPLGEMM_TURING_PARAMS) convcu = ConvMainUnitTest(IMPLGEMM_SIMT_PARAMS + IMPLGEMM_VOLTA_PARAMS + IMPLGEMM_TURING_PARAMS)
...@@ -192,7 +193,7 @@ if disable_jit is not None and disable_jit == "1": ...@@ -192,7 +193,7 @@ if disable_jit is not None and disable_jit == "1":
cus = [gemmtuner, convtuner, cus = [gemmtuner, convtuner,
convops, SpconvOps(), BoxOps(), HashTable(), CompileInfo(), convops, SpconvOps(), BoxOps(), HashTable(), CompileInfo(),
ExternalAllocator(), ExternalAllocator(),
ExternalSpconvMatmul()] ExternalSpconvMatmul(), InferenceOps()]
if not CUMM_CPU_ONLY_BUILD: if not CUMM_CPU_ONLY_BUILD:
cus.extend([cu, convcu]) cus.extend([cu, convcu])
ext_modules: List[Extension] = [ ext_modules: List[Extension] = [
......
...@@ -606,7 +606,11 @@ class SimpleGemm: ...@@ -606,7 +606,11 @@ class SimpleGemm:
gather_data: tv.Tensor = tv.Tensor(), gather_data: tv.Tensor = tv.Tensor(),
workspace: tv.Tensor = tv.Tensor(), workspace: tv.Tensor = tv.Tensor(),
timer: CUDAKernelTimer = CUDAKernelTimer(False), timer: CUDAKernelTimer = CUDAKernelTimer(False),
force_nvrtc: bool = False): force_nvrtc: bool = False,
bias: Optional[tv.Tensor] = None,
act_alpha: float = 0.0,
act_beta: float = 0.0,
act_type: tv.gemm.Activation = tv.gemm.Activation.None_):
m, n, k = GemmMainUnitTest.extract_mnk(a.shape, b.shape, trans_a, m, n, k = GemmMainUnitTest.extract_mnk(a.shape, b.shape, trans_a,
trans_b, trans_c, trans_b, trans_c,
shuffle_type.value, shuffle_type.value,
...@@ -630,6 +634,8 @@ class SimpleGemm: ...@@ -630,6 +634,8 @@ class SimpleGemm:
params.a = a params.a = a
params.b = b params.b = b
params.c = c params.c = c
if bias is not None:
params.d = bias
params.a_inds = a_inds params.a_inds = a_inds
params.b_inds = b_inds params.b_inds = b_inds
params.c_inds = c_inds params.c_inds = c_inds
...@@ -638,6 +644,9 @@ class SimpleGemm: ...@@ -638,6 +644,9 @@ class SimpleGemm:
params.stream = stream params.stream = stream
params.alpha = alpha params.alpha = alpha
params.beta = beta params.beta = beta
params.act_alpha = act_alpha
params.act_beta = act_beta
params.act_type = act_type
params.workspace = workspace params.workspace = workspace
# gather = 0 # gather = 0
# if profile_res.external_gather and not gather_data.empty(): # if profile_res.external_gather and not gather_data.empty():
...@@ -973,7 +982,11 @@ class SimpleConv: ...@@ -973,7 +982,11 @@ class SimpleConv:
workspace: tv.Tensor = tv.Tensor(), workspace: tv.Tensor = tv.Tensor(),
verbose: bool = False, verbose: bool = False,
timer: CUDAKernelTimer = CUDAKernelTimer(False), timer: CUDAKernelTimer = CUDAKernelTimer(False),
force_nvrtc: bool = False): force_nvrtc: bool = False,
bias: Optional[tv.Tensor] = None,
act_alpha: float = 0.0,
act_beta: float = 0.0,
act_type: tv.gemm.Activation = tv.gemm.Activation.None_):
channel_k = output.dim(1) channel_k = output.dim(1)
channel_c = inp.dim(1) channel_c = inp.dim(1)
# GemmMainUnitTest.stream_synchronize(stream) # GemmMainUnitTest.stream_synchronize(stream)
...@@ -989,7 +1002,7 @@ class SimpleConv: ...@@ -989,7 +1002,7 @@ class SimpleConv:
params = ConvParams(NDIM_DONT_CARE, ConvOpTypeCpp(op_type_value)) params = ConvParams(NDIM_DONT_CARE, ConvOpTypeCpp(op_type_value))
is_not_static = str( is_not_static = str(
algo_desp) not in self.prebuilt_desp_names algo_desp) not in self.prebuilt_desp_names
if algo_desp.is_nvrtc and (is_not_static or force_nvrtc): if force_nvrtc or (algo_desp.is_nvrtc and is_not_static):
params.nvrtc_params = self._cached_get_nvrtc_params( params.nvrtc_params = self._cached_get_nvrtc_params(
algo_desp, profile_res.arch) algo_desp, profile_res.arch)
params.conv_algo_desp = profile_res.algo_desp params.conv_algo_desp = profile_res.algo_desp
...@@ -1001,6 +1014,9 @@ class SimpleConv: ...@@ -1001,6 +1014,9 @@ class SimpleConv:
params.split_k_slices = split_k_slices params.split_k_slices = split_k_slices
params.alpha = alpha params.alpha = alpha
params.beta = beta params.beta = beta
params.act_alpha = act_alpha
params.act_beta = act_beta
params.act_type = act_type
params.stream = stream params.stream = stream
params.mask_argsort = mask_argsort params.mask_argsort = mask_argsort
params.indices = indices params.indices = indices
...@@ -1011,6 +1027,8 @@ class SimpleConv: ...@@ -1011,6 +1027,8 @@ class SimpleConv:
params.mask_filter = mask_filter params.mask_filter = mask_filter
params.mask_output = mask_output params.mask_output = mask_output
params.reverse_mask = reverse_mask params.reverse_mask = reverse_mask
if bias is not None:
params.bias = bias
if timer.enable: if timer.enable:
assert timer._timer is not None assert timer._timer is not None
params.timer = timer._timer params.timer = timer._timer
......
...@@ -36,6 +36,7 @@ if project_is_installed(PACKAGE_NAME) and project_is_editable( ...@@ -36,6 +36,7 @@ if project_is_installed(PACKAGE_NAME) and project_is_editable(
from spconv.csrc.sparse.convops import GemmTunerSimple, ExternalSpconvMatmul from spconv.csrc.sparse.convops import GemmTunerSimple, ExternalSpconvMatmul
from spconv.csrc.sparse.convops import ConvTunerSimple, ConvGemmOps from spconv.csrc.sparse.convops import ConvTunerSimple, ConvGemmOps
from spconv.csrc.sparse.convops import SimpleExternalSpconvMatmul from spconv.csrc.sparse.convops import SimpleExternalSpconvMatmul
from spconv.csrc.sparse.inference import InferenceOps
all_shuffle = SHUFFLE_SIMT_PARAMS + SHUFFLE_VOLTA_PARAMS + SHUFFLE_TURING_PARAMS all_shuffle = SHUFFLE_SIMT_PARAMS + SHUFFLE_VOLTA_PARAMS + SHUFFLE_TURING_PARAMS
all_shuffle = list(filter(lambda x: not x.is_nvrtc, all_shuffle)) all_shuffle = list(filter(lambda x: not x.is_nvrtc, all_shuffle))
...@@ -63,6 +64,7 @@ if project_is_installed(PACKAGE_NAME) and project_is_editable( ...@@ -63,6 +64,7 @@ if project_is_installed(PACKAGE_NAME) and project_is_editable(
ExternalAllocator(), ExternalAllocator(),
ExternalSpconvMatmul(), ExternalSpconvMatmul(),
SimpleExternalSpconvMatmul(), # for debug, won't be included in release SimpleExternalSpconvMatmul(), # for debug, won't be included in release
InferenceOps(),
] ]
pccm.builder.build_pybind(cus, pccm.builder.build_pybind(cus,
PACKAGE_ROOT / "core_cc", PACKAGE_ROOT / "core_cc",
......
...@@ -100,7 +100,9 @@ class AllocKeys: ...@@ -100,7 +100,9 @@ class AllocKeys:
SPCONV_DEBUG_WEIGHT = False SPCONV_DEBUG_WEIGHT = False
SPCONV_CPP_INDICE_PAIRS = False SPCONV_CPP_INDICE_PAIRS = True
SPCONV_USE_DIRECT_TABLE = True
# currently use cpp pair gen is slightly slower than python, I don't know why. # currently use cpp pair gen is slightly slower than python, I don't know why.
SPCONV_CPP_INDICE_PAIRS_IGEMM = os.getenv("SPCONV_CPP_INDICE_PAIRS_IGEMM", "0") == "1" SPCONV_CPP_INDICE_PAIRS_IGEMM = os.getenv("SPCONV_CPP_INDICE_PAIRS_IGEMM", "0") == "1"
......
...@@ -5,6 +5,7 @@ from cumm.tensorview import Tensor ...@@ -5,6 +5,7 @@ from cumm.tensorview import Tensor
from cumm.tensorview.gemm import NVRTCParams from cumm.tensorview.gemm import NVRTCParams
from spconv.core_cc.csrc.sparse.convops import ConvTuneResult from spconv.core_cc.csrc.sparse.convops import ConvTuneResult
from cumm.tensorview import CUDAKernelTimer from cumm.tensorview import CUDAKernelTimer
from cumm.tensorview.gemm import Activation
class ConvTunerSimple: class ConvTunerSimple:
def __init__(self, desps: List[ConvAlgoDesp]) -> None: def __init__(self, desps: List[ConvAlgoDesp]) -> None:
""" """
...@@ -88,7 +89,7 @@ class ConvTunerSimple: ...@@ -88,7 +89,7 @@ class ConvTunerSimple:
mask_width: mask_width:
""" """
... ...
def run_with_tuned_result(self, profile_res, op_type: int, inp: Tensor, weight: Tensor, output: Tensor, mask: Tensor, mask_argsort: Tensor, mask_output: Tensor, indices: Tensor, reverse_mask: bool, mask_filter: int = 0xffffffff, mask_width: int = -1, alpha: float = 1.0, beta: float = 0.0, stream_int: int = 0, workspace: Tensor = Tensor(), verbose: bool = False, timer: CUDAKernelTimer = CUDAKernelTimer(false), force_nvrtc: bool = False) -> None: def run_with_tuned_result(self, profile_res, op_type: int, inp: Tensor, weight: Tensor, output: Tensor, mask: Tensor, mask_argsort: Tensor, mask_output: Tensor, indices: Tensor, reverse_mask: bool, mask_filter: int = 0xffffffff, mask_width: int = -1, alpha: float = 1.0, beta: float = 0.0, stream_int: int = 0, workspace: Tensor = Tensor(), verbose: bool = False, timer: CUDAKernelTimer = CUDAKernelTimer(false), force_nvrtc: bool = False, bias: Tensor = Tensor(), act_alpha: float = 0.0, act_beta: float = 0.0, act_type: Activation = Activation.None_) -> None:
""" """
Args: Args:
profile_res: profile_res:
...@@ -110,6 +111,10 @@ class ConvTunerSimple: ...@@ -110,6 +111,10 @@ class ConvTunerSimple:
verbose: verbose:
timer: timer:
force_nvrtc: force_nvrtc:
bias:
act_alpha:
act_beta:
act_type:
""" """
... ...
def query_workspace_size(self, desp: ConvAlgoDesp, splitk: int, op_type: int, N: int, C: int, K: int, kv: int) -> int: def query_workspace_size(self, desp: ConvAlgoDesp, splitk: int, op_type: int, N: int, C: int, K: int, kv: int) -> int:
......
...@@ -5,6 +5,7 @@ from cumm.tensorview import Tensor ...@@ -5,6 +5,7 @@ from cumm.tensorview import Tensor
from cumm.tensorview.gemm import NVRTCParams from cumm.tensorview.gemm import NVRTCParams
from spconv.core_cc.csrc.sparse.convops import GemmTuneResult from spconv.core_cc.csrc.sparse.convops import GemmTuneResult
from cumm.tensorview import CUDAKernelTimer from cumm.tensorview import CUDAKernelTimer
from cumm.tensorview.gemm import Activation
class GemmTunerSimple: class GemmTunerSimple:
def __init__(self, desps: List[GemmAlgoDesp]) -> None: def __init__(self, desps: List[GemmAlgoDesp]) -> None:
""" """
...@@ -81,7 +82,7 @@ class GemmTunerSimple: ...@@ -81,7 +82,7 @@ class GemmTunerSimple:
hint: hint:
""" """
... ...
def run_with_tuned_result(self, profile_res, a: Tensor, b: Tensor, c: Tensor, trans_a: bool, trans_b: bool, trans_c: bool, arch: Tuple[int, int], stream_int: int, shuffle_type: int, a_inds: Tensor, b_inds: Tensor, c_inds: Tensor, hint: int = 0, alpha: float = 1.0, beta: float = 0.0, workspace: Tensor = Tensor(), timer: CUDAKernelTimer = CUDAKernelTimer(False), force_nvrtc: bool = False) -> None: def run_with_tuned_result(self, profile_res, a: Tensor, b: Tensor, c: Tensor, trans_a: bool, trans_b: bool, trans_c: bool, arch: Tuple[int, int], stream_int: int, shuffle_type: int, a_inds: Tensor, b_inds: Tensor, c_inds: Tensor, hint: int = 0, alpha: float = 1.0, beta: float = 0.0, workspace: Tensor = Tensor(), timer: CUDAKernelTimer = CUDAKernelTimer(False), force_nvrtc: bool = False, bias: Tensor = Tensor(), act_alpha: float = 0.0, act_beta: float = 0.0, act_type: Activation = Activation.None_) -> None:
""" """
Args: Args:
profile_res: profile_res:
...@@ -103,5 +104,9 @@ class GemmTunerSimple: ...@@ -103,5 +104,9 @@ class GemmTunerSimple:
workspace: workspace:
timer: timer:
force_nvrtc: force_nvrtc:
bias:
act_alpha:
act_beta:
act_type:
""" """
... ...
from typing import overload, Any, Callable, Dict, List, Optional, Set, Tuple, Type, Union from typing import overload, Any, Callable, Dict, List, Optional, Set, Tuple, Type, Union
from pccm.stubs import EnumValue, EnumClassValue from pccm.stubs import EnumValue, EnumClassValue
from cumm.tensorview import Tensor from cumm.tensorview import Tensor
from cumm.tensorview.gemm import Activation
from cumm.tensorview import CUDAKernelTimer from cumm.tensorview import CUDAKernelTimer
class ConvGemmOps: class ConvGemmOps:
@staticmethod @staticmethod
...@@ -11,7 +12,7 @@ class ConvGemmOps: ...@@ -11,7 +12,7 @@ class ConvGemmOps:
""" """
... ...
@staticmethod @staticmethod
def indice_conv(allocator, ext_mm, gemm_tuner, all_w_is_krsc: bool, filter_hwio: bool, features: Tensor, filters: Tensor, indice_pairs: Tensor, indice_pair_num: Tensor, arch: Tuple[int, int], num_activate_out: int, inverse: bool = False, subm: bool = False, algo: int = 0, stream_int: int = 0) -> None: def indice_conv(allocator, ext_mm, gemm_tuner, all_w_is_krsc: bool, filter_hwio: bool, features: Tensor, filters: Tensor, indice_pairs: Tensor, indice_pair_num: Tensor, arch: Tuple[int, int], num_activate_out: int, inverse: bool = False, subm: bool = False, algo: int = 0, stream_int: int = 0, bias: Tensor = Tensor(), act_alpha: float = 0.0, act_beta: float = 0.0, act_type: Activation = Activation.None_) -> None:
""" """
1. this function need to take a out features 1. this function need to take a out features
that from subm first mm. that from subm first mm.
...@@ -32,6 +33,10 @@ class ConvGemmOps: ...@@ -32,6 +33,10 @@ class ConvGemmOps:
subm: subm:
algo: algo:
stream_int: stream_int:
bias:
act_alpha:
act_beta:
act_type:
""" """
... ...
@staticmethod @staticmethod
...@@ -56,7 +61,7 @@ class ConvGemmOps: ...@@ -56,7 +61,7 @@ class ConvGemmOps:
""" """
... ...
@staticmethod @staticmethod
def implicit_gemm(allocator, conv_tuner, features: Tensor, filters: Tensor, pair_fwd: Tensor, pair_mask_fwd_splits: List[Tensor], mask_argsort_fwd_splits: List[Tensor], num_activate_out: int, masks: Tensor, arch: Tuple[int, int], is_train: bool = False, is_subm: bool = False, stream_int: int = 0, timer: CUDAKernelTimer = CUDAKernelTimer(False), auto_fp32_accum: bool = True, fp32_accum: bool = False) -> Tuple[int, Any]: def implicit_gemm(allocator, conv_tuner, features: Tensor, filters: Tensor, pair_fwd: Tensor, pair_mask_fwd_splits: List[Tensor], mask_argsort_fwd_splits: List[Tensor], num_activate_out: int, masks: Tensor, arch: Tuple[int, int], is_train: bool = False, is_subm: bool = False, stream_int: int = 0, timer: CUDAKernelTimer = CUDAKernelTimer(False), auto_fp32_accum: bool = True, fp32_accum: bool = False, bias: Tensor = Tensor(), act_alpha: float = 0.0, act_beta: float = 0.0, act_type: Activation = Activation.None_) -> Tuple[int, Any]:
""" """
Args: Args:
allocator: allocator:
...@@ -75,6 +80,10 @@ class ConvGemmOps: ...@@ -75,6 +80,10 @@ class ConvGemmOps:
timer: timer:
auto_fp32_accum: auto_fp32_accum:
fp32_accum: fp32_accum:
bias:
act_alpha:
act_beta:
act_type:
""" """
... ...
@staticmethod @staticmethod
......
from typing import overload, Any, Callable, Dict, List, Optional, Set, Tuple, Type, Union
from pccm.stubs import EnumValue, EnumClassValue
from cumm.tensorview import Tensor
from cumm.tensorview.gemm import Activation
class InferenceOps:
@staticmethod
def bias_add_act_inplace(out: Tensor, bias: Tensor, act_type: Activation = Activation.None_, alpha: float = 0.0, beta: float = 0.0, stream: int = 0) -> None:
"""
Args:
out:
bias:
act_type:
alpha:
beta:
stream:
"""
...
@staticmethod
def bias_add_inplace(out: Tensor, bias: Tensor, stream: int = 0) -> None:
"""
Args:
out:
bias:
stream:
"""
...
@staticmethod
def activation_inplace(out: Tensor, act_type: Activation, alpha: float, beta: float, stream: int = 0) -> None:
"""
Args:
out:
act_type:
alpha:
beta:
stream:
"""
...
...@@ -14,7 +14,7 @@ from spconv.csrc.sparse.gather import GatherCPU ...@@ -14,7 +14,7 @@ from spconv.csrc.sparse.gather import GatherCPU
from .alloc import ExternalAllocator from .alloc import ExternalAllocator
from cumm.common import CompileInfo from cumm.common import CompileInfo
from .inference import InferenceOps
class ExternalSpconvMatmul(pccm.Class): class ExternalSpconvMatmul(pccm.Class):
"""a helper class to warp matmul operations """a helper class to warp matmul operations
...@@ -834,6 +834,12 @@ class GemmTunerSimple(pccm.ParameterizedClass): ...@@ -834,6 +834,12 @@ class GemmTunerSimple(pccm.ParameterizedClass):
code.arg("timer", "tv::CUDAKernelTimer", "tv::CUDAKernelTimer(false)", code.arg("timer", "tv::CUDAKernelTimer", "tv::CUDAKernelTimer(false)",
"cumm.tensorview.CUDAKernelTimer = CUDAKernelTimer(False)") "cumm.tensorview.CUDAKernelTimer = CUDAKernelTimer(False)")
code.arg("force_nvrtc", f"bool", "false") code.arg("force_nvrtc", f"bool", "false")
code.arg("bias", "tv::Tensor", "tv::Tensor()",
"cumm.tensorview.Tensor = Tensor()")
code.arg("act_alpha", f"float", "0.0")
code.arg("act_beta", f"float", "0.0")
code.arg("act_type", f"tv::gemm::Activation", "tv::gemm::Activation::kNone", "cumm.tensorview.gemm.Activation = Activation.None_")
if CUMM_CPU_ONLY_BUILD: if CUMM_CPU_ONLY_BUILD:
code.raw(f"TV_THROW_RT_ERR(\"not implemented for cpu!!!\")") code.raw(f"TV_THROW_RT_ERR(\"not implemented for cpu!!!\")")
return code return code
...@@ -847,12 +853,13 @@ class GemmTunerSimple(pccm.ParameterizedClass): ...@@ -847,12 +853,13 @@ class GemmTunerSimple(pccm.ParameterizedClass):
tv::gemm::GemmParams params; tv::gemm::GemmParams params;
bool desp_is_static = prebuilt_names_.find(desp.__repr__()) == prebuilt_names_.end(); bool desp_is_static = prebuilt_names_.find(desp.__repr__()) == prebuilt_names_.end();
if (desp.is_nvrtc && (desp_is_static || force_nvrtc)){{ if (force_nvrtc || (desp.is_nvrtc && desp_is_static)){{
params.nvrtc_params = cached_get_nvrtc_params(desp, profile_res.arch, stream_int); params.nvrtc_params = cached_get_nvrtc_params(desp, profile_res.arch, stream_int);
}} }}
params.a = a; params.a = a;
params.b = b; params.b = b;
params.c = c; params.c = c;
params.d = bias;
params.a_inds = a_inds; params.a_inds = a_inds;
params.b_inds = b_inds; params.b_inds = b_inds;
params.c_inds = c_inds; params.c_inds = c_inds;
...@@ -861,6 +868,10 @@ class GemmTunerSimple(pccm.ParameterizedClass): ...@@ -861,6 +868,10 @@ class GemmTunerSimple(pccm.ParameterizedClass):
params.stream = stream_int; params.stream = stream_int;
params.alpha = alpha; params.alpha = alpha;
params.beta = beta; params.beta = beta;
params.act_alpha = act_alpha;
params.act_beta = act_beta;
params.act_type = act_type;
params.workspace = workspace; params.workspace = workspace;
GemmMain::matmul2(params); GemmMain::matmul2(params);
""") """)
...@@ -1257,15 +1268,18 @@ class ConvTunerSimple(pccm.ParameterizedClass): ...@@ -1257,15 +1268,18 @@ class ConvTunerSimple(pccm.ParameterizedClass):
code.arg("timer", "tv::CUDAKernelTimer", "tv::CUDAKernelTimer(false)", code.arg("timer", "tv::CUDAKernelTimer", "tv::CUDAKernelTimer(false)",
"cumm.tensorview.CUDAKernelTimer = CUDAKernelTimer(false)") "cumm.tensorview.CUDAKernelTimer = CUDAKernelTimer(false)")
code.arg("force_nvrtc", f"bool", "false") code.arg("force_nvrtc", f"bool", "false")
code.arg("bias", "tv::Tensor", "tv::Tensor()",
"cumm.tensorview.Tensor = Tensor()")
code.arg("act_alpha", f"float", "0.0")
code.arg("act_beta", f"float", "0.0")
code.arg("act_type", f"tv::gemm::Activation", "tv::gemm::Activation::kNone", "cumm.tensorview.gemm.Activation = Activation.None_")
if CUMM_CPU_ONLY_BUILD: if CUMM_CPU_ONLY_BUILD:
code.raw(f"TV_THROW_RT_ERR(\"not implemented for cpu!!!\")") code.raw(f"TV_THROW_RT_ERR(\"not implemented for cpu!!!\")")
return code return code
code.raw(f""" code.raw(f"""
auto desp = profile_res.algo_desp; auto desp = profile_res.algo_desp;
if (force_nvrtc){{
desp.is_nvrtc = true;
}}
int split_k_slices = 1; int split_k_slices = 1;
if (profile_res.splitk > 1){{ if (profile_res.splitk > 1){{
split_k_slices = profile_res.splitk; split_k_slices = profile_res.splitk;
...@@ -1276,7 +1290,7 @@ class ConvTunerSimple(pccm.ParameterizedClass): ...@@ -1276,7 +1290,7 @@ class ConvTunerSimple(pccm.ParameterizedClass):
auto arch = profile_res.arch; auto arch = profile_res.arch;
tv::gemm::ConvParams params({NDIM_DONT_CARE}, op_type_cpp, timer); tv::gemm::ConvParams params({NDIM_DONT_CARE}, op_type_cpp, timer);
bool desp_is_static = prebuilt_names_.find(desp.__repr__()) == prebuilt_names_.end(); bool desp_is_static = prebuilt_names_.find(desp.__repr__()) == prebuilt_names_.end();
if (desp.is_nvrtc && (desp_is_static || force_nvrtc)){{ if (force_nvrtc || (desp.is_nvrtc && desp_is_static)){{
params.nvrtc_params = cached_get_nvrtc_params(desp, arch, stream_int); params.nvrtc_params = cached_get_nvrtc_params(desp, arch, stream_int);
}} }}
params.conv_algo_desp = desp; params.conv_algo_desp = desp;
...@@ -1284,10 +1298,15 @@ class ConvTunerSimple(pccm.ParameterizedClass): ...@@ -1284,10 +1298,15 @@ class ConvTunerSimple(pccm.ParameterizedClass):
params.weight = weight.view(channel_k, -1, channel_c); params.weight = weight.view(channel_k, -1, channel_c);
params.output = output; params.output = output;
params.verbose = verbose; params.verbose = verbose;
params.bias = bias;
params.split_k_slices = split_k_slices; params.split_k_slices = split_k_slices;
params.alpha = alpha; params.alpha = alpha;
params.beta = beta; params.beta = beta;
params.act_alpha = act_alpha;
params.act_beta = act_beta;
params.act_type = act_type;
params.stream = stream_int; params.stream = stream_int;
params.mask_argsort = mask_argsort; params.mask_argsort = mask_argsort;
params.indices = indices; params.indices = indices;
...@@ -1336,6 +1355,7 @@ class ConvGemmOps(pccm.ParameterizedClass): ...@@ -1336,6 +1355,7 @@ class ConvGemmOps(pccm.ParameterizedClass):
GemmTuneResult, GemmTuneResult,
ConvTuneResult, ConvTuneResult,
ExternalSpconvMatmul, ExternalSpconvMatmul,
InferenceOps,
) )
self.add_param_class("gemm", gemm_tuner, "GemmTuner") self.add_param_class("gemm", gemm_tuner, "GemmTuner")
self.add_param_class("conv", conv_tuner, "ConvTuner") self.add_param_class("conv", conv_tuner, "ConvTuner")
...@@ -1384,11 +1404,18 @@ class ConvGemmOps(pccm.ParameterizedClass): ...@@ -1384,11 +1404,18 @@ class ConvGemmOps(pccm.ParameterizedClass):
code.arg("subm", "bool", "false") code.arg("subm", "bool", "false")
code.arg("algo", "int", f"{ConvAlgo.Native.value}") code.arg("algo", "int", f"{ConvAlgo.Native.value}")
code.arg("stream_int", f"std::uintptr_t", "0", pyanno="int") code.arg("stream_int", f"std::uintptr_t", "0", pyanno="int")
code.arg("bias", "tv::Tensor", "tv::Tensor()",
"cumm.tensorview.Tensor = Tensor()")
code.arg("act_alpha", f"float", "0.0")
code.arg("act_beta", f"float", "0.0")
code.arg("act_type", f"tv::gemm::Activation", "tv::gemm::Activation::kNone", "cumm.tensorview.gemm.Activation = Activation.None_")
code.raw(f""" code.raw(f"""
int kv_dim, out_channel, kv; int kv_dim, out_channel, kv;
std::vector<int64_t> filter_shape_per_kv; std::vector<int64_t> filter_shape_per_kv;
bool is_KC_not_CK; bool is_KC_not_CK;
bool has_bias = !bias.empty();
bool has_act = act_type != tv::gemm::Activation::kNone;
if (!all_w_is_krsc){{ if (!all_w_is_krsc){{
kv_dim = 0; kv_dim = 0;
is_KC_not_CK = !filter_hwio; is_KC_not_CK = !filter_hwio;
...@@ -1419,10 +1446,22 @@ class ConvGemmOps(pccm.ParameterizedClass): ...@@ -1419,10 +1446,22 @@ class ConvGemmOps(pccm.ParameterizedClass):
out_features = allocator.zeros({pccm.literal(AllocKeys.OutFeatures)}, out_features = allocator.zeros({pccm.literal(AllocKeys.OutFeatures)},
{{num_activate_out, out_channel}}, features.dtype(), features.device(), stream_int); {{num_activate_out, out_channel}}, features.dtype(), features.device(), stream_int);
}} }}
if (has_act || has_bias){{
TV_ASSERT_RT_ERR(!features.is_cpu(), "bias and act don't support cpu.");
}}
if (kv == 1 && subm){{ if (kv == 1 && subm){{
if (has_bias && has_act){{
InferenceOps::bias_add_act_inplace(out_features, bias, act_type, act_alpha, act_beta, stream_int);
}}else{{
if (has_bias){{
InferenceOps::bias_add_inplace(out_features, bias, stream_int);
}}
if (has_act){{
InferenceOps::activation_inplace(out_features, act_type, act_alpha, act_beta, stream_int);
}}
}}
return; return;
}} }}
auto indice_pair_num_cpu = indice_pair_num.cpu(); auto indice_pair_num_cpu = indice_pair_num.cpu();
auto indice_pair_num_cpu_ptr = indice_pair_num_cpu.data_ptr<int>(); auto indice_pair_num_cpu_ptr = indice_pair_num_cpu.data_ptr<int>();
int maxnhot = 0; int maxnhot = 0;
...@@ -1571,6 +1610,16 @@ class ConvGemmOps(pccm.ParameterizedClass): ...@@ -1571,6 +1610,16 @@ class ConvGemmOps(pccm.ParameterizedClass):
beta); beta);
inited = true; inited = true;
}} }}
if (has_bias && has_act){{
InferenceOps::bias_add_act_inplace(out_features, bias, act_type, act_alpha, act_beta, stream_int);
}}else{{
if (has_bias){{
InferenceOps::bias_add_inplace(out_features, bias, stream_int);
}}
if (has_act){{
InferenceOps::activation_inplace(out_features, act_type, act_alpha, act_beta, stream_int);
}}
}}
""") """)
return code return code
...@@ -1913,11 +1962,21 @@ class ConvGemmOps(pccm.ParameterizedClass): ...@@ -1913,11 +1962,21 @@ class ConvGemmOps(pccm.ParameterizedClass):
code.arg("auto_fp32_accum", "bool", "true") code.arg("auto_fp32_accum", "bool", "true")
code.arg("fp32_accum", "bool", "false") code.arg("fp32_accum", "bool", "false")
code.arg("bias", "tv::Tensor", "tv::Tensor()",
"cumm.tensorview.Tensor = Tensor()")
code.arg("act_alpha", f"float", "0.0")
code.arg("act_beta", f"float", "0.0")
code.arg("act_type", f"tv::gemm::Activation", "tv::gemm::Activation::kNone", "cumm.tensorview.gemm.Activation = Activation.None_")
if CUMM_CPU_ONLY_BUILD: if CUMM_CPU_ONLY_BUILD:
code.raw(f"TV_THROW_RT_ERR(\"not implemented for cpu!!!\")") code.raw(f"TV_THROW_RT_ERR(\"not implemented for cpu!!!\")")
return code.ret("int") return code.ret("int")
code.raw(f""" code.raw(f"""
if (!bias.empty() || act_type != tv::gemm::Activation::kNone){{
TV_ASSERT_RT_ERR(pair_mask_fwd_splits.size() == 1, "SplitGemm don't support fused bias/act for now.");
}}
uint32_t* mask_ptr = masks.data_ptr<uint32_t>(); uint32_t* mask_ptr = masks.data_ptr<uint32_t>();
int num_mask = masks.dim(0); int num_mask = masks.dim(0);
int out_channel = filters.dim(0); int out_channel = filters.dim(0);
...@@ -1989,6 +2048,7 @@ class ConvGemmOps(pccm.ParameterizedClass): ...@@ -1989,6 +2048,7 @@ class ConvGemmOps(pccm.ParameterizedClass):
for (int j = 0; j < num_split; ++j){{ for (int j = 0; j < num_split; ++j){{
float beta = j == 0 ? 0 : 1; float beta = j == 0 ? 0 : 1;
conv_tuner.run_with_tuned_result( conv_tuner.run_with_tuned_result(
tune_res, tune_res,
kForwardInt, kForwardInt,
...@@ -2006,7 +2066,12 @@ class ConvGemmOps(pccm.ParameterizedClass): ...@@ -2006,7 +2066,12 @@ class ConvGemmOps(pccm.ParameterizedClass):
stream_int, stream_int,
tv::Tensor(), // workspace tv::Tensor(), // workspace
false, // verbose false, // verbose
timer); timer,
false,
bias,
act_alpha,
act_beta,
act_type);
}} }}
// auto end_ev = tv::CUDAEvent(); // auto end_ev = tv::CUDAEvent();
// end_ev.record(stream_int); // end_ev.record(stream_int);
......
# Copyright 2021 Yan Yan
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import pccm
from cumm.common import TensorView, GemmDTypes, TensorViewKernel, ThrustLib, GemmBasic
from spconv.csrc.sparse.cpu_core import OMPLib
from cumm.constants import CUMM_CPU_ONLY_BUILD
class InferenceOpsKernel(pccm.ParameterizedClass):
def __init__(self):
super().__init__()
self.add_dependency(TensorViewKernel, GemmBasic)
@pccm.cuda.cuda_global_function
def bias_add_inplace_kernel(self):
code = pccm.FunctionCode()
code.targ("T")
code.arg("out_features", f"T*")
code.arg("bias", f"const T*")
code.arg("size", "int")
code.arg("num_features", "int")
code.raw(f"""
for (int i : tv::KernelLoopY<int>(size)) {{
auto out_ptr = out_features + i * num_features;
for (int j : tv::KernelLoopX<int>(num_features)) {{
out_ptr[j] = bias[j] + out_ptr[j];
}}
}}
""")
return code
@pccm.cuda.cuda_global_function
def bias_add_act_inplace_kernel(self):
code = pccm.FunctionCode()
code.targ("T")
code.arg("out_features", f"T*")
code.arg("bias", f"const T*")
code.arg("act_type", f"tv::gemm::Activation")
code.arg("alpha", f"T")
code.arg("beta", f"T")
code.arg("size", "int")
code.arg("num_features", "int")
code.raw(f"""
for (int i : tv::KernelLoopY<int>(size)) {{
auto out_ptr = out_features + i * num_features;
for (int j : tv::KernelLoopX<int>(num_features)) {{
T o = out_ptr[j] + bias[j];
switch (act_type){{
case tv::gemm::Activation::kNone:
break;
case tv::gemm::Activation::kReLU:{{
o = o >= T(0) ? o : T(0);
}}
case tv::gemm::Activation::kLeakyReLU:{{
o = o >= T(0) ? o : o * alpha;
}}
default: ;
}}
out_ptr[j] = o;
}}
}}
""")
return code
@pccm.cuda.cuda_global_function
def activation_inplace_kernel(self):
code = pccm.FunctionCode()
code.targ("T")
code.arg("out_features", f"T*")
code.arg("act_type", f"tv::gemm::Activation")
code.arg("alpha", f"T")
code.arg("beta", f"T")
code.arg("size", "int")
code.raw(f"""
for (int i : tv::KernelLoopX<int>(size)) {{
T o = out_features[i];
switch (act_type){{
case tv::gemm::Activation::kNone:
break;
case tv::gemm::Activation::kReLU:{{
out_features[i] = o >= T(0) ? o : T(0);
}}
case tv::gemm::Activation::kLeakyReLU:{{
out_features[i] = o >= T(0) ? o : o * alpha;
}}
default: ;
}}
}}
""")
return code
class InferenceOps(pccm.Class):
def __init__(self):
super().__init__()
self.add_dependency(TensorView)
self.kernel = InferenceOpsKernel()
self.add_include("tensorview/gemm/core/constants.h")
if CUMM_CPU_ONLY_BUILD:
_DECORATOR = pccm.static_function
else:
_DECORATOR = pccm.cuda.static_function
@pccm.pybind.mark
@_DECORATOR
def bias_add_act_inplace(self):
code = pccm.FunctionCode()
code.arg("out", "tv::Tensor")
code.arg("bias", "tv::Tensor")
code.arg("act_type", f"tv::gemm::Activation", "tv::gemm::Activation::kNone", "cumm.tensorview.gemm.Activation = Activation.None_")
code.arg("alpha", f"float", "0.0")
code.arg("beta", f"float", "0.0")
code.arg("stream", "std::uintptr_t", "0")
if CUMM_CPU_ONLY_BUILD:
code.raw(f"""
TV_THROW_RT_ERR("this function don't support cpu only build.")
""")
return code
code.add_param_class("ker", self.kernel)
code.raw(f"""
auto nhot = out.dim(0);
auto cudastream = reinterpret_cast<cudaStream_t>(stream);
TV_ASSERT_RT_ERR(bias.dim(0) == out.dim(1), "error");
tv::dispatch<float, double, tv::half_t, tv::bfloat16_t>(out.dtype(), [&](auto I){{
using T = TV_DECLTYPE(I);
constexpr int MaxThreads = 512;
tv::cuda::Launch launcher(1);
bool found = tv::dispatch_int_noexcept<512, 256, 128, 64, 32, 16>(out.dim(1), [](int my, int expect){{return my >= expect;}}, [&](auto V){{
// if out.dim(1) > value in list above, run this function.
// if a value is found, other value won't be executed.
int NumFeatures = TV_DECLTYPE(V)::value;
int Num0 = MaxThreads / NumFeatures;
dim3 blocks(tv::div_up(out.dim(1), int64_t(NumFeatures)), tv::div_up(nhot, int64_t(Num0)));
dim3 threads(NumFeatures, Num0);
launcher = tv::cuda::Launch(blocks, threads, cudastream);
}});
if (!found){{
int NumFeatures = 16;
int Num0 = MaxThreads / NumFeatures;
dim3 blocks(tv::div_up(out.dim(1), int64_t(NumFeatures)), tv::div_up(nhot, int64_t(Num0)));
dim3 threads(NumFeatures, Num0);
launcher = tv::cuda::Launch(blocks, threads, cudastream);
}}
if (act_type == tv::gemm::Activation::kNone){{
launcher(ker::bias_add_inplace_kernel<T>, out.data_ptr<T>(), bias.data_ptr<const T>(),
nhot, out.dim(1));
}}else{{
launcher(ker::bias_add_act_inplace_kernel<T>, out.data_ptr<T>(), bias.data_ptr<const T>(),
act_type, T(alpha), T(beta), nhot, out.dim(1));
}}
}});
""")
return code
@pccm.pybind.mark
@_DECORATOR
def bias_add_inplace(self):
code = pccm.FunctionCode()
code.arg("out", "tv::Tensor")
code.arg("bias", "tv::Tensor")
code.arg("stream", "std::uintptr_t", "0")
code.raw(f"""
return bias_add_act_inplace(out, bias, tv::gemm::Activation::kNone, 0, 0, stream);
""")
return code
@pccm.pybind.mark
@_DECORATOR
def activation_inplace(self):
code = pccm.FunctionCode()
code.arg("out", "tv::Tensor")
code.arg("act_type", f"tv::gemm::Activation")
code.arg("alpha", f"float")
code.arg("beta", f"float")
code.arg("stream", "std::uintptr_t", "0")
if CUMM_CPU_ONLY_BUILD:
code.raw(f"""
TV_THROW_RT_ERR("this function don't support cpu only build.")
""")
return code
code.add_param_class("ker", self.kernel)
code.raw(f"""
auto nhot = out.size();
auto cudastream = reinterpret_cast<cudaStream_t>(stream);
tv::cuda::Launch launcher = tv::cuda::Launch(nhot, cudastream);
tv::dispatch<float, double, tv::half_t, tv::bfloat16_t>(out.dtype(), [&](auto I){{
using T = TV_DECLTYPE(I);
launcher(ker::activation_inplace_kernel<T>, out.data_ptr<T>(), act_type, T(alpha), T(beta),
nhot);
}});
""")
return code
...@@ -17,6 +17,7 @@ from spconv.csrc.sparse.convops import (ConvGemmOps, ConvTunerSimple, ...@@ -17,6 +17,7 @@ from spconv.csrc.sparse.convops import (ConvGemmOps, ConvTunerSimple,
from spconv.csrc.utils import BoxOps from spconv.csrc.utils import BoxOps
from cumm.gemm.algospec.core import (GemmAlgo, ShuffleStrideType) from cumm.gemm.algospec.core import (GemmAlgo, ShuffleStrideType)
from cumm.conv.bases import ConvLayout, ConvLayoutType, ConvOpType from cumm.conv.bases import ConvLayout, ConvLayoutType, ConvOpType
from spconv.csrc.sparse.inference import InferenceOps
def main(include: str, def main(include: str,
...@@ -60,6 +61,7 @@ def main(include: str, ...@@ -60,6 +61,7 @@ def main(include: str,
ExternalSpconvMatmul(), ExternalSpconvMatmul(),
SimpleExternalSpconvMatmul(), SimpleExternalSpconvMatmul(),
StaticAllocator(), StaticAllocator(),
InferenceOps(),
] ]
gen_cmake(libname, cus, include, src, namespace_prefix=prefix) gen_cmake(libname, cus, include, src, namespace_prefix=prefix)
......
...@@ -162,6 +162,7 @@ class SparseConvTensor(metaclass=SpConvTensorMeta): ...@@ -162,6 +162,7 @@ class SparseConvTensor(metaclass=SpConvTensorMeta):
assert len(spatial_shape) == ndim, "spatial shape must equal to ndim" assert len(spatial_shape) == ndim, "spatial shape must equal to ndim"
assert indices.dtype == torch.int32, "only support int32" assert indices.dtype == torch.int32, "only support int32"
assert batch_size > 0 assert batch_size > 0
# assert features.shape[0] == indices.shape[0]
self._features = features self._features = features
self.indices = indices self.indices = indices
self.spatial_shape = [int(v) for v in spatial_shape] self.spatial_shape = [int(v) for v in spatial_shape]
...@@ -197,6 +198,9 @@ class SparseConvTensor(metaclass=SpConvTensorMeta): ...@@ -197,6 +198,9 @@ class SparseConvTensor(metaclass=SpConvTensorMeta):
return new_spt return new_spt
def minus(self):
return self.replace_feature(-self.features)
@property @property
def features(self): def features(self):
return self._features return self._features
......
...@@ -41,7 +41,7 @@ else: ...@@ -41,7 +41,7 @@ else:
GEMM_CPP = None GEMM_CPP = None
CONV_CPP = None CONV_CPP = None
import time import time
from spconv.constants import FILTER_HWIO, ALL_WEIGHT_IS_KRSC, AllocKeys from spconv.constants import FILTER_HWIO, ALL_WEIGHT_IS_KRSC, AllocKeys, SPCONV_USE_DIRECT_TABLE
from cumm.gemm import codeops from cumm.gemm import codeops
from spconv.tools import CUDAKernelTimer from spconv.tools import CUDAKernelTimer
...@@ -101,8 +101,12 @@ class _HashData: ...@@ -101,8 +101,12 @@ class _HashData:
dtype=torch.int32, dtype=torch.int32,
device=device) device=device)
hashdata_tv = torch_tensor_to_tv(self.hashdata) hashdata_tv = torch_tensor_to_tv(self.hashdata)
self.hashdata_k_tv = hashdata_tv[0] if num == 0:
self.hashdata_v_tv = hashdata_tv[1] self.hashdata_k_tv = tv.Tensor()
self.hashdata_v_tv = tv.Tensor()
else:
self.hashdata_k_tv = hashdata_tv[0]
self.hashdata_v_tv = hashdata_tv[1]
def get_indice_pairs(indices: torch.Tensor, def get_indice_pairs(indices: torch.Tensor,
...@@ -315,7 +319,7 @@ def get_indice_pairs_implicit_gemm( ...@@ -315,7 +319,7 @@ def get_indice_pairs_implicit_gemm(
alloc: Optional[ThrustSortAllocator] = None, alloc: Optional[ThrustSortAllocator] = None,
timer: CUDAKernelTimer = CUDAKernelTimer(False), timer: CUDAKernelTimer = CUDAKernelTimer(False),
num_out_act_bound: int = -1, num_out_act_bound: int = -1,
direct_table: bool = True): direct_table: bool = SPCONV_USE_DIRECT_TABLE):
""" """
Why return tuple? because pytorch seems don't support custom object in autograd. Why return tuple? because pytorch seems don't support custom object in autograd.
return: ( return: (
...@@ -535,7 +539,6 @@ def get_indice_pairs_implicit_gemm( ...@@ -535,7 +539,6 @@ def get_indice_pairs_implicit_gemm(
indices.shape[0], ksize, stride, padding, dilation) indices.shape[0], ksize, stride, padding, dilation)
if transpose: if transpose:
max_num_act = kv * indices.shape[0] max_num_act = kv * indices.shape[0]
pair_bwd = pair pair_bwd = pair
pair_bwd_tv = pair_tv pair_bwd_tv = pair_tv
indice_pairs_uniq = torch.empty((pair.numel() + 1, ), indice_pairs_uniq = torch.empty((pair.numel() + 1, ),
......
...@@ -32,9 +32,9 @@ def waymo_data(batch_size=1, num_features=-1): ...@@ -32,9 +32,9 @@ def waymo_data(batch_size=1, num_features=-1):
# 150000) # 150000)
data = np.load(Path(__file__).parent / "data" / "benchmark-pc.npz") data = np.load(Path(__file__).parent / "data" / "benchmark-pc.npz")
pc = np.ascontiguousarray(data["pc"]) pc = np.ascontiguousarray(data["pc"])
print(pc.shape)
voxels_tv, indices_tv, _ = gen.point_to_voxel(tv.from_numpy(pc)) voxels_tv, indices_tv, _ = gen.point_to_voxel(tv.from_numpy(pc))
voxels = voxels_tv.numpy().reshape(-1, 3) voxels = voxels_tv.numpy().reshape(-1, 3)
if num_features > 0: if num_features > 0:
voxels = np.zeros((voxels.shape[0], num_features), dtype=voxels.dtype) voxels = np.zeros((voxels.shape[0], num_features), dtype=voxels.dtype)
coors = indices_tv.numpy() coors = indices_tv.numpy()
...@@ -316,6 +316,7 @@ import json ...@@ -316,6 +316,7 @@ import json
def main(): def main():
import pickle import pickle
np.random.seed(50051) np.random.seed(50051)
torch.manual_seed(50051) torch.manual_seed(50051)
# voxels, coors, spatial_shape = waymo_data(num_features=128) # voxels, coors, spatial_shape = waymo_data(num_features=128)
...@@ -377,14 +378,6 @@ def main(): ...@@ -377,14 +378,6 @@ def main():
# print("------------") # print("------------")
with tv.measure_duration() as measure: with tv.measure_duration() as measure:
out_nograd = net(voxels_th, coors_th, 1, show_metrics) out_nograd = net(voxels_th, coors_th, 1, show_metrics)
# res = timer.collect_by_name("forward", timer.get_all_pair_time())
# res2 = timer.collect_by_name("forward0", timer.get_all_pair_time())
# print(sum(res.values()) + sum(res2.values()))
# print(timer.get_all_pair_time())
# print(sum(timer.get_all_pair_time().values()))
# sort_bench()
times.append(measure.duration) times.append(measure.duration)
if show_metrics: if show_metrics:
timer = out_nograd._timer timer = out_nograd._timer
......
...@@ -31,6 +31,7 @@ import pccm ...@@ -31,6 +31,7 @@ import pccm
import torch import torch
import torch.nn.functional as F import torch.nn.functional as F
from spconv.core_cc.csrc.sparse.convops import GemmTuneResult, ConvTuneResult from spconv.core_cc.csrc.sparse.convops import GemmTuneResult, ConvTuneResult
from spconv.pytorch.core import SparseConvTensor
from spconv.test_utils import TestCase from spconv.test_utils import TestCase
from cumm import tensorview as tv from cumm import tensorview as tv
from cumm.conv.bases import NCHW, NHWC, ConvIterAlgo, ConvOpType from cumm.conv.bases import NCHW, NHWC, ConvIterAlgo, ConvOpType
...@@ -44,8 +45,10 @@ from spconv.pytorch.cppcore import get_current_stream, torch_tensor_to_tv ...@@ -44,8 +45,10 @@ from spconv.pytorch.cppcore import get_current_stream, torch_tensor_to_tv
from spconv.test_utils import generate_sparse_data, params_grid from spconv.test_utils import generate_sparse_data, params_grid
import tqdm import tqdm
from spconv.constants import ALL_WEIGHT_IS_KRSC, SPCONV_CPP_GEMM from spconv.constants import ALL_WEIGHT_IS_KRSC, SPCONV_CPP_GEMM
from spconv.core_cc.csrc.sparse.inference import InferenceOps
from spconv.pytorch import functional as Fsp
assert ALL_WEIGHT_IS_KRSC is True, "we only support KRSC in spconv >= 2.2" assert ALL_WEIGHT_IS_KRSC is True, "we only support KRSC in spconv >= 2.2"
from spconv.pytorch.hash import HashTable
# TODO remove or release this when tf32 op is ready # TODO remove or release this when tf32 op is ready
torch.backends.cuda.matmul.allow_tf32 = False torch.backends.cuda.matmul.allow_tf32 = False
...@@ -60,8 +63,9 @@ NUMPY_DTYPE_TO_TORCH = { ...@@ -60,8 +63,9 @@ NUMPY_DTYPE_TO_TORCH = {
class SparseConvTester: class SparseConvTester:
def __init__(self, algo: ConvAlgo, subm: bool, shape: List[int], bs: int, dtype: np.dtype, N: int, K: int, C: int, def __init__(self, algo: ConvAlgo, subm: bool, shape: List[int], bs: int, dtype: np.dtype, N: int, K: int, C: int,
ksize: int, stride: int, padding: int, dilation: int) -> None: ksize: int, stride: int, padding: int, dilation: int, check_bias: bool = False, check_act: bool = False) -> None:
ndim = 3 ndim = 3
transpose = False
self.shape = shape self.shape = shape
self.bs = bs self.bs = bs
self.dtype = dtype self.dtype = dtype
...@@ -77,6 +81,15 @@ class SparseConvTester: ...@@ -77,6 +81,15 @@ class SparseConvTester:
op = expand_nd(ndim, 0) op = expand_nd(ndim, 0)
self.kv: int = np.prod(self.ksize) self.kv: int = np.prod(self.ksize)
self.num_split = 1 if algo == ConvAlgo.MaskImplicitGemm else 2 self.num_split = 1 if algo == ConvAlgo.MaskImplicitGemm else 2
if not subm:
if transpose:
out_shape = ops.get_deconv_output_size(shape, self.ksize, self.stride,
self.padding, self.dilation, op)
else:
out_shape = ops.get_conv_output_size(shape, self.ksize, self.stride,
self.padding, self.dilation)
else:
out_shape = shape
sparse_dict = generate_sparse_data(shape, [N] * bs, C) sparse_dict = generate_sparse_data(shape, [N] * bs, C)
...@@ -88,10 +101,15 @@ class SparseConvTester: ...@@ -88,10 +101,15 @@ class SparseConvTester:
out_inds, pair_ref, indice_num_per_loc = ops.get_indice_pairs( out_inds, pair_ref, indice_num_per_loc = ops.get_indice_pairs(
indices_th, 1, shape, ConvAlgo.Native, self.ksize, self.stride, self.padding, indices_th, 1, shape, ConvAlgo.Native, self.ksize, self.stride, self.padding,
self.dilation, op, subm) self.dilation, op, subm)
self.ref_out_inds = out_inds
self.ref_out_inds_scalar = Fsp._indice_to_scalar(out_inds.long(), [bs, *out_shape])
self.indice_num_per_loc_np = indice_num_per_loc.cpu().numpy() self.indice_num_per_loc_np = indice_num_per_loc.cpu().numpy()
self.indice_pairs_np = pair_ref.cpu().numpy() self.indice_pairs_np = pair_ref.cpu().numpy()
self.pair_native = pair_ref self.pair_native = pair_ref
self.indice_num_per_loc = indice_num_per_loc self.indice_num_per_loc = indice_num_per_loc
self.use_direct_table = True
self.out_shape = out_shape
if algo == ConvAlgo.Native: if algo == ConvAlgo.Native:
self.out_inds: torch.Tensor = out_inds self.out_inds: torch.Tensor = out_inds
self.num_inds_per_loc: torch.Tensor = indice_num_per_loc self.num_inds_per_loc: torch.Tensor = indice_num_per_loc
...@@ -105,7 +123,7 @@ class SparseConvTester: ...@@ -105,7 +123,7 @@ class SparseConvTester:
else: else:
res = ops.get_indice_pairs_implicit_gemm(indices_th, bs, shape, res = ops.get_indice_pairs_implicit_gemm(indices_th, bs, shape,
algo, self.ksize, self.stride, self.padding, algo, self.ksize, self.stride, self.padding,
self.dilation, op, subm=subm) self.dilation, op, subm=subm, direct_table=self.use_direct_table)
self.out_inds = res[0] self.out_inds = res[0]
self.num_inds_per_loc = res[1] self.num_inds_per_loc = res[1]
...@@ -116,8 +134,27 @@ class SparseConvTester: ...@@ -116,8 +134,27 @@ class SparseConvTester:
self.mask_argsort_fwd_splits = res[6] self.mask_argsort_fwd_splits = res[6]
self.mask_argsort_bwd_splits = res[7] self.mask_argsort_bwd_splits = res[7]
self.masks = res[8] self.masks = res[8]
self.out_inds_scalar = Fsp._indice_to_scalar(self.out_inds.long(), [bs, *out_shape])
table = HashTable(out_inds.device, torch.int64, torch.int32, self.out_inds.shape[0] * 2)
# test coords -> test out indexes
table.insert(self.out_inds_scalar, torch.arange(0, self.out_inds.shape[0], dtype=torch.int32, device=self.device))
# out_order: test_order_to_ref, test index for each ref coord
out_order, is_empty = table.query(self.ref_out_inds_scalar)
assert is_empty.int().sum().item() == 0, "shouldn't happen"
self.out_order = out_order.cpu().numpy()
# inp_table = HashTable(out_inds.device, torch.int64, torch.int32, self.ref_out_inds.shape[0] * 2)
# inp_table.insert(self.ref_out_inds_scalar, torch.arange(0, self.ref_out_inds.shape[0], dtype=torch.int32, device=self.device))
# # out_order: ref index for each out coord
# out_order, is_empty = inp_table.query(self.out_inds_scalar)
self.voxels_np = voxels_np self.voxels_np = voxels_np
self.indices_np = indices_np self.indices_np = indices_np
self.check_bias = check_bias
self.check_act = check_act
self.subm = subm self.subm = subm
if dtype == np.int8: if dtype == np.int8:
...@@ -128,6 +165,10 @@ class SparseConvTester: ...@@ -128,6 +165,10 @@ class SparseConvTester:
self.output = np.random.randint(-2, 2, size=[ self.output = np.random.randint(-2, 2, size=[
self.out_inds.shape[0], K self.out_inds.shape[0], K
]).astype(dtype) ]).astype(dtype)
self.bias = np.random.randint(-2, 2, size=[
K
]).astype(dtype)
else: else:
self.inp = np.random.uniform(-1, 1, size=[ self.inp = np.random.uniform(-1, 1, size=[
voxels_np.shape[0], C voxels_np.shape[0], C
...@@ -136,14 +177,25 @@ class SparseConvTester: ...@@ -136,14 +177,25 @@ class SparseConvTester:
self.output = np.random.uniform(-1, 1, size=[ self.output = np.random.uniform(-1, 1, size=[
self.out_inds.shape[0], K self.out_inds.shape[0], K
]).astype(dtype) ]).astype(dtype)
self.bias = np.random.uniform(-1, 1, size=[
K
]).astype(dtype)
self.weight_ref = self.weight.transpose(1, 2, 3, 0, 4) self.weight_ref = self.weight.transpose(1, 2, 3, 0, 4)
self.weight_ref = np.ascontiguousarray(self.weight_ref).reshape(-1, K, C) self.weight_ref = np.ascontiguousarray(self.weight_ref).reshape(-1, K, C)
self.out_ref, self.din_ref, self.dw_ref = self._get_ref_output() self.out_ref, self.din_ref, self.dw_ref = self._get_ref_output()
if check_bias:
self.out_ref += self.bias
# relu
if check_act:
self.out_ref = np.maximum(self.out_ref, 0)
self.dw_ref = np.ascontiguousarray(self.dw_ref.transpose(1, 0, 2).reshape(K, *self.ksize, C)) self.dw_ref = np.ascontiguousarray(self.dw_ref.transpose(1, 0, 2).reshape(K, *self.ksize, C))
self.arch = tv.get_compute_capability() self.arch = tv.get_compute_capability()
def get_output_ref_spt(self):
return SparseConvTensor(torch.from_numpy(self.out_ref).cuda(), self.ref_out_inds, self.out_shape, self.bs)
def _get_ref_output(self): def _get_ref_output(self):
output_ref = np.zeros_like(self.output, dtype=np.float32) output_ref = np.zeros_like(self.output, dtype=np.float32)
dinput_ref = np.zeros_like(self.inp, dtype=np.float32) dinput_ref = np.zeros_like(self.inp, dtype=np.float32)
...@@ -165,13 +217,15 @@ class SparseConvTester: ...@@ -165,13 +217,15 @@ class SparseConvTester:
np.float32) @ self.weight_ref[filter_offset].T.astype( np.float32) @ self.weight_ref[filter_offset].T.astype(
np.float32) np.float32)
output_ref[o_inds] += cc output_ref[o_inds] += cc
a = self.output[o_inds] # we use random output as dout here
a = self.output[self.out_order][o_inds]
# NK @ KC # NK @ KC
cc = a.astype( cc = a.astype(
np.float32) @ self.weight_ref[filter_offset].astype( np.float32) @ self.weight_ref[filter_offset].astype(
np.float32) np.float32)
dinput_ref[i_inds] += cc dinput_ref[i_inds] += cc
out_gather = self.output[o_inds] # [N, K] # use random output and random inp as dout and inp
out_gather = self.output[self.out_order][o_inds] # [N, K]
inp_gather = self.inp[i_inds] # [N, C] inp_gather = self.inp[i_inds] # [N, C]
# KN @ NC # KN @ NC
dw_res = out_gather.astype( dw_res = out_gather.astype(
...@@ -225,7 +279,7 @@ def _test_impgemm_conv_cuda(subm: bool): ...@@ -225,7 +279,7 @@ def _test_impgemm_conv_cuda(subm: bool):
shapes = [[19, 18, 17]] shapes = [[19, 18, 17]]
batchsizes = [1] batchsizes = [1]
dtypes = [np.float32, np.float16] dtypes = [np.float32, np.float16]
dtypes = [np.int8] # dtypes = [np.int8]
test_case = TestCase() test_case = TestCase()
# in_channels = [32] # in_channels = [32]
# out_channels = [32, 48, 64] # out_channels = [32, 48, 64]
...@@ -245,6 +299,7 @@ def _test_impgemm_conv_cuda(subm: bool): ...@@ -245,6 +299,7 @@ def _test_impgemm_conv_cuda(subm: bool):
strides = [1, 2, 3] strides = [1, 2, 3]
paddings = [0, 1] paddings = [0, 1]
dilations = [1, 2] dilations = [1, 2]
algos = [ algos = [
# ConvAlgo.MaskSplitImplicitGemm, # ConvAlgo.MaskSplitImplicitGemm,
ConvAlgo.MaskImplicitGemm, ConvAlgo.MaskImplicitGemm,
...@@ -261,11 +316,14 @@ def _test_impgemm_conv_cuda(subm: bool): ...@@ -261,11 +316,14 @@ def _test_impgemm_conv_cuda(subm: bool):
multipler = max(C, K) / multiple_base multipler = max(C, K) / multiple_base
multipler = max(multipler, 1.0) multipler = max(multipler, 1.0)
# print(num_batch) # print(num_batch)
tester = SparseConvTester(algo, subm, shape, bs, dtype, num_batch, K, C, k, s, p, d) tester = SparseConvTester(algo, subm, shape, bs, dtype, num_batch, K, C, k, s, p, d, check_bias=True, check_act=True)
bias = None
act = tv.gemm.Activation.None_
if tester.check_bias:
bias = tv.from_numpy(tester.bias).cuda()
atol, rtol = dtype_to_tol[dtype] atol, rtol = dtype_to_tol[dtype]
mask_width_to_mask_out_fwd: Dict[int, torch.Tensor] = {} mask_width_to_mask_out_fwd: Dict[int, torch.Tensor] = {}
mask_width_to_mask_out_bwd: Dict[int, torch.Tensor] = {} mask_width_to_mask_out_bwd: Dict[int, torch.Tensor] = {}
op_types = [ConvOpType.kForward, ConvOpType.kBackwardInput] op_types = [ConvOpType.kForward, ConvOpType.kBackwardInput]
spk = 1 spk = 1
for op_type in op_types: for op_type in op_types:
...@@ -276,7 +334,11 @@ def _test_impgemm_conv_cuda(subm: bool): ...@@ -276,7 +334,11 @@ def _test_impgemm_conv_cuda(subm: bool):
NHWC.layout_type.value, NHWC.interleave, NHWC.interleave, NHWC.interleave, arch, op_type.value, -1, True, False) NHWC.layout_type.value, NHWC.interleave, NHWC.interleave, NHWC.interleave, arch, op_type.value, -1, True, False)
else: else:
avail_desps = CONV.get_all_available(inp_tv, weight_tv, output_tv, NHWC, NHWC, NHWC, arch, op_type, -1) avail_desps = CONV.get_all_available(inp_tv, weight_tv, output_tv, NHWC, NHWC, NHWC, arch, op_type, -1)
if op_type == ConvOpType.kForward and tester.check_act:
act = tv.gemm.Activation.ReLU
else:
act = tv.gemm.Activation.None_
assert avail_desps
for desp in avail_desps: for desp in avail_desps:
if not subm: if not subm:
if op_type == ConvOpType.kForward: if op_type == ConvOpType.kForward:
...@@ -292,7 +354,10 @@ def _test_impgemm_conv_cuda(subm: bool): ...@@ -292,7 +354,10 @@ def _test_impgemm_conv_cuda(subm: bool):
dtype=torch.int32, dtype=torch.int32,
device=tester.device) device=tester.device)
mask_output_fwd = mask_width_to_mask_out_fwd[mask_width] mask_output_fwd = mask_width_to_mask_out_fwd[mask_width]
is_fwd = desp.op_type.value == ConvOpType.kForward.value
bias_cur = bias
if op_type != ConvOpType.kForward:
bias_cur = None
if subm: if subm:
if desp.op_type.value == ConvOpType.kForward.value: if desp.op_type.value == ConvOpType.kForward.value:
indice_pairs = tester.pair_fwd indice_pairs = tester.pair_fwd
...@@ -303,9 +368,12 @@ def _test_impgemm_conv_cuda(subm: bool): ...@@ -303,9 +368,12 @@ def _test_impgemm_conv_cuda(subm: bool):
mask_output = mask_output_fwd mask_output = mask_output_fwd
# print([bin(x.item()) for x in masks]) # print([bin(x.item()) for x in masks])
for j in range(tester.num_split): for j in range(tester.num_split):
beta = 1 if j == 1 else 0 beta = 1 if j > 0 else 0
if bias_cur is not None:
beta = 1
if j > 0:
bias_cur = None
mask_filter = tester.masks[j].item() mask_filter = tester.masks[j].item()
reverse_mask = False reverse_mask = False
if desp.op_type.value == ConvOpType.kBackwardWeight.value: if desp.op_type.value == ConvOpType.kBackwardWeight.value:
mask_op = mask_output[j] mask_op = mask_output[j]
...@@ -338,6 +406,8 @@ def _test_impgemm_conv_cuda(subm: bool): ...@@ -338,6 +406,8 @@ def _test_impgemm_conv_cuda(subm: bool):
beta=beta, beta=beta,
verbose=False, verbose=False,
force_nvrtc=force_nvrtc, force_nvrtc=force_nvrtc,
bias=bias_cur if is_fwd and bias_cur is not None else tv.Tensor(),
act_type=act,
) )
else: else:
CONV.run_with_tuned_result( CONV.run_with_tuned_result(
...@@ -356,6 +426,8 @@ def _test_impgemm_conv_cuda(subm: bool): ...@@ -356,6 +426,8 @@ def _test_impgemm_conv_cuda(subm: bool):
beta=beta, beta=beta,
verbose=False, verbose=False,
force_nvrtc=force_nvrtc, force_nvrtc=force_nvrtc,
bias=bias_cur if is_fwd else None,
act_type=act,
) )
else: else:
...@@ -382,7 +454,12 @@ def _test_impgemm_conv_cuda(subm: bool): ...@@ -382,7 +454,12 @@ def _test_impgemm_conv_cuda(subm: bool):
mask_output = mask_output_fwd mask_output = mask_output_fwd
for j in range(tester.num_split): for j in range(tester.num_split):
beta = 1 if j == 1 else 0 # beta = 1 if j == 1 else 0
beta = 1 if j > 0 else 0
if bias_cur is not None:
beta = 1
if j > 0:
bias_cur = None
mask_filter = tester.masks[j].item() mask_filter = tester.masks[j].item()
reverse_mask = False reverse_mask = False
if desp.op_type.value == ConvOpType.kBackwardWeight.value: if desp.op_type.value == ConvOpType.kBackwardWeight.value:
...@@ -406,6 +483,9 @@ def _test_impgemm_conv_cuda(subm: bool): ...@@ -406,6 +483,9 @@ def _test_impgemm_conv_cuda(subm: bool):
mask_width=mask_width, mask_width=mask_width,
beta=beta, beta=beta,
verbose=False, verbose=False,
force_nvrtc=force_nvrtc,
bias=bias if is_fwd and bias is not None else tv.Tensor(),
act_type=act,
) )
else: else:
CONV.run_with_tuned_result( CONV.run_with_tuned_result(
...@@ -423,6 +503,9 @@ def _test_impgemm_conv_cuda(subm: bool): ...@@ -423,6 +503,9 @@ def _test_impgemm_conv_cuda(subm: bool):
mask_width=mask_width, mask_width=mask_width,
beta=beta, beta=beta,
verbose=False, verbose=False,
force_nvrtc=force_nvrtc,
bias=bias if is_fwd else None,
act_type=act,
) )
out_ref = tester.out_ref out_ref = tester.out_ref
...@@ -430,6 +513,7 @@ def _test_impgemm_conv_cuda(subm: bool): ...@@ -430,6 +513,7 @@ def _test_impgemm_conv_cuda(subm: bool):
dw_ref = tester.dw_ref dw_ref = tester.dw_ref
if op_type == ConvOpType.kForward: if op_type == ConvOpType.kForward:
out_my = output_tv.cpu().numpy() out_my = output_tv.cpu().numpy()
out_my = out_my[tester.out_order]
if dtype != np.float16: if dtype != np.float16:
test_case.assertAllClose(out_ref, out_my, atol=atol, rtol=rtol) test_case.assertAllClose(out_ref, out_my, atol=atol, rtol=rtol)
else: else:
...@@ -437,7 +521,6 @@ def _test_impgemm_conv_cuda(subm: bool): ...@@ -437,7 +521,6 @@ def _test_impgemm_conv_cuda(subm: bool):
if (error_norm > 5): if (error_norm > 5):
print(f"{desp}, Error={error_norm}") print(f"{desp}, Error={error_norm}")
assert error_norm < 10 * multipler assert error_norm < 10 * multipler
# print(desp, )
else: else:
din_my = inp_tv.cpu().numpy() din_my = inp_tv.cpu().numpy()
if dtype != np.float16: if dtype != np.float16:
...@@ -446,7 +529,6 @@ def _test_impgemm_conv_cuda(subm: bool): ...@@ -446,7 +529,6 @@ def _test_impgemm_conv_cuda(subm: bool):
error_norm = np.linalg.norm(din_ref.reshape(-1) - din_my.reshape(-1)) error_norm = np.linalg.norm(din_ref.reshape(-1) - din_my.reshape(-1))
assert error_norm < 10 * multipler, f"{desp}, {error_norm}, {k}, {s}, {p}, {d}" assert error_norm < 10 * multipler, f"{desp}, {error_norm}, {k}, {s}, {p}, {d}"
inp_tv, weight_tv, output_tv = tester.get_operands(ConvOpType.kBackwardWeight) inp_tv, weight_tv, output_tv = tester.get_operands(ConvOpType.kBackwardWeight)
for spk in [1, 4, 16, 64]: for spk in [1, 4, 16, 64]:
for mask_width, mask_output in mask_width_to_mask_out_fwd.items(): for mask_width, mask_output in mask_width_to_mask_out_fwd.items():
if SPCONV_CPP_GEMM: if SPCONV_CPP_GEMM:
...@@ -554,7 +636,10 @@ def _test_native_conv_cuda(subm: bool): ...@@ -554,7 +636,10 @@ def _test_native_conv_cuda(subm: bool):
for shape, bs, C, K, k, s, p, d, dtype in tqdm.tqdm(params_grid( for shape, bs, C, K, k, s, p, d, dtype in tqdm.tqdm(params_grid(
shapes, batchsizes, in_channels, out_channels, ksizes, shapes, batchsizes, in_channels, out_channels, ksizes,
strides, paddings, dilations, dtypes)): strides, paddings, dilations, dtypes)):
tester = SparseConvTester(ConvAlgo.Native, subm, shape, bs, dtype, 1500, K, C, k, s, p, d) tester = SparseConvTester(ConvAlgo.Native, subm, shape, bs, dtype, 1500, K, C, k, s, p, d, check_bias=True, check_act=True)
bias = None
if tester.check_bias:
bias = tv.from_numpy(tester.bias).cuda()
atol, rtol = dtype_to_tol[dtype] atol, rtol = dtype_to_tol[dtype]
multipler = max(C, K) / multiple_base multipler = max(C, K) / multiple_base
multipler = max(multipler, 1.0) multipler = max(multipler, 1.0)
...@@ -580,7 +665,6 @@ def _test_native_conv_cuda(subm: bool): ...@@ -580,7 +665,6 @@ def _test_native_conv_cuda(subm: bool):
inp_tv = torch_tensor_to_tv(inp_th) inp_tv = torch_tensor_to_tv(inp_th)
weight_tv = torch_tensor_to_tv(weight_th) weight_tv = torch_tensor_to_tv(weight_th)
output_tv = torch_tensor_to_tv(output_th) output_tv = torch_tensor_to_tv(output_th)
if op_type == ConvOpType.kForward: if op_type == ConvOpType.kForward:
a = inp_tv a = inp_tv
c = output_tv c = output_tv
...@@ -593,9 +677,11 @@ def _test_native_conv_cuda(subm: bool): ...@@ -593,9 +677,11 @@ def _test_native_conv_cuda(subm: bool):
for desp in avail_desps: for desp in avail_desps:
if subm: if subm:
torch.mm(inp_th, weight_th[:, tester.kv // 2].T, out=output_th) torch.mm(inp_th, weight_th[:, tester.kv // 2].T, out=output_th)
# output_th += bias_th
else: else:
output_tv.zero_() output_tv.zero_()
inited = subm inited = subm
# determine last valid subm indices, then apply
for i, nhot in enumerate(indice_pair_num_cpu): for i, nhot in enumerate(indice_pair_num_cpu):
if subm and i == kv_center: if subm and i == kv_center:
continue continue
...@@ -643,8 +729,14 @@ def _test_native_conv_cuda(subm: bool): ...@@ -643,8 +729,14 @@ def _test_native_conv_cuda(subm: bool):
hint=AlgoHint.Fowrard.value, hint=AlgoHint.Fowrard.value,
alpha=1.0, alpha=1.0,
beta=beta) beta=beta)
inited = True inited = True
if bias is not None and tester.check_act:
InferenceOps.bias_add_act_inplace(output_tv, bias, tv.gemm.Activation.ReLU, 0, 0)
else:
if bias is not None:
InferenceOps.bias_add_inplace(output_tv, bias, 0)
if tester.check_act:
InferenceOps.activation_inplace(output_tv, tv.gemm.Activation.ReLU, 0, 0)
out_my = output_tv.cpu().numpy() out_my = output_tv.cpu().numpy()
if dtype != np.float16: if dtype != np.float16:
# error_norm = np.linalg.norm(out_ref.reshape(-1) - out_my.reshape(-1)) # error_norm = np.linalg.norm(out_ref.reshape(-1) - out_my.reshape(-1))
...@@ -807,7 +899,7 @@ def _test_native_conv_cuda(subm: bool): ...@@ -807,7 +899,7 @@ def _test_native_conv_cuda(subm: bool):
def test_all_algo_unit(): def test_all_algo_unit():
# for i in range(5): # for i in range(5):
_test_impgemm_conv_cuda(True) _test_impgemm_conv_cuda(True)
_test_impgemm_conv_cuda(False) _test_impgemm_conv_cuda(True)
_test_native_conv_cuda(True) _test_native_conv_cuda(True)
_test_native_conv_cuda(False) _test_native_conv_cuda(False)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment