Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
one
spconv
Commits
d0bfb3a3
Commit
d0bfb3a3
authored
Sep 06, 2022
by
yan.yan
Browse files
add fused bias/act
parent
2b195e43
Changes
15
Show whitespace changes
Inline
Side-by-side
Showing
15 changed files
with
504 additions
and
53 deletions
+504
-53
setup.py
setup.py
+3
-2
spconv/algo.py
spconv/algo.py
+21
-3
spconv/build.py
spconv/build.py
+2
-0
spconv/constants.py
spconv/constants.py
+3
-1
spconv/core_cc/csrc/sparse/convops/convops.pyi
spconv/core_cc/csrc/sparse/convops/convops.pyi
+6
-1
spconv/core_cc/csrc/sparse/convops/gemmops.pyi
spconv/core_cc/csrc/sparse/convops/gemmops.pyi
+6
-1
spconv/core_cc/csrc/sparse/convops/spops.pyi
spconv/core_cc/csrc/sparse/convops/spops.pyi
+11
-2
spconv/core_cc/csrc/sparse/inference.pyi
spconv/core_cc/csrc/sparse/inference.pyi
+37
-0
spconv/csrc/sparse/convops.py
spconv/csrc/sparse/convops.py
+73
-8
spconv/csrc/sparse/inference.py
spconv/csrc/sparse/inference.py
+213
-0
spconv/gencode/__main__.py
spconv/gencode/__main__.py
+2
-0
spconv/pytorch/core.py
spconv/pytorch/core.py
+4
-0
spconv/pytorch/ops.py
spconv/pytorch/ops.py
+8
-5
test/benchmark.py
test/benchmark.py
+2
-9
test/test_all_algo.py
test/test_all_algo.py
+113
-21
No files found.
setup.py
View file @
d0bfb3a3
...
@@ -52,7 +52,7 @@ REQUIRES_PYTHON = '>=3.6'
...
@@ -52,7 +52,7 @@ REQUIRES_PYTHON = '>=3.6'
VERSION
=
None
VERSION
=
None
# What packages are required for this module to be executed?
# What packages are required for this module to be executed?
REQUIRED
=
[
"pccm>=0.
2.21
"
,
"pybind11>=2.6.0"
,
"fire"
,
"numpy"
,
*
deps
]
REQUIRED
=
[
"pccm>=0.
3.5
"
,
"pybind11>=2.6.0"
,
"fire"
,
"numpy"
,
*
deps
]
# What packages are optional?
# What packages are optional?
EXTRAS
=
{
EXTRAS
=
{
...
@@ -162,6 +162,7 @@ if disable_jit is not None and disable_jit == "1":
...
@@ -162,6 +162,7 @@ if disable_jit is not None and disable_jit == "1":
from
spconv.csrc.sparse.alloc
import
ExternalAllocator
from
spconv.csrc.sparse.alloc
import
ExternalAllocator
from
spconv.csrc.sparse.convops
import
GemmTunerSimple
,
ExternalSpconvMatmul
from
spconv.csrc.sparse.convops
import
GemmTunerSimple
,
ExternalSpconvMatmul
from
spconv.csrc.sparse.convops
import
ConvTunerSimple
,
ConvGemmOps
from
spconv.csrc.sparse.convops
import
ConvTunerSimple
,
ConvGemmOps
from
spconv.csrc.sparse.inference
import
InferenceOps
cu
=
GemmMainUnitTest
(
SHUFFLE_SIMT_PARAMS
+
SHUFFLE_VOLTA_PARAMS
+
SHUFFLE_TURING_PARAMS
)
cu
=
GemmMainUnitTest
(
SHUFFLE_SIMT_PARAMS
+
SHUFFLE_VOLTA_PARAMS
+
SHUFFLE_TURING_PARAMS
)
convcu
=
ConvMainUnitTest
(
IMPLGEMM_SIMT_PARAMS
+
IMPLGEMM_VOLTA_PARAMS
+
IMPLGEMM_TURING_PARAMS
)
convcu
=
ConvMainUnitTest
(
IMPLGEMM_SIMT_PARAMS
+
IMPLGEMM_VOLTA_PARAMS
+
IMPLGEMM_TURING_PARAMS
)
...
@@ -192,7 +193,7 @@ if disable_jit is not None and disable_jit == "1":
...
@@ -192,7 +193,7 @@ if disable_jit is not None and disable_jit == "1":
cus
=
[
gemmtuner
,
convtuner
,
cus
=
[
gemmtuner
,
convtuner
,
convops
,
SpconvOps
(),
BoxOps
(),
HashTable
(),
CompileInfo
(),
convops
,
SpconvOps
(),
BoxOps
(),
HashTable
(),
CompileInfo
(),
ExternalAllocator
(),
ExternalAllocator
(),
ExternalSpconvMatmul
()]
ExternalSpconvMatmul
()
,
InferenceOps
()
]
if
not
CUMM_CPU_ONLY_BUILD
:
if
not
CUMM_CPU_ONLY_BUILD
:
cus
.
extend
([
cu
,
convcu
])
cus
.
extend
([
cu
,
convcu
])
ext_modules
:
List
[
Extension
]
=
[
ext_modules
:
List
[
Extension
]
=
[
...
...
spconv/algo.py
View file @
d0bfb3a3
...
@@ -606,7 +606,11 @@ class SimpleGemm:
...
@@ -606,7 +606,11 @@ class SimpleGemm:
gather_data
:
tv
.
Tensor
=
tv
.
Tensor
(),
gather_data
:
tv
.
Tensor
=
tv
.
Tensor
(),
workspace
:
tv
.
Tensor
=
tv
.
Tensor
(),
workspace
:
tv
.
Tensor
=
tv
.
Tensor
(),
timer
:
CUDAKernelTimer
=
CUDAKernelTimer
(
False
),
timer
:
CUDAKernelTimer
=
CUDAKernelTimer
(
False
),
force_nvrtc
:
bool
=
False
):
force_nvrtc
:
bool
=
False
,
bias
:
Optional
[
tv
.
Tensor
]
=
None
,
act_alpha
:
float
=
0.0
,
act_beta
:
float
=
0.0
,
act_type
:
tv
.
gemm
.
Activation
=
tv
.
gemm
.
Activation
.
None_
):
m
,
n
,
k
=
GemmMainUnitTest
.
extract_mnk
(
a
.
shape
,
b
.
shape
,
trans_a
,
m
,
n
,
k
=
GemmMainUnitTest
.
extract_mnk
(
a
.
shape
,
b
.
shape
,
trans_a
,
trans_b
,
trans_c
,
trans_b
,
trans_c
,
shuffle_type
.
value
,
shuffle_type
.
value
,
...
@@ -630,6 +634,8 @@ class SimpleGemm:
...
@@ -630,6 +634,8 @@ class SimpleGemm:
params
.
a
=
a
params
.
a
=
a
params
.
b
=
b
params
.
b
=
b
params
.
c
=
c
params
.
c
=
c
if
bias
is
not
None
:
params
.
d
=
bias
params
.
a_inds
=
a_inds
params
.
a_inds
=
a_inds
params
.
b_inds
=
b_inds
params
.
b_inds
=
b_inds
params
.
c_inds
=
c_inds
params
.
c_inds
=
c_inds
...
@@ -638,6 +644,9 @@ class SimpleGemm:
...
@@ -638,6 +644,9 @@ class SimpleGemm:
params
.
stream
=
stream
params
.
stream
=
stream
params
.
alpha
=
alpha
params
.
alpha
=
alpha
params
.
beta
=
beta
params
.
beta
=
beta
params
.
act_alpha
=
act_alpha
params
.
act_beta
=
act_beta
params
.
act_type
=
act_type
params
.
workspace
=
workspace
params
.
workspace
=
workspace
# gather = 0
# gather = 0
# if profile_res.external_gather and not gather_data.empty():
# if profile_res.external_gather and not gather_data.empty():
...
@@ -973,7 +982,11 @@ class SimpleConv:
...
@@ -973,7 +982,11 @@ class SimpleConv:
workspace
:
tv
.
Tensor
=
tv
.
Tensor
(),
workspace
:
tv
.
Tensor
=
tv
.
Tensor
(),
verbose
:
bool
=
False
,
verbose
:
bool
=
False
,
timer
:
CUDAKernelTimer
=
CUDAKernelTimer
(
False
),
timer
:
CUDAKernelTimer
=
CUDAKernelTimer
(
False
),
force_nvrtc
:
bool
=
False
):
force_nvrtc
:
bool
=
False
,
bias
:
Optional
[
tv
.
Tensor
]
=
None
,
act_alpha
:
float
=
0.0
,
act_beta
:
float
=
0.0
,
act_type
:
tv
.
gemm
.
Activation
=
tv
.
gemm
.
Activation
.
None_
):
channel_k
=
output
.
dim
(
1
)
channel_k
=
output
.
dim
(
1
)
channel_c
=
inp
.
dim
(
1
)
channel_c
=
inp
.
dim
(
1
)
# GemmMainUnitTest.stream_synchronize(stream)
# GemmMainUnitTest.stream_synchronize(stream)
...
@@ -989,7 +1002,7 @@ class SimpleConv:
...
@@ -989,7 +1002,7 @@ class SimpleConv:
params
=
ConvParams
(
NDIM_DONT_CARE
,
ConvOpTypeCpp
(
op_type_value
))
params
=
ConvParams
(
NDIM_DONT_CARE
,
ConvOpTypeCpp
(
op_type_value
))
is_not_static
=
str
(
is_not_static
=
str
(
algo_desp
)
not
in
self
.
prebuilt_desp_names
algo_desp
)
not
in
self
.
prebuilt_desp_names
if
algo_desp
.
is_nvrtc
and
(
is_not_static
or
force_nvrtc
):
if
force_nvrtc
or
(
algo_desp
.
is_nvrtc
and
is_not_static
):
params
.
nvrtc_params
=
self
.
_cached_get_nvrtc_params
(
params
.
nvrtc_params
=
self
.
_cached_get_nvrtc_params
(
algo_desp
,
profile_res
.
arch
)
algo_desp
,
profile_res
.
arch
)
params
.
conv_algo_desp
=
profile_res
.
algo_desp
params
.
conv_algo_desp
=
profile_res
.
algo_desp
...
@@ -1001,6 +1014,9 @@ class SimpleConv:
...
@@ -1001,6 +1014,9 @@ class SimpleConv:
params
.
split_k_slices
=
split_k_slices
params
.
split_k_slices
=
split_k_slices
params
.
alpha
=
alpha
params
.
alpha
=
alpha
params
.
beta
=
beta
params
.
beta
=
beta
params
.
act_alpha
=
act_alpha
params
.
act_beta
=
act_beta
params
.
act_type
=
act_type
params
.
stream
=
stream
params
.
stream
=
stream
params
.
mask_argsort
=
mask_argsort
params
.
mask_argsort
=
mask_argsort
params
.
indices
=
indices
params
.
indices
=
indices
...
@@ -1011,6 +1027,8 @@ class SimpleConv:
...
@@ -1011,6 +1027,8 @@ class SimpleConv:
params
.
mask_filter
=
mask_filter
params
.
mask_filter
=
mask_filter
params
.
mask_output
=
mask_output
params
.
mask_output
=
mask_output
params
.
reverse_mask
=
reverse_mask
params
.
reverse_mask
=
reverse_mask
if
bias
is
not
None
:
params
.
bias
=
bias
if
timer
.
enable
:
if
timer
.
enable
:
assert
timer
.
_timer
is
not
None
assert
timer
.
_timer
is
not
None
params
.
timer
=
timer
.
_timer
params
.
timer
=
timer
.
_timer
...
...
spconv/build.py
View file @
d0bfb3a3
...
@@ -36,6 +36,7 @@ if project_is_installed(PACKAGE_NAME) and project_is_editable(
...
@@ -36,6 +36,7 @@ if project_is_installed(PACKAGE_NAME) and project_is_editable(
from
spconv.csrc.sparse.convops
import
GemmTunerSimple
,
ExternalSpconvMatmul
from
spconv.csrc.sparse.convops
import
GemmTunerSimple
,
ExternalSpconvMatmul
from
spconv.csrc.sparse.convops
import
ConvTunerSimple
,
ConvGemmOps
from
spconv.csrc.sparse.convops
import
ConvTunerSimple
,
ConvGemmOps
from
spconv.csrc.sparse.convops
import
SimpleExternalSpconvMatmul
from
spconv.csrc.sparse.convops
import
SimpleExternalSpconvMatmul
from
spconv.csrc.sparse.inference
import
InferenceOps
all_shuffle
=
SHUFFLE_SIMT_PARAMS
+
SHUFFLE_VOLTA_PARAMS
+
SHUFFLE_TURING_PARAMS
all_shuffle
=
SHUFFLE_SIMT_PARAMS
+
SHUFFLE_VOLTA_PARAMS
+
SHUFFLE_TURING_PARAMS
all_shuffle
=
list
(
filter
(
lambda
x
:
not
x
.
is_nvrtc
,
all_shuffle
))
all_shuffle
=
list
(
filter
(
lambda
x
:
not
x
.
is_nvrtc
,
all_shuffle
))
...
@@ -63,6 +64,7 @@ if project_is_installed(PACKAGE_NAME) and project_is_editable(
...
@@ -63,6 +64,7 @@ if project_is_installed(PACKAGE_NAME) and project_is_editable(
ExternalAllocator
(),
ExternalAllocator
(),
ExternalSpconvMatmul
(),
ExternalSpconvMatmul
(),
SimpleExternalSpconvMatmul
(),
# for debug, won't be included in release
SimpleExternalSpconvMatmul
(),
# for debug, won't be included in release
InferenceOps
(),
]
]
pccm
.
builder
.
build_pybind
(
cus
,
pccm
.
builder
.
build_pybind
(
cus
,
PACKAGE_ROOT
/
"core_cc"
,
PACKAGE_ROOT
/
"core_cc"
,
...
...
spconv/constants.py
View file @
d0bfb3a3
...
@@ -100,7 +100,9 @@ class AllocKeys:
...
@@ -100,7 +100,9 @@ class AllocKeys:
SPCONV_DEBUG_WEIGHT
=
False
SPCONV_DEBUG_WEIGHT
=
False
SPCONV_CPP_INDICE_PAIRS
=
False
SPCONV_CPP_INDICE_PAIRS
=
True
SPCONV_USE_DIRECT_TABLE
=
True
# currently use cpp pair gen is slightly slower than python, I don't know why.
# currently use cpp pair gen is slightly slower than python, I don't know why.
SPCONV_CPP_INDICE_PAIRS_IGEMM
=
os
.
getenv
(
"SPCONV_CPP_INDICE_PAIRS_IGEMM"
,
"0"
)
==
"1"
SPCONV_CPP_INDICE_PAIRS_IGEMM
=
os
.
getenv
(
"SPCONV_CPP_INDICE_PAIRS_IGEMM"
,
"0"
)
==
"1"
...
...
spconv/core_cc/csrc/sparse/convops/convops.pyi
View file @
d0bfb3a3
...
@@ -5,6 +5,7 @@ from cumm.tensorview import Tensor
...
@@ -5,6 +5,7 @@ from cumm.tensorview import Tensor
from cumm.tensorview.gemm import NVRTCParams
from cumm.tensorview.gemm import NVRTCParams
from spconv.core_cc.csrc.sparse.convops import ConvTuneResult
from spconv.core_cc.csrc.sparse.convops import ConvTuneResult
from cumm.tensorview import CUDAKernelTimer
from cumm.tensorview import CUDAKernelTimer
from cumm.tensorview.gemm import Activation
class ConvTunerSimple:
class ConvTunerSimple:
def __init__(self, desps: List[ConvAlgoDesp]) -> None:
def __init__(self, desps: List[ConvAlgoDesp]) -> None:
"""
"""
...
@@ -88,7 +89,7 @@ class ConvTunerSimple:
...
@@ -88,7 +89,7 @@ class ConvTunerSimple:
mask_width:
mask_width:
"""
"""
...
...
def run_with_tuned_result(self, profile_res, op_type: int, inp: Tensor, weight: Tensor, output: Tensor, mask: Tensor, mask_argsort: Tensor, mask_output: Tensor, indices: Tensor, reverse_mask: bool, mask_filter: int = 0xffffffff, mask_width: int = -1, alpha: float = 1.0, beta: float = 0.0, stream_int: int = 0, workspace: Tensor = Tensor(), verbose: bool = False, timer: CUDAKernelTimer = CUDAKernelTimer(false), force_nvrtc: bool = False) -> None:
def run_with_tuned_result(self, profile_res, op_type: int, inp: Tensor, weight: Tensor, output: Tensor, mask: Tensor, mask_argsort: Tensor, mask_output: Tensor, indices: Tensor, reverse_mask: bool, mask_filter: int = 0xffffffff, mask_width: int = -1, alpha: float = 1.0, beta: float = 0.0, stream_int: int = 0, workspace: Tensor = Tensor(), verbose: bool = False, timer: CUDAKernelTimer = CUDAKernelTimer(false), force_nvrtc: bool = False
, bias: Tensor = Tensor(), act_alpha: float = 0.0, act_beta: float = 0.0, act_type: Activation = Activation.None_
) -> None:
"""
"""
Args:
Args:
profile_res:
profile_res:
...
@@ -110,6 +111,10 @@ class ConvTunerSimple:
...
@@ -110,6 +111,10 @@ class ConvTunerSimple:
verbose:
verbose:
timer:
timer:
force_nvrtc:
force_nvrtc:
bias:
act_alpha:
act_beta:
act_type:
"""
"""
...
...
def query_workspace_size(self, desp: ConvAlgoDesp, splitk: int, op_type: int, N: int, C: int, K: int, kv: int) -> int:
def query_workspace_size(self, desp: ConvAlgoDesp, splitk: int, op_type: int, N: int, C: int, K: int, kv: int) -> int:
...
...
spconv/core_cc/csrc/sparse/convops/gemmops.pyi
View file @
d0bfb3a3
...
@@ -5,6 +5,7 @@ from cumm.tensorview import Tensor
...
@@ -5,6 +5,7 @@ from cumm.tensorview import Tensor
from cumm.tensorview.gemm import NVRTCParams
from cumm.tensorview.gemm import NVRTCParams
from spconv.core_cc.csrc.sparse.convops import GemmTuneResult
from spconv.core_cc.csrc.sparse.convops import GemmTuneResult
from cumm.tensorview import CUDAKernelTimer
from cumm.tensorview import CUDAKernelTimer
from cumm.tensorview.gemm import Activation
class GemmTunerSimple:
class GemmTunerSimple:
def __init__(self, desps: List[GemmAlgoDesp]) -> None:
def __init__(self, desps: List[GemmAlgoDesp]) -> None:
"""
"""
...
@@ -81,7 +82,7 @@ class GemmTunerSimple:
...
@@ -81,7 +82,7 @@ class GemmTunerSimple:
hint:
hint:
"""
"""
...
...
def run_with_tuned_result(self, profile_res, a: Tensor, b: Tensor, c: Tensor, trans_a: bool, trans_b: bool, trans_c: bool, arch: Tuple[int, int], stream_int: int, shuffle_type: int, a_inds: Tensor, b_inds: Tensor, c_inds: Tensor, hint: int = 0, alpha: float = 1.0, beta: float = 0.0, workspace: Tensor = Tensor(), timer: CUDAKernelTimer = CUDAKernelTimer(False), force_nvrtc: bool = False) -> None:
def run_with_tuned_result(self, profile_res, a: Tensor, b: Tensor, c: Tensor, trans_a: bool, trans_b: bool, trans_c: bool, arch: Tuple[int, int], stream_int: int, shuffle_type: int, a_inds: Tensor, b_inds: Tensor, c_inds: Tensor, hint: int = 0, alpha: float = 1.0, beta: float = 0.0, workspace: Tensor = Tensor(), timer: CUDAKernelTimer = CUDAKernelTimer(False), force_nvrtc: bool = False
, bias: Tensor = Tensor(), act_alpha: float = 0.0, act_beta: float = 0.0, act_type: Activation = Activation.None_
) -> None:
"""
"""
Args:
Args:
profile_res:
profile_res:
...
@@ -103,5 +104,9 @@ class GemmTunerSimple:
...
@@ -103,5 +104,9 @@ class GemmTunerSimple:
workspace:
workspace:
timer:
timer:
force_nvrtc:
force_nvrtc:
bias:
act_alpha:
act_beta:
act_type:
"""
"""
...
...
spconv/core_cc/csrc/sparse/convops/spops.pyi
View file @
d0bfb3a3
from typing import overload, Any, Callable, Dict, List, Optional, Set, Tuple, Type, Union
from typing import overload, Any, Callable, Dict, List, Optional, Set, Tuple, Type, Union
from pccm.stubs import EnumValue, EnumClassValue
from pccm.stubs import EnumValue, EnumClassValue
from cumm.tensorview import Tensor
from cumm.tensorview import Tensor
from cumm.tensorview.gemm import Activation
from cumm.tensorview import CUDAKernelTimer
from cumm.tensorview import CUDAKernelTimer
class ConvGemmOps:
class ConvGemmOps:
@staticmethod
@staticmethod
...
@@ -11,7 +12,7 @@ class ConvGemmOps:
...
@@ -11,7 +12,7 @@ class ConvGemmOps:
"""
"""
...
...
@staticmethod
@staticmethod
def indice_conv(allocator, ext_mm, gemm_tuner, all_w_is_krsc: bool, filter_hwio: bool, features: Tensor, filters: Tensor, indice_pairs: Tensor, indice_pair_num: Tensor, arch: Tuple[int, int], num_activate_out: int, inverse: bool = False, subm: bool = False, algo: int = 0, stream_int: int = 0) -> None:
def indice_conv(allocator, ext_mm, gemm_tuner, all_w_is_krsc: bool, filter_hwio: bool, features: Tensor, filters: Tensor, indice_pairs: Tensor, indice_pair_num: Tensor, arch: Tuple[int, int], num_activate_out: int, inverse: bool = False, subm: bool = False, algo: int = 0, stream_int: int = 0
, bias: Tensor = Tensor(), act_alpha: float = 0.0, act_beta: float = 0.0, act_type: Activation = Activation.None_
) -> None:
"""
"""
1. this function need to take a out features
1. this function need to take a out features
that from subm first mm.
that from subm first mm.
...
@@ -32,6 +33,10 @@ class ConvGemmOps:
...
@@ -32,6 +33,10 @@ class ConvGemmOps:
subm:
subm:
algo:
algo:
stream_int:
stream_int:
bias:
act_alpha:
act_beta:
act_type:
"""
"""
...
...
@staticmethod
@staticmethod
...
@@ -56,7 +61,7 @@ class ConvGemmOps:
...
@@ -56,7 +61,7 @@ class ConvGemmOps:
"""
"""
...
...
@staticmethod
@staticmethod
def implicit_gemm(allocator, conv_tuner, features: Tensor, filters: Tensor, pair_fwd: Tensor, pair_mask_fwd_splits: List[Tensor], mask_argsort_fwd_splits: List[Tensor], num_activate_out: int, masks: Tensor, arch: Tuple[int, int], is_train: bool = False, is_subm: bool = False, stream_int: int = 0, timer: CUDAKernelTimer = CUDAKernelTimer(False), auto_fp32_accum: bool = True, fp32_accum: bool = False) -> Tuple[int, Any]:
def implicit_gemm(allocator, conv_tuner, features: Tensor, filters: Tensor, pair_fwd: Tensor, pair_mask_fwd_splits: List[Tensor], mask_argsort_fwd_splits: List[Tensor], num_activate_out: int, masks: Tensor, arch: Tuple[int, int], is_train: bool = False, is_subm: bool = False, stream_int: int = 0, timer: CUDAKernelTimer = CUDAKernelTimer(False), auto_fp32_accum: bool = True, fp32_accum: bool = False
, bias: Tensor = Tensor(), act_alpha: float = 0.0, act_beta: float = 0.0, act_type: Activation = Activation.None_
) -> Tuple[int, Any]:
"""
"""
Args:
Args:
allocator:
allocator:
...
@@ -75,6 +80,10 @@ class ConvGemmOps:
...
@@ -75,6 +80,10 @@ class ConvGemmOps:
timer:
timer:
auto_fp32_accum:
auto_fp32_accum:
fp32_accum:
fp32_accum:
bias:
act_alpha:
act_beta:
act_type:
"""
"""
...
...
@staticmethod
@staticmethod
...
...
spconv/core_cc/csrc/sparse/inference.pyi
0 → 100644
View file @
d0bfb3a3
from typing import overload, Any, Callable, Dict, List, Optional, Set, Tuple, Type, Union
from pccm.stubs import EnumValue, EnumClassValue
from cumm.tensorview import Tensor
from cumm.tensorview.gemm import Activation
class InferenceOps:
@staticmethod
def bias_add_act_inplace(out: Tensor, bias: Tensor, act_type: Activation = Activation.None_, alpha: float = 0.0, beta: float = 0.0, stream: int = 0) -> None:
"""
Args:
out:
bias:
act_type:
alpha:
beta:
stream:
"""
...
@staticmethod
def bias_add_inplace(out: Tensor, bias: Tensor, stream: int = 0) -> None:
"""
Args:
out:
bias:
stream:
"""
...
@staticmethod
def activation_inplace(out: Tensor, act_type: Activation, alpha: float, beta: float, stream: int = 0) -> None:
"""
Args:
out:
act_type:
alpha:
beta:
stream:
"""
...
spconv/csrc/sparse/convops.py
View file @
d0bfb3a3
...
@@ -14,7 +14,7 @@ from spconv.csrc.sparse.gather import GatherCPU
...
@@ -14,7 +14,7 @@ from spconv.csrc.sparse.gather import GatherCPU
from
.alloc
import
ExternalAllocator
from
.alloc
import
ExternalAllocator
from
cumm.common
import
CompileInfo
from
cumm.common
import
CompileInfo
from
.inference
import
InferenceOps
class
ExternalSpconvMatmul
(
pccm
.
Class
):
class
ExternalSpconvMatmul
(
pccm
.
Class
):
"""a helper class to warp matmul operations
"""a helper class to warp matmul operations
...
@@ -834,6 +834,12 @@ class GemmTunerSimple(pccm.ParameterizedClass):
...
@@ -834,6 +834,12 @@ class GemmTunerSimple(pccm.ParameterizedClass):
code
.
arg
(
"timer"
,
"tv::CUDAKernelTimer"
,
"tv::CUDAKernelTimer(false)"
,
code
.
arg
(
"timer"
,
"tv::CUDAKernelTimer"
,
"tv::CUDAKernelTimer(false)"
,
"cumm.tensorview.CUDAKernelTimer = CUDAKernelTimer(False)"
)
"cumm.tensorview.CUDAKernelTimer = CUDAKernelTimer(False)"
)
code
.
arg
(
"force_nvrtc"
,
f
"bool"
,
"false"
)
code
.
arg
(
"force_nvrtc"
,
f
"bool"
,
"false"
)
code
.
arg
(
"bias"
,
"tv::Tensor"
,
"tv::Tensor()"
,
"cumm.tensorview.Tensor = Tensor()"
)
code
.
arg
(
"act_alpha"
,
f
"float"
,
"0.0"
)
code
.
arg
(
"act_beta"
,
f
"float"
,
"0.0"
)
code
.
arg
(
"act_type"
,
f
"tv::gemm::Activation"
,
"tv::gemm::Activation::kNone"
,
"cumm.tensorview.gemm.Activation = Activation.None_"
)
if
CUMM_CPU_ONLY_BUILD
:
if
CUMM_CPU_ONLY_BUILD
:
code
.
raw
(
f
"TV_THROW_RT_ERR(
\"
not implemented for cpu!!!
\"
)"
)
code
.
raw
(
f
"TV_THROW_RT_ERR(
\"
not implemented for cpu!!!
\"
)"
)
return
code
return
code
...
@@ -847,12 +853,13 @@ class GemmTunerSimple(pccm.ParameterizedClass):
...
@@ -847,12 +853,13 @@ class GemmTunerSimple(pccm.ParameterizedClass):
tv::gemm::GemmParams params;
tv::gemm::GemmParams params;
bool desp_is_static = prebuilt_names_.find(desp.__repr__()) == prebuilt_names_.end();
bool desp_is_static = prebuilt_names_.find(desp.__repr__()) == prebuilt_names_.end();
if (desp.is_nvrtc &&
(
desp_is_static
|| force_nvrtc
)){{
if
(force_nvrtc ||
(desp.is_nvrtc && desp_is_static)){{
params.nvrtc_params = cached_get_nvrtc_params(desp, profile_res.arch, stream_int);
params.nvrtc_params = cached_get_nvrtc_params(desp, profile_res.arch, stream_int);
}}
}}
params.a = a;
params.a = a;
params.b = b;
params.b = b;
params.c = c;
params.c = c;
params.d = bias;
params.a_inds = a_inds;
params.a_inds = a_inds;
params.b_inds = b_inds;
params.b_inds = b_inds;
params.c_inds = c_inds;
params.c_inds = c_inds;
...
@@ -861,6 +868,10 @@ class GemmTunerSimple(pccm.ParameterizedClass):
...
@@ -861,6 +868,10 @@ class GemmTunerSimple(pccm.ParameterizedClass):
params.stream = stream_int;
params.stream = stream_int;
params.alpha = alpha;
params.alpha = alpha;
params.beta = beta;
params.beta = beta;
params.act_alpha = act_alpha;
params.act_beta = act_beta;
params.act_type = act_type;
params.workspace = workspace;
params.workspace = workspace;
GemmMain::matmul2(params);
GemmMain::matmul2(params);
"""
)
"""
)
...
@@ -1257,15 +1268,18 @@ class ConvTunerSimple(pccm.ParameterizedClass):
...
@@ -1257,15 +1268,18 @@ class ConvTunerSimple(pccm.ParameterizedClass):
code
.
arg
(
"timer"
,
"tv::CUDAKernelTimer"
,
"tv::CUDAKernelTimer(false)"
,
code
.
arg
(
"timer"
,
"tv::CUDAKernelTimer"
,
"tv::CUDAKernelTimer(false)"
,
"cumm.tensorview.CUDAKernelTimer = CUDAKernelTimer(false)"
)
"cumm.tensorview.CUDAKernelTimer = CUDAKernelTimer(false)"
)
code
.
arg
(
"force_nvrtc"
,
f
"bool"
,
"false"
)
code
.
arg
(
"force_nvrtc"
,
f
"bool"
,
"false"
)
code
.
arg
(
"bias"
,
"tv::Tensor"
,
"tv::Tensor()"
,
"cumm.tensorview.Tensor = Tensor()"
)
code
.
arg
(
"act_alpha"
,
f
"float"
,
"0.0"
)
code
.
arg
(
"act_beta"
,
f
"float"
,
"0.0"
)
code
.
arg
(
"act_type"
,
f
"tv::gemm::Activation"
,
"tv::gemm::Activation::kNone"
,
"cumm.tensorview.gemm.Activation = Activation.None_"
)
if
CUMM_CPU_ONLY_BUILD
:
if
CUMM_CPU_ONLY_BUILD
:
code
.
raw
(
f
"TV_THROW_RT_ERR(
\"
not implemented for cpu!!!
\"
)"
)
code
.
raw
(
f
"TV_THROW_RT_ERR(
\"
not implemented for cpu!!!
\"
)"
)
return
code
return
code
code
.
raw
(
f
"""
code
.
raw
(
f
"""
auto desp = profile_res.algo_desp;
auto desp = profile_res.algo_desp;
if (force_nvrtc){{
desp.is_nvrtc = true;
}}
int split_k_slices = 1;
int split_k_slices = 1;
if (profile_res.splitk > 1){{
if (profile_res.splitk > 1){{
split_k_slices = profile_res.splitk;
split_k_slices = profile_res.splitk;
...
@@ -1276,7 +1290,7 @@ class ConvTunerSimple(pccm.ParameterizedClass):
...
@@ -1276,7 +1290,7 @@ class ConvTunerSimple(pccm.ParameterizedClass):
auto arch = profile_res.arch;
auto arch = profile_res.arch;
tv::gemm::ConvParams params(
{
NDIM_DONT_CARE
}
, op_type_cpp, timer);
tv::gemm::ConvParams params(
{
NDIM_DONT_CARE
}
, op_type_cpp, timer);
bool desp_is_static = prebuilt_names_.find(desp.__repr__()) == prebuilt_names_.end();
bool desp_is_static = prebuilt_names_.find(desp.__repr__()) == prebuilt_names_.end();
if (desp.is_nvrtc &&
(
desp_is_static
|| force_nvrtc
)){{
if
(force_nvrtc ||
(desp.is_nvrtc && desp_is_static)){{
params.nvrtc_params = cached_get_nvrtc_params(desp, arch, stream_int);
params.nvrtc_params = cached_get_nvrtc_params(desp, arch, stream_int);
}}
}}
params.conv_algo_desp = desp;
params.conv_algo_desp = desp;
...
@@ -1284,10 +1298,15 @@ class ConvTunerSimple(pccm.ParameterizedClass):
...
@@ -1284,10 +1298,15 @@ class ConvTunerSimple(pccm.ParameterizedClass):
params.weight = weight.view(channel_k, -1, channel_c);
params.weight = weight.view(channel_k, -1, channel_c);
params.output = output;
params.output = output;
params.verbose = verbose;
params.verbose = verbose;
params.bias = bias;
params.split_k_slices = split_k_slices;
params.split_k_slices = split_k_slices;
params.alpha = alpha;
params.alpha = alpha;
params.beta = beta;
params.beta = beta;
params.act_alpha = act_alpha;
params.act_beta = act_beta;
params.act_type = act_type;
params.stream = stream_int;
params.stream = stream_int;
params.mask_argsort = mask_argsort;
params.mask_argsort = mask_argsort;
params.indices = indices;
params.indices = indices;
...
@@ -1336,6 +1355,7 @@ class ConvGemmOps(pccm.ParameterizedClass):
...
@@ -1336,6 +1355,7 @@ class ConvGemmOps(pccm.ParameterizedClass):
GemmTuneResult
,
GemmTuneResult
,
ConvTuneResult
,
ConvTuneResult
,
ExternalSpconvMatmul
,
ExternalSpconvMatmul
,
InferenceOps
,
)
)
self
.
add_param_class
(
"gemm"
,
gemm_tuner
,
"GemmTuner"
)
self
.
add_param_class
(
"gemm"
,
gemm_tuner
,
"GemmTuner"
)
self
.
add_param_class
(
"conv"
,
conv_tuner
,
"ConvTuner"
)
self
.
add_param_class
(
"conv"
,
conv_tuner
,
"ConvTuner"
)
...
@@ -1384,11 +1404,18 @@ class ConvGemmOps(pccm.ParameterizedClass):
...
@@ -1384,11 +1404,18 @@ class ConvGemmOps(pccm.ParameterizedClass):
code
.
arg
(
"subm"
,
"bool"
,
"false"
)
code
.
arg
(
"subm"
,
"bool"
,
"false"
)
code
.
arg
(
"algo"
,
"int"
,
f
"
{
ConvAlgo
.
Native
.
value
}
"
)
code
.
arg
(
"algo"
,
"int"
,
f
"
{
ConvAlgo
.
Native
.
value
}
"
)
code
.
arg
(
"stream_int"
,
f
"std::uintptr_t"
,
"0"
,
pyanno
=
"int"
)
code
.
arg
(
"stream_int"
,
f
"std::uintptr_t"
,
"0"
,
pyanno
=
"int"
)
code
.
arg
(
"bias"
,
"tv::Tensor"
,
"tv::Tensor()"
,
"cumm.tensorview.Tensor = Tensor()"
)
code
.
arg
(
"act_alpha"
,
f
"float"
,
"0.0"
)
code
.
arg
(
"act_beta"
,
f
"float"
,
"0.0"
)
code
.
arg
(
"act_type"
,
f
"tv::gemm::Activation"
,
"tv::gemm::Activation::kNone"
,
"cumm.tensorview.gemm.Activation = Activation.None_"
)
code
.
raw
(
f
"""
code
.
raw
(
f
"""
int kv_dim, out_channel, kv;
int kv_dim, out_channel, kv;
std::vector<int64_t> filter_shape_per_kv;
std::vector<int64_t> filter_shape_per_kv;
bool is_KC_not_CK;
bool is_KC_not_CK;
bool has_bias = !bias.empty();
bool has_act = act_type != tv::gemm::Activation::kNone;
if (!all_w_is_krsc){{
if (!all_w_is_krsc){{
kv_dim = 0;
kv_dim = 0;
is_KC_not_CK = !filter_hwio;
is_KC_not_CK = !filter_hwio;
...
@@ -1419,10 +1446,22 @@ class ConvGemmOps(pccm.ParameterizedClass):
...
@@ -1419,10 +1446,22 @@ class ConvGemmOps(pccm.ParameterizedClass):
out_features = allocator.zeros(
{
pccm
.
literal
(
AllocKeys
.
OutFeatures
)
}
,
out_features = allocator.zeros(
{
pccm
.
literal
(
AllocKeys
.
OutFeatures
)
}
,
{{num_activate_out, out_channel}}, features.dtype(), features.device(), stream_int);
{{num_activate_out, out_channel}}, features.dtype(), features.device(), stream_int);
}}
}}
if (has_act || has_bias){{
TV_ASSERT_RT_ERR(!features.is_cpu(), "bias and act don't support cpu.");
}}
if (kv == 1 && subm){{
if (kv == 1 && subm){{
if (has_bias && has_act){{
InferenceOps::bias_add_act_inplace(out_features, bias, act_type, act_alpha, act_beta, stream_int);
}}else{{
if (has_bias){{
InferenceOps::bias_add_inplace(out_features, bias, stream_int);
}}
if (has_act){{
InferenceOps::activation_inplace(out_features, act_type, act_alpha, act_beta, stream_int);
}}
}}
return;
return;
}}
}}
auto indice_pair_num_cpu = indice_pair_num.cpu();
auto indice_pair_num_cpu = indice_pair_num.cpu();
auto indice_pair_num_cpu_ptr = indice_pair_num_cpu.data_ptr<int>();
auto indice_pair_num_cpu_ptr = indice_pair_num_cpu.data_ptr<int>();
int maxnhot = 0;
int maxnhot = 0;
...
@@ -1571,6 +1610,16 @@ class ConvGemmOps(pccm.ParameterizedClass):
...
@@ -1571,6 +1610,16 @@ class ConvGemmOps(pccm.ParameterizedClass):
beta);
beta);
inited = true;
inited = true;
}}
}}
if (has_bias && has_act){{
InferenceOps::bias_add_act_inplace(out_features, bias, act_type, act_alpha, act_beta, stream_int);
}}else{{
if (has_bias){{
InferenceOps::bias_add_inplace(out_features, bias, stream_int);
}}
if (has_act){{
InferenceOps::activation_inplace(out_features, act_type, act_alpha, act_beta, stream_int);
}}
}}
"""
)
"""
)
return
code
return
code
...
@@ -1913,11 +1962,21 @@ class ConvGemmOps(pccm.ParameterizedClass):
...
@@ -1913,11 +1962,21 @@ class ConvGemmOps(pccm.ParameterizedClass):
code
.
arg
(
"auto_fp32_accum"
,
"bool"
,
"true"
)
code
.
arg
(
"auto_fp32_accum"
,
"bool"
,
"true"
)
code
.
arg
(
"fp32_accum"
,
"bool"
,
"false"
)
code
.
arg
(
"fp32_accum"
,
"bool"
,
"false"
)
code
.
arg
(
"bias"
,
"tv::Tensor"
,
"tv::Tensor()"
,
"cumm.tensorview.Tensor = Tensor()"
)
code
.
arg
(
"act_alpha"
,
f
"float"
,
"0.0"
)
code
.
arg
(
"act_beta"
,
f
"float"
,
"0.0"
)
code
.
arg
(
"act_type"
,
f
"tv::gemm::Activation"
,
"tv::gemm::Activation::kNone"
,
"cumm.tensorview.gemm.Activation = Activation.None_"
)
if
CUMM_CPU_ONLY_BUILD
:
if
CUMM_CPU_ONLY_BUILD
:
code
.
raw
(
f
"TV_THROW_RT_ERR(
\"
not implemented for cpu!!!
\"
)"
)
code
.
raw
(
f
"TV_THROW_RT_ERR(
\"
not implemented for cpu!!!
\"
)"
)
return
code
.
ret
(
"int"
)
return
code
.
ret
(
"int"
)
code
.
raw
(
f
"""
code
.
raw
(
f
"""
if (!bias.empty() || act_type != tv::gemm::Activation::kNone){{
TV_ASSERT_RT_ERR(pair_mask_fwd_splits.size() == 1, "SplitGemm don't support fused bias/act for now.");
}}
uint32_t* mask_ptr = masks.data_ptr<uint32_t>();
uint32_t* mask_ptr = masks.data_ptr<uint32_t>();
int num_mask = masks.dim(0);
int num_mask = masks.dim(0);
int out_channel = filters.dim(0);
int out_channel = filters.dim(0);
...
@@ -1989,6 +2048,7 @@ class ConvGemmOps(pccm.ParameterizedClass):
...
@@ -1989,6 +2048,7 @@ class ConvGemmOps(pccm.ParameterizedClass):
for (int j = 0; j < num_split; ++j){{
for (int j = 0; j < num_split; ++j){{
float beta = j == 0 ? 0 : 1;
float beta = j == 0 ? 0 : 1;
conv_tuner.run_with_tuned_result(
conv_tuner.run_with_tuned_result(
tune_res,
tune_res,
kForwardInt,
kForwardInt,
...
@@ -2006,7 +2066,12 @@ class ConvGemmOps(pccm.ParameterizedClass):
...
@@ -2006,7 +2066,12 @@ class ConvGemmOps(pccm.ParameterizedClass):
stream_int,
stream_int,
tv::Tensor(), // workspace
tv::Tensor(), // workspace
false, // verbose
false, // verbose
timer);
timer,
false,
bias,
act_alpha,
act_beta,
act_type);
}}
}}
// auto end_ev = tv::CUDAEvent();
// auto end_ev = tv::CUDAEvent();
// end_ev.record(stream_int);
// end_ev.record(stream_int);
...
...
spconv/csrc/sparse/inference.py
0 → 100644
View file @
d0bfb3a3
# Copyright 2021 Yan Yan
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
pccm
from
cumm.common
import
TensorView
,
GemmDTypes
,
TensorViewKernel
,
ThrustLib
,
GemmBasic
from
spconv.csrc.sparse.cpu_core
import
OMPLib
from
cumm.constants
import
CUMM_CPU_ONLY_BUILD
class
InferenceOpsKernel
(
pccm
.
ParameterizedClass
):
def
__init__
(
self
):
super
().
__init__
()
self
.
add_dependency
(
TensorViewKernel
,
GemmBasic
)
@
pccm
.
cuda
.
cuda_global_function
def
bias_add_inplace_kernel
(
self
):
code
=
pccm
.
FunctionCode
()
code
.
targ
(
"T"
)
code
.
arg
(
"out_features"
,
f
"T*"
)
code
.
arg
(
"bias"
,
f
"const T*"
)
code
.
arg
(
"size"
,
"int"
)
code
.
arg
(
"num_features"
,
"int"
)
code
.
raw
(
f
"""
for (int i : tv::KernelLoopY<int>(size)) {{
auto out_ptr = out_features + i * num_features;
for (int j : tv::KernelLoopX<int>(num_features)) {{
out_ptr[j] = bias[j] + out_ptr[j];
}}
}}
"""
)
return
code
@
pccm
.
cuda
.
cuda_global_function
def
bias_add_act_inplace_kernel
(
self
):
code
=
pccm
.
FunctionCode
()
code
.
targ
(
"T"
)
code
.
arg
(
"out_features"
,
f
"T*"
)
code
.
arg
(
"bias"
,
f
"const T*"
)
code
.
arg
(
"act_type"
,
f
"tv::gemm::Activation"
)
code
.
arg
(
"alpha"
,
f
"T"
)
code
.
arg
(
"beta"
,
f
"T"
)
code
.
arg
(
"size"
,
"int"
)
code
.
arg
(
"num_features"
,
"int"
)
code
.
raw
(
f
"""
for (int i : tv::KernelLoopY<int>(size)) {{
auto out_ptr = out_features + i * num_features;
for (int j : tv::KernelLoopX<int>(num_features)) {{
T o = out_ptr[j] + bias[j];
switch (act_type){{
case tv::gemm::Activation::kNone:
break;
case tv::gemm::Activation::kReLU:{{
o = o >= T(0) ? o : T(0);
}}
case tv::gemm::Activation::kLeakyReLU:{{
o = o >= T(0) ? o : o * alpha;
}}
default: ;
}}
out_ptr[j] = o;
}}
}}
"""
)
return
code
@
pccm
.
cuda
.
cuda_global_function
def
activation_inplace_kernel
(
self
):
code
=
pccm
.
FunctionCode
()
code
.
targ
(
"T"
)
code
.
arg
(
"out_features"
,
f
"T*"
)
code
.
arg
(
"act_type"
,
f
"tv::gemm::Activation"
)
code
.
arg
(
"alpha"
,
f
"T"
)
code
.
arg
(
"beta"
,
f
"T"
)
code
.
arg
(
"size"
,
"int"
)
code
.
raw
(
f
"""
for (int i : tv::KernelLoopX<int>(size)) {{
T o = out_features[i];
switch (act_type){{
case tv::gemm::Activation::kNone:
break;
case tv::gemm::Activation::kReLU:{{
out_features[i] = o >= T(0) ? o : T(0);
}}
case tv::gemm::Activation::kLeakyReLU:{{
out_features[i] = o >= T(0) ? o : o * alpha;
}}
default: ;
}}
}}
"""
)
return
code
class
InferenceOps
(
pccm
.
Class
):
def
__init__
(
self
):
super
().
__init__
()
self
.
add_dependency
(
TensorView
)
self
.
kernel
=
InferenceOpsKernel
()
self
.
add_include
(
"tensorview/gemm/core/constants.h"
)
if
CUMM_CPU_ONLY_BUILD
:
_DECORATOR
=
pccm
.
static_function
else
:
_DECORATOR
=
pccm
.
cuda
.
static_function
@
pccm
.
pybind
.
mark
@
_DECORATOR
def
bias_add_act_inplace
(
self
):
code
=
pccm
.
FunctionCode
()
code
.
arg
(
"out"
,
"tv::Tensor"
)
code
.
arg
(
"bias"
,
"tv::Tensor"
)
code
.
arg
(
"act_type"
,
f
"tv::gemm::Activation"
,
"tv::gemm::Activation::kNone"
,
"cumm.tensorview.gemm.Activation = Activation.None_"
)
code
.
arg
(
"alpha"
,
f
"float"
,
"0.0"
)
code
.
arg
(
"beta"
,
f
"float"
,
"0.0"
)
code
.
arg
(
"stream"
,
"std::uintptr_t"
,
"0"
)
if
CUMM_CPU_ONLY_BUILD
:
code
.
raw
(
f
"""
TV_THROW_RT_ERR("this function don't support cpu only build.")
"""
)
return
code
code
.
add_param_class
(
"ker"
,
self
.
kernel
)
code
.
raw
(
f
"""
auto nhot = out.dim(0);
auto cudastream = reinterpret_cast<cudaStream_t>(stream);
TV_ASSERT_RT_ERR(bias.dim(0) == out.dim(1), "error");
tv::dispatch<float, double, tv::half_t, tv::bfloat16_t>(out.dtype(), [&](auto I){{
using T = TV_DECLTYPE(I);
constexpr int MaxThreads = 512;
tv::cuda::Launch launcher(1);
bool found = tv::dispatch_int_noexcept<512, 256, 128, 64, 32, 16>(out.dim(1), [](int my, int expect){{return my >= expect;}}, [&](auto V){{
// if out.dim(1) > value in list above, run this function.
// if a value is found, other value won't be executed.
int NumFeatures = TV_DECLTYPE(V)::value;
int Num0 = MaxThreads / NumFeatures;
dim3 blocks(tv::div_up(out.dim(1), int64_t(NumFeatures)), tv::div_up(nhot, int64_t(Num0)));
dim3 threads(NumFeatures, Num0);
launcher = tv::cuda::Launch(blocks, threads, cudastream);
}});
if (!found){{
int NumFeatures = 16;
int Num0 = MaxThreads / NumFeatures;
dim3 blocks(tv::div_up(out.dim(1), int64_t(NumFeatures)), tv::div_up(nhot, int64_t(Num0)));
dim3 threads(NumFeatures, Num0);
launcher = tv::cuda::Launch(blocks, threads, cudastream);
}}
if (act_type == tv::gemm::Activation::kNone){{
launcher(ker::bias_add_inplace_kernel<T>, out.data_ptr<T>(), bias.data_ptr<const T>(),
nhot, out.dim(1));
}}else{{
launcher(ker::bias_add_act_inplace_kernel<T>, out.data_ptr<T>(), bias.data_ptr<const T>(),
act_type, T(alpha), T(beta), nhot, out.dim(1));
}}
}});
"""
)
return
code
@
pccm
.
pybind
.
mark
@
_DECORATOR
def
bias_add_inplace
(
self
):
code
=
pccm
.
FunctionCode
()
code
.
arg
(
"out"
,
"tv::Tensor"
)
code
.
arg
(
"bias"
,
"tv::Tensor"
)
code
.
arg
(
"stream"
,
"std::uintptr_t"
,
"0"
)
code
.
raw
(
f
"""
return bias_add_act_inplace(out, bias, tv::gemm::Activation::kNone, 0, 0, stream);
"""
)
return
code
@
pccm
.
pybind
.
mark
@
_DECORATOR
def
activation_inplace
(
self
):
code
=
pccm
.
FunctionCode
()
code
.
arg
(
"out"
,
"tv::Tensor"
)
code
.
arg
(
"act_type"
,
f
"tv::gemm::Activation"
)
code
.
arg
(
"alpha"
,
f
"float"
)
code
.
arg
(
"beta"
,
f
"float"
)
code
.
arg
(
"stream"
,
"std::uintptr_t"
,
"0"
)
if
CUMM_CPU_ONLY_BUILD
:
code
.
raw
(
f
"""
TV_THROW_RT_ERR("this function don't support cpu only build.")
"""
)
return
code
code
.
add_param_class
(
"ker"
,
self
.
kernel
)
code
.
raw
(
f
"""
auto nhot = out.size();
auto cudastream = reinterpret_cast<cudaStream_t>(stream);
tv::cuda::Launch launcher = tv::cuda::Launch(nhot, cudastream);
tv::dispatch<float, double, tv::half_t, tv::bfloat16_t>(out.dtype(), [&](auto I){{
using T = TV_DECLTYPE(I);
launcher(ker::activation_inplace_kernel<T>, out.data_ptr<T>(), act_type, T(alpha), T(beta),
nhot);
}});
"""
)
return
code
spconv/gencode/__main__.py
View file @
d0bfb3a3
...
@@ -17,6 +17,7 @@ from spconv.csrc.sparse.convops import (ConvGemmOps, ConvTunerSimple,
...
@@ -17,6 +17,7 @@ from spconv.csrc.sparse.convops import (ConvGemmOps, ConvTunerSimple,
from
spconv.csrc.utils
import
BoxOps
from
spconv.csrc.utils
import
BoxOps
from
cumm.gemm.algospec.core
import
(
GemmAlgo
,
ShuffleStrideType
)
from
cumm.gemm.algospec.core
import
(
GemmAlgo
,
ShuffleStrideType
)
from
cumm.conv.bases
import
ConvLayout
,
ConvLayoutType
,
ConvOpType
from
cumm.conv.bases
import
ConvLayout
,
ConvLayoutType
,
ConvOpType
from
spconv.csrc.sparse.inference
import
InferenceOps
def
main
(
include
:
str
,
def
main
(
include
:
str
,
...
@@ -60,6 +61,7 @@ def main(include: str,
...
@@ -60,6 +61,7 @@ def main(include: str,
ExternalSpconvMatmul
(),
ExternalSpconvMatmul
(),
SimpleExternalSpconvMatmul
(),
SimpleExternalSpconvMatmul
(),
StaticAllocator
(),
StaticAllocator
(),
InferenceOps
(),
]
]
gen_cmake
(
libname
,
cus
,
include
,
src
,
namespace_prefix
=
prefix
)
gen_cmake
(
libname
,
cus
,
include
,
src
,
namespace_prefix
=
prefix
)
...
...
spconv/pytorch/core.py
View file @
d0bfb3a3
...
@@ -162,6 +162,7 @@ class SparseConvTensor(metaclass=SpConvTensorMeta):
...
@@ -162,6 +162,7 @@ class SparseConvTensor(metaclass=SpConvTensorMeta):
assert
len
(
spatial_shape
)
==
ndim
,
"spatial shape must equal to ndim"
assert
len
(
spatial_shape
)
==
ndim
,
"spatial shape must equal to ndim"
assert
indices
.
dtype
==
torch
.
int32
,
"only support int32"
assert
indices
.
dtype
==
torch
.
int32
,
"only support int32"
assert
batch_size
>
0
assert
batch_size
>
0
# assert features.shape[0] == indices.shape[0]
self
.
_features
=
features
self
.
_features
=
features
self
.
indices
=
indices
self
.
indices
=
indices
self
.
spatial_shape
=
[
int
(
v
)
for
v
in
spatial_shape
]
self
.
spatial_shape
=
[
int
(
v
)
for
v
in
spatial_shape
]
...
@@ -197,6 +198,9 @@ class SparseConvTensor(metaclass=SpConvTensorMeta):
...
@@ -197,6 +198,9 @@ class SparseConvTensor(metaclass=SpConvTensorMeta):
return
new_spt
return
new_spt
def
minus
(
self
):
return
self
.
replace_feature
(
-
self
.
features
)
@
property
@
property
def
features
(
self
):
def
features
(
self
):
return
self
.
_features
return
self
.
_features
...
...
spconv/pytorch/ops.py
View file @
d0bfb3a3
...
@@ -41,7 +41,7 @@ else:
...
@@ -41,7 +41,7 @@ else:
GEMM_CPP
=
None
GEMM_CPP
=
None
CONV_CPP
=
None
CONV_CPP
=
None
import
time
import
time
from
spconv.constants
import
FILTER_HWIO
,
ALL_WEIGHT_IS_KRSC
,
AllocKeys
from
spconv.constants
import
FILTER_HWIO
,
ALL_WEIGHT_IS_KRSC
,
AllocKeys
,
SPCONV_USE_DIRECT_TABLE
from
cumm.gemm
import
codeops
from
cumm.gemm
import
codeops
from
spconv.tools
import
CUDAKernelTimer
from
spconv.tools
import
CUDAKernelTimer
...
@@ -101,6 +101,10 @@ class _HashData:
...
@@ -101,6 +101,10 @@ class _HashData:
dtype
=
torch
.
int32
,
dtype
=
torch
.
int32
,
device
=
device
)
device
=
device
)
hashdata_tv
=
torch_tensor_to_tv
(
self
.
hashdata
)
hashdata_tv
=
torch_tensor_to_tv
(
self
.
hashdata
)
if
num
==
0
:
self
.
hashdata_k_tv
=
tv
.
Tensor
()
self
.
hashdata_v_tv
=
tv
.
Tensor
()
else
:
self
.
hashdata_k_tv
=
hashdata_tv
[
0
]
self
.
hashdata_k_tv
=
hashdata_tv
[
0
]
self
.
hashdata_v_tv
=
hashdata_tv
[
1
]
self
.
hashdata_v_tv
=
hashdata_tv
[
1
]
...
@@ -315,7 +319,7 @@ def get_indice_pairs_implicit_gemm(
...
@@ -315,7 +319,7 @@ def get_indice_pairs_implicit_gemm(
alloc
:
Optional
[
ThrustSortAllocator
]
=
None
,
alloc
:
Optional
[
ThrustSortAllocator
]
=
None
,
timer
:
CUDAKernelTimer
=
CUDAKernelTimer
(
False
),
timer
:
CUDAKernelTimer
=
CUDAKernelTimer
(
False
),
num_out_act_bound
:
int
=
-
1
,
num_out_act_bound
:
int
=
-
1
,
direct_table
:
bool
=
True
):
direct_table
:
bool
=
SPCONV_USE_DIRECT_TABLE
):
"""
"""
Why return tuple? because pytorch seems don't support custom object in autograd.
Why return tuple? because pytorch seems don't support custom object in autograd.
return: (
return: (
...
@@ -535,7 +539,6 @@ def get_indice_pairs_implicit_gemm(
...
@@ -535,7 +539,6 @@ def get_indice_pairs_implicit_gemm(
indices
.
shape
[
0
],
ksize
,
stride
,
padding
,
dilation
)
indices
.
shape
[
0
],
ksize
,
stride
,
padding
,
dilation
)
if
transpose
:
if
transpose
:
max_num_act
=
kv
*
indices
.
shape
[
0
]
max_num_act
=
kv
*
indices
.
shape
[
0
]
pair_bwd
=
pair
pair_bwd
=
pair
pair_bwd_tv
=
pair_tv
pair_bwd_tv
=
pair_tv
indice_pairs_uniq
=
torch
.
empty
((
pair
.
numel
()
+
1
,
),
indice_pairs_uniq
=
torch
.
empty
((
pair
.
numel
()
+
1
,
),
...
...
test/benchmark.py
View file @
d0bfb3a3
...
@@ -32,9 +32,9 @@ def waymo_data(batch_size=1, num_features=-1):
...
@@ -32,9 +32,9 @@ def waymo_data(batch_size=1, num_features=-1):
# 150000)
# 150000)
data
=
np
.
load
(
Path
(
__file__
).
parent
/
"data"
/
"benchmark-pc.npz"
)
data
=
np
.
load
(
Path
(
__file__
).
parent
/
"data"
/
"benchmark-pc.npz"
)
pc
=
np
.
ascontiguousarray
(
data
[
"pc"
])
pc
=
np
.
ascontiguousarray
(
data
[
"pc"
])
print
(
pc
.
shape
)
voxels_tv
,
indices_tv
,
_
=
gen
.
point_to_voxel
(
tv
.
from_numpy
(
pc
))
voxels_tv
,
indices_tv
,
_
=
gen
.
point_to_voxel
(
tv
.
from_numpy
(
pc
))
voxels
=
voxels_tv
.
numpy
().
reshape
(
-
1
,
3
)
voxels
=
voxels_tv
.
numpy
().
reshape
(
-
1
,
3
)
if
num_features
>
0
:
if
num_features
>
0
:
voxels
=
np
.
zeros
((
voxels
.
shape
[
0
],
num_features
),
dtype
=
voxels
.
dtype
)
voxels
=
np
.
zeros
((
voxels
.
shape
[
0
],
num_features
),
dtype
=
voxels
.
dtype
)
coors
=
indices_tv
.
numpy
()
coors
=
indices_tv
.
numpy
()
...
@@ -316,6 +316,7 @@ import json
...
@@ -316,6 +316,7 @@ import json
def
main
():
def
main
():
import
pickle
import
pickle
np
.
random
.
seed
(
50051
)
np
.
random
.
seed
(
50051
)
torch
.
manual_seed
(
50051
)
torch
.
manual_seed
(
50051
)
# voxels, coors, spatial_shape = waymo_data(num_features=128)
# voxels, coors, spatial_shape = waymo_data(num_features=128)
...
@@ -377,14 +378,6 @@ def main():
...
@@ -377,14 +378,6 @@ def main():
# print("------------")
# print("------------")
with
tv
.
measure_duration
()
as
measure
:
with
tv
.
measure_duration
()
as
measure
:
out_nograd
=
net
(
voxels_th
,
coors_th
,
1
,
show_metrics
)
out_nograd
=
net
(
voxels_th
,
coors_th
,
1
,
show_metrics
)
# res = timer.collect_by_name("forward", timer.get_all_pair_time())
# res2 = timer.collect_by_name("forward0", timer.get_all_pair_time())
# print(sum(res.values()) + sum(res2.values()))
# print(timer.get_all_pair_time())
# print(sum(timer.get_all_pair_time().values()))
# sort_bench()
times
.
append
(
measure
.
duration
)
times
.
append
(
measure
.
duration
)
if
show_metrics
:
if
show_metrics
:
timer
=
out_nograd
.
_timer
timer
=
out_nograd
.
_timer
...
...
test/test_all_algo.py
View file @
d0bfb3a3
...
@@ -31,6 +31,7 @@ import pccm
...
@@ -31,6 +31,7 @@ import pccm
import
torch
import
torch
import
torch.nn.functional
as
F
import
torch.nn.functional
as
F
from
spconv.core_cc.csrc.sparse.convops
import
GemmTuneResult
,
ConvTuneResult
from
spconv.core_cc.csrc.sparse.convops
import
GemmTuneResult
,
ConvTuneResult
from
spconv.pytorch.core
import
SparseConvTensor
from
spconv.test_utils
import
TestCase
from
spconv.test_utils
import
TestCase
from
cumm
import
tensorview
as
tv
from
cumm
import
tensorview
as
tv
from
cumm.conv.bases
import
NCHW
,
NHWC
,
ConvIterAlgo
,
ConvOpType
from
cumm.conv.bases
import
NCHW
,
NHWC
,
ConvIterAlgo
,
ConvOpType
...
@@ -44,8 +45,10 @@ from spconv.pytorch.cppcore import get_current_stream, torch_tensor_to_tv
...
@@ -44,8 +45,10 @@ from spconv.pytorch.cppcore import get_current_stream, torch_tensor_to_tv
from
spconv.test_utils
import
generate_sparse_data
,
params_grid
from
spconv.test_utils
import
generate_sparse_data
,
params_grid
import
tqdm
import
tqdm
from
spconv.constants
import
ALL_WEIGHT_IS_KRSC
,
SPCONV_CPP_GEMM
from
spconv.constants
import
ALL_WEIGHT_IS_KRSC
,
SPCONV_CPP_GEMM
from
spconv.core_cc.csrc.sparse.inference
import
InferenceOps
from
spconv.pytorch
import
functional
as
Fsp
assert
ALL_WEIGHT_IS_KRSC
is
True
,
"we only support KRSC in spconv >= 2.2"
assert
ALL_WEIGHT_IS_KRSC
is
True
,
"we only support KRSC in spconv >= 2.2"
from
spconv.pytorch.hash
import
HashTable
# TODO remove or release this when tf32 op is ready
# TODO remove or release this when tf32 op is ready
torch
.
backends
.
cuda
.
matmul
.
allow_tf32
=
False
torch
.
backends
.
cuda
.
matmul
.
allow_tf32
=
False
...
@@ -60,8 +63,9 @@ NUMPY_DTYPE_TO_TORCH = {
...
@@ -60,8 +63,9 @@ NUMPY_DTYPE_TO_TORCH = {
class
SparseConvTester
:
class
SparseConvTester
:
def
__init__
(
self
,
algo
:
ConvAlgo
,
subm
:
bool
,
shape
:
List
[
int
],
bs
:
int
,
dtype
:
np
.
dtype
,
N
:
int
,
K
:
int
,
C
:
int
,
def
__init__
(
self
,
algo
:
ConvAlgo
,
subm
:
bool
,
shape
:
List
[
int
],
bs
:
int
,
dtype
:
np
.
dtype
,
N
:
int
,
K
:
int
,
C
:
int
,
ksize
:
int
,
stride
:
int
,
padding
:
int
,
dilation
:
int
)
->
None
:
ksize
:
int
,
stride
:
int
,
padding
:
int
,
dilation
:
int
,
check_bias
:
bool
=
False
,
check_act
:
bool
=
False
)
->
None
:
ndim
=
3
ndim
=
3
transpose
=
False
self
.
shape
=
shape
self
.
shape
=
shape
self
.
bs
=
bs
self
.
bs
=
bs
self
.
dtype
=
dtype
self
.
dtype
=
dtype
...
@@ -77,6 +81,15 @@ class SparseConvTester:
...
@@ -77,6 +81,15 @@ class SparseConvTester:
op
=
expand_nd
(
ndim
,
0
)
op
=
expand_nd
(
ndim
,
0
)
self
.
kv
:
int
=
np
.
prod
(
self
.
ksize
)
self
.
kv
:
int
=
np
.
prod
(
self
.
ksize
)
self
.
num_split
=
1
if
algo
==
ConvAlgo
.
MaskImplicitGemm
else
2
self
.
num_split
=
1
if
algo
==
ConvAlgo
.
MaskImplicitGemm
else
2
if
not
subm
:
if
transpose
:
out_shape
=
ops
.
get_deconv_output_size
(
shape
,
self
.
ksize
,
self
.
stride
,
self
.
padding
,
self
.
dilation
,
op
)
else
:
out_shape
=
ops
.
get_conv_output_size
(
shape
,
self
.
ksize
,
self
.
stride
,
self
.
padding
,
self
.
dilation
)
else
:
out_shape
=
shape
sparse_dict
=
generate_sparse_data
(
shape
,
[
N
]
*
bs
,
C
)
sparse_dict
=
generate_sparse_data
(
shape
,
[
N
]
*
bs
,
C
)
...
@@ -88,10 +101,15 @@ class SparseConvTester:
...
@@ -88,10 +101,15 @@ class SparseConvTester:
out_inds
,
pair_ref
,
indice_num_per_loc
=
ops
.
get_indice_pairs
(
out_inds
,
pair_ref
,
indice_num_per_loc
=
ops
.
get_indice_pairs
(
indices_th
,
1
,
shape
,
ConvAlgo
.
Native
,
self
.
ksize
,
self
.
stride
,
self
.
padding
,
indices_th
,
1
,
shape
,
ConvAlgo
.
Native
,
self
.
ksize
,
self
.
stride
,
self
.
padding
,
self
.
dilation
,
op
,
subm
)
self
.
dilation
,
op
,
subm
)
self
.
ref_out_inds
=
out_inds
self
.
ref_out_inds_scalar
=
Fsp
.
_indice_to_scalar
(
out_inds
.
long
(),
[
bs
,
*
out_shape
])
self
.
indice_num_per_loc_np
=
indice_num_per_loc
.
cpu
().
numpy
()
self
.
indice_num_per_loc_np
=
indice_num_per_loc
.
cpu
().
numpy
()
self
.
indice_pairs_np
=
pair_ref
.
cpu
().
numpy
()
self
.
indice_pairs_np
=
pair_ref
.
cpu
().
numpy
()
self
.
pair_native
=
pair_ref
self
.
pair_native
=
pair_ref
self
.
indice_num_per_loc
=
indice_num_per_loc
self
.
indice_num_per_loc
=
indice_num_per_loc
self
.
use_direct_table
=
True
self
.
out_shape
=
out_shape
if
algo
==
ConvAlgo
.
Native
:
if
algo
==
ConvAlgo
.
Native
:
self
.
out_inds
:
torch
.
Tensor
=
out_inds
self
.
out_inds
:
torch
.
Tensor
=
out_inds
self
.
num_inds_per_loc
:
torch
.
Tensor
=
indice_num_per_loc
self
.
num_inds_per_loc
:
torch
.
Tensor
=
indice_num_per_loc
...
@@ -105,7 +123,7 @@ class SparseConvTester:
...
@@ -105,7 +123,7 @@ class SparseConvTester:
else
:
else
:
res
=
ops
.
get_indice_pairs_implicit_gemm
(
indices_th
,
bs
,
shape
,
res
=
ops
.
get_indice_pairs_implicit_gemm
(
indices_th
,
bs
,
shape
,
algo
,
self
.
ksize
,
self
.
stride
,
self
.
padding
,
algo
,
self
.
ksize
,
self
.
stride
,
self
.
padding
,
self
.
dilation
,
op
,
subm
=
subm
)
self
.
dilation
,
op
,
subm
=
subm
,
direct_table
=
self
.
use_direct_table
)
self
.
out_inds
=
res
[
0
]
self
.
out_inds
=
res
[
0
]
self
.
num_inds_per_loc
=
res
[
1
]
self
.
num_inds_per_loc
=
res
[
1
]
...
@@ -116,8 +134,27 @@ class SparseConvTester:
...
@@ -116,8 +134,27 @@ class SparseConvTester:
self
.
mask_argsort_fwd_splits
=
res
[
6
]
self
.
mask_argsort_fwd_splits
=
res
[
6
]
self
.
mask_argsort_bwd_splits
=
res
[
7
]
self
.
mask_argsort_bwd_splits
=
res
[
7
]
self
.
masks
=
res
[
8
]
self
.
masks
=
res
[
8
]
self
.
out_inds_scalar
=
Fsp
.
_indice_to_scalar
(
self
.
out_inds
.
long
(),
[
bs
,
*
out_shape
])
table
=
HashTable
(
out_inds
.
device
,
torch
.
int64
,
torch
.
int32
,
self
.
out_inds
.
shape
[
0
]
*
2
)
# test coords -> test out indexes
table
.
insert
(
self
.
out_inds_scalar
,
torch
.
arange
(
0
,
self
.
out_inds
.
shape
[
0
],
dtype
=
torch
.
int32
,
device
=
self
.
device
))
# out_order: test_order_to_ref, test index for each ref coord
out_order
,
is_empty
=
table
.
query
(
self
.
ref_out_inds_scalar
)
assert
is_empty
.
int
().
sum
().
item
()
==
0
,
"shouldn't happen"
self
.
out_order
=
out_order
.
cpu
().
numpy
()
# inp_table = HashTable(out_inds.device, torch.int64, torch.int32, self.ref_out_inds.shape[0] * 2)
# inp_table.insert(self.ref_out_inds_scalar, torch.arange(0, self.ref_out_inds.shape[0], dtype=torch.int32, device=self.device))
# # out_order: ref index for each out coord
# out_order, is_empty = inp_table.query(self.out_inds_scalar)
self
.
voxels_np
=
voxels_np
self
.
voxels_np
=
voxels_np
self
.
indices_np
=
indices_np
self
.
indices_np
=
indices_np
self
.
check_bias
=
check_bias
self
.
check_act
=
check_act
self
.
subm
=
subm
self
.
subm
=
subm
if
dtype
==
np
.
int8
:
if
dtype
==
np
.
int8
:
...
@@ -128,6 +165,10 @@ class SparseConvTester:
...
@@ -128,6 +165,10 @@ class SparseConvTester:
self
.
output
=
np
.
random
.
randint
(
-
2
,
2
,
size
=
[
self
.
output
=
np
.
random
.
randint
(
-
2
,
2
,
size
=
[
self
.
out_inds
.
shape
[
0
],
K
self
.
out_inds
.
shape
[
0
],
K
]).
astype
(
dtype
)
]).
astype
(
dtype
)
self
.
bias
=
np
.
random
.
randint
(
-
2
,
2
,
size
=
[
K
]).
astype
(
dtype
)
else
:
else
:
self
.
inp
=
np
.
random
.
uniform
(
-
1
,
1
,
size
=
[
self
.
inp
=
np
.
random
.
uniform
(
-
1
,
1
,
size
=
[
voxels_np
.
shape
[
0
],
C
voxels_np
.
shape
[
0
],
C
...
@@ -136,14 +177,25 @@ class SparseConvTester:
...
@@ -136,14 +177,25 @@ class SparseConvTester:
self
.
output
=
np
.
random
.
uniform
(
-
1
,
1
,
size
=
[
self
.
output
=
np
.
random
.
uniform
(
-
1
,
1
,
size
=
[
self
.
out_inds
.
shape
[
0
],
K
self
.
out_inds
.
shape
[
0
],
K
]).
astype
(
dtype
)
]).
astype
(
dtype
)
self
.
bias
=
np
.
random
.
uniform
(
-
1
,
1
,
size
=
[
K
]).
astype
(
dtype
)
self
.
weight_ref
=
self
.
weight
.
transpose
(
1
,
2
,
3
,
0
,
4
)
self
.
weight_ref
=
self
.
weight
.
transpose
(
1
,
2
,
3
,
0
,
4
)
self
.
weight_ref
=
np
.
ascontiguousarray
(
self
.
weight_ref
).
reshape
(
-
1
,
K
,
C
)
self
.
weight_ref
=
np
.
ascontiguousarray
(
self
.
weight_ref
).
reshape
(
-
1
,
K
,
C
)
self
.
out_ref
,
self
.
din_ref
,
self
.
dw_ref
=
self
.
_get_ref_output
()
self
.
out_ref
,
self
.
din_ref
,
self
.
dw_ref
=
self
.
_get_ref_output
()
if
check_bias
:
self
.
out_ref
+=
self
.
bias
# relu
if
check_act
:
self
.
out_ref
=
np
.
maximum
(
self
.
out_ref
,
0
)
self
.
dw_ref
=
np
.
ascontiguousarray
(
self
.
dw_ref
.
transpose
(
1
,
0
,
2
).
reshape
(
K
,
*
self
.
ksize
,
C
))
self
.
dw_ref
=
np
.
ascontiguousarray
(
self
.
dw_ref
.
transpose
(
1
,
0
,
2
).
reshape
(
K
,
*
self
.
ksize
,
C
))
self
.
arch
=
tv
.
get_compute_capability
()
self
.
arch
=
tv
.
get_compute_capability
()
def
get_output_ref_spt
(
self
):
return
SparseConvTensor
(
torch
.
from_numpy
(
self
.
out_ref
).
cuda
(),
self
.
ref_out_inds
,
self
.
out_shape
,
self
.
bs
)
def
_get_ref_output
(
self
):
def
_get_ref_output
(
self
):
output_ref
=
np
.
zeros_like
(
self
.
output
,
dtype
=
np
.
float32
)
output_ref
=
np
.
zeros_like
(
self
.
output
,
dtype
=
np
.
float32
)
dinput_ref
=
np
.
zeros_like
(
self
.
inp
,
dtype
=
np
.
float32
)
dinput_ref
=
np
.
zeros_like
(
self
.
inp
,
dtype
=
np
.
float32
)
...
@@ -165,13 +217,15 @@ class SparseConvTester:
...
@@ -165,13 +217,15 @@ class SparseConvTester:
np
.
float32
)
@
self
.
weight_ref
[
filter_offset
].
T
.
astype
(
np
.
float32
)
@
self
.
weight_ref
[
filter_offset
].
T
.
astype
(
np
.
float32
)
np
.
float32
)
output_ref
[
o_inds
]
+=
cc
output_ref
[
o_inds
]
+=
cc
a
=
self
.
output
[
o_inds
]
# we use random output as dout here
a
=
self
.
output
[
self
.
out_order
][
o_inds
]
# NK @ KC
# NK @ KC
cc
=
a
.
astype
(
cc
=
a
.
astype
(
np
.
float32
)
@
self
.
weight_ref
[
filter_offset
].
astype
(
np
.
float32
)
@
self
.
weight_ref
[
filter_offset
].
astype
(
np
.
float32
)
np
.
float32
)
dinput_ref
[
i_inds
]
+=
cc
dinput_ref
[
i_inds
]
+=
cc
out_gather
=
self
.
output
[
o_inds
]
# [N, K]
# use random output and random inp as dout and inp
out_gather
=
self
.
output
[
self
.
out_order
][
o_inds
]
# [N, K]
inp_gather
=
self
.
inp
[
i_inds
]
# [N, C]
inp_gather
=
self
.
inp
[
i_inds
]
# [N, C]
# KN @ NC
# KN @ NC
dw_res
=
out_gather
.
astype
(
dw_res
=
out_gather
.
astype
(
...
@@ -225,7 +279,7 @@ def _test_impgemm_conv_cuda(subm: bool):
...
@@ -225,7 +279,7 @@ def _test_impgemm_conv_cuda(subm: bool):
shapes
=
[[
19
,
18
,
17
]]
shapes
=
[[
19
,
18
,
17
]]
batchsizes
=
[
1
]
batchsizes
=
[
1
]
dtypes
=
[
np
.
float32
,
np
.
float16
]
dtypes
=
[
np
.
float32
,
np
.
float16
]
dtypes
=
[
np
.
int8
]
#
dtypes = [np.int8]
test_case
=
TestCase
()
test_case
=
TestCase
()
# in_channels = [32]
# in_channels = [32]
# out_channels = [32, 48, 64]
# out_channels = [32, 48, 64]
...
@@ -245,6 +299,7 @@ def _test_impgemm_conv_cuda(subm: bool):
...
@@ -245,6 +299,7 @@ def _test_impgemm_conv_cuda(subm: bool):
strides
=
[
1
,
2
,
3
]
strides
=
[
1
,
2
,
3
]
paddings
=
[
0
,
1
]
paddings
=
[
0
,
1
]
dilations
=
[
1
,
2
]
dilations
=
[
1
,
2
]
algos
=
[
algos
=
[
# ConvAlgo.MaskSplitImplicitGemm,
# ConvAlgo.MaskSplitImplicitGemm,
ConvAlgo
.
MaskImplicitGemm
,
ConvAlgo
.
MaskImplicitGemm
,
...
@@ -261,11 +316,14 @@ def _test_impgemm_conv_cuda(subm: bool):
...
@@ -261,11 +316,14 @@ def _test_impgemm_conv_cuda(subm: bool):
multipler
=
max
(
C
,
K
)
/
multiple_base
multipler
=
max
(
C
,
K
)
/
multiple_base
multipler
=
max
(
multipler
,
1.0
)
multipler
=
max
(
multipler
,
1.0
)
# print(num_batch)
# print(num_batch)
tester
=
SparseConvTester
(
algo
,
subm
,
shape
,
bs
,
dtype
,
num_batch
,
K
,
C
,
k
,
s
,
p
,
d
)
tester
=
SparseConvTester
(
algo
,
subm
,
shape
,
bs
,
dtype
,
num_batch
,
K
,
C
,
k
,
s
,
p
,
d
,
check_bias
=
True
,
check_act
=
True
)
bias
=
None
act
=
tv
.
gemm
.
Activation
.
None_
if
tester
.
check_bias
:
bias
=
tv
.
from_numpy
(
tester
.
bias
).
cuda
()
atol
,
rtol
=
dtype_to_tol
[
dtype
]
atol
,
rtol
=
dtype_to_tol
[
dtype
]
mask_width_to_mask_out_fwd
:
Dict
[
int
,
torch
.
Tensor
]
=
{}
mask_width_to_mask_out_fwd
:
Dict
[
int
,
torch
.
Tensor
]
=
{}
mask_width_to_mask_out_bwd
:
Dict
[
int
,
torch
.
Tensor
]
=
{}
mask_width_to_mask_out_bwd
:
Dict
[
int
,
torch
.
Tensor
]
=
{}
op_types
=
[
ConvOpType
.
kForward
,
ConvOpType
.
kBackwardInput
]
op_types
=
[
ConvOpType
.
kForward
,
ConvOpType
.
kBackwardInput
]
spk
=
1
spk
=
1
for
op_type
in
op_types
:
for
op_type
in
op_types
:
...
@@ -276,7 +334,11 @@ def _test_impgemm_conv_cuda(subm: bool):
...
@@ -276,7 +334,11 @@ def _test_impgemm_conv_cuda(subm: bool):
NHWC
.
layout_type
.
value
,
NHWC
.
interleave
,
NHWC
.
interleave
,
NHWC
.
interleave
,
arch
,
op_type
.
value
,
-
1
,
True
,
False
)
NHWC
.
layout_type
.
value
,
NHWC
.
interleave
,
NHWC
.
interleave
,
NHWC
.
interleave
,
arch
,
op_type
.
value
,
-
1
,
True
,
False
)
else
:
else
:
avail_desps
=
CONV
.
get_all_available
(
inp_tv
,
weight_tv
,
output_tv
,
NHWC
,
NHWC
,
NHWC
,
arch
,
op_type
,
-
1
)
avail_desps
=
CONV
.
get_all_available
(
inp_tv
,
weight_tv
,
output_tv
,
NHWC
,
NHWC
,
NHWC
,
arch
,
op_type
,
-
1
)
if
op_type
==
ConvOpType
.
kForward
and
tester
.
check_act
:
act
=
tv
.
gemm
.
Activation
.
ReLU
else
:
act
=
tv
.
gemm
.
Activation
.
None_
assert
avail_desps
for
desp
in
avail_desps
:
for
desp
in
avail_desps
:
if
not
subm
:
if
not
subm
:
if
op_type
==
ConvOpType
.
kForward
:
if
op_type
==
ConvOpType
.
kForward
:
...
@@ -292,7 +354,10 @@ def _test_impgemm_conv_cuda(subm: bool):
...
@@ -292,7 +354,10 @@ def _test_impgemm_conv_cuda(subm: bool):
dtype
=
torch
.
int32
,
dtype
=
torch
.
int32
,
device
=
tester
.
device
)
device
=
tester
.
device
)
mask_output_fwd
=
mask_width_to_mask_out_fwd
[
mask_width
]
mask_output_fwd
=
mask_width_to_mask_out_fwd
[
mask_width
]
is_fwd
=
desp
.
op_type
.
value
==
ConvOpType
.
kForward
.
value
bias_cur
=
bias
if
op_type
!=
ConvOpType
.
kForward
:
bias_cur
=
None
if
subm
:
if
subm
:
if
desp
.
op_type
.
value
==
ConvOpType
.
kForward
.
value
:
if
desp
.
op_type
.
value
==
ConvOpType
.
kForward
.
value
:
indice_pairs
=
tester
.
pair_fwd
indice_pairs
=
tester
.
pair_fwd
...
@@ -303,9 +368,12 @@ def _test_impgemm_conv_cuda(subm: bool):
...
@@ -303,9 +368,12 @@ def _test_impgemm_conv_cuda(subm: bool):
mask_output
=
mask_output_fwd
mask_output
=
mask_output_fwd
# print([bin(x.item()) for x in masks])
# print([bin(x.item()) for x in masks])
for
j
in
range
(
tester
.
num_split
):
for
j
in
range
(
tester
.
num_split
):
beta
=
1
if
j
==
1
else
0
beta
=
1
if
j
>
0
else
0
if
bias_cur
is
not
None
:
beta
=
1
if
j
>
0
:
bias_cur
=
None
mask_filter
=
tester
.
masks
[
j
].
item
()
mask_filter
=
tester
.
masks
[
j
].
item
()
reverse_mask
=
False
reverse_mask
=
False
if
desp
.
op_type
.
value
==
ConvOpType
.
kBackwardWeight
.
value
:
if
desp
.
op_type
.
value
==
ConvOpType
.
kBackwardWeight
.
value
:
mask_op
=
mask_output
[
j
]
mask_op
=
mask_output
[
j
]
...
@@ -338,6 +406,8 @@ def _test_impgemm_conv_cuda(subm: bool):
...
@@ -338,6 +406,8 @@ def _test_impgemm_conv_cuda(subm: bool):
beta
=
beta
,
beta
=
beta
,
verbose
=
False
,
verbose
=
False
,
force_nvrtc
=
force_nvrtc
,
force_nvrtc
=
force_nvrtc
,
bias
=
bias_cur
if
is_fwd
and
bias_cur
is
not
None
else
tv
.
Tensor
(),
act_type
=
act
,
)
)
else
:
else
:
CONV
.
run_with_tuned_result
(
CONV
.
run_with_tuned_result
(
...
@@ -356,6 +426,8 @@ def _test_impgemm_conv_cuda(subm: bool):
...
@@ -356,6 +426,8 @@ def _test_impgemm_conv_cuda(subm: bool):
beta
=
beta
,
beta
=
beta
,
verbose
=
False
,
verbose
=
False
,
force_nvrtc
=
force_nvrtc
,
force_nvrtc
=
force_nvrtc
,
bias
=
bias_cur
if
is_fwd
else
None
,
act_type
=
act
,
)
)
else
:
else
:
...
@@ -382,7 +454,12 @@ def _test_impgemm_conv_cuda(subm: bool):
...
@@ -382,7 +454,12 @@ def _test_impgemm_conv_cuda(subm: bool):
mask_output
=
mask_output_fwd
mask_output
=
mask_output_fwd
for
j
in
range
(
tester
.
num_split
):
for
j
in
range
(
tester
.
num_split
):
beta
=
1
if
j
==
1
else
0
# beta = 1 if j == 1 else 0
beta
=
1
if
j
>
0
else
0
if
bias_cur
is
not
None
:
beta
=
1
if
j
>
0
:
bias_cur
=
None
mask_filter
=
tester
.
masks
[
j
].
item
()
mask_filter
=
tester
.
masks
[
j
].
item
()
reverse_mask
=
False
reverse_mask
=
False
if
desp
.
op_type
.
value
==
ConvOpType
.
kBackwardWeight
.
value
:
if
desp
.
op_type
.
value
==
ConvOpType
.
kBackwardWeight
.
value
:
...
@@ -406,6 +483,9 @@ def _test_impgemm_conv_cuda(subm: bool):
...
@@ -406,6 +483,9 @@ def _test_impgemm_conv_cuda(subm: bool):
mask_width
=
mask_width
,
mask_width
=
mask_width
,
beta
=
beta
,
beta
=
beta
,
verbose
=
False
,
verbose
=
False
,
force_nvrtc
=
force_nvrtc
,
bias
=
bias
if
is_fwd
and
bias
is
not
None
else
tv
.
Tensor
(),
act_type
=
act
,
)
)
else
:
else
:
CONV
.
run_with_tuned_result
(
CONV
.
run_with_tuned_result
(
...
@@ -423,6 +503,9 @@ def _test_impgemm_conv_cuda(subm: bool):
...
@@ -423,6 +503,9 @@ def _test_impgemm_conv_cuda(subm: bool):
mask_width
=
mask_width
,
mask_width
=
mask_width
,
beta
=
beta
,
beta
=
beta
,
verbose
=
False
,
verbose
=
False
,
force_nvrtc
=
force_nvrtc
,
bias
=
bias
if
is_fwd
else
None
,
act_type
=
act
,
)
)
out_ref
=
tester
.
out_ref
out_ref
=
tester
.
out_ref
...
@@ -430,6 +513,7 @@ def _test_impgemm_conv_cuda(subm: bool):
...
@@ -430,6 +513,7 @@ def _test_impgemm_conv_cuda(subm: bool):
dw_ref
=
tester
.
dw_ref
dw_ref
=
tester
.
dw_ref
if
op_type
==
ConvOpType
.
kForward
:
if
op_type
==
ConvOpType
.
kForward
:
out_my
=
output_tv
.
cpu
().
numpy
()
out_my
=
output_tv
.
cpu
().
numpy
()
out_my
=
out_my
[
tester
.
out_order
]
if
dtype
!=
np
.
float16
:
if
dtype
!=
np
.
float16
:
test_case
.
assertAllClose
(
out_ref
,
out_my
,
atol
=
atol
,
rtol
=
rtol
)
test_case
.
assertAllClose
(
out_ref
,
out_my
,
atol
=
atol
,
rtol
=
rtol
)
else
:
else
:
...
@@ -437,7 +521,6 @@ def _test_impgemm_conv_cuda(subm: bool):
...
@@ -437,7 +521,6 @@ def _test_impgemm_conv_cuda(subm: bool):
if
(
error_norm
>
5
):
if
(
error_norm
>
5
):
print
(
f
"
{
desp
}
, Error=
{
error_norm
}
"
)
print
(
f
"
{
desp
}
, Error=
{
error_norm
}
"
)
assert
error_norm
<
10
*
multipler
assert
error_norm
<
10
*
multipler
# print(desp, )
else
:
else
:
din_my
=
inp_tv
.
cpu
().
numpy
()
din_my
=
inp_tv
.
cpu
().
numpy
()
if
dtype
!=
np
.
float16
:
if
dtype
!=
np
.
float16
:
...
@@ -446,7 +529,6 @@ def _test_impgemm_conv_cuda(subm: bool):
...
@@ -446,7 +529,6 @@ def _test_impgemm_conv_cuda(subm: bool):
error_norm
=
np
.
linalg
.
norm
(
din_ref
.
reshape
(
-
1
)
-
din_my
.
reshape
(
-
1
))
error_norm
=
np
.
linalg
.
norm
(
din_ref
.
reshape
(
-
1
)
-
din_my
.
reshape
(
-
1
))
assert
error_norm
<
10
*
multipler
,
f
"
{
desp
}
,
{
error_norm
}
,
{
k
}
,
{
s
}
,
{
p
}
,
{
d
}
"
assert
error_norm
<
10
*
multipler
,
f
"
{
desp
}
,
{
error_norm
}
,
{
k
}
,
{
s
}
,
{
p
}
,
{
d
}
"
inp_tv
,
weight_tv
,
output_tv
=
tester
.
get_operands
(
ConvOpType
.
kBackwardWeight
)
inp_tv
,
weight_tv
,
output_tv
=
tester
.
get_operands
(
ConvOpType
.
kBackwardWeight
)
for
spk
in
[
1
,
4
,
16
,
64
]:
for
spk
in
[
1
,
4
,
16
,
64
]:
for
mask_width
,
mask_output
in
mask_width_to_mask_out_fwd
.
items
():
for
mask_width
,
mask_output
in
mask_width_to_mask_out_fwd
.
items
():
if
SPCONV_CPP_GEMM
:
if
SPCONV_CPP_GEMM
:
...
@@ -554,7 +636,10 @@ def _test_native_conv_cuda(subm: bool):
...
@@ -554,7 +636,10 @@ def _test_native_conv_cuda(subm: bool):
for
shape
,
bs
,
C
,
K
,
k
,
s
,
p
,
d
,
dtype
in
tqdm
.
tqdm
(
params_grid
(
for
shape
,
bs
,
C
,
K
,
k
,
s
,
p
,
d
,
dtype
in
tqdm
.
tqdm
(
params_grid
(
shapes
,
batchsizes
,
in_channels
,
out_channels
,
ksizes
,
shapes
,
batchsizes
,
in_channels
,
out_channels
,
ksizes
,
strides
,
paddings
,
dilations
,
dtypes
)):
strides
,
paddings
,
dilations
,
dtypes
)):
tester
=
SparseConvTester
(
ConvAlgo
.
Native
,
subm
,
shape
,
bs
,
dtype
,
1500
,
K
,
C
,
k
,
s
,
p
,
d
)
tester
=
SparseConvTester
(
ConvAlgo
.
Native
,
subm
,
shape
,
bs
,
dtype
,
1500
,
K
,
C
,
k
,
s
,
p
,
d
,
check_bias
=
True
,
check_act
=
True
)
bias
=
None
if
tester
.
check_bias
:
bias
=
tv
.
from_numpy
(
tester
.
bias
).
cuda
()
atol
,
rtol
=
dtype_to_tol
[
dtype
]
atol
,
rtol
=
dtype_to_tol
[
dtype
]
multipler
=
max
(
C
,
K
)
/
multiple_base
multipler
=
max
(
C
,
K
)
/
multiple_base
multipler
=
max
(
multipler
,
1.0
)
multipler
=
max
(
multipler
,
1.0
)
...
@@ -580,7 +665,6 @@ def _test_native_conv_cuda(subm: bool):
...
@@ -580,7 +665,6 @@ def _test_native_conv_cuda(subm: bool):
inp_tv
=
torch_tensor_to_tv
(
inp_th
)
inp_tv
=
torch_tensor_to_tv
(
inp_th
)
weight_tv
=
torch_tensor_to_tv
(
weight_th
)
weight_tv
=
torch_tensor_to_tv
(
weight_th
)
output_tv
=
torch_tensor_to_tv
(
output_th
)
output_tv
=
torch_tensor_to_tv
(
output_th
)
if
op_type
==
ConvOpType
.
kForward
:
if
op_type
==
ConvOpType
.
kForward
:
a
=
inp_tv
a
=
inp_tv
c
=
output_tv
c
=
output_tv
...
@@ -593,9 +677,11 @@ def _test_native_conv_cuda(subm: bool):
...
@@ -593,9 +677,11 @@ def _test_native_conv_cuda(subm: bool):
for
desp
in
avail_desps
:
for
desp
in
avail_desps
:
if
subm
:
if
subm
:
torch
.
mm
(
inp_th
,
weight_th
[:,
tester
.
kv
//
2
].
T
,
out
=
output_th
)
torch
.
mm
(
inp_th
,
weight_th
[:,
tester
.
kv
//
2
].
T
,
out
=
output_th
)
# output_th += bias_th
else
:
else
:
output_tv
.
zero_
()
output_tv
.
zero_
()
inited
=
subm
inited
=
subm
# determine last valid subm indices, then apply
for
i
,
nhot
in
enumerate
(
indice_pair_num_cpu
):
for
i
,
nhot
in
enumerate
(
indice_pair_num_cpu
):
if
subm
and
i
==
kv_center
:
if
subm
and
i
==
kv_center
:
continue
continue
...
@@ -643,8 +729,14 @@ def _test_native_conv_cuda(subm: bool):
...
@@ -643,8 +729,14 @@ def _test_native_conv_cuda(subm: bool):
hint
=
AlgoHint
.
Fowrard
.
value
,
hint
=
AlgoHint
.
Fowrard
.
value
,
alpha
=
1.0
,
alpha
=
1.0
,
beta
=
beta
)
beta
=
beta
)
inited
=
True
inited
=
True
if
bias
is
not
None
and
tester
.
check_act
:
InferenceOps
.
bias_add_act_inplace
(
output_tv
,
bias
,
tv
.
gemm
.
Activation
.
ReLU
,
0
,
0
)
else
:
if
bias
is
not
None
:
InferenceOps
.
bias_add_inplace
(
output_tv
,
bias
,
0
)
if
tester
.
check_act
:
InferenceOps
.
activation_inplace
(
output_tv
,
tv
.
gemm
.
Activation
.
ReLU
,
0
,
0
)
out_my
=
output_tv
.
cpu
().
numpy
()
out_my
=
output_tv
.
cpu
().
numpy
()
if
dtype
!=
np
.
float16
:
if
dtype
!=
np
.
float16
:
# error_norm = np.linalg.norm(out_ref.reshape(-1) - out_my.reshape(-1))
# error_norm = np.linalg.norm(out_ref.reshape(-1) - out_my.reshape(-1))
...
@@ -807,7 +899,7 @@ def _test_native_conv_cuda(subm: bool):
...
@@ -807,7 +899,7 @@ def _test_native_conv_cuda(subm: bool):
def
test_all_algo_unit
():
def
test_all_algo_unit
():
# for i in range(5):
# for i in range(5):
_test_impgemm_conv_cuda
(
True
)
_test_impgemm_conv_cuda
(
True
)
_test_impgemm_conv_cuda
(
Fals
e
)
_test_impgemm_conv_cuda
(
Tru
e
)
_test_native_conv_cuda
(
True
)
_test_native_conv_cuda
(
True
)
_test_native_conv_cuda
(
False
)
_test_native_conv_cuda
(
False
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment