Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
one
spconv
Commits
899008fa
Commit
899008fa
authored
Jul 20, 2022
by
yan.yan
Browse files
working on c++ only
parent
f78575ea
Changes
31
Expand all
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
3404 additions
and
327 deletions
+3404
-327
docs/DEVELOPMENT.md
docs/DEVELOPMENT.md
+0
-25
setup.py
setup.py
+22
-3
spconv/algo.py
spconv/algo.py
+187
-67
spconv/algocore.py
spconv/algocore.py
+13
-3
spconv/benchmark/me.py
spconv/benchmark/me.py
+0
-24
spconv/benchmark/thsp.py
spconv/benchmark/thsp.py
+0
-24
spconv/build.py
spconv/build.py
+35
-3
spconv/constants.py
spconv/constants.py
+30
-1
spconv/core.py
spconv/core.py
+177
-27
spconv/core_cc/csrc/sparse/all/__init__.pyi
spconv/core_cc/csrc/sparse/all/__init__.pyi
+43
-10
spconv/core_cc/csrc/sparse/alloc.pyi
spconv/core_cc/csrc/sparse/alloc.pyi
+18
-4
spconv/core_cc/csrc/sparse/convops/__init__.pyi
spconv/core_cc/csrc/sparse/convops/__init__.pyi
+96
-0
spconv/core_cc/csrc/sparse/convops/convops.pyi
spconv/core_cc/csrc/sparse/convops/convops.pyi
+126
-0
spconv/core_cc/csrc/sparse/convops/gemmops.pyi
spconv/core_cc/csrc/sparse/convops/gemmops.pyi
+107
-0
spconv/core_cc/csrc/sparse/convops/spops.pyi
spconv/core_cc/csrc/sparse/convops/spops.pyi
+101
-0
spconv/core_cc/cumm/common.pyi
spconv/core_cc/cumm/common.pyi
+7
-0
spconv/core_cc/cumm/gemm/main.pyi
spconv/core_cc/cumm/gemm/main.pyi
+3
-2
spconv/csrc/sparse/all.py
spconv/csrc/sparse/all.py
+217
-79
spconv/csrc/sparse/alloc.py
spconv/csrc/sparse/alloc.py
+190
-7
spconv/csrc/sparse/convops.py
spconv/csrc/sparse/convops.py
+2032
-48
No files found.
docs/DEVELOPMENT.md
deleted
100644 → 0
View file @
f78575ea
<!--
Copyright 2021 Yan Yan
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
# How to develop spconv 2.x
## First step
spconv 2.x is written in a unique c++ framework
```pccm```
. read
[
pccm guide
](
)
to learn how to use
```pccm```
.
It's recommend to uninstall spconv and cumm installed by pip, then install spconv and cumm both in editable mode (
```pip install -e .```
)
## Architecture
\ No newline at end of file
setup.py
View file @
899008fa
...
@@ -159,6 +159,9 @@ if disable_jit is not None and disable_jit == "1":
...
@@ -159,6 +159,9 @@ if disable_jit is not None and disable_jit == "1":
from
spconv.csrc.utils
import
BoxOps
from
spconv.csrc.utils
import
BoxOps
from
spconv.csrc.hash.core
import
HashTable
from
spconv.csrc.hash.core
import
HashTable
from
cumm.common
import
CompileInfo
from
cumm.common
import
CompileInfo
from
spconv.csrc.sparse.alloc
import
ExternalAllocator
from
spconv.csrc.sparse.convops
import
GemmTunerSimple
,
ExternalSpconvMatmul
from
spconv.csrc.sparse.convops
import
ConvTunerSimple
,
ConvGemmOps
cu
=
GemmMainUnitTest
(
SHUFFLE_SIMT_PARAMS
+
SHUFFLE_VOLTA_PARAMS
+
SHUFFLE_TURING_PARAMS
)
cu
=
GemmMainUnitTest
(
SHUFFLE_SIMT_PARAMS
+
SHUFFLE_VOLTA_PARAMS
+
SHUFFLE_TURING_PARAMS
)
convcu
=
ConvMainUnitTest
(
IMPLGEMM_SIMT_PARAMS
+
IMPLGEMM_VOLTA_PARAMS
+
IMPLGEMM_TURING_PARAMS
)
convcu
=
ConvMainUnitTest
(
IMPLGEMM_SIMT_PARAMS
+
IMPLGEMM_VOLTA_PARAMS
+
IMPLGEMM_TURING_PARAMS
)
...
@@ -172,14 +175,30 @@ if disable_jit is not None and disable_jit == "1":
...
@@ -172,14 +175,30 @@ if disable_jit is not None and disable_jit == "1":
std
=
"c++14"
std
=
"c++14"
else
:
else
:
std
=
"c++17"
std
=
"c++17"
cus
=
[
cu
,
convcu
,
SpconvOps
(),
BoxOps
(),
HashTable
(),
CompileInfo
()]
if
CUMM_CPU_ONLY_BUILD
:
if
CUMM_CPU_ONLY_BUILD
:
cus
=
[
SpconvOps
(),
BoxOps
(),
HashTable
(),
CompileInfo
()]
gemmtuner
=
GemmTunerSimple
(
cu
)
gemmtuner
.
namespace
=
"csrc.sparse.convops.gemmops"
convtuner
=
ConvTunerSimple
(
convcu
)
convtuner
.
namespace
=
"csrc.sparse.convops.convops"
convops
=
ConvGemmOps
(
gemmtuner
,
convtuner
)
convops
.
namespace
=
"csrc.sparse.convops.spops"
else
:
gemmtuner
=
GemmTunerSimple
(
None
)
gemmtuner
.
namespace
=
"csrc.sparse.convops.gemmops"
convtuner
=
ConvTunerSimple
(
None
)
convtuner
.
namespace
=
"csrc.sparse.convops.convops"
convops
=
ConvGemmOps
(
gemmtuner
,
convtuner
)
convops
.
namespace
=
"csrc.sparse.convops.spops"
cus
=
[
gemmtuner
,
convtuner
,
convops
,
SpconvOps
(),
BoxOps
(),
HashTable
(),
CompileInfo
(),
ExternalAllocator
(),
ExternalSpconvMatmul
()]
if
not
CUMM_CPU_ONLY_BUILD
:
cus
.
extend
([
cu
,
convcu
])
ext_modules
:
List
[
Extension
]
=
[
ext_modules
:
List
[
Extension
]
=
[
PCCMExtension
(
cus
,
PCCMExtension
(
cus
,
"spconv/core_cc"
,
"spconv/core_cc"
,
Path
(
__file__
).
resolve
().
parent
/
"spconv"
,
Path
(
__file__
).
resolve
().
parent
/
"spconv"
,
objects_folder
=
"objects"
,
std
=
std
,
std
=
std
,
disable_pch
=
True
,
disable_pch
=
True
,
verbose
=
True
)
verbose
=
True
)
...
...
spconv/algo.py
View file @
899008fa
...
@@ -37,7 +37,7 @@ from cumm import dtypes
...
@@ -37,7 +37,7 @@ from cumm import dtypes
from
spconv.constants
import
(
NDIM_DONT_CARE
,
SPCONV_BWD_SPLITK
,
from
spconv.constants
import
(
NDIM_DONT_CARE
,
SPCONV_BWD_SPLITK
,
SPCONV_NVRTC_MODE
,
SPCONV_DEBUG_NVRTC_KERNELS
)
SPCONV_NVRTC_MODE
,
SPCONV_DEBUG_NVRTC_KERNELS
)
from
spconv.core
import
ALL_IMPGEMM_PARAMS
,
AlgoHint
,
ConvAlgo
from
spconv.core
import
ALL_IMPGEMM_PARAMS
,
AlgoHint
,
ConvAlgo
,
ALL_NATIVE_PARAMS
from
spconv.core_cc.cumm.conv.main
import
ConvMainUnitTest
from
spconv.core_cc.cumm.conv.main
import
ConvMainUnitTest
from
spconv.core_cc.cumm.gemm.main
import
GemmMainUnitTest
from
spconv.core_cc.cumm.gemm.main
import
GemmMainUnitTest
from
spconv.cppconstants
import
COMPILED_CUDA_ARCHS
from
spconv.cppconstants
import
COMPILED_CUDA_ARCHS
...
@@ -49,14 +49,17 @@ from spconv import algocore
...
@@ -49,14 +49,17 @@ from spconv import algocore
from
cumm.conv.main
import
gen_gemm_kernels
as
gen_conv_kernels
from
cumm.conv.main
import
gen_gemm_kernels
as
gen_conv_kernels
from
cumm.gemm.main
import
gen_gemm_kernels
from
cumm.gemm.main
import
gen_gemm_kernels
from
spconv.core_cc.csrc.sparse.convops
import
GemmTuneResult
,
ConvTuneResult
from
spconv.core_cc.csrc.sparse.convops.gemmops
import
GemmTunerSimple
as
GemmTunerSimpleBase
from
spconv.core_cc.csrc.sparse.convops.convops
import
ConvTunerSimple
as
ConvTunerSimpleBase
ALL_ALGO_DESPS
=
GemmMainUnitTest
.
get_all_algo_desp
()
ALL_ALGO_DESPS
=
GemmMainUnitTest
.
get_all_algo_desp
()
ALL_CONV_ALGO_DESPS
=
ConvMainUnitTest
.
get_all_conv_algo_desp
()
ALL_CONV_ALGO_DESPS
=
ConvMainUnitTest
.
get_all_conv_algo_desp
()
_GEMM_STATIC_KEY
=
Tuple
[
bool
,
bool
,
bool
,
int
,
int
,
int
,
str
,
str
]
_GEMM_STATIC_KEY
=
Tuple
[
bool
,
bool
,
bool
,
int
,
int
,
int
,
int
,
str
]
class
SimpleGemmAlgoMeta
:
class
SimpleGemmAlgoMeta
:
def
__init__
(
self
,
tile_ms
:
List
[
int
],
tile_ns
:
List
[
int
],
def
__init__
(
self
,
tile_ms
:
List
[
int
],
tile_ns
:
List
[
int
],
tile_ks
:
List
[
int
],
tile_ks
:
List
[
int
],
tile_shape_to_algos
:
Dict
[
int
,
List
[
int
]])
->
None
:
tile_shape_to_algos
:
Dict
[
int
,
List
[
int
]])
->
None
:
...
@@ -67,19 +70,29 @@ class SimpleGemmAlgoMeta:
...
@@ -67,19 +70,29 @@ class SimpleGemmAlgoMeta:
class
BestAlgoByProfile
:
class
BestAlgoByProfile
:
def
__init__
(
self
,
algo_desp
:
GemmAlgoDesp
,
arch
:
Tuple
[
int
,
int
],
splitk
:
int
=
1
)
->
None
:
def
__init__
(
self
,
algo_desp
:
GemmAlgoDesp
,
arch
:
Tuple
[
int
,
int
],
splitk
:
int
=
1
)
->
None
:
self
.
algo_desp
=
algo_desp
self
.
algo_desp
=
algo_desp
self
.
splitk
=
splitk
self
.
splitk
=
splitk
self
.
arch
=
arch
self
.
arch
=
arch
class
BestConvAlgoByProfile
:
class
BestConvAlgoByProfile
:
def
__init__
(
self
,
algo_desp
:
ConvAlgoDesp
,
arch
:
Tuple
[
int
,
int
],
splitk
:
int
=
1
)
->
None
:
def
__init__
(
self
,
algo_desp
:
ConvAlgoDesp
,
arch
:
Tuple
[
int
,
int
],
splitk
:
int
=
1
)
->
None
:
self
.
algo_desp
=
algo_desp
self
.
algo_desp
=
algo_desp
self
.
splitk
=
splitk
self
.
splitk
=
splitk
self
.
arch
=
arch
self
.
arch
=
arch
def
_get_nvrtc_params
(
mod
:
CummNVRTCModule
,
ker
:
Union
[
GemmKernel
,
ConvKernel
],
kernel_name
:
str
):
def
_get_nvrtc_params
(
mod
:
CummNVRTCModule
,
ker
:
Union
[
GemmKernel
,
ConvKernel
],
kernel_name
:
str
):
nvrtc_mode
=
SPCONV_NVRTC_MODE
nvrtc_mode
=
SPCONV_NVRTC_MODE
nvrtc_params
=
tv
.
gemm
.
NVRTCParams
()
nvrtc_params
=
tv
.
gemm
.
NVRTCParams
()
nvrtc_params
.
cumodule
=
mod
.
get_cpp_object
()
nvrtc_params
.
cumodule
=
mod
.
get_cpp_object
()
...
@@ -89,8 +102,7 @@ def _get_nvrtc_params(mod: CummNVRTCModule, ker: Union[GemmKernel, ConvKernel],
...
@@ -89,8 +102,7 @@ def _get_nvrtc_params(mod: CummNVRTCModule, ker: Union[GemmKernel, ConvKernel],
ns
=
ker
.
namespace
ns
=
ker
.
namespace
if
nvrtc_mode
==
NVRTCMode
.
DynamicParallism
:
if
nvrtc_mode
==
NVRTCMode
.
DynamicParallism
:
nvrtc_params
.
kernel_name
=
mod
.
get_lowered_name
(
nvrtc_params
.
kernel_name
=
mod
.
get_lowered_name
(
f
"
{
ns
}
::nvrtc_kernel"
)
f
"
{
ns
}
::nvrtc_kernel"
)
elif
nvrtc_mode
==
NVRTCMode
.
KernelAndCPU
:
elif
nvrtc_mode
==
NVRTCMode
.
KernelAndCPU
:
nvrtc_params
.
kernel_name
=
mod
.
get_lowered_name
(
f
"
{
ns
}
::
{
kernel_name
}
"
)
nvrtc_params
.
kernel_name
=
mod
.
get_lowered_name
(
f
"
{
ns
}
::
{
kernel_name
}
"
)
...
@@ -101,8 +113,10 @@ def _get_nvrtc_params(mod: CummNVRTCModule, ker: Union[GemmKernel, ConvKernel],
...
@@ -101,8 +113,10 @@ def _get_nvrtc_params(mod: CummNVRTCModule, ker: Union[GemmKernel, ConvKernel],
nvrtc_params
.
param_storage
=
tv
.
empty
([
nvrtc_params
.
param_size
],
nvrtc_params
.
param_storage
=
tv
.
empty
([
nvrtc_params
.
param_size
],
tv
.
uint8
,
0
)
tv
.
uint8
,
0
)
nvrtc_params
.
param_storage_cpu
=
tv
.
empty
(
nvrtc_params
.
param_storage_cpu
=
tv
.
empty
([
nvrtc_params
.
param_size
],
[
nvrtc_params
.
param_size
],
tv
.
uint8
,
-
1
,
pinned
=
True
)
tv
.
uint8
,
-
1
,
pinned
=
True
)
elif
nvrtc_mode
==
NVRTCMode
.
Direct
:
elif
nvrtc_mode
==
NVRTCMode
.
Direct
:
nvrtc_params
.
kernel_name
=
mod
.
get_lowered_name
(
f
"
{
ns
}
::
{
kernel_name
}
"
)
nvrtc_params
.
kernel_name
=
mod
.
get_lowered_name
(
f
"
{
ns
}
::
{
kernel_name
}
"
)
...
@@ -120,9 +134,84 @@ def _get_nvrtc_params(mod: CummNVRTCModule, ker: Union[GemmKernel, ConvKernel],
...
@@ -120,9 +134,84 @@ def _get_nvrtc_params(mod: CummNVRTCModule, ker: Union[GemmKernel, ConvKernel],
raise
NotImplementedError
raise
NotImplementedError
return
nvrtc_params
return
nvrtc_params
class
GemmTunerSimple
(
GemmTunerSimpleBase
):
def
__init__
(
self
,
desps
:
List
[
GemmAlgoDesp
])
->
None
:
super
().
__init__
(
desps
)
self
.
_nvrtc_caches
:
Dict
[
Tuple
[
str
,
Tuple
[
int
,
int
],
int
],
NVRTCParams
]
=
{}
def
_compile_nvrtc_module
(
self
,
desp
:
GemmAlgoDesp
):
params
=
algocore
.
get_gemm_param_from_desp
(
desp
)
kernel
=
gen_gemm_kernels
(
params
,
SPCONV_NVRTC_MODE
)
kernel
.
namespace
=
"spconv"
custom_names
=
[]
if
SPCONV_NVRTC_MODE
==
NVRTCMode
.
ConstantMemory
:
custom_names
=
[
f
"&
{
kernel
.
namespace
}
::
{
NVRTCConstants
.
CONSTANT_PARAM_KEY
}
"
]
cudadevrt
=
""
if
SPCONV_NVRTC_MODE
==
NVRTCMode
.
DynamicParallism
:
cudadevrt_p
=
get_cudadevrt_path
()
assert
cudadevrt_p
is
not
None
,
"DynamicParallism must have cudadevrt"
cudadevrt
=
str
(
cudadevrt_p
)
mod
=
CummNVRTCModule
([
kernel
],
cudadevrt_path
=
cudadevrt
,
custom_names
=
custom_names
)
mod
.
load
()
return
mod
,
kernel
def
cached_get_nvrtc_params
(
self
,
desp
:
GemmAlgoDesp
,
arch
:
Tuple
[
int
,
int
],
stream_int
:
int
)
->
NVRTCParams
:
key
=
(
str
(
desp
),
arch
,
stream_int
)
if
key
in
self
.
_nvrtc_caches
:
return
self
.
_nvrtc_caches
[
key
]
mod
,
ker
=
self
.
_compile_nvrtc_module
(
desp
)
nvrtc_params
=
_get_nvrtc_params
(
mod
,
ker
,
"gemm_kernel"
)
self
.
_nvrtc_caches
[
key
]
=
nvrtc_params
return
nvrtc_params
class
ConvTunerSimple
(
ConvTunerSimpleBase
):
def
__init__
(
self
,
desps
:
List
[
ConvAlgoDesp
])
->
None
:
super
().
__init__
(
desps
)
self
.
_nvrtc_caches
:
Dict
[
Tuple
[
str
,
Tuple
[
int
,
int
],
int
],
NVRTCParams
]
=
{}
def
_compile_nvrtc_module
(
self
,
desp
:
ConvAlgoDesp
):
params
=
algocore
.
get_conv_param_from_desp
(
desp
)
kernel
=
gen_conv_kernels
(
params
,
SPCONV_NVRTC_MODE
)
kernel
.
namespace
=
"spconv"
custom_names
=
[]
if
SPCONV_NVRTC_MODE
==
NVRTCMode
.
ConstantMemory
:
custom_names
=
[
f
"&
{
kernel
.
namespace
}
::
{
NVRTCConstants
.
CONSTANT_PARAM_KEY
}
"
]
cudadevrt
=
""
if
SPCONV_NVRTC_MODE
==
NVRTCMode
.
DynamicParallism
:
cudadevrt_p
=
get_cudadevrt_path
()
assert
cudadevrt_p
is
not
None
,
"DynamicParallism must have cudadevrt"
cudadevrt
=
str
(
cudadevrt_p
)
mod
=
CummNVRTCModule
([
kernel
],
cudadevrt_path
=
cudadevrt
,
verbose
=
False
,
custom_names
=
custom_names
)
mod
.
load
()
return
mod
,
kernel
def
cached_get_nvrtc_params
(
self
,
desp
:
ConvAlgoDesp
,
arch
:
Tuple
[
int
,
int
],
stream_int
:
int
)
->
NVRTCParams
:
key
=
(
str
(
desp
),
arch
,
stream_int
)
if
key
in
self
.
_nvrtc_caches
:
return
self
.
_nvrtc_caches
[
key
]
mod
,
ker
=
self
.
_compile_nvrtc_module
(
desp
)
print
(
f
"Can't find algo
{
desp
}
in prebuilt. compile with nvrtc..."
)
nvrtc_params
=
_get_nvrtc_params
(
mod
,
ker
,
"conv_kernel"
)
self
.
_nvrtc_caches
[
key
]
=
nvrtc_params
return
nvrtc_params
class
SimpleGemm
:
class
SimpleGemm
:
def
__init__
(
self
,
prebuilt_desps
:
List
[
GemmAlgoDesp
])
->
None
:
def
__init__
(
self
,
prebuilt_desps
:
List
[
GemmAlgoDesp
])
->
None
:
all_desps
=
[
algocore
.
get_conv_algo_desp_from_param
(
p
)
for
p
in
ALL_IMPGEMM_PARAMS
]
all_desps
=
[
algocore
.
get_gemm_algo_desp_from_param
(
p
)
for
p
in
ALL_NATIVE_PARAMS
]
self
.
prebuilt_desps
=
prebuilt_desps
self
.
prebuilt_desps
=
prebuilt_desps
self
.
prebuilt_desp_names
=
{
str
(
d
)
for
d
in
prebuilt_desps
}
self
.
prebuilt_desp_names
=
{
str
(
d
)
for
d
in
prebuilt_desps
}
if
SPCONV_DEBUG_NVRTC_KERNELS
:
if
SPCONV_DEBUG_NVRTC_KERNELS
:
...
@@ -178,7 +267,9 @@ class SimpleGemm:
...
@@ -178,7 +267,9 @@ class SimpleGemm:
kernel
.
namespace
=
"spconv"
kernel
.
namespace
=
"spconv"
custom_names
=
[]
custom_names
=
[]
if
SPCONV_NVRTC_MODE
==
NVRTCMode
.
ConstantMemory
:
if
SPCONV_NVRTC_MODE
==
NVRTCMode
.
ConstantMemory
:
custom_names
=
[
f
"&
{
kernel
.
namespace
}
::
{
NVRTCConstants
.
CONSTANT_PARAM_KEY
}
"
]
custom_names
=
[
f
"&
{
kernel
.
namespace
}
::
{
NVRTCConstants
.
CONSTANT_PARAM_KEY
}
"
]
cudadevrt
=
""
cudadevrt
=
""
if
SPCONV_NVRTC_MODE
==
NVRTCMode
.
DynamicParallism
:
if
SPCONV_NVRTC_MODE
==
NVRTCMode
.
DynamicParallism
:
cudadevrt_p
=
get_cudadevrt_path
()
cudadevrt_p
=
get_cudadevrt_path
()
...
@@ -186,12 +277,12 @@ class SimpleGemm:
...
@@ -186,12 +277,12 @@ class SimpleGemm:
cudadevrt
=
str
(
cudadevrt_p
)
cudadevrt
=
str
(
cudadevrt_p
)
mod
=
CummNVRTCModule
([
kernel
],
mod
=
CummNVRTCModule
([
kernel
],
cudadevrt_path
=
cudadevrt
,
cudadevrt_path
=
cudadevrt
,
verbose
=
False
,
custom_names
=
custom_names
)
custom_names
=
custom_names
)
mod
.
load
()
mod
.
load
()
return
mod
,
kernel
return
mod
,
kernel
def
_cached_get_nvrtc_params
(
self
,
desp
:
GemmAlgoDesp
,
arch
:
Tuple
[
int
,
int
]):
def
_cached_get_nvrtc_params
(
self
,
desp
:
GemmAlgoDesp
,
arch
:
Tuple
[
int
,
int
]):
key
=
(
str
(
desp
),
arch
)
key
=
(
str
(
desp
),
arch
)
if
key
in
self
.
_nvrtc_caches
:
if
key
in
self
.
_nvrtc_caches
:
return
self
.
_nvrtc_caches
[
key
]
return
self
.
_nvrtc_caches
[
key
]
...
@@ -218,12 +309,15 @@ class SimpleGemm:
...
@@ -218,12 +309,15 @@ class SimpleGemm:
trans_c
=
False
trans_c
=
False
avail_algos
=
get_available_algo_str_from_arch
(
arch
)
avail_algos
=
get_available_algo_str_from_arch
(
arch
)
finally_algos
:
List
[
GemmAlgoDesp
]
=
[]
finally_algos
:
List
[
GemmAlgoDesp
]
=
[]
# print(self.static_key_to_desps)
for
algo
in
avail_algos
:
for
algo
in
avail_algos
:
static_key
=
(
trans_a
,
trans_b
,
trans_c
,
a
.
dtype
,
b
.
dtype
,
c
.
dtype
,
static_key
=
(
trans_a
,
trans_b
,
trans_c
,
a
.
dtype
,
b
.
dtype
,
c
.
dtype
,
shuffle_type
.
value
,
algo
)
shuffle_type
.
value
,
algo
)
# print(static_key)
desps
=
self
.
static_key_to_desps
.
get
(
static_key
,
None
)
desps
=
self
.
static_key_to_desps
.
get
(
static_key
,
None
)
if
desps
is
None
or
len
(
desps
)
==
0
:
if
desps
is
None
or
len
(
desps
)
==
0
:
continue
continue
# print(desps)
for
desp
in
desps
:
for
desp
in
desps
:
# skip volta tensor op since it is very slow in architectures except volta.
# skip volta tensor op since it is very slow in architectures except volta.
if
arch
>=
(
7
,
5
)
and
desp
.
algo
==
GemmAlgo
.
Volta
.
value
:
if
arch
>=
(
7
,
5
)
and
desp
.
algo
==
GemmAlgo
.
Volta
.
value
:
...
@@ -430,6 +524,7 @@ class SimpleGemm:
...
@@ -430,6 +524,7 @@ class SimpleGemm:
best_scatter_params
=
(
-
1
,
-
1
,
-
1
,
-
1
)
best_scatter_params
=
(
-
1
,
-
1
,
-
1
,
-
1
)
all_profile_res
:
List
[
BestAlgoByProfile
]
=
[]
all_profile_res
:
List
[
BestAlgoByProfile
]
=
[]
# print(avail)
for
desp
in
avail
:
for
desp
in
avail
:
c_
.
zero_whole_storage_
()
c_
.
zero_whole_storage_
()
split_k_slices
=
1
split_k_slices
=
1
...
@@ -466,7 +561,8 @@ class SimpleGemm:
...
@@ -466,7 +561,8 @@ class SimpleGemm:
times
.
append
(
np
.
mean
(
this_times
[
1
:]))
times
.
append
(
np
.
mean
(
this_times
[
1
:]))
spk_speeds
.
append
(
times
[
-
1
])
spk_speeds
.
append
(
times
[
-
1
])
all_profile_res
.
append
(
BestAlgoByProfile
(
desp
,
arch
,
splitk
=
spk
))
all_profile_res
.
append
(
BestAlgoByProfile
(
desp
,
arch
,
splitk
=
spk
))
min_time
=
1000
min_time
=
1000
min_idx
=
-
1
min_idx
=
-
1
...
@@ -490,8 +586,7 @@ class SimpleGemm:
...
@@ -490,8 +586,7 @@ class SimpleGemm:
return
res
,
min_time
return
res
,
min_time
def
run_with_tuned_result
(
def
run_with_tuned_result
(
self
,
self
,
profile_res
:
BestAlgoByProfile
,
profile_res
:
BestAlgoByProfile
,
a
:
tv
.
Tensor
,
a
:
tv
.
Tensor
,
b
:
tv
.
Tensor
,
b
:
tv
.
Tensor
,
...
@@ -501,7 +596,7 @@ class SimpleGemm:
...
@@ -501,7 +596,7 @@ class SimpleGemm:
trans_c
:
bool
,
trans_c
:
bool
,
arch
:
Tuple
[
int
,
int
],
arch
:
Tuple
[
int
,
int
],
stream
:
int
,
stream
:
int
,
shuffle_type
:
ShuffleStrideType
=
ShuffleStrideType
.
NoShuffle
,
shuffle_type
:
ShuffleStrideType
,
a_inds
:
tv
.
Tensor
=
tv
.
Tensor
(),
a_inds
:
tv
.
Tensor
=
tv
.
Tensor
(),
b_inds
:
tv
.
Tensor
=
tv
.
Tensor
(),
b_inds
:
tv
.
Tensor
=
tv
.
Tensor
(),
c_inds
:
tv
.
Tensor
=
tv
.
Tensor
(),
c_inds
:
tv
.
Tensor
=
tv
.
Tensor
(),
...
@@ -510,7 +605,8 @@ class SimpleGemm:
...
@@ -510,7 +605,8 @@ class SimpleGemm:
beta
:
float
=
0.0
,
beta
:
float
=
0.0
,
gather_data
:
tv
.
Tensor
=
tv
.
Tensor
(),
gather_data
:
tv
.
Tensor
=
tv
.
Tensor
(),
workspace
:
tv
.
Tensor
=
tv
.
Tensor
(),
workspace
:
tv
.
Tensor
=
tv
.
Tensor
(),
timer
:
CUDAKernelTimer
=
CUDAKernelTimer
(
False
)):
timer
:
CUDAKernelTimer
=
CUDAKernelTimer
(
False
),
force_nvrtc
:
bool
=
False
):
m
,
n
,
k
=
GemmMainUnitTest
.
extract_mnk
(
a
.
shape
,
b
.
shape
,
trans_a
,
m
,
n
,
k
=
GemmMainUnitTest
.
extract_mnk
(
a
.
shape
,
b
.
shape
,
trans_a
,
trans_b
,
trans_c
,
trans_b
,
trans_c
,
shuffle_type
.
value
,
shuffle_type
.
value
,
...
@@ -526,8 +622,10 @@ class SimpleGemm:
...
@@ -526,8 +622,10 @@ class SimpleGemm:
if
profile_res
.
splitk
>
1
:
if
profile_res
.
splitk
>
1
:
split_k_slices
=
profile_res
.
splitk
split_k_slices
=
profile_res
.
splitk
params
=
GemmParams
()
params
=
GemmParams
()
if
algo_desp
.
is_nvrtc
and
str
(
algo_desp
)
not
in
self
.
prebuilt_desp_names
:
is_not_static
=
str
(
algo_desp
)
not
in
self
.
prebuilt_desp_names
params
.
nvrtc_params
=
self
.
_cached_get_nvrtc_params
(
algo_desp
,
profile_res
.
arch
)
if
algo_desp
.
is_nvrtc
and
(
is_not_static
or
force_nvrtc
):
params
.
nvrtc_params
=
self
.
_cached_get_nvrtc_params
(
algo_desp
,
profile_res
.
arch
)
params
.
a
=
a
params
.
a
=
a
params
.
b
=
b
params
.
b
=
b
...
@@ -569,8 +667,12 @@ _CONV_STATIC_KEY = Tuple[int, int, int, int, int, int, int, int, int, str, int]
...
@@ -569,8 +667,12 @@ _CONV_STATIC_KEY = Tuple[int, int, int, int, int, int, int, int, int, str, int]
class
SimpleConv
:
class
SimpleConv
:
def
__init__
(
self
,
prebuilt_desps
:
List
[
ConvAlgoDesp
])
->
None
:
def
__init__
(
self
,
prebuilt_desps
:
List
[
ConvAlgoDesp
])
->
None
:
all_desps
=
[
algocore
.
get_conv_algo_desp_from_param
(
p
)
for
p
in
ALL_IMPGEMM_PARAMS
]
all_desps
=
[
algocore
.
get_conv_algo_desp_from_param
(
p
)
for
p
in
ALL_IMPGEMM_PARAMS
]
self
.
prebuilt_desps
=
prebuilt_desps
self
.
prebuilt_desps
=
prebuilt_desps
self
.
prebuilt_desp_names
=
{
str
(
d
)
for
d
in
prebuilt_desps
}
self
.
prebuilt_desp_names
=
{
str
(
d
)
for
d
in
prebuilt_desps
}
self
.
prebuilt_desp_names
.
clear
()
self
.
prebuilt_desp_names
.
clear
()
...
@@ -650,6 +752,7 @@ class SimpleConv:
...
@@ -650,6 +752,7 @@ class SimpleConv:
use_f32_as_accum
=
weight
.
dim
(
0
)
*
kv
>
128
*
27
use_f32_as_accum
=
weight
.
dim
(
0
)
*
kv
>
128
*
27
else
:
else
:
use_f32_as_accum
=
fp32_accum
use_f32_as_accum
=
fp32_accum
use_f32_as_accum
=
False
for
algo
in
avail_algos
:
for
algo
in
avail_algos
:
static_key
=
(
layout_i
.
layout_type
.
value
,
static_key
=
(
layout_i
.
layout_type
.
value
,
layout_w
.
layout_type
.
value
,
layout_w
.
layout_type
.
value
,
...
@@ -664,7 +767,6 @@ class SimpleConv:
...
@@ -664,7 +767,6 @@ class SimpleConv:
if
arch
>=
(
7
,
5
)
and
desp
.
algo
==
GemmAlgo
.
Volta
.
value
:
if
arch
>=
(
7
,
5
)
and
desp
.
algo
==
GemmAlgo
.
Volta
.
value
:
continue
continue
if
arch
>=
(
7
,
0
)
and
is_fp16
:
if
arch
>=
(
7
,
0
)
and
is_fp16
:
# skip simt fp16 kernels if we have tensor core
if
desp
.
algo
==
GemmAlgo
.
Simt
:
if
desp
.
algo
==
GemmAlgo
.
Simt
:
continue
continue
if
use_f32_as_accum
:
if
use_f32_as_accum
:
...
@@ -675,6 +777,7 @@ class SimpleConv:
...
@@ -675,6 +777,7 @@ class SimpleConv:
ldw
=
weight
.
dim
(
-
1
)
ldw
=
weight
.
dim
(
-
1
)
ldo
=
out
.
dim
(
-
1
)
ldo
=
out
.
dim
(
-
1
)
mask_width_valid
=
True
mask_width_valid
=
True
if
desp
.
op_type
==
ConvOpType
.
kBackwardWeight
.
value
:
if
desp
.
op_type
==
ConvOpType
.
kBackwardWeight
.
value
:
assert
mask_width
>
0
assert
mask_width
>
0
mask_width_valid
=
mask_width
%
desp
.
tile_shape
[
2
]
==
0
mask_width_valid
=
mask_width
%
desp
.
tile_shape
[
2
]
==
0
...
@@ -722,7 +825,9 @@ class SimpleConv:
...
@@ -722,7 +825,9 @@ class SimpleConv:
kernel
.
namespace
=
"spconv"
kernel
.
namespace
=
"spconv"
custom_names
=
[]
custom_names
=
[]
if
SPCONV_NVRTC_MODE
==
NVRTCMode
.
ConstantMemory
:
if
SPCONV_NVRTC_MODE
==
NVRTCMode
.
ConstantMemory
:
custom_names
=
[
f
"&
{
kernel
.
namespace
}
::
{
NVRTCConstants
.
CONSTANT_PARAM_KEY
}
"
]
custom_names
=
[
f
"&
{
kernel
.
namespace
}
::
{
NVRTCConstants
.
CONSTANT_PARAM_KEY
}
"
]
cudadevrt
=
""
cudadevrt
=
""
if
SPCONV_NVRTC_MODE
==
NVRTCMode
.
DynamicParallism
:
if
SPCONV_NVRTC_MODE
==
NVRTCMode
.
DynamicParallism
:
cudadevrt_p
=
get_cudadevrt_path
()
cudadevrt_p
=
get_cudadevrt_path
()
...
@@ -735,10 +840,12 @@ class SimpleConv:
...
@@ -735,10 +840,12 @@ class SimpleConv:
mod
.
load
()
mod
.
load
()
return
mod
,
kernel
return
mod
,
kernel
def
_cached_get_nvrtc_params
(
self
,
desp
:
ConvAlgoDesp
,
arch
:
Tuple
[
int
,
int
]):
def
_cached_get_nvrtc_params
(
self
,
desp
:
ConvAlgoDesp
,
arch
:
Tuple
[
int
,
int
]):
key
=
(
str
(
desp
),
arch
)
key
=
(
str
(
desp
),
arch
)
if
key
in
self
.
_nvrtc_caches
:
if
key
in
self
.
_nvrtc_caches
:
return
self
.
_nvrtc_caches
[
key
]
return
self
.
_nvrtc_caches
[
key
]
print
(
f
"Can't find algo
{
desp
}
in prebuilt. compile with nvrtc..."
)
mod
,
ker
=
self
.
_compile_nvrtc_module
(
desp
)
mod
,
ker
=
self
.
_compile_nvrtc_module
(
desp
)
nvrtc_params
=
_get_nvrtc_params
(
mod
,
ker
,
"conv_kernel"
)
nvrtc_params
=
_get_nvrtc_params
(
mod
,
ker
,
"conv_kernel"
)
self
.
_nvrtc_caches
[
key
]
=
nvrtc_params
self
.
_nvrtc_caches
[
key
]
=
nvrtc_params
...
@@ -795,8 +902,8 @@ class SimpleConv:
...
@@ -795,8 +902,8 @@ class SimpleConv:
params
.
indices
=
indices
params
.
indices
=
indices
params
.
mask
=
mask
params
.
mask
=
mask
params
.
mask_output
=
mask_output
params
.
mask_output
=
mask_output
if
op_type
==
ConvOpType
.
kBackwardWeight
:
#
if op_type == ConvOpType.kBackwardWeight:
assert
not
mask_output
.
empty
()
#
assert not mask_output.empty()
if
op_type
==
ConvOpType
.
kBackwardInput
:
if
op_type
==
ConvOpType
.
kBackwardInput
:
params
.
reverse_mask
=
reverse_mask
params
.
reverse_mask
=
reverse_mask
params
.
mask_filter
=
mask_filter
params
.
mask_filter
=
mask_filter
...
@@ -808,20 +915,20 @@ class SimpleConv:
...
@@ -808,20 +915,20 @@ class SimpleConv:
spk_speeds
=
[]
spk_speeds
=
[]
for
spk
in
splitk_tests
:
for
spk
in
splitk_tests
:
this_times
=
[]
this_times
=
[]
for
j
in
range
(
3
):
for
j
in
range
(
4
):
GemmMainUnitTest
.
stream_synchronize
(
stream
)
t
=
time
.
time
()
params
.
split_k_slices
=
spk
params
.
split_k_slices
=
spk
if
desp
.
is_nvrtc
and
str
(
desp
)
not
in
self
.
prebuilt_desp_names
:
with
tv
.
measure_duration
(
stream
=
stream
)
as
measure
:
if
desp
.
is_nvrtc
and
str
(
desp
)
not
in
self
.
prebuilt_desp_names
:
tv
.
gemm
.
run_nvrtc_conv_kernel
(
params
)
tv
.
gemm
.
run_nvrtc_conv_kernel
(
params
)
else
:
else
:
ConvMainUnitTest
.
implicit_gemm2
(
params
)
ConvMainUnitTest
.
implicit_gemm2
(
params
)
GemmMainUnitTest
.
stream_synchronize
(
stream
)
this_times
.
append
(
measure
.
duration
)
this_times
.
append
(
time
.
time
()
-
t
)
times
.
append
(
np
.
mean
(
this_times
[
1
:]))
times
.
append
(
np
.
mean
(
this_times
[
1
:]))
spk_speeds
.
append
(
times
[
-
1
])
spk_speeds
.
append
(
times
[
-
1
])
all_profile_res
.
append
(
BestConvAlgoByProfile
(
desp
,
arch
,
splitk
=
spk
))
all_profile_res
.
append
(
BestConvAlgoByProfile
(
desp
,
arch
,
splitk
=
spk
))
if
not
all_profile_res
:
if
not
all_profile_res
:
raise
ValueError
(
"can't find suitable algorithm for"
,
op_type
)
raise
ValueError
(
"can't find suitable algorithm for"
,
op_type
)
min_time
=
1000
min_time
=
1000
...
@@ -865,7 +972,8 @@ class SimpleConv:
...
@@ -865,7 +972,8 @@ class SimpleConv:
stream
:
int
=
0
,
stream
:
int
=
0
,
workspace
:
tv
.
Tensor
=
tv
.
Tensor
(),
workspace
:
tv
.
Tensor
=
tv
.
Tensor
(),
verbose
:
bool
=
False
,
verbose
:
bool
=
False
,
timer
:
CUDAKernelTimer
=
CUDAKernelTimer
(
False
)):
timer
:
CUDAKernelTimer
=
CUDAKernelTimer
(
False
),
force_nvrtc
:
bool
=
False
):
channel_k
=
output
.
dim
(
1
)
channel_k
=
output
.
dim
(
1
)
channel_c
=
inp
.
dim
(
1
)
channel_c
=
inp
.
dim
(
1
)
# GemmMainUnitTest.stream_synchronize(stream)
# GemmMainUnitTest.stream_synchronize(stream)
...
@@ -879,13 +987,17 @@ class SimpleConv:
...
@@ -879,13 +987,17 @@ class SimpleConv:
else
:
else
:
op_type_value
=
op_type
.
value
op_type_value
=
op_type
.
value
params
=
ConvParams
(
NDIM_DONT_CARE
,
ConvOpTypeCpp
(
op_type_value
))
params
=
ConvParams
(
NDIM_DONT_CARE
,
ConvOpTypeCpp
(
op_type_value
))
if
algo_desp
.
is_nvrtc
and
str
(
algo_desp
)
not
in
self
.
prebuilt_desp_names
:
is_not_static
=
str
(
params
.
nvrtc_params
=
self
.
_cached_get_nvrtc_params
(
algo_desp
,
profile_res
.
arch
)
algo_desp
)
not
in
self
.
prebuilt_desp_names
if
algo_desp
.
is_nvrtc
and
(
is_not_static
or
force_nvrtc
):
params
.
nvrtc_params
=
self
.
_cached_get_nvrtc_params
(
algo_desp
,
profile_res
.
arch
)
params
.
conv_algo_desp
=
profile_res
.
algo_desp
params
.
conv_algo_desp
=
profile_res
.
algo_desp
params
.
input
=
inp
params
.
input
=
inp
params
.
verbose
=
verbose
params
.
verbose
=
verbose
params
.
weight
=
weight
.
view
([
channel_k
,
-
1
,
channel_c
])
params
.
weight
=
weight
.
view
([
channel_k
,
-
1
,
channel_c
])
params
.
output
=
output
params
.
output
=
output
params
.
split_k_slices
=
split_k_slices
params
.
split_k_slices
=
split_k_slices
params
.
alpha
=
alpha
params
.
alpha
=
alpha
params
.
beta
=
beta
params
.
beta
=
beta
...
@@ -893,6 +1005,7 @@ class SimpleConv:
...
@@ -893,6 +1005,7 @@ class SimpleConv:
params
.
mask_argsort
=
mask_argsort
params
.
mask_argsort
=
mask_argsort
params
.
indices
=
indices
params
.
indices
=
indices
params
.
mask
=
mask
params
.
mask
=
mask
params
.
mask_filter
=
mask_filter
params
.
mask_filter
=
mask_filter
params
.
mask_width
=
mask_width
params
.
mask_width
=
mask_width
params
.
mask_filter
=
mask_filter
params
.
mask_filter
=
mask_filter
...
@@ -919,6 +1032,13 @@ class SimpleConv:
...
@@ -919,6 +1032,13 @@ class SimpleConv:
GEMM
=
SimpleGemm
(
ALL_ALGO_DESPS
)
GEMM
=
SimpleGemm
(
ALL_ALGO_DESPS
)
CONV
=
SimpleConv
(
ALL_CONV_ALGO_DESPS
)
CONV
=
SimpleConv
(
ALL_CONV_ALGO_DESPS
)
GEMM_CPP
=
GemmTunerSimple
([
algocore
.
get_gemm_algo_desp_from_param
(
p
)
for
p
in
ALL_NATIVE_PARAMS
])
CONV_CPP
=
ConvTunerSimple
([
algocore
.
get_conv_algo_desp_from_param
(
p
)
for
p
in
ALL_IMPGEMM_PARAMS
])
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
print
(
len
(
ALL_CONV_ALGO_DESPS
))
print
(
len
(
ALL_CONV_ALGO_DESPS
))
print
(
ALL_CONV_ALGO_DESPS
[
0
])
print
(
ALL_CONV_ALGO_DESPS
[
0
])
spconv/algocore.py
View file @
899008fa
...
@@ -24,8 +24,8 @@ from cumm.tensorview.gemm import ConvLayoutType as ConvLayoutTypeCpp
...
@@ -24,8 +24,8 @@ from cumm.tensorview.gemm import ConvLayoutType as ConvLayoutTypeCpp
from
cumm.tensorview.gemm
import
ShuffleStrideType
as
ShuffleStrideTypeCpp
from
cumm.tensorview.gemm
import
ShuffleStrideType
as
ShuffleStrideTypeCpp
from
cumm.tensorview.gemm
import
ConvParams
,
GemmAlgoDesp
,
GemmParams
from
cumm.tensorview.gemm
import
ConvParams
,
GemmAlgoDesp
,
GemmParams
from
cumm.gemm.main
import
GemmAlgoParams
from
cumm.gemm.main
import
GemmAlgoParams
,
gen_gemm_kernels
from
cumm.conv.main
import
ConvAlgoParams
,
ConvIterAlgo
from
cumm.conv.main
import
ConvAlgoParams
,
ConvIterAlgo
,
gen_gemm_kernels
as
gen_conv_kernels
from
cumm
import
dtypes
from
cumm
import
dtypes
from
cumm.conv.bases
import
(
NCHW
,
NHWC
,
ConvIterAlgo
,
ConvLayout
,
from
cumm.conv.bases
import
(
NCHW
,
NHWC
,
ConvIterAlgo
,
ConvLayout
,
ConvLayoutType
,
ConvMode
,
ConvOpType
)
ConvLayoutType
,
ConvMode
,
ConvOpType
)
...
@@ -56,10 +56,15 @@ def _assign_gemm_desp_props(desp: Union[ConvAlgoDesp, GemmAlgoDesp],
...
@@ -56,10 +56,15 @@ def _assign_gemm_desp_props(desp: Union[ConvAlgoDesp, GemmAlgoDesp],
desp
.
access_per_vector
=
p
.
access_per_vector
desp
.
access_per_vector
=
p
.
access_per_vector
desp
.
is_nvrtc
=
p
.
is_nvrtc
desp
.
is_nvrtc
=
p
.
is_nvrtc
def
get_gemm_algo_desp_from_param
(
p
:
GemmAlgoParams
):
def
get_gemm_algo_desp_from_param
(
p
:
GemmAlgoParams
):
desp
=
GemmAlgoDesp
()
desp
=
GemmAlgoDesp
()
_assign_gemm_desp_props
(
desp
,
p
)
_assign_gemm_desp_props
(
desp
,
p
)
# here we must generate kernel for element-per-access data
ker
=
gen_gemm_kernels
(
p
)
desp
.
element_per_access_a
=
ker
.
input_spec
.
input_iter_a
.
element_per_acc
desp
.
element_per_access_b
=
ker
.
input_spec
.
input_iter_b
.
element_per_acc
desp
.
element_per_access_c
=
ker
.
output_spec
.
out_iter
.
element_per_acc
return
desp
return
desp
...
@@ -78,6 +83,10 @@ def get_conv_algo_desp_from_param(p: ConvAlgoParams):
...
@@ -78,6 +83,10 @@ def get_conv_algo_desp_from_param(p: ConvAlgoParams):
desp
.
interleave_o
=
p
.
layout_desp_output
.
interleave
desp
.
interleave_o
=
p
.
layout_desp_output
.
interleave
desp
.
mask_sparse
=
p
.
mask_sparse
desp
.
mask_sparse
=
p
.
mask_sparse
desp
.
increment_k_first
=
p
.
increment_k_first
desp
.
increment_k_first
=
p
.
increment_k_first
ker
=
gen_conv_kernels
(
p
)
desp
.
element_per_access_a
=
ker
.
input_spec
.
input_iter_a
.
element_per_acc
desp
.
element_per_access_b
=
ker
.
input_spec
.
input_iter_b
.
element_per_acc
desp
.
element_per_access_c
=
ker
.
output_spec
.
out_iter
.
element_per_acc
return
desp
return
desp
...
@@ -106,6 +115,7 @@ def _assign_gemm_params(desp: Union[ConvAlgoDesp, GemmAlgoDesp],
...
@@ -106,6 +115,7 @@ def _assign_gemm_params(desp: Union[ConvAlgoDesp, GemmAlgoDesp],
p
.
is_nvrtc
=
desp
.
is_nvrtc
p
.
is_nvrtc
=
desp
.
is_nvrtc
def
get_gemm_param_from_desp
(
desp
:
GemmAlgoDesp
):
def
get_gemm_param_from_desp
(
desp
:
GemmAlgoDesp
):
p
=
GemmAlgoParams
((
0
,
0
,
0
),
(
0
,
0
,
0
),
0
,
"s8,s8,s8,s8,s8"
,
False
,
False
,
p
=
GemmAlgoParams
((
0
,
0
,
0
),
(
0
,
0
,
0
),
0
,
"s8,s8,s8,s8,s8"
,
False
,
False
,
False
,
GemmAlgo
.
Simt
)
False
,
GemmAlgo
.
Simt
)
...
...
spconv/benchmark/me.py
deleted
100644 → 0
View file @
f78575ea
"""Benchmark MinkowskiEngine
"""
from
spconv.benchmark.core
import
get_voxel_data
import
time
from
pathlib
import
Path
import
numpy
as
np
import
torch
from
torch
import
nn
from
spconv.core
import
ConvAlgo
from
cumm
import
dtypes
from
spconv.test_utils
import
params_grid
_DTYPE_TO_TORCH_DTYPE
=
{
dtypes
.
float32
:
torch
.
float32
,
dtypes
.
float16
:
torch
.
float16
,
}
def
bench_me_basic
(
dtype_str
:
str
):
dtype
=
dtypes
.
get_dtype_by_shortcut
(
dtype_str
)
if
dtype
not
in
_DTYPE_TO_TORCH_DTYPE
:
raise
NotImplementedError
(
"only support bench f32 and f16 for now"
)
torch_dtype
=
_DTYPE_TO_TORCH_DTYPE
[
dtype
]
spconv/benchmark/thsp.py
deleted
100644 → 0
View file @
f78575ea
"""Benchmark torchsparse
"""
from
spconv.benchmark.core
import
get_voxel_data
import
time
from
pathlib
import
Path
import
numpy
as
np
import
torch
from
torch
import
nn
from
spconv.core
import
ConvAlgo
from
cumm
import
dtypes
from
spconv.test_utils
import
params_grid
_DTYPE_TO_TORCH_DTYPE
=
{
dtypes
.
float32
:
torch
.
float32
,
dtypes
.
float16
:
torch
.
float16
,
}
def
bench_torchsparse_basic
(
dtype_str
:
str
):
dtype
=
dtypes
.
get_dtype_by_shortcut
(
dtype_str
)
if
dtype
not
in
_DTYPE_TO_TORCH_DTYPE
:
raise
NotImplementedError
(
"only support bench f32 and f16 for now"
)
torch_dtype
=
_DTYPE_TO_TORCH_DTYPE
[
dtype
]
spconv/build.py
View file @
899008fa
...
@@ -13,6 +13,7 @@
...
@@ -13,6 +13,7 @@
# limitations under the License.
# limitations under the License.
from
pathlib
import
Path
from
pathlib
import
Path
from
typing
import
List
import
pccm
import
pccm
from
pccm.utils
import
project_is_editable
,
project_is_installed
from
pccm.utils
import
project_is_editable
,
project_is_installed
...
@@ -32,6 +33,10 @@ if project_is_installed(PACKAGE_NAME) and project_is_editable(
...
@@ -32,6 +33,10 @@ if project_is_installed(PACKAGE_NAME) and project_is_editable(
from
spconv.csrc.sparse.alloc
import
ExternalAllocator
from
spconv.csrc.sparse.alloc
import
ExternalAllocator
from
spconv.csrc.utils
import
BoxOps
from
spconv.csrc.utils
import
BoxOps
from
spconv.csrc.hash.core
import
HashTable
from
spconv.csrc.hash.core
import
HashTable
from
spconv.csrc.sparse.convops
import
GemmTunerSimple
,
ExternalSpconvMatmul
from
spconv.csrc.sparse.convops
import
ConvTunerSimple
,
ConvGemmOps
from
spconv.csrc.sparse.convops
import
SimpleExternalSpconvMatmul
all_shuffle
=
SHUFFLE_SIMT_PARAMS
+
SHUFFLE_VOLTA_PARAMS
+
SHUFFLE_TURING_PARAMS
all_shuffle
=
SHUFFLE_SIMT_PARAMS
+
SHUFFLE_VOLTA_PARAMS
+
SHUFFLE_TURING_PARAMS
all_shuffle
=
list
(
filter
(
lambda
x
:
not
x
.
is_nvrtc
,
all_shuffle
))
all_shuffle
=
list
(
filter
(
lambda
x
:
not
x
.
is_nvrtc
,
all_shuffle
))
cu
=
GemmMainUnitTest
(
all_shuffle
)
cu
=
GemmMainUnitTest
(
all_shuffle
)
...
@@ -41,8 +46,35 @@ if project_is_installed(PACKAGE_NAME) and project_is_editable(
...
@@ -41,8 +46,35 @@ if project_is_installed(PACKAGE_NAME) and project_is_editable(
all_imp
=
list
(
filter
(
lambda
x
:
not
x
.
is_nvrtc
,
all_imp
))
all_imp
=
list
(
filter
(
lambda
x
:
not
x
.
is_nvrtc
,
all_imp
))
convcu
=
ConvMainUnitTest
(
all_imp
)
convcu
=
ConvMainUnitTest
(
all_imp
)
convcu
.
namespace
=
"cumm.conv.main"
convcu
.
namespace
=
"cumm.conv.main"
pccm
.
builder
.
build_pybind
([
cu
,
convcu
,
SpconvOps
(),
BoxOps
(),
HashTable
(),
CompileInfo
(),
ExternalAllocator
()],
gemmtuner
=
GemmTunerSimple
(
cu
)
gemmtuner
.
namespace
=
"csrc.sparse.convops.gemmops"
convtuner
=
ConvTunerSimple
(
convcu
)
convtuner
.
namespace
=
"csrc.sparse.convops.convops"
convops
=
ConvGemmOps
(
gemmtuner
,
convtuner
)
convops
.
namespace
=
"csrc.sparse.convops.spops"
cus
=
[
cu
,
convcu
,
gemmtuner
,
convtuner
,
convops
,
SpconvOps
(),
BoxOps
(),
HashTable
(),
CompileInfo
(),
ExternalAllocator
(),
ExternalSpconvMatmul
(),
SimpleExternalSpconvMatmul
(),
]
pccm
.
builder
.
build_pybind
(
cus
,
PACKAGE_ROOT
/
"core_cc"
,
PACKAGE_ROOT
/
"core_cc"
,
namespace_root
=
PACKAGE_ROOT
,
namespace_root
=
PACKAGE_ROOT
,
load_library
=
False
)
load_library
=
False
,
verbose
=
True
)
# cus_dev: List[pccm.Class] = [
# ]
# pccm.builder.build_pybind(cus_dev,
# PACKAGE_ROOT / "core_cc_dev",
# namespace_root=PACKAGE_ROOT,
# load_library=False,
# verbose=True)
spconv/constants.py
View file @
899008fa
...
@@ -30,6 +30,7 @@ if _filter_hwio_env is not None:
...
@@ -30,6 +30,7 @@ if _filter_hwio_env is not None:
raise
NotImplementedError
(
"SPCONV_FILTER_HWIO is deprecated. use SPCONV_SAVED_WEIGHT_LAYOUT instead."
)
raise
NotImplementedError
(
"SPCONV_FILTER_HWIO is deprecated. use SPCONV_SAVED_WEIGHT_LAYOUT instead."
)
DISABLE_JIT
=
os
.
getenv
(
"SPCONV_DISABLE_JIT"
,
"0"
)
==
"1"
DISABLE_JIT
=
os
.
getenv
(
"SPCONV_DISABLE_JIT"
,
"0"
)
==
"1"
NDIM_DONT_CARE
=
3
NDIM_DONT_CARE
=
3
FILTER_HWIO
=
False
FILTER_HWIO
=
False
...
@@ -59,8 +60,10 @@ SPCONV_BWD_SPLITK = list(map(int, os.getenv("SPCONV_BWD_SPLITK", "1,2,4,8,16,32,
...
@@ -59,8 +60,10 @@ SPCONV_BWD_SPLITK = list(map(int, os.getenv("SPCONV_BWD_SPLITK", "1,2,4,8,16,32,
SPCONV_NVRTC_MODE
=
NVRTCMode
.
ConstantMemory
SPCONV_NVRTC_MODE
=
NVRTCMode
.
ConstantMemory
SPCONV_DEBUG_NVRTC_KERNELS
=
False
SPCONV_DEBUG_NVRTC_KERNELS
=
False
SPCONV_DEBUG_CPP_ONLY
=
project_is_editable
(
PACKAGE_NAME
)
class
Spconv
Alloc
ator
Keys
:
class
AllocKeys
:
Pair
=
"Pair"
Pair
=
"Pair"
IndiceNumPerLoc
=
"IndiceNumPerLoc"
IndiceNumPerLoc
=
"IndiceNumPerLoc"
PairMask
=
"PairMask"
PairMask
=
"PairMask"
...
@@ -72,5 +75,31 @@ class SpconvAllocatorKeys:
...
@@ -72,5 +75,31 @@ class SpconvAllocatorKeys:
# MaskArgSortFwd = "MaskArgSortFwd"
# MaskArgSortFwd = "MaskArgSortFwd"
MaskArgSortBwd
=
"MaskArgSortBwd"
MaskArgSortBwd
=
"MaskArgSortBwd"
MaskOutputFwd
=
"MaskOutputFwd"
OutFeatures
=
"OutFeatures"
OutFeatures
=
"OutFeatures"
Features
=
"Features"
Filters
=
"Filters"
OutBp
=
"OutBp"
DIn
=
"DIn"
DFilters
=
"DFilters"
InpBuffer
=
"InpBuffer"
OutBuffer
=
"OutBuffer"
IndicePairsUniq
=
"IndicePairsUniq"
IndicePairsUniqBackup
=
"IndicePairsUniqBackup"
HashKOrKV
=
"HashKOrKV"
HashV
=
"HashV"
ThrustTemp
=
"ThrustTemp"
SPCONV_DEBUG_WEIGHT
=
False
SPCONV_DEBUG_WEIGHT
=
False
SPCONV_CPP_INDICE_PAIRS
=
True
SPCONV_CPP_INDICE_PAIRS_IGEMM
=
True
SPCONV_CPP_GEMM
=
True
\ No newline at end of file
spconv/core.py
View file @
899008fa
...
@@ -16,9 +16,10 @@ from cumm.gemm.main import gen_shuffle_params_v2 as gen_shuffle_params, GemmAlgo
...
@@ -16,9 +16,10 @@ from cumm.gemm.main import gen_shuffle_params_v2 as gen_shuffle_params, GemmAlgo
from
cumm.gemm
import
kernel
from
cumm.gemm
import
kernel
from
typing
import
List
from
typing
import
List
from
cumm.gemm.algospec.core
import
TensorOp
from
cumm.gemm.algospec.core
import
TensorOp
from
cumm.conv.main
import
gen_gemm_params
as
gen_conv_params
,
ConvFwdAndBwdInput
,
ConvBwdWeight
,
ConvIterAlgo
,
GemmAlgo
from
cumm.conv.main
import
gen_gemm_params
as
gen_conv_params
,
ConvFwdAndBwdInput
,
ConvBwdWeight
,
ConvFwd
,
ConvIterAlgo
,
GemmAlgo
from
cumm.conv.bases
import
(
NCHW
,
NHWC
,
ConvIterAlgo
,
ConvLayout
,
from
cumm.conv.bases
import
(
NCHW
,
NHWC
,
ConvIterAlgo
,
ConvLayout
,
ConvLayoutType
,
ConvMode
,
ConvOpType
)
ConvLayoutType
,
ConvMode
,
ConvOpType
)
from
spconv.algocore
import
get_gemm_algo_desp_from_param
from
spconv.constants
import
NDIM_DONT_CARE
from
spconv.constants
import
NDIM_DONT_CARE
...
@@ -402,32 +403,6 @@ IMPLGEMM_SIMT_PARAMS = [
...
@@ -402,32 +403,6 @@ IMPLGEMM_SIMT_PARAMS = [
increment_k_first
=
True
,
increment_k_first
=
True
,
access_per_vector
=
1
),
access_per_vector
=
1
),
]
]
IMPLGEMM_SIMT_PARAMS
=
[
*
gen_conv_params
(
ConvFwdAndBwdInput
,
(
64
,
32
,
16
),
(
32
,
32
,
8
),
NDIM_DONT_CARE
,
ConvIterAlgo
.
Optimized
,
2
,
[
"f32,f32,f32,f32,f32"
,
"f16,f16,f16,f32,f32"
],
NHWC
,
NHWC
,
NHWC
,
GemmAlgo
.
Simt
,
None
,
mask_sparse
=
True
,
increment_k_first
=
True
,
access_per_vector
=
1
),
*
gen_conv_params
(
ConvBwdWeight
,
(
64
,
32
,
16
),
(
32
,
32
,
8
),
NDIM_DONT_CARE
,
ConvIterAlgo
.
Optimized
,
2
,
[
"f32,f32,f32,f32,f32"
,
"f16,f16,f16,f32,f32"
],
NHWC
,
NHWC
,
NHWC
,
GemmAlgo
.
Simt
,
None
,
mask_sparse
=
True
,
increment_k_first
=
True
,
access_per_vector
=
1
),
]
IMPLGEMM_VOLTA_PARAMS
=
[
IMPLGEMM_VOLTA_PARAMS
=
[
...
@@ -693,6 +668,181 @@ IMPLGEMM_TURING_PARAMS = [
...
@@ -693,6 +668,181 @@ IMPLGEMM_TURING_PARAMS = [
# NHWC, NHWC, NHWC, GemmAlgo.Turing, TensorOp((16, 8, 8)), mask_sparse=True, increment_k_first=True, access_per_vector=1),
# NHWC, NHWC, NHWC, GemmAlgo.Turing, TensorOp((16, 8, 8)), mask_sparse=True, increment_k_first=True, access_per_vector=1),
# gen_conv_params(ConvFwdAndBwdInput, )
# gen_conv_params(ConvFwdAndBwdInput, )
# all int8 kernels use nvrtc.
*
gen_conv_params
(
ConvFwd
,
(
32
,
32
,
32
),
(
32
,
32
,
32
),
NDIM_DONT_CARE
,
ConvIterAlgo
.
Optimized
,
2
,
[
"s8,s8,s8,s32,s32"
],
NHWC
,
NHWC
,
NHWC
,
GemmAlgo
.
Turing
,
TensorOp
((
8
,
8
,
16
)),
mask_sparse
=
True
,
increment_k_first
=
True
,
access_per_vector
=
1
,
is_nvrtc
=
False
),
*
gen_conv_params
(
ConvFwd
,
(
32
,
64
,
32
),
(
32
,
32
,
32
),
NDIM_DONT_CARE
,
ConvIterAlgo
.
Optimized
,
2
,
[
"s8,s8,s8,s32,s32"
],
NHWC
,
NHWC
,
NHWC
,
GemmAlgo
.
Turing
,
TensorOp
((
8
,
8
,
16
)),
mask_sparse
=
True
,
increment_k_first
=
True
,
access_per_vector
=
1
,
is_nvrtc
=
False
),
*
gen_conv_params
(
ConvFwd
,
(
32
,
32
,
64
),
(
32
,
32
,
32
),
NDIM_DONT_CARE
,
ConvIterAlgo
.
Optimized
,
2
,
[
"s8,s8,s8,s32,s32"
],
NHWC
,
NHWC
,
NHWC
,
GemmAlgo
.
Turing
,
TensorOp
((
8
,
8
,
16
)),
mask_sparse
=
True
,
increment_k_first
=
True
,
access_per_vector
=
1
,
is_nvrtc
=
False
),
*
gen_conv_params
(
ConvFwd
,
(
32
,
64
,
64
),
(
32
,
32
,
32
),
NDIM_DONT_CARE
,
ConvIterAlgo
.
Optimized
,
2
,
[
"s8,s8,s8,s32,s32"
],
NHWC
,
NHWC
,
NHWC
,
GemmAlgo
.
Turing
,
TensorOp
((
8
,
8
,
16
)),
mask_sparse
=
True
,
increment_k_first
=
True
,
access_per_vector
=
1
,
is_nvrtc
=
False
),
*
gen_conv_params
(
ConvFwd
,
(
64
,
128
,
32
),
(
32
,
64
,
32
),
NDIM_DONT_CARE
,
ConvIterAlgo
.
Optimized
,
2
,
[
"s8,s8,s8,s32,s32"
],
NHWC
,
NHWC
,
NHWC
,
GemmAlgo
.
Turing
,
TensorOp
((
8
,
8
,
16
)),
mask_sparse
=
True
,
increment_k_first
=
True
,
access_per_vector
=
1
,
is_nvrtc
=
False
),
*
gen_conv_params
(
ConvFwd
,
(
64
,
64
,
32
),
(
32
,
64
,
32
),
NDIM_DONT_CARE
,
ConvIterAlgo
.
Optimized
,
2
,
[
"s8,s8,s8,s32,s32"
],
NHWC
,
NHWC
,
NHWC
,
GemmAlgo
.
Turing
,
TensorOp
((
8
,
8
,
16
)),
mask_sparse
=
True
,
increment_k_first
=
True
,
access_per_vector
=
1
,
is_nvrtc
=
False
),
*
gen_conv_params
(
ConvFwd
,
(
64
,
64
,
32
),
(
32
,
32
,
32
),
NDIM_DONT_CARE
,
ConvIterAlgo
.
Optimized
,
2
,
[
"s8,s8,s8,s32,s32"
],
NHWC
,
NHWC
,
NHWC
,
GemmAlgo
.
Turing
,
TensorOp
((
8
,
8
,
16
)),
mask_sparse
=
True
,
increment_k_first
=
True
,
access_per_vector
=
1
,
is_nvrtc
=
False
),
*
gen_conv_params
(
ConvFwd
,
(
64
,
32
,
32
),
(
32
,
32
,
32
),
NDIM_DONT_CARE
,
ConvIterAlgo
.
Optimized
,
2
,
[
"s8,s8,s8,s32,s32"
],
NHWC
,
NHWC
,
NHWC
,
GemmAlgo
.
Turing
,
TensorOp
((
8
,
8
,
16
)),
mask_sparse
=
True
,
increment_k_first
=
True
,
access_per_vector
=
1
,
is_nvrtc
=
False
),
*
gen_conv_params
(
ConvFwd
,
(
128
,
128
,
64
),
(
64
,
64
,
64
),
NDIM_DONT_CARE
,
ConvIterAlgo
.
Optimized
,
2
,
[
"s8,s8,s8,s32,s32"
],
NHWC
,
NHWC
,
NHWC
,
GemmAlgo
.
Turing
,
TensorOp
((
8
,
8
,
16
)),
mask_sparse
=
True
,
increment_k_first
=
True
,
access_per_vector
=
1
,
is_nvrtc
=
False
),
*
gen_conv_params
(
ConvFwd
,
(
64
,
128
,
64
),
(
32
,
64
,
64
),
NDIM_DONT_CARE
,
ConvIterAlgo
.
Optimized
,
2
,
[
"s8,s8,s8,s32,s32"
],
NHWC
,
NHWC
,
NHWC
,
GemmAlgo
.
Turing
,
TensorOp
((
8
,
8
,
16
)),
mask_sparse
=
True
,
increment_k_first
=
True
,
access_per_vector
=
1
,
is_nvrtc
=
False
),
# *gen_conv_params(ConvFwd, (32, 32, 32), (32, 32, 32),
# NDIM_DONT_CARE,
# ConvIterAlgo.Optimized,
# 2, ["s8,s8,s8,s32,s32"],
# NHWC,
# NHWC,
# NHWC,
# GemmAlgo.Turing,
# TensorOp((8, 8, 16)),
# mask_sparse=True,
# increment_k_first=True,
# access_per_vector=0,
# is_nvrtc=True),
# *gen_conv_params(ConvFwd, (32, 64, 32), (32, 32, 32),
# NDIM_DONT_CARE,
# ConvIterAlgo.Optimized,
# 2, ["s8,s8,s8,s32,s32"],
# NHWC,
# NHWC,
# NHWC,
# GemmAlgo.Turing,
# TensorOp((8, 8, 16)),
# mask_sparse=True,
# increment_k_first=True,
# access_per_vector=0,
# is_nvrtc=True),
# *gen_conv_params(ConvFwd, (32, 32, 64), (32, 32, 32),
# NDIM_DONT_CARE,
# ConvIterAlgo.Optimized,
# 2, ["s8,s8,s8,s32,s32"],
# NHWC,
# NHWC,
# NHWC,
# GemmAlgo.Turing,
# TensorOp((8, 8, 16)),
# mask_sparse=True,
# increment_k_first=True,
# access_per_vector=0,
# is_nvrtc=True),
]
]
ALL_NATIVE_PARAMS
=
SHUFFLE_SIMT_PARAMS
+
SHUFFLE_TURING_PARAMS
+
SHUFFLE_VOLTA_PARAMS
ALL_NATIVE_PARAMS
=
SHUFFLE_SIMT_PARAMS
+
SHUFFLE_TURING_PARAMS
+
SHUFFLE_VOLTA_PARAMS
...
...
spconv/core_cc/csrc/sparse/all/__init__.pyi
View file @
899008fa
...
@@ -48,7 +48,7 @@ class SpconvOps:
...
@@ -48,7 +48,7 @@ class SpconvOps:
"""
"""
...
...
@staticmethod
@staticmethod
def generate_conv_inds_stage2(indices: Tensor, hashdata_k: Tensor, hashdata_v: Tensor, indice_pairs: Tensor, indice_pairs_uniq: Tensor, indice_pairs_uniq_before_sort: Tensor, out_inds: Tensor, num_out_act: int, batch_size: int, output_dims: List[int], input_dims: List[int], ksize: List[int], stride: List[int], padding: List[int], dilation: List[int], transposed: bool = False, stream_int: int = 0) -> int:
def generate_conv_inds_stage2(indices: Tensor, hashdata_k: Tensor, hashdata_v: Tensor, indice_pairs: Tensor, indice_pairs_uniq: Tensor, indice_pairs_uniq_before_sort: Tensor, out_inds: Tensor,
indice_num_per_loc: Tensor,
num_out_act: int, batch_size: int, output_dims: List[int], input_dims: List[int], ksize: List[int], stride: List[int], padding: List[int], dilation: List[int], transposed: bool = False, stream_int: int = 0
, use_bound_algo: bool = False
) -> int:
"""
"""
Args:
Args:
indices:
indices:
...
@@ -58,6 +58,7 @@ class SpconvOps:
...
@@ -58,6 +58,7 @@ class SpconvOps:
indice_pairs_uniq:
indice_pairs_uniq:
indice_pairs_uniq_before_sort:
indice_pairs_uniq_before_sort:
out_inds:
out_inds:
indice_num_per_loc:
num_out_act:
num_out_act:
batch_size:
batch_size:
output_dims:
output_dims:
...
@@ -68,6 +69,7 @@ class SpconvOps:
...
@@ -68,6 +69,7 @@ class SpconvOps:
dilation:
dilation:
transposed:
transposed:
stream_int:
stream_int:
use_bound_algo:
"""
"""
...
...
@staticmethod
@staticmethod
...
@@ -191,6 +193,31 @@ class SpconvOps:
...
@@ -191,6 +193,31 @@ class SpconvOps:
"""
"""
...
...
@staticmethod
@staticmethod
def indice_maxpool(out_features: Tensor, features: Tensor, indice_pairs: Tensor, indice_pair_num: Tensor, num_activate_out: int, stream: int = 0) -> None:
"""
Args:
out_features:
features:
indice_pairs:
indice_pair_num:
num_activate_out:
stream:
"""
...
@staticmethod
def indice_maxpool_backward(din: Tensor, features: Tensor, out_features: Tensor, out_bp: Tensor, indice_pairs: Tensor, indice_pair_num: Tensor, stream: int = 0) -> None:
"""
Args:
din:
features:
out_features:
out_bp:
indice_pairs:
indice_pair_num:
stream:
"""
...
@staticmethod
def maxpool_implicit_gemm_forward(out: Tensor, inp: Tensor, inds: Tensor, stream: int = 0) -> None:
def maxpool_implicit_gemm_forward(out: Tensor, inp: Tensor, inds: Tensor, stream: int = 0) -> None:
"""
"""
Args:
Args:
...
@@ -369,7 +396,18 @@ class SpconvOps:
...
@@ -369,7 +396,18 @@ class SpconvOps:
@staticmethod
@staticmethod
def get_int32_max() -> int: ...
def get_int32_max() -> int: ...
@staticmethod
@staticmethod
def get_indice_pairs_implicit_gemm(allocator, indices: Tensor, batch_size: int, input_dims: List[int], algo: int, ksize: List[int], stride: List[int], padding: List[int], dilation: List[int], out_padding: List[int], subm: bool, transposed: bool, is_train: bool, stream_int: int = 0) -> Tensor:
def get_indice_gen_workspace_size(kv: int, num_act_in: int, num_act_out_bound: int, subm: bool, use_int64_hash_k: bool) -> int:
"""
Args:
kv:
num_act_in:
num_act_out_bound:
subm:
use_int64_hash_k:
"""
...
@staticmethod
def get_indice_pairs_implicit_gemm(allocator, indices: Tensor, batch_size: int, input_dims: List[int], algo: int, ksize: List[int], stride: List[int], padding: List[int], dilation: List[int], out_padding: List[int], subm: bool, transposed: bool, is_train: bool, stream_int: int = 0, num_out_act_bound: int = -1) -> Tuple[Tensor, int]:
"""
"""
Args:
Args:
allocator:
allocator:
...
@@ -386,10 +424,11 @@ class SpconvOps:
...
@@ -386,10 +424,11 @@ class SpconvOps:
transposed:
transposed:
is_train:
is_train:
stream_int:
stream_int:
num_out_act_bound:
"""
"""
...
...
@staticmethod
@staticmethod
def get_indice_pairs(allocator, indices: Tensor, batch_size: int, input_dims: List[int], algo: int, ksize: List[int], stride: List[int], padding: List[int], dilation: List[int], out_padding: List[int], subm: bool, transposed: bool, stream_int: int = 0) ->
None
:
def get_indice_pairs(allocator, indices: Tensor, batch_size: int, input_dims: List[int], algo: int, ksize: List[int], stride: List[int], padding: List[int], dilation: List[int], out_padding: List[int], subm: bool, transposed: bool, stream_int: int = 0
, num_out_act_bound: int = -1
) ->
int
:
"""
"""
Args:
Args:
allocator:
allocator:
...
@@ -405,12 +444,6 @@ class SpconvOps:
...
@@ -405,12 +444,6 @@ class SpconvOps:
subm:
subm:
transposed:
transposed:
stream_int:
stream_int:
"""
num_out_act_bound:
...
@staticmethod
def test_allocator(allocator) -> None:
"""
Args:
allocator:
"""
"""
...
...
spconv/core_cc/csrc/sparse/alloc.pyi
View file @
899008fa
...
@@ -2,25 +2,29 @@ from typing import overload, Any, Callable, Dict, List, Optional, Set, Tuple, Ty
...
@@ -2,25 +2,29 @@ from typing import overload, Any, Callable, Dict, List, Optional, Set, Tuple, Ty
from pccm.stubs import EnumValue, EnumClassValue
from pccm.stubs import EnumValue, EnumClassValue
from cumm.tensorview import Tensor
from cumm.tensorview import Tensor
class ExternalAllocator:
class ExternalAllocator:
def zeros(self, name: str, shape: List[int], dtype: int, device: int) -> Tensor:
def zeros(self, name: str, shape: List[int], dtype: int, device: int
, is_temp_memory: bool = False, stream: int = 0
) -> Tensor:
"""
"""
Args:
Args:
name:
name:
shape:
shape:
dtype:
dtype:
device:
device:
is_temp_memory:
stream:
"""
"""
...
...
def empty(self, name: str, shape: List[int], dtype: int, device: int) -> Tensor:
def empty(self, name: str, shape: List[int], dtype: int, device: int
, is_temp_memory: bool = False, stream: int = 0
) -> Tensor:
"""
"""
Args:
Args:
name:
name:
shape:
shape:
dtype:
dtype:
device:
device:
is_temp_memory:
stream:
"""
"""
...
...
def full_int(self, name: str, shape: List[int], value: int, dtype: int, device: int) -> Tensor:
def full_int(self, name: str, shape: List[int], value: int, dtype: int, device: int
, is_temp_memory: bool = False, stream: int = 0
) -> Tensor:
"""
"""
Args:
Args:
name:
name:
...
@@ -28,9 +32,11 @@ class ExternalAllocator:
...
@@ -28,9 +32,11 @@ class ExternalAllocator:
value:
value:
dtype:
dtype:
device:
device:
is_temp_memory:
stream:
"""
"""
...
...
def full_float(self, name: str, shape: List[int], value: float, dtype: int, device: int) -> Tensor:
def full_float(self, name: str, shape: List[int], value: float, dtype: int, device: int
, is_temp_memory: bool = False, stream: int = 0
) -> Tensor:
"""
"""
Args:
Args:
name:
name:
...
@@ -38,6 +44,14 @@ class ExternalAllocator:
...
@@ -38,6 +44,14 @@ class ExternalAllocator:
value:
value:
dtype:
dtype:
device:
device:
is_temp_memory:
stream:
"""
...
def get_tensor_by_name(self, name: str) -> Tensor:
"""
Args:
name:
"""
"""
...
...
def free(self, ten: Tensor) -> None:
def free(self, ten: Tensor) -> None:
...
...
spconv/core_cc/csrc/sparse/convops/__init__.pyi
0 → 100644
View file @
899008fa
from typing import overload, Any, Callable, Dict, List, Optional, Set, Tuple, Type, Union
from pccm.stubs import EnumValue, EnumClassValue
from cumm.tensorview.gemm import GemmAlgoDesp
from cumm.tensorview.gemm import ConvAlgoDesp
from cumm.tensorview import Tensor
from ...csrc.sparse.convops import ExternalSpconvMatmul
class GemmTuneResult:
algo_desp: GemmAlgoDesp
arch: Tuple[int, int]
splitk: int
def is_valid(self) -> bool: ...
@overload
def __init__(self) -> None: ...
@overload
def __init__(self, algo_desp: GemmAlgoDesp, arch: Tuple[int, int], splitk: int) -> None:
"""
Args:
algo_desp:
arch:
splitk:
"""
...
class ConvTuneResult:
algo_desp: ConvAlgoDesp
arch: Tuple[int, int]
splitk: int
@overload
def __init__(self) -> None: ...
@overload
def __init__(self, algo_desp: ConvAlgoDesp, arch: Tuple[int, int], splitk: int) -> None:
"""
Args:
algo_desp:
arch:
splitk:
"""
...
def is_valid(self) -> bool: ...
class ExternalSpconvMatmul:
def indice_conv_init_gemm(self, features_n: str, filters_n: str, all_weight_is_krsc: bool, is_kc_not_ck: bool, kv_center: int, out_channel: int, stream_int: int = 0) -> Tensor:
"""
Args:
features_n:
filters_n:
all_weight_is_krsc:
is_kc_not_ck:
kv_center:
out_channel:
stream_int:
"""
...
def indice_conv_cpu_gemm(self, inp_buffer_n: str, out_buffer_n: str, filters_n: str, all_weight_is_krsc: bool, is_kc_not_ck: bool, nhot: int, index: int) -> None:
"""
Args:
inp_buffer_n:
out_buffer_n:
filters_n:
all_weight_is_krsc:
is_kc_not_ck:
nhot:
index:
"""
...
def indice_conv_bwd_init_gemm(self, features_n: str, filters_n: str, out_bp_n: str, dfilters_n: str, all_weight_is_krsc: bool, is_kc_not_ck: bool, kv_center: int, stream_int: int = 0) -> Tensor:
"""
Args:
features_n:
filters_n:
out_bp_n:
dfilters_n:
all_weight_is_krsc:
is_kc_not_ck:
kv_center:
stream_int:
"""
...
def indice_conv_bwd_cpu_gemm(self, inp_buffer_n: str, out_buffer_n: str, filters_n: str, dfilters_n: str, all_weight_is_krsc: bool, is_kc_not_ck: bool, nhot: int, index: int) -> None:
"""
Args:
inp_buffer_n:
out_buffer_n:
filters_n:
dfilters_n:
all_weight_is_krsc:
is_kc_not_ck:
nhot:
index:
"""
...
class SimpleExternalSpconvMatmul(ExternalSpconvMatmul):
def __init__(self, alloc) -> None:
"""
Args:
alloc:
"""
...
spconv/core_cc/csrc/sparse/convops/convops.pyi
0 → 100644
View file @
899008fa
from typing import overload, Any, Callable, Dict, List, Optional, Set, Tuple, Type, Union
from pccm.stubs import EnumValue, EnumClassValue
from cumm.tensorview.gemm import ConvAlgoDesp
from cumm.tensorview import Tensor
from cumm.tensorview.gemm import NVRTCParams
from spconv.core_cc.csrc.sparse.convops import ConvTuneResult
from cumm.tensorview import CUDAKernelTimer
class ConvTunerSimple:
def __init__(self, desps: List[ConvAlgoDesp]) -> None:
"""
Args:
desps:
"""
...
@staticmethod
def get_available_algo_str_from_arch(arch: Tuple[int, int]) -> List[str]:
"""
Args:
arch:
"""
...
def get_all_available(self, inp: Tensor, weight: Tensor, out: Tensor, layout_i: int, layout_w: int, layout_o: int, interleave_i: int, interleave_w: int, interleave_o: int, arch: Tuple[int, int], op_type: int, mask_width: int, auto_fp32_accum: bool, fp32_accum: bool) -> List[ConvAlgoDesp]:
"""
Args:
inp:
weight:
out:
layout_i:
layout_w:
layout_o:
interleave_i:
interleave_w:
interleave_o:
arch:
op_type:
mask_width:
auto_fp32_accum:
fp32_accum:
"""
...
def cached_get_nvrtc_params(self, desp: ConvAlgoDesp, arch: Tuple[int, int], stream_int: int) -> NVRTCParams:
"""
Args:
desp:
arch:
stream_int:
"""
...
def tune_and_cache(self, op_type: int, inp: Tensor, weight: Tensor, output: Tensor, layout_i: int, layout_w: int, layout_o: int, interleave_i: int, interleave_w: int, interleave_o: int, arch: Tuple[int, int], mask: Tensor, mask_argsort: Tensor, indices: Tensor, reverse_mask: bool, mask_filter: int = 0xffffffff, mask_width: int = -1, mask_output: Tensor = Tensor(), alpha: float = 1.0, beta: float = 0.0, stream_int: int = 0, auto_fp32_accum: bool = True, fp32_accum: bool = False, num_run: int = 5) -> Tuple[ConvTuneResult, float]:
"""
Args:
op_type:
inp:
weight:
output:
layout_i:
layout_w:
layout_o:
interleave_i:
interleave_w:
interleave_o:
arch:
mask:
mask_argsort:
indices:
reverse_mask:
mask_filter:
mask_width:
mask_output:
alpha:
beta:
stream_int:
auto_fp32_accum:
fp32_accum:
num_run:
"""
...
def get_tuned_algo(self, op_type: int, i_dtype: int, w_dtype: int, o_dtype: int, k: int, c: int, arch: Tuple[int, int], mask_width: int = -1) -> Tuple[Any, bool]:
"""
Args:
op_type:
i_dtype:
w_dtype:
o_dtype:
k:
c:
arch:
mask_width:
"""
...
def run_with_tuned_result(self, profile_res, op_type: int, inp: Tensor, weight: Tensor, output: Tensor, mask: Tensor, mask_argsort: Tensor, mask_output: Tensor, indices: Tensor, reverse_mask: bool, mask_filter: int = 0xffffffff, mask_width: int = -1, alpha: float = 1.0, beta: float = 0.0, stream_int: int = 0, workspace: Tensor = Tensor(), verbose: bool = False, timer: CUDAKernelTimer = CUDAKernelTimer(false), force_nvrtc: bool = False) -> None:
"""
Args:
profile_res:
op_type:
inp:
weight:
output:
mask:
mask_argsort:
mask_output:
indices:
reverse_mask:
mask_filter:
mask_width:
alpha:
beta:
stream_int:
workspace:
verbose:
timer:
force_nvrtc:
"""
...
def query_workspace_size(self, desp: ConvAlgoDesp, splitk: int, op_type: int, N: int, C: int, K: int, kv: int) -> int:
"""
Args:
desp:
splitk:
op_type:
N:
C:
K:
kv:
"""
...
spconv/core_cc/csrc/sparse/convops/gemmops.pyi
0 → 100644
View file @
899008fa
from typing import overload, Any, Callable, Dict, List, Optional, Set, Tuple, Type, Union
from pccm.stubs import EnumValue, EnumClassValue
from cumm.tensorview.gemm import GemmAlgoDesp
from cumm.tensorview import Tensor
from cumm.tensorview.gemm import NVRTCParams
from spconv.core_cc.csrc.sparse.convops import GemmTuneResult
from cumm.tensorview import CUDAKernelTimer
class GemmTunerSimple:
def __init__(self, desps: List[GemmAlgoDesp]) -> None:
"""
Args:
desps:
"""
...
@staticmethod
def get_available_algo_str_from_arch(arch: Tuple[int, int]) -> List[str]:
"""
Args:
arch:
"""
...
def get_all_available(self, a: Tensor, b: Tensor, c: Tensor, trans_a: bool, trans_b: bool, trans_c: bool, arch: Tuple[int, int], shuffle_type: int) -> List[GemmAlgoDesp]:
"""
Args:
a:
b:
c:
trans_a:
trans_b:
trans_c:
arch:
shuffle_type:
"""
...
def cached_get_nvrtc_params(self, desp: GemmAlgoDesp, arch: Tuple[int, int], stream_int: int) -> NVRTCParams:
"""
Args:
desp:
arch:
stream_int:
"""
...
def tune_and_cache(self, a: Tensor, b: Tensor, c: Tensor, trans_a: bool, trans_b: bool, trans_c: bool, arch: Tuple[int, int], shuffle_type: int, a_inds: Tensor, b_inds: Tensor, c_inds: Tensor, hint: int = 0, alpha: float = 1.0, beta: float = 0.0, stream_int: int = 0, num_run: int = 5) -> Tuple[GemmTuneResult, float]:
"""
Args:
a:
b:
c:
trans_a:
trans_b:
trans_c:
arch:
shuffle_type:
a_inds:
b_inds:
c_inds:
hint:
alpha:
beta:
stream_int:
num_run:
"""
...
def get_tuned_algo(self, a_dtype: int, b_dtype: int, c_dtype: int, a_shape: List[int], b_shape: List[int], c_shape: List[int], trans_a: bool, trans_b: bool, trans_c: bool, arch: Tuple[int, int], shuffle_type: int, a_inds_shape: List[int], b_inds_shape: List[int], c_inds_shape: List[int], hint: int = 0) -> Tuple[Any, bool]:
"""
Args:
a_dtype:
b_dtype:
c_dtype:
a_shape:
b_shape:
c_shape:
trans_a:
trans_b:
trans_c:
arch:
shuffle_type:
a_inds_shape:
b_inds_shape:
c_inds_shape:
hint:
"""
...
def run_with_tuned_result(self, profile_res, a: Tensor, b: Tensor, c: Tensor, trans_a: bool, trans_b: bool, trans_c: bool, arch: Tuple[int, int], stream_int: int, shuffle_type: int, a_inds: Tensor, b_inds: Tensor, c_inds: Tensor, hint: int = 0, alpha: float = 1.0, beta: float = 0.0, workspace: Tensor = Tensor(), timer: CUDAKernelTimer = CUDAKernelTimer(False), force_nvrtc: bool = False) -> None:
"""
Args:
profile_res:
a:
b:
c:
trans_a:
trans_b:
trans_c:
arch:
stream_int:
shuffle_type:
a_inds:
b_inds:
c_inds:
hint:
alpha:
beta:
workspace:
timer:
force_nvrtc:
"""
...
spconv/core_cc/csrc/sparse/convops/spops.pyi
0 → 100644
View file @
899008fa
from typing import overload, Any, Callable, Dict, List, Optional, Set, Tuple, Type, Union
from pccm.stubs import EnumValue, EnumClassValue
from cumm.tensorview import Tensor
from cumm.tensorview import CUDAKernelTimer
class ConvGemmOps:
@staticmethod
def get_compute_capability(index: int = -1) -> Tuple[int, int]:
"""
Args:
index:
"""
...
@staticmethod
def indice_conv(allocator, ext_mm, gemm_tuner, all_w_is_krsc: bool, filter_hwio: bool, features: Tensor, filters: Tensor, indice_pairs: Tensor, indice_pair_num: Tensor, num_activate_out: int, inverse: bool = False, subm: bool = False, algo: int = 0, stream_int: int = 0) -> None:
"""
1. this function need to take a out features
that from subm first mm.
2. this function don't support CPU.
Args:
allocator:
ext_mm:
gemm_tuner:
all_w_is_krsc:
filter_hwio:
features:
filters:
indice_pairs:
indice_pair_num:
num_activate_out:
inverse:
subm:
algo:
stream_int:
"""
...
@staticmethod
def indice_conv_backward(allocator, ext_mm, gemm_tuner, all_w_is_krsc: bool, filter_hwio: bool, features: Tensor, filters: Tensor, out_bp: Tensor, indice_pairs: Tensor, indice_pair_num: Tensor, inverse: bool = False, subm: bool = False, algo: int = 0, stream_int: int = 0) -> None:
"""
Args:
allocator:
ext_mm:
gemm_tuner:
all_w_is_krsc:
filter_hwio:
features:
filters:
out_bp:
indice_pairs:
indice_pair_num:
inverse:
subm:
algo:
stream_int:
"""
...
@staticmethod
def implicit_gemm(allocator, conv_tuner, features: Tensor, filters: Tensor, pair_fwd: Tensor, pair_mask_fwd_splits: List[Tensor], mask_argsort_fwd_splits: List[Tensor], num_activate_out: int, masks: Tensor, is_train: bool = False, is_subm: bool = False, stream_int: int = 0, timer: CUDAKernelTimer = CUDAKernelTimer(False), auto_fp32_accum: bool = True, fp32_accum: bool = False) -> int:
"""
Args:
allocator:
conv_tuner:
features:
filters:
pair_fwd:
pair_mask_fwd_splits:
mask_argsort_fwd_splits:
num_activate_out:
masks:
is_train:
is_subm:
stream_int:
timer:
auto_fp32_accum:
fp32_accum:
"""
...
@staticmethod
def implicit_gemm_backward(allocator, conv_tuner, features: Tensor, filters: Tensor, out_bp: Tensor, pair_fwd: Tensor, pair_bwd: Tensor, pair_mask_fwd_splits: List[Tensor], pair_mask_bwd_splits: List[Tensor], mask_argsort_fwd_splits: List[Tensor], mask_argsort_bwd_splits: List[Tensor], mask_output_fwd: Tensor, masks: Tensor, mask_width: int, is_subm: bool, stream_int: int = 0, timer: CUDAKernelTimer = CUDAKernelTimer(False), auto_fp32_accum: bool = True, fp32_accum: bool = False) -> None:
"""
Args:
allocator:
conv_tuner:
features:
filters:
out_bp:
pair_fwd:
pair_bwd:
pair_mask_fwd_splits:
pair_mask_bwd_splits:
mask_argsort_fwd_splits:
mask_argsort_bwd_splits:
mask_output_fwd:
masks:
mask_width:
is_subm:
stream_int:
timer:
auto_fp32_accum:
fp32_accum:
"""
...
spconv/core_cc/cumm/common.pyi
View file @
899008fa
...
@@ -3,3 +3,10 @@ from pccm.stubs import EnumValue, EnumClassValue
...
@@ -3,3 +3,10 @@ from pccm.stubs import EnumValue, EnumClassValue
class CompileInfo:
class CompileInfo:
@staticmethod
@staticmethod
def get_compiled_cuda_arch() -> List[Tuple[int, int]]: ...
def get_compiled_cuda_arch() -> List[Tuple[int, int]]: ...
@staticmethod
def arch_is_compiled(arch: Tuple[int, int]) -> bool:
"""
Args:
arch:
"""
...
spconv/core_cc/cumm/gemm/main.pyi
View file @
899008fa
from typing import overload, Any, Callable, Dict, List, Optional, Set, Tuple, Type, Union
from typing import overload, Any, Callable, Dict, List, Optional, Set, Tuple, Type, Union
from pccm.stubs import EnumValue, EnumClassValue
from pccm.stubs import EnumValue, EnumClassValue
from cumm.tensorview.gemm import GemmAlgoDesp
from cumm.tensorview.gemm import GemmParams
from cumm.tensorview.gemm import GemmParams
class GemmMainUnitTest:
class GemmMainUnitTest:
@staticmethod
@staticmethod
def get_all_algo_desp() -> List[
Any
]: ...
def get_all_algo_desp() -> List[
GemmAlgoDesp
]: ...
@staticmethod
@staticmethod
def extract_mnk(a_shape: List[int], b_shape: List[int], trans_a: bool, trans_b: bool, trans_c: bool, shuffle_type:
str = "0"
, a_inds_shape: List[int] = [], b_inds_shape: List[int] = [], c_inds_shape: List[int] = []) -> Tuple[int, int, int]:
def extract_mnk(a_shape: List[int], b_shape: List[int], trans_a: bool, trans_b: bool, trans_c: bool, shuffle_type:
int = 0
, a_inds_shape: List[int] = [], b_inds_shape: List[int] = [], c_inds_shape: List[int] = []) -> Tuple[int, int, int]:
"""
"""
Args:
Args:
a_shape:
a_shape:
...
...
spconv/csrc/sparse/all.py
View file @
899008fa
This diff is collapsed.
Click to expand it.
spconv/csrc/sparse/alloc.py
View file @
899008fa
import
pccm
import
pccm
from
cumm.common
import
TensorView
,
TensorViewCPU
,
TensorViewKernel
,
ThrustLib
from
cumm.common
import
TensorView
,
TensorViewCPU
,
TensorViewKernel
,
ThrustLib
from
spconv.constants
import
AllocKeys
class
ExternalAllocatorGuard
(
pccm
.
Class
):
class
ExternalAllocatorGuard
(
pccm
.
Class
):
def
__init__
(
self
):
def
__init__
(
self
):
super
().
__init__
()
super
().
__init__
()
...
@@ -51,6 +53,9 @@ class ExternalAllocator(pccm.Class):
...
@@ -51,6 +53,9 @@ class ExternalAllocator(pccm.Class):
code
.
arg
(
"shape"
,
"std::vector<int64_t>"
)
code
.
arg
(
"shape"
,
"std::vector<int64_t>"
)
code
.
arg
(
"dtype"
,
"int"
)
code
.
arg
(
"dtype"
,
"int"
)
code
.
arg
(
"device"
,
"int"
)
code
.
arg
(
"device"
,
"int"
)
code
.
arg
(
"is_temp_memory"
,
"bool"
,
"false"
)
code
.
arg
(
"stream"
,
"std::uintptr_t"
,
"0"
)
return
code
.
ret
(
"tv::Tensor"
)
return
code
.
ret
(
"tv::Tensor"
)
@
pccm
.
pybind
.
mark
(
virtual
=
True
)
@
pccm
.
pybind
.
mark
(
virtual
=
True
)
...
@@ -61,6 +66,9 @@ class ExternalAllocator(pccm.Class):
...
@@ -61,6 +66,9 @@ class ExternalAllocator(pccm.Class):
code
.
arg
(
"shape"
,
"std::vector<int64_t>"
)
code
.
arg
(
"shape"
,
"std::vector<int64_t>"
)
code
.
arg
(
"dtype"
,
"int"
)
code
.
arg
(
"dtype"
,
"int"
)
code
.
arg
(
"device"
,
"int"
)
code
.
arg
(
"device"
,
"int"
)
code
.
arg
(
"is_temp_memory"
,
"bool"
,
"false"
)
code
.
arg
(
"stream"
,
"std::uintptr_t"
,
"0"
)
return
code
.
ret
(
"tv::Tensor"
)
return
code
.
ret
(
"tv::Tensor"
)
@
pccm
.
pybind
.
mark
(
virtual
=
True
)
@
pccm
.
pybind
.
mark
(
virtual
=
True
)
...
@@ -72,6 +80,9 @@ class ExternalAllocator(pccm.Class):
...
@@ -72,6 +80,9 @@ class ExternalAllocator(pccm.Class):
code
.
arg
(
"value"
,
"int"
)
code
.
arg
(
"value"
,
"int"
)
code
.
arg
(
"dtype"
,
"int"
)
code
.
arg
(
"dtype"
,
"int"
)
code
.
arg
(
"device"
,
"int"
)
code
.
arg
(
"device"
,
"int"
)
code
.
arg
(
"is_temp_memory"
,
"bool"
,
"false"
)
code
.
arg
(
"stream"
,
"std::uintptr_t"
,
"0"
)
return
code
.
ret
(
"tv::Tensor"
)
return
code
.
ret
(
"tv::Tensor"
)
@
pccm
.
pybind
.
mark
(
virtual
=
True
)
@
pccm
.
pybind
.
mark
(
virtual
=
True
)
...
@@ -83,6 +94,15 @@ class ExternalAllocator(pccm.Class):
...
@@ -83,6 +94,15 @@ class ExternalAllocator(pccm.Class):
code
.
arg
(
"value"
,
"float"
)
code
.
arg
(
"value"
,
"float"
)
code
.
arg
(
"dtype"
,
"int"
)
code
.
arg
(
"dtype"
,
"int"
)
code
.
arg
(
"device"
,
"int"
)
code
.
arg
(
"device"
,
"int"
)
code
.
arg
(
"is_temp_memory"
,
"bool"
,
"false"
)
code
.
arg
(
"stream"
,
"std::uintptr_t"
,
"0"
)
return
code
.
ret
(
"tv::Tensor"
)
@
pccm
.
pybind
.
mark
(
virtual
=
True
)
@
pccm
.
member_function
(
virtual
=
True
,
pure_virtual
=
True
)
def
get_tensor_by_name
(
self
):
code
=
pccm
.
code
()
code
.
arg
(
"name"
,
"std::string"
)
return
code
.
ret
(
"tv::Tensor"
)
return
code
.
ret
(
"tv::Tensor"
)
@
pccm
.
pybind
.
mark
(
virtual
=
True
)
@
pccm
.
pybind
.
mark
(
virtual
=
True
)
...
@@ -105,9 +125,11 @@ class ExternalAllocator(pccm.Class):
...
@@ -105,9 +125,11 @@ class ExternalAllocator(pccm.Class):
code
.
arg
(
"shape"
,
"std::vector<int64_t>"
)
code
.
arg
(
"shape"
,
"std::vector<int64_t>"
)
code
.
arg
(
"dtype"
,
"int"
)
code
.
arg
(
"dtype"
,
"int"
)
code
.
arg
(
"device"
,
"int"
)
code
.
arg
(
"device"
,
"int"
)
code
.
arg
(
"name"
,
"std::string"
,
"
\"\"
"
)
code
.
arg
(
"stream"
,
"std::uintptr_t"
,
"0"
)
code
.
raw
(
f
"""
code
.
raw
(
f
"""
// "" means temp memory
// "" means temp memory
auto ten = zeros(
""
, shape, dtype, device);
auto ten = zeros(
name
, shape, dtype, device
, true, stream
);
return std::make_
{
self
.
ptr_type
}
<ExternalAllocatorGuard>(ten, [this](tv::Tensor ten){{
return std::make_
{
self
.
ptr_type
}
<ExternalAllocatorGuard>(ten, [this](tv::Tensor ten){{
this->free(ten);
this->free(ten);
}});
}});
...
@@ -120,8 +142,10 @@ class ExternalAllocator(pccm.Class):
...
@@ -120,8 +142,10 @@ class ExternalAllocator(pccm.Class):
code
.
arg
(
"shape"
,
"std::vector<int64_t>"
)
code
.
arg
(
"shape"
,
"std::vector<int64_t>"
)
code
.
arg
(
"dtype"
,
"int"
)
code
.
arg
(
"dtype"
,
"int"
)
code
.
arg
(
"device"
,
"int"
)
code
.
arg
(
"device"
,
"int"
)
code
.
arg
(
"name"
,
"std::string"
,
"
\"\"
"
)
code
.
arg
(
"stream"
,
"std::uintptr_t"
,
"0"
)
code
.
raw
(
f
"""
code
.
raw
(
f
"""
auto ten = empty(
""
, shape, dtype, device);
auto ten = empty(
name
, shape, dtype, device
, true, stream
);
return std::make_
{
self
.
ptr_type
}
<ExternalAllocatorGuard>(ten, [this](tv::Tensor ten){{
return std::make_
{
self
.
ptr_type
}
<ExternalAllocatorGuard>(ten, [this](tv::Tensor ten){{
this->free(ten);
this->free(ten);
}});
}});
...
@@ -135,8 +159,10 @@ class ExternalAllocator(pccm.Class):
...
@@ -135,8 +159,10 @@ class ExternalAllocator(pccm.Class):
code
.
arg
(
"value"
,
"int"
)
code
.
arg
(
"value"
,
"int"
)
code
.
arg
(
"dtype"
,
"int"
)
code
.
arg
(
"dtype"
,
"int"
)
code
.
arg
(
"device"
,
"int"
)
code
.
arg
(
"device"
,
"int"
)
code
.
arg
(
"name"
,
"std::string"
,
"
\"\"
"
)
code
.
arg
(
"stream"
,
"std::uintptr_t"
,
"0"
)
code
.
raw
(
f
"""
code
.
raw
(
f
"""
auto ten = full_int(
""
, shape, value, dtype, device);
auto ten = full_int(
name
, shape, value, dtype, device
, true, stream
);
return std::make_
{
self
.
ptr_type
}
<ExternalAllocatorGuard>(ten, [this](tv::Tensor ten){{
return std::make_
{
self
.
ptr_type
}
<ExternalAllocatorGuard>(ten, [this](tv::Tensor ten){{
this->free(ten);
this->free(ten);
}});
}});
...
@@ -150,8 +176,10 @@ class ExternalAllocator(pccm.Class):
...
@@ -150,8 +176,10 @@ class ExternalAllocator(pccm.Class):
code
.
arg
(
"value"
,
"int"
)
code
.
arg
(
"value"
,
"int"
)
code
.
arg
(
"dtype"
,
"int"
)
code
.
arg
(
"dtype"
,
"int"
)
code
.
arg
(
"device"
,
"int"
)
code
.
arg
(
"device"
,
"int"
)
code
.
arg
(
"name"
,
"std::string"
,
"
\"\"
"
)
code
.
arg
(
"stream"
,
"std::uintptr_t"
,
"0"
)
code
.
raw
(
f
"""
code
.
raw
(
f
"""
auto ten = full_float(
""
, shape, value, dtype, device);
auto ten = full_float(
name
, shape, value, dtype, device
, true, stream
);
return std::make_
{
self
.
ptr_type
}
<ExternalAllocatorGuard>(ten, [this](tv::Tensor t){{
return std::make_
{
self
.
ptr_type
}
<ExternalAllocatorGuard>(ten, [this](tv::Tensor t){{
this->free(t);
this->free(t);
}});
}});
...
@@ -179,7 +207,7 @@ class ThrustAllocator(pccm.Class):
...
@@ -179,7 +207,7 @@ class ThrustAllocator(pccm.Class):
code
.
arg
(
"num_bytes"
,
"std::ptrdiff_t"
)
code
.
arg
(
"num_bytes"
,
"std::ptrdiff_t"
)
code
.
ret
(
"char*"
)
code
.
ret
(
"char*"
)
code
.
raw
(
f
"""
code
.
raw
(
f
"""
auto ten = allocator_.empty(
""
, {{num_bytes}}, tv::uint8, 0);
auto ten = allocator_.empty(
{
pccm
.
literal
(
AllocKeys
.
ThrustTemp
)
}
, {{num_bytes}}, tv::uint8, 0);
return reinterpret_cast<char*>(ten.raw_data());
return reinterpret_cast<char*>(ten.raw_data());
"""
)
"""
)
return
code
return
code
...
@@ -193,3 +221,158 @@ class ThrustAllocator(pccm.Class):
...
@@ -193,3 +221,158 @@ class ThrustAllocator(pccm.Class):
return allocator_.free_noexcept(tv::from_blob(ptr, {{num_bytes}}, tv::uint8, 0));
return allocator_.free_noexcept(tv::from_blob(ptr, {{num_bytes}}, tv::uint8, 0));
"""
)
"""
)
return
code
return
code
class
StaticAllocator
(
ExternalAllocator
):
"""a simple allocator for tensorrt plugin.
"""
def
__init__
(
self
):
super
().
__init__
()
self
.
add_dependency
(
TensorView
)
self
.
add_member
(
"tensor_dict_"
,
"std::unordered_map<std::string, tv::Tensor>"
)
self
.
add_member
(
"repr_"
,
"std::string"
)
self
.
add_member
(
"thrust_tmp_tensor_"
,
"tv::Tensor"
)
self
.
grow
=
1.5
@
pccm
.
pybind
.
mark
@
pccm
.
constructor
def
ctor
(
self
):
code
=
pccm
.
code
()
code
.
arg
(
"tensor_dict"
,
"std::unordered_map<std::string, tv::Tensor>"
)
code
.
ctor_init
(
"tensor_dict_"
,
"tensor_dict"
)
code
.
raw
(
f
"""
std::stringstream ss;
for (auto& p : tensor_dict){{
tv::ssprint(ss, p.first, p.second.shape(), tv::dtype_str(p.second.dtype()), "
\\
n");
}}
repr_ = ss.str();
"""
)
return
code
@
pccm
.
member_function
(
virtual
=
True
)
def
_get_raw_and_check
(
self
):
code
=
pccm
.
code
()
code
.
arg
(
"name"
,
"std::string"
)
code
.
arg
(
"shape"
,
"std::vector<int64_t>"
)
code
.
arg
(
"dtype"
,
"int"
)
code
.
arg
(
"device"
,
"int"
)
code
.
raw
(
f
"""
auto res = get_tensor_by_name(name);
size_t total = std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<int64_t>());
TV_ASSERT_RT_ERR(res.nbytes() >= total * tv::bit_size(tv::DType(dtype))
&& res.device() == device, "alloc failed", shape, res.shape());
return tv::from_blob(res.raw_data(), shape, dtype, device);
"""
)
return
code
.
ret
(
"tv::Tensor"
)
@
pccm
.
pybind
.
mark
@
pccm
.
member_function
(
virtual
=
True
)
def
zeros
(
self
):
code
=
pccm
.
code
()
code
.
arg
(
"name"
,
"std::string"
)
code
.
arg
(
"shape"
,
"std::vector<int64_t>"
)
code
.
arg
(
"dtype"
,
"int"
)
code
.
arg
(
"device"
,
"int"
)
code
.
arg
(
"is_temp_memory"
,
"bool"
,
"false"
)
code
.
arg
(
"stream"
,
"std::uintptr_t"
,
"0"
)
code
.
raw
(
f
"""
auto tvctx = tv::Context();
tvctx.set_cuda_stream(reinterpret_cast<cudaStream_t>(stream));
auto blob = _get_raw_and_check(name, shape, dtype, device);
return blob.zero_(tvctx);
"""
)
return
code
.
ret
(
"tv::Tensor"
)
@
pccm
.
pybind
.
mark
@
pccm
.
member_function
(
virtual
=
True
)
def
empty
(
self
):
code
=
pccm
.
code
()
code
.
arg
(
"name"
,
"std::string"
)
code
.
arg
(
"shape"
,
"std::vector<int64_t>"
)
code
.
arg
(
"dtype"
,
"int"
)
code
.
arg
(
"device"
,
"int"
)
code
.
arg
(
"is_temp_memory"
,
"bool"
,
"false"
)
code
.
arg
(
"stream"
,
"std::uintptr_t"
,
"0"
)
code
.
raw
(
f
"""
if (name ==
{
pccm
.
literal
(
AllocKeys
.
ThrustTemp
)
}
){{
// thrust tmp shouldn't inside tensor_dict. use a simple method to allocate
// we assume each allocator always handle one stream
// so we can just use one tensor
tv::Tensor res = thrust_tmp_tensor_;
if (res.empty()){{
res = tv::empty(shape, dtype, device);
thrust_tmp_tensor_ = res;
}}
if (shape[0] > thrust_tmp_tensor_.dim(0)){{
res = tv::empty({{int64_t(shape[0] *
{
self
.
grow
}
)}}, dtype, device);
thrust_tmp_tensor_ = res;
}}
return res;
}}else{{
auto blob = _get_raw_and_check(name, shape, dtype, device);
return blob;
}}
"""
)
return
code
.
ret
(
"tv::Tensor"
)
@
pccm
.
pybind
.
mark
@
pccm
.
member_function
(
virtual
=
True
)
def
full_int
(
self
):
code
=
pccm
.
code
()
code
.
arg
(
"name"
,
"std::string"
)
code
.
arg
(
"shape"
,
"std::vector<int64_t>"
)
code
.
arg
(
"value"
,
"int"
)
code
.
arg
(
"dtype"
,
"int"
)
code
.
arg
(
"device"
,
"int"
)
code
.
arg
(
"is_temp_memory"
,
"bool"
,
"false"
)
code
.
arg
(
"stream"
,
"std::uintptr_t"
,
"0"
)
code
.
raw
(
f
"""
auto tvctx = tv::Context();
auto blob = _get_raw_and_check(name, shape, dtype, device);
return blob.fill_(tvctx, value);
"""
)
return
code
.
ret
(
"tv::Tensor"
)
@
pccm
.
pybind
.
mark
@
pccm
.
member_function
(
virtual
=
True
)
def
full_float
(
self
):
code
=
pccm
.
code
()
code
.
arg
(
"name"
,
"std::string"
)
code
.
arg
(
"shape"
,
"std::vector<int64_t>"
)
code
.
arg
(
"value"
,
"float"
)
code
.
arg
(
"dtype"
,
"int"
)
code
.
arg
(
"device"
,
"int"
)
code
.
arg
(
"is_temp_memory"
,
"bool"
,
"false"
)
code
.
arg
(
"stream"
,
"std::uintptr_t"
,
"0"
)
code
.
raw
(
f
"""
auto blob = _get_raw_and_check(name, shape, dtype, device);
return blob.fill_(tvctx, value);
"""
)
return
code
.
ret
(
"tv::Tensor"
)
@
pccm
.
pybind
.
mark
@
pccm
.
member_function
(
virtual
=
True
)
def
get_tensor_by_name
(
self
):
code
=
pccm
.
code
()
code
.
arg
(
"name"
,
"std::string"
)
code
.
raw
(
f
"""
TV_ASSERT_RT_ERR(tensor_dict_.find(name) != tensor_dict_.end(), "can't find", name, "exists:
\\
n", repr_);
return tensor_dict_.at(name);
"""
)
return
code
.
ret
(
"tv::Tensor"
)
@
pccm
.
pybind
.
mark
@
pccm
.
member_function
(
virtual
=
True
)
def
free
(
self
):
code
=
pccm
.
code
()
code
.
arg
(
"ten"
,
"tv::Tensor"
)
return
code
@
pccm
.
pybind
.
mark
@
pccm
.
member_function
(
virtual
=
True
)
def
free_noexcept
(
self
):
code
=
pccm
.
code
()
code
.
arg
(
"ten"
,
"tv::Tensor"
)
return
code
spconv/csrc/sparse/convops.py
View file @
899008fa
This diff is collapsed.
Click to expand it.
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment