Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
one
spconv
Commits
899008fa
Commit
899008fa
authored
Jul 20, 2022
by
yan.yan
Browse files
working on c++ only
parent
f78575ea
Changes
31
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
3404 additions
and
327 deletions
+3404
-327
docs/DEVELOPMENT.md
docs/DEVELOPMENT.md
+0
-25
setup.py
setup.py
+22
-3
spconv/algo.py
spconv/algo.py
+187
-67
spconv/algocore.py
spconv/algocore.py
+13
-3
spconv/benchmark/me.py
spconv/benchmark/me.py
+0
-24
spconv/benchmark/thsp.py
spconv/benchmark/thsp.py
+0
-24
spconv/build.py
spconv/build.py
+35
-3
spconv/constants.py
spconv/constants.py
+30
-1
spconv/core.py
spconv/core.py
+177
-27
spconv/core_cc/csrc/sparse/all/__init__.pyi
spconv/core_cc/csrc/sparse/all/__init__.pyi
+43
-10
spconv/core_cc/csrc/sparse/alloc.pyi
spconv/core_cc/csrc/sparse/alloc.pyi
+18
-4
spconv/core_cc/csrc/sparse/convops/__init__.pyi
spconv/core_cc/csrc/sparse/convops/__init__.pyi
+96
-0
spconv/core_cc/csrc/sparse/convops/convops.pyi
spconv/core_cc/csrc/sparse/convops/convops.pyi
+126
-0
spconv/core_cc/csrc/sparse/convops/gemmops.pyi
spconv/core_cc/csrc/sparse/convops/gemmops.pyi
+107
-0
spconv/core_cc/csrc/sparse/convops/spops.pyi
spconv/core_cc/csrc/sparse/convops/spops.pyi
+101
-0
spconv/core_cc/cumm/common.pyi
spconv/core_cc/cumm/common.pyi
+7
-0
spconv/core_cc/cumm/gemm/main.pyi
spconv/core_cc/cumm/gemm/main.pyi
+3
-2
spconv/csrc/sparse/all.py
spconv/csrc/sparse/all.py
+217
-79
spconv/csrc/sparse/alloc.py
spconv/csrc/sparse/alloc.py
+190
-7
spconv/csrc/sparse/convops.py
spconv/csrc/sparse/convops.py
+2032
-48
No files found.
docs/DEVELOPMENT.md
deleted
100644 → 0
View file @
f78575ea
<!--
Copyright 2021 Yan Yan
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
# How to develop spconv 2.x
## First step
spconv 2.x is written in a unique c++ framework
```pccm```
. read
[
pccm guide
](
)
to learn how to use
```pccm```
.
It's recommend to uninstall spconv and cumm installed by pip, then install spconv and cumm both in editable mode (
```pip install -e .```
)
## Architecture
\ No newline at end of file
setup.py
View file @
899008fa
...
...
@@ -159,6 +159,9 @@ if disable_jit is not None and disable_jit == "1":
from
spconv.csrc.utils
import
BoxOps
from
spconv.csrc.hash.core
import
HashTable
from
cumm.common
import
CompileInfo
from
spconv.csrc.sparse.alloc
import
ExternalAllocator
from
spconv.csrc.sparse.convops
import
GemmTunerSimple
,
ExternalSpconvMatmul
from
spconv.csrc.sparse.convops
import
ConvTunerSimple
,
ConvGemmOps
cu
=
GemmMainUnitTest
(
SHUFFLE_SIMT_PARAMS
+
SHUFFLE_VOLTA_PARAMS
+
SHUFFLE_TURING_PARAMS
)
convcu
=
ConvMainUnitTest
(
IMPLGEMM_SIMT_PARAMS
+
IMPLGEMM_VOLTA_PARAMS
+
IMPLGEMM_TURING_PARAMS
)
...
...
@@ -172,14 +175,30 @@ if disable_jit is not None and disable_jit == "1":
std
=
"c++14"
else
:
std
=
"c++17"
cus
=
[
cu
,
convcu
,
SpconvOps
(),
BoxOps
(),
HashTable
(),
CompileInfo
()]
if
CUMM_CPU_ONLY_BUILD
:
cus
=
[
SpconvOps
(),
BoxOps
(),
HashTable
(),
CompileInfo
()]
gemmtuner
=
GemmTunerSimple
(
cu
)
gemmtuner
.
namespace
=
"csrc.sparse.convops.gemmops"
convtuner
=
ConvTunerSimple
(
convcu
)
convtuner
.
namespace
=
"csrc.sparse.convops.convops"
convops
=
ConvGemmOps
(
gemmtuner
,
convtuner
)
convops
.
namespace
=
"csrc.sparse.convops.spops"
else
:
gemmtuner
=
GemmTunerSimple
(
None
)
gemmtuner
.
namespace
=
"csrc.sparse.convops.gemmops"
convtuner
=
ConvTunerSimple
(
None
)
convtuner
.
namespace
=
"csrc.sparse.convops.convops"
convops
=
ConvGemmOps
(
gemmtuner
,
convtuner
)
convops
.
namespace
=
"csrc.sparse.convops.spops"
cus
=
[
gemmtuner
,
convtuner
,
convops
,
SpconvOps
(),
BoxOps
(),
HashTable
(),
CompileInfo
(),
ExternalAllocator
(),
ExternalSpconvMatmul
()]
if
not
CUMM_CPU_ONLY_BUILD
:
cus
.
extend
([
cu
,
convcu
])
ext_modules
:
List
[
Extension
]
=
[
PCCMExtension
(
cus
,
"spconv/core_cc"
,
Path
(
__file__
).
resolve
().
parent
/
"spconv"
,
objects_folder
=
"objects"
,
std
=
std
,
disable_pch
=
True
,
verbose
=
True
)
...
...
spconv/algo.py
View file @
899008fa
This diff is collapsed.
Click to expand it.
spconv/algocore.py
View file @
899008fa
...
...
@@ -24,8 +24,8 @@ from cumm.tensorview.gemm import ConvLayoutType as ConvLayoutTypeCpp
from
cumm.tensorview.gemm
import
ShuffleStrideType
as
ShuffleStrideTypeCpp
from
cumm.tensorview.gemm
import
ConvParams
,
GemmAlgoDesp
,
GemmParams
from
cumm.gemm.main
import
GemmAlgoParams
from
cumm.conv.main
import
ConvAlgoParams
,
ConvIterAlgo
from
cumm.gemm.main
import
GemmAlgoParams
,
gen_gemm_kernels
from
cumm.conv.main
import
ConvAlgoParams
,
ConvIterAlgo
,
gen_gemm_kernels
as
gen_conv_kernels
from
cumm
import
dtypes
from
cumm.conv.bases
import
(
NCHW
,
NHWC
,
ConvIterAlgo
,
ConvLayout
,
ConvLayoutType
,
ConvMode
,
ConvOpType
)
...
...
@@ -56,10 +56,15 @@ def _assign_gemm_desp_props(desp: Union[ConvAlgoDesp, GemmAlgoDesp],
desp
.
access_per_vector
=
p
.
access_per_vector
desp
.
is_nvrtc
=
p
.
is_nvrtc
def
get_gemm_algo_desp_from_param
(
p
:
GemmAlgoParams
):
desp
=
GemmAlgoDesp
()
_assign_gemm_desp_props
(
desp
,
p
)
# here we must generate kernel for element-per-access data
ker
=
gen_gemm_kernels
(
p
)
desp
.
element_per_access_a
=
ker
.
input_spec
.
input_iter_a
.
element_per_acc
desp
.
element_per_access_b
=
ker
.
input_spec
.
input_iter_b
.
element_per_acc
desp
.
element_per_access_c
=
ker
.
output_spec
.
out_iter
.
element_per_acc
return
desp
...
...
@@ -78,6 +83,10 @@ def get_conv_algo_desp_from_param(p: ConvAlgoParams):
desp
.
interleave_o
=
p
.
layout_desp_output
.
interleave
desp
.
mask_sparse
=
p
.
mask_sparse
desp
.
increment_k_first
=
p
.
increment_k_first
ker
=
gen_conv_kernels
(
p
)
desp
.
element_per_access_a
=
ker
.
input_spec
.
input_iter_a
.
element_per_acc
desp
.
element_per_access_b
=
ker
.
input_spec
.
input_iter_b
.
element_per_acc
desp
.
element_per_access_c
=
ker
.
output_spec
.
out_iter
.
element_per_acc
return
desp
...
...
@@ -106,6 +115,7 @@ def _assign_gemm_params(desp: Union[ConvAlgoDesp, GemmAlgoDesp],
p
.
is_nvrtc
=
desp
.
is_nvrtc
def
get_gemm_param_from_desp
(
desp
:
GemmAlgoDesp
):
p
=
GemmAlgoParams
((
0
,
0
,
0
),
(
0
,
0
,
0
),
0
,
"s8,s8,s8,s8,s8"
,
False
,
False
,
False
,
GemmAlgo
.
Simt
)
...
...
spconv/benchmark/me.py
deleted
100644 → 0
View file @
f78575ea
"""Benchmark MinkowskiEngine
"""
from
spconv.benchmark.core
import
get_voxel_data
import
time
from
pathlib
import
Path
import
numpy
as
np
import
torch
from
torch
import
nn
from
spconv.core
import
ConvAlgo
from
cumm
import
dtypes
from
spconv.test_utils
import
params_grid
_DTYPE_TO_TORCH_DTYPE
=
{
dtypes
.
float32
:
torch
.
float32
,
dtypes
.
float16
:
torch
.
float16
,
}
def
bench_me_basic
(
dtype_str
:
str
):
dtype
=
dtypes
.
get_dtype_by_shortcut
(
dtype_str
)
if
dtype
not
in
_DTYPE_TO_TORCH_DTYPE
:
raise
NotImplementedError
(
"only support bench f32 and f16 for now"
)
torch_dtype
=
_DTYPE_TO_TORCH_DTYPE
[
dtype
]
spconv/benchmark/thsp.py
deleted
100644 → 0
View file @
f78575ea
"""Benchmark torchsparse
"""
from
spconv.benchmark.core
import
get_voxel_data
import
time
from
pathlib
import
Path
import
numpy
as
np
import
torch
from
torch
import
nn
from
spconv.core
import
ConvAlgo
from
cumm
import
dtypes
from
spconv.test_utils
import
params_grid
_DTYPE_TO_TORCH_DTYPE
=
{
dtypes
.
float32
:
torch
.
float32
,
dtypes
.
float16
:
torch
.
float16
,
}
def
bench_torchsparse_basic
(
dtype_str
:
str
):
dtype
=
dtypes
.
get_dtype_by_shortcut
(
dtype_str
)
if
dtype
not
in
_DTYPE_TO_TORCH_DTYPE
:
raise
NotImplementedError
(
"only support bench f32 and f16 for now"
)
torch_dtype
=
_DTYPE_TO_TORCH_DTYPE
[
dtype
]
spconv/build.py
View file @
899008fa
...
...
@@ -13,6 +13,7 @@
# limitations under the License.
from
pathlib
import
Path
from
typing
import
List
import
pccm
from
pccm.utils
import
project_is_editable
,
project_is_installed
...
...
@@ -32,17 +33,48 @@ if project_is_installed(PACKAGE_NAME) and project_is_editable(
from
spconv.csrc.sparse.alloc
import
ExternalAllocator
from
spconv.csrc.utils
import
BoxOps
from
spconv.csrc.hash.core
import
HashTable
from
spconv.csrc.sparse.convops
import
GemmTunerSimple
,
ExternalSpconvMatmul
from
spconv.csrc.sparse.convops
import
ConvTunerSimple
,
ConvGemmOps
from
spconv.csrc.sparse.convops
import
SimpleExternalSpconvMatmul
all_shuffle
=
SHUFFLE_SIMT_PARAMS
+
SHUFFLE_VOLTA_PARAMS
+
SHUFFLE_TURING_PARAMS
all_shuffle
=
list
(
filter
(
lambda
x
:
not
x
.
is_nvrtc
,
all_shuffle
))
cu
=
GemmMainUnitTest
(
all_shuffle
)
cu
.
namespace
=
"cumm.gemm.main"
all_imp
=
(
IMPLGEMM_SIMT_PARAMS
+
IMPLGEMM_VOLTA_PARAMS
+
IMPLGEMM_TURING_PARAMS
)
IMPLGEMM_TURING_PARAMS
)
all_imp
=
list
(
filter
(
lambda
x
:
not
x
.
is_nvrtc
,
all_imp
))
convcu
=
ConvMainUnitTest
(
all_imp
)
convcu
.
namespace
=
"cumm.conv.main"
pccm
.
builder
.
build_pybind
([
cu
,
convcu
,
SpconvOps
(),
BoxOps
(),
HashTable
(),
CompileInfo
(),
ExternalAllocator
()],
gemmtuner
=
GemmTunerSimple
(
cu
)
gemmtuner
.
namespace
=
"csrc.sparse.convops.gemmops"
convtuner
=
ConvTunerSimple
(
convcu
)
convtuner
.
namespace
=
"csrc.sparse.convops.convops"
convops
=
ConvGemmOps
(
gemmtuner
,
convtuner
)
convops
.
namespace
=
"csrc.sparse.convops.spops"
cus
=
[
cu
,
convcu
,
gemmtuner
,
convtuner
,
convops
,
SpconvOps
(),
BoxOps
(),
HashTable
(),
CompileInfo
(),
ExternalAllocator
(),
ExternalSpconvMatmul
(),
SimpleExternalSpconvMatmul
(),
]
pccm
.
builder
.
build_pybind
(
cus
,
PACKAGE_ROOT
/
"core_cc"
,
namespace_root
=
PACKAGE_ROOT
,
load_library
=
False
)
load_library
=
False
,
verbose
=
True
)
# cus_dev: List[pccm.Class] = [
# ]
# pccm.builder.build_pybind(cus_dev,
# PACKAGE_ROOT / "core_cc_dev",
# namespace_root=PACKAGE_ROOT,
# load_library=False,
# verbose=True)
spconv/constants.py
View file @
899008fa
...
...
@@ -30,6 +30,7 @@ if _filter_hwio_env is not None:
raise
NotImplementedError
(
"SPCONV_FILTER_HWIO is deprecated. use SPCONV_SAVED_WEIGHT_LAYOUT instead."
)
DISABLE_JIT
=
os
.
getenv
(
"SPCONV_DISABLE_JIT"
,
"0"
)
==
"1"
NDIM_DONT_CARE
=
3
FILTER_HWIO
=
False
...
...
@@ -59,8 +60,10 @@ SPCONV_BWD_SPLITK = list(map(int, os.getenv("SPCONV_BWD_SPLITK", "1,2,4,8,16,32,
SPCONV_NVRTC_MODE
=
NVRTCMode
.
ConstantMemory
SPCONV_DEBUG_NVRTC_KERNELS
=
False
SPCONV_DEBUG_CPP_ONLY
=
project_is_editable
(
PACKAGE_NAME
)
class
Spconv
Alloc
ator
Keys
:
class
AllocKeys
:
Pair
=
"Pair"
IndiceNumPerLoc
=
"IndiceNumPerLoc"
PairMask
=
"PairMask"
...
...
@@ -72,5 +75,31 @@ class SpconvAllocatorKeys:
# MaskArgSortFwd = "MaskArgSortFwd"
MaskArgSortBwd
=
"MaskArgSortBwd"
MaskOutputFwd
=
"MaskOutputFwd"
OutFeatures
=
"OutFeatures"
Features
=
"Features"
Filters
=
"Filters"
OutBp
=
"OutBp"
DIn
=
"DIn"
DFilters
=
"DFilters"
InpBuffer
=
"InpBuffer"
OutBuffer
=
"OutBuffer"
IndicePairsUniq
=
"IndicePairsUniq"
IndicePairsUniqBackup
=
"IndicePairsUniqBackup"
HashKOrKV
=
"HashKOrKV"
HashV
=
"HashV"
ThrustTemp
=
"ThrustTemp"
SPCONV_DEBUG_WEIGHT
=
False
SPCONV_CPP_INDICE_PAIRS
=
True
SPCONV_CPP_INDICE_PAIRS_IGEMM
=
True
SPCONV_CPP_GEMM
=
True
\ No newline at end of file
spconv/core.py
View file @
899008fa
...
...
@@ -16,9 +16,10 @@ from cumm.gemm.main import gen_shuffle_params_v2 as gen_shuffle_params, GemmAlgo
from
cumm.gemm
import
kernel
from
typing
import
List
from
cumm.gemm.algospec.core
import
TensorOp
from
cumm.conv.main
import
gen_gemm_params
as
gen_conv_params
,
ConvFwdAndBwdInput
,
ConvBwdWeight
,
ConvIterAlgo
,
GemmAlgo
from
cumm.conv.main
import
gen_gemm_params
as
gen_conv_params
,
ConvFwdAndBwdInput
,
ConvBwdWeight
,
ConvFwd
,
ConvIterAlgo
,
GemmAlgo
from
cumm.conv.bases
import
(
NCHW
,
NHWC
,
ConvIterAlgo
,
ConvLayout
,
ConvLayoutType
,
ConvMode
,
ConvOpType
)
from
spconv.algocore
import
get_gemm_algo_desp_from_param
from
spconv.constants
import
NDIM_DONT_CARE
...
...
@@ -402,32 +403,6 @@ IMPLGEMM_SIMT_PARAMS = [
increment_k_first
=
True
,
access_per_vector
=
1
),
]
IMPLGEMM_SIMT_PARAMS
=
[
*
gen_conv_params
(
ConvFwdAndBwdInput
,
(
64
,
32
,
16
),
(
32
,
32
,
8
),
NDIM_DONT_CARE
,
ConvIterAlgo
.
Optimized
,
2
,
[
"f32,f32,f32,f32,f32"
,
"f16,f16,f16,f32,f32"
],
NHWC
,
NHWC
,
NHWC
,
GemmAlgo
.
Simt
,
None
,
mask_sparse
=
True
,
increment_k_first
=
True
,
access_per_vector
=
1
),
*
gen_conv_params
(
ConvBwdWeight
,
(
64
,
32
,
16
),
(
32
,
32
,
8
),
NDIM_DONT_CARE
,
ConvIterAlgo
.
Optimized
,
2
,
[
"f32,f32,f32,f32,f32"
,
"f16,f16,f16,f32,f32"
],
NHWC
,
NHWC
,
NHWC
,
GemmAlgo
.
Simt
,
None
,
mask_sparse
=
True
,
increment_k_first
=
True
,
access_per_vector
=
1
),
]
IMPLGEMM_VOLTA_PARAMS
=
[
...
...
@@ -693,6 +668,181 @@ IMPLGEMM_TURING_PARAMS = [
# NHWC, NHWC, NHWC, GemmAlgo.Turing, TensorOp((16, 8, 8)), mask_sparse=True, increment_k_first=True, access_per_vector=1),
# gen_conv_params(ConvFwdAndBwdInput, )
# all int8 kernels use nvrtc.
*
gen_conv_params
(
ConvFwd
,
(
32
,
32
,
32
),
(
32
,
32
,
32
),
NDIM_DONT_CARE
,
ConvIterAlgo
.
Optimized
,
2
,
[
"s8,s8,s8,s32,s32"
],
NHWC
,
NHWC
,
NHWC
,
GemmAlgo
.
Turing
,
TensorOp
((
8
,
8
,
16
)),
mask_sparse
=
True
,
increment_k_first
=
True
,
access_per_vector
=
1
,
is_nvrtc
=
False
),
*
gen_conv_params
(
ConvFwd
,
(
32
,
64
,
32
),
(
32
,
32
,
32
),
NDIM_DONT_CARE
,
ConvIterAlgo
.
Optimized
,
2
,
[
"s8,s8,s8,s32,s32"
],
NHWC
,
NHWC
,
NHWC
,
GemmAlgo
.
Turing
,
TensorOp
((
8
,
8
,
16
)),
mask_sparse
=
True
,
increment_k_first
=
True
,
access_per_vector
=
1
,
is_nvrtc
=
False
),
*
gen_conv_params
(
ConvFwd
,
(
32
,
32
,
64
),
(
32
,
32
,
32
),
NDIM_DONT_CARE
,
ConvIterAlgo
.
Optimized
,
2
,
[
"s8,s8,s8,s32,s32"
],
NHWC
,
NHWC
,
NHWC
,
GemmAlgo
.
Turing
,
TensorOp
((
8
,
8
,
16
)),
mask_sparse
=
True
,
increment_k_first
=
True
,
access_per_vector
=
1
,
is_nvrtc
=
False
),
*
gen_conv_params
(
ConvFwd
,
(
32
,
64
,
64
),
(
32
,
32
,
32
),
NDIM_DONT_CARE
,
ConvIterAlgo
.
Optimized
,
2
,
[
"s8,s8,s8,s32,s32"
],
NHWC
,
NHWC
,
NHWC
,
GemmAlgo
.
Turing
,
TensorOp
((
8
,
8
,
16
)),
mask_sparse
=
True
,
increment_k_first
=
True
,
access_per_vector
=
1
,
is_nvrtc
=
False
),
*
gen_conv_params
(
ConvFwd
,
(
64
,
128
,
32
),
(
32
,
64
,
32
),
NDIM_DONT_CARE
,
ConvIterAlgo
.
Optimized
,
2
,
[
"s8,s8,s8,s32,s32"
],
NHWC
,
NHWC
,
NHWC
,
GemmAlgo
.
Turing
,
TensorOp
((
8
,
8
,
16
)),
mask_sparse
=
True
,
increment_k_first
=
True
,
access_per_vector
=
1
,
is_nvrtc
=
False
),
*
gen_conv_params
(
ConvFwd
,
(
64
,
64
,
32
),
(
32
,
64
,
32
),
NDIM_DONT_CARE
,
ConvIterAlgo
.
Optimized
,
2
,
[
"s8,s8,s8,s32,s32"
],
NHWC
,
NHWC
,
NHWC
,
GemmAlgo
.
Turing
,
TensorOp
((
8
,
8
,
16
)),
mask_sparse
=
True
,
increment_k_first
=
True
,
access_per_vector
=
1
,
is_nvrtc
=
False
),
*
gen_conv_params
(
ConvFwd
,
(
64
,
64
,
32
),
(
32
,
32
,
32
),
NDIM_DONT_CARE
,
ConvIterAlgo
.
Optimized
,
2
,
[
"s8,s8,s8,s32,s32"
],
NHWC
,
NHWC
,
NHWC
,
GemmAlgo
.
Turing
,
TensorOp
((
8
,
8
,
16
)),
mask_sparse
=
True
,
increment_k_first
=
True
,
access_per_vector
=
1
,
is_nvrtc
=
False
),
*
gen_conv_params
(
ConvFwd
,
(
64
,
32
,
32
),
(
32
,
32
,
32
),
NDIM_DONT_CARE
,
ConvIterAlgo
.
Optimized
,
2
,
[
"s8,s8,s8,s32,s32"
],
NHWC
,
NHWC
,
NHWC
,
GemmAlgo
.
Turing
,
TensorOp
((
8
,
8
,
16
)),
mask_sparse
=
True
,
increment_k_first
=
True
,
access_per_vector
=
1
,
is_nvrtc
=
False
),
*
gen_conv_params
(
ConvFwd
,
(
128
,
128
,
64
),
(
64
,
64
,
64
),
NDIM_DONT_CARE
,
ConvIterAlgo
.
Optimized
,
2
,
[
"s8,s8,s8,s32,s32"
],
NHWC
,
NHWC
,
NHWC
,
GemmAlgo
.
Turing
,
TensorOp
((
8
,
8
,
16
)),
mask_sparse
=
True
,
increment_k_first
=
True
,
access_per_vector
=
1
,
is_nvrtc
=
False
),
*
gen_conv_params
(
ConvFwd
,
(
64
,
128
,
64
),
(
32
,
64
,
64
),
NDIM_DONT_CARE
,
ConvIterAlgo
.
Optimized
,
2
,
[
"s8,s8,s8,s32,s32"
],
NHWC
,
NHWC
,
NHWC
,
GemmAlgo
.
Turing
,
TensorOp
((
8
,
8
,
16
)),
mask_sparse
=
True
,
increment_k_first
=
True
,
access_per_vector
=
1
,
is_nvrtc
=
False
),
# *gen_conv_params(ConvFwd, (32, 32, 32), (32, 32, 32),
# NDIM_DONT_CARE,
# ConvIterAlgo.Optimized,
# 2, ["s8,s8,s8,s32,s32"],
# NHWC,
# NHWC,
# NHWC,
# GemmAlgo.Turing,
# TensorOp((8, 8, 16)),
# mask_sparse=True,
# increment_k_first=True,
# access_per_vector=0,
# is_nvrtc=True),
# *gen_conv_params(ConvFwd, (32, 64, 32), (32, 32, 32),
# NDIM_DONT_CARE,
# ConvIterAlgo.Optimized,
# 2, ["s8,s8,s8,s32,s32"],
# NHWC,
# NHWC,
# NHWC,
# GemmAlgo.Turing,
# TensorOp((8, 8, 16)),
# mask_sparse=True,
# increment_k_first=True,
# access_per_vector=0,
# is_nvrtc=True),
# *gen_conv_params(ConvFwd, (32, 32, 64), (32, 32, 32),
# NDIM_DONT_CARE,
# ConvIterAlgo.Optimized,
# 2, ["s8,s8,s8,s32,s32"],
# NHWC,
# NHWC,
# NHWC,
# GemmAlgo.Turing,
# TensorOp((8, 8, 16)),
# mask_sparse=True,
# increment_k_first=True,
# access_per_vector=0,
# is_nvrtc=True),
]
ALL_NATIVE_PARAMS
=
SHUFFLE_SIMT_PARAMS
+
SHUFFLE_TURING_PARAMS
+
SHUFFLE_VOLTA_PARAMS
...
...
spconv/core_cc/csrc/sparse/all/__init__.pyi
View file @
899008fa
...
...
@@ -48,7 +48,7 @@ class SpconvOps:
"""
...
@staticmethod
def generate_conv_inds_stage2(indices: Tensor, hashdata_k: Tensor, hashdata_v: Tensor, indice_pairs: Tensor, indice_pairs_uniq: Tensor, indice_pairs_uniq_before_sort: Tensor, out_inds: Tensor, num_out_act: int, batch_size: int, output_dims: List[int], input_dims: List[int], ksize: List[int], stride: List[int], padding: List[int], dilation: List[int], transposed: bool = False, stream_int: int = 0) -> int:
def generate_conv_inds_stage2(indices: Tensor, hashdata_k: Tensor, hashdata_v: Tensor, indice_pairs: Tensor, indice_pairs_uniq: Tensor, indice_pairs_uniq_before_sort: Tensor, out_inds: Tensor,
indice_num_per_loc: Tensor,
num_out_act: int, batch_size: int, output_dims: List[int], input_dims: List[int], ksize: List[int], stride: List[int], padding: List[int], dilation: List[int], transposed: bool = False, stream_int: int = 0
, use_bound_algo: bool = False
) -> int:
"""
Args:
indices:
...
...
@@ -58,6 +58,7 @@ class SpconvOps:
indice_pairs_uniq:
indice_pairs_uniq_before_sort:
out_inds:
indice_num_per_loc:
num_out_act:
batch_size:
output_dims:
...
...
@@ -68,6 +69,7 @@ class SpconvOps:
dilation:
transposed:
stream_int:
use_bound_algo:
"""
...
@staticmethod
...
...
@@ -191,6 +193,31 @@ class SpconvOps:
"""
...
@staticmethod
def indice_maxpool(out_features: Tensor, features: Tensor, indice_pairs: Tensor, indice_pair_num: Tensor, num_activate_out: int, stream: int = 0) -> None:
"""
Args:
out_features:
features:
indice_pairs:
indice_pair_num:
num_activate_out:
stream:
"""
...
@staticmethod
def indice_maxpool_backward(din: Tensor, features: Tensor, out_features: Tensor, out_bp: Tensor, indice_pairs: Tensor, indice_pair_num: Tensor, stream: int = 0) -> None:
"""
Args:
din:
features:
out_features:
out_bp:
indice_pairs:
indice_pair_num:
stream:
"""
...
@staticmethod
def maxpool_implicit_gemm_forward(out: Tensor, inp: Tensor, inds: Tensor, stream: int = 0) -> None:
"""
Args:
...
...
@@ -369,7 +396,18 @@ class SpconvOps:
@staticmethod
def get_int32_max() -> int: ...
@staticmethod
def get_indice_pairs_implicit_gemm(allocator, indices: Tensor, batch_size: int, input_dims: List[int], algo: int, ksize: List[int], stride: List[int], padding: List[int], dilation: List[int], out_padding: List[int], subm: bool, transposed: bool, is_train: bool, stream_int: int = 0) -> Tensor:
def get_indice_gen_workspace_size(kv: int, num_act_in: int, num_act_out_bound: int, subm: bool, use_int64_hash_k: bool) -> int:
"""
Args:
kv:
num_act_in:
num_act_out_bound:
subm:
use_int64_hash_k:
"""
...
@staticmethod
def get_indice_pairs_implicit_gemm(allocator, indices: Tensor, batch_size: int, input_dims: List[int], algo: int, ksize: List[int], stride: List[int], padding: List[int], dilation: List[int], out_padding: List[int], subm: bool, transposed: bool, is_train: bool, stream_int: int = 0, num_out_act_bound: int = -1) -> Tuple[Tensor, int]:
"""
Args:
allocator:
...
...
@@ -386,10 +424,11 @@ class SpconvOps:
transposed:
is_train:
stream_int:
num_out_act_bound:
"""
...
@staticmethod
def get_indice_pairs(allocator, indices: Tensor, batch_size: int, input_dims: List[int], algo: int, ksize: List[int], stride: List[int], padding: List[int], dilation: List[int], out_padding: List[int], subm: bool, transposed: bool, stream_int: int = 0) ->
None
:
def get_indice_pairs(allocator, indices: Tensor, batch_size: int, input_dims: List[int], algo: int, ksize: List[int], stride: List[int], padding: List[int], dilation: List[int], out_padding: List[int], subm: bool, transposed: bool, stream_int: int = 0
, num_out_act_bound: int = -1
) ->
int
:
"""
Args:
allocator:
...
...
@@ -405,12 +444,6 @@ class SpconvOps:
subm:
transposed:
stream_int:
"""
...
@staticmethod
def test_allocator(allocator) -> None:
"""
Args:
allocator:
num_out_act_bound:
"""
...
spconv/core_cc/csrc/sparse/alloc.pyi
View file @
899008fa
...
...
@@ -2,25 +2,29 @@ from typing import overload, Any, Callable, Dict, List, Optional, Set, Tuple, Ty
from pccm.stubs import EnumValue, EnumClassValue
from cumm.tensorview import Tensor
class ExternalAllocator:
def zeros(self, name: str, shape: List[int], dtype: int, device: int) -> Tensor:
def zeros(self, name: str, shape: List[int], dtype: int, device: int
, is_temp_memory: bool = False, stream: int = 0
) -> Tensor:
"""
Args:
name:
shape:
dtype:
device:
is_temp_memory:
stream:
"""
...
def empty(self, name: str, shape: List[int], dtype: int, device: int) -> Tensor:
def empty(self, name: str, shape: List[int], dtype: int, device: int
, is_temp_memory: bool = False, stream: int = 0
) -> Tensor:
"""
Args:
name:
shape:
dtype:
device:
is_temp_memory:
stream:
"""
...
def full_int(self, name: str, shape: List[int], value: int, dtype: int, device: int) -> Tensor:
def full_int(self, name: str, shape: List[int], value: int, dtype: int, device: int
, is_temp_memory: bool = False, stream: int = 0
) -> Tensor:
"""
Args:
name:
...
...
@@ -28,9 +32,11 @@ class ExternalAllocator:
value:
dtype:
device:
is_temp_memory:
stream:
"""
...
def full_float(self, name: str, shape: List[int], value: float, dtype: int, device: int) -> Tensor:
def full_float(self, name: str, shape: List[int], value: float, dtype: int, device: int
, is_temp_memory: bool = False, stream: int = 0
) -> Tensor:
"""
Args:
name:
...
...
@@ -38,6 +44,14 @@ class ExternalAllocator:
value:
dtype:
device:
is_temp_memory:
stream:
"""
...
def get_tensor_by_name(self, name: str) -> Tensor:
"""
Args:
name:
"""
...
def free(self, ten: Tensor) -> None:
...
...
spconv/core_cc/csrc/sparse/convops/__init__.pyi
0 → 100644
View file @
899008fa
from typing import overload, Any, Callable, Dict, List, Optional, Set, Tuple, Type, Union
from pccm.stubs import EnumValue, EnumClassValue
from cumm.tensorview.gemm import GemmAlgoDesp
from cumm.tensorview.gemm import ConvAlgoDesp
from cumm.tensorview import Tensor
from ...csrc.sparse.convops import ExternalSpconvMatmul
class GemmTuneResult:
algo_desp: GemmAlgoDesp
arch: Tuple[int, int]
splitk: int
def is_valid(self) -> bool: ...
@overload
def __init__(self) -> None: ...
@overload
def __init__(self, algo_desp: GemmAlgoDesp, arch: Tuple[int, int], splitk: int) -> None:
"""
Args:
algo_desp:
arch:
splitk:
"""
...
class ConvTuneResult:
algo_desp: ConvAlgoDesp
arch: Tuple[int, int]
splitk: int
@overload
def __init__(self) -> None: ...
@overload
def __init__(self, algo_desp: ConvAlgoDesp, arch: Tuple[int, int], splitk: int) -> None:
"""
Args:
algo_desp:
arch:
splitk:
"""
...
def is_valid(self) -> bool: ...
class ExternalSpconvMatmul:
def indice_conv_init_gemm(self, features_n: str, filters_n: str, all_weight_is_krsc: bool, is_kc_not_ck: bool, kv_center: int, out_channel: int, stream_int: int = 0) -> Tensor:
"""
Args:
features_n:
filters_n:
all_weight_is_krsc:
is_kc_not_ck:
kv_center:
out_channel:
stream_int:
"""
...
def indice_conv_cpu_gemm(self, inp_buffer_n: str, out_buffer_n: str, filters_n: str, all_weight_is_krsc: bool, is_kc_not_ck: bool, nhot: int, index: int) -> None:
"""
Args:
inp_buffer_n:
out_buffer_n:
filters_n:
all_weight_is_krsc:
is_kc_not_ck:
nhot:
index:
"""
...
def indice_conv_bwd_init_gemm(self, features_n: str, filters_n: str, out_bp_n: str, dfilters_n: str, all_weight_is_krsc: bool, is_kc_not_ck: bool, kv_center: int, stream_int: int = 0) -> Tensor:
"""
Args:
features_n:
filters_n:
out_bp_n:
dfilters_n:
all_weight_is_krsc:
is_kc_not_ck:
kv_center:
stream_int:
"""
...
def indice_conv_bwd_cpu_gemm(self, inp_buffer_n: str, out_buffer_n: str, filters_n: str, dfilters_n: str, all_weight_is_krsc: bool, is_kc_not_ck: bool, nhot: int, index: int) -> None:
"""
Args:
inp_buffer_n:
out_buffer_n:
filters_n:
dfilters_n:
all_weight_is_krsc:
is_kc_not_ck:
nhot:
index:
"""
...
class SimpleExternalSpconvMatmul(ExternalSpconvMatmul):
def __init__(self, alloc) -> None:
"""
Args:
alloc:
"""
...
spconv/core_cc/csrc/sparse/convops/convops.pyi
0 → 100644
View file @
899008fa
from typing import overload, Any, Callable, Dict, List, Optional, Set, Tuple, Type, Union
from pccm.stubs import EnumValue, EnumClassValue
from cumm.tensorview.gemm import ConvAlgoDesp
from cumm.tensorview import Tensor
from cumm.tensorview.gemm import NVRTCParams
from spconv.core_cc.csrc.sparse.convops import ConvTuneResult
from cumm.tensorview import CUDAKernelTimer
class ConvTunerSimple:
def __init__(self, desps: List[ConvAlgoDesp]) -> None:
"""
Args:
desps:
"""
...
@staticmethod
def get_available_algo_str_from_arch(arch: Tuple[int, int]) -> List[str]:
"""
Args:
arch:
"""
...
def get_all_available(self, inp: Tensor, weight: Tensor, out: Tensor, layout_i: int, layout_w: int, layout_o: int, interleave_i: int, interleave_w: int, interleave_o: int, arch: Tuple[int, int], op_type: int, mask_width: int, auto_fp32_accum: bool, fp32_accum: bool) -> List[ConvAlgoDesp]:
"""
Args:
inp:
weight:
out:
layout_i:
layout_w:
layout_o:
interleave_i:
interleave_w:
interleave_o:
arch:
op_type:
mask_width:
auto_fp32_accum:
fp32_accum:
"""
...
def cached_get_nvrtc_params(self, desp: ConvAlgoDesp, arch: Tuple[int, int], stream_int: int) -> NVRTCParams:
"""
Args:
desp:
arch:
stream_int:
"""
...
def tune_and_cache(self, op_type: int, inp: Tensor, weight: Tensor, output: Tensor, layout_i: int, layout_w: int, layout_o: int, interleave_i: int, interleave_w: int, interleave_o: int, arch: Tuple[int, int], mask: Tensor, mask_argsort: Tensor, indices: Tensor, reverse_mask: bool, mask_filter: int = 0xffffffff, mask_width: int = -1, mask_output: Tensor = Tensor(), alpha: float = 1.0, beta: float = 0.0, stream_int: int = 0, auto_fp32_accum: bool = True, fp32_accum: bool = False, num_run: int = 5) -> Tuple[ConvTuneResult, float]:
"""
Args:
op_type:
inp:
weight:
output:
layout_i:
layout_w:
layout_o:
interleave_i:
interleave_w:
interleave_o:
arch:
mask:
mask_argsort:
indices:
reverse_mask:
mask_filter:
mask_width:
mask_output:
alpha:
beta:
stream_int:
auto_fp32_accum:
fp32_accum:
num_run:
"""
...
def get_tuned_algo(self, op_type: int, i_dtype: int, w_dtype: int, o_dtype: int, k: int, c: int, arch: Tuple[int, int], mask_width: int = -1) -> Tuple[Any, bool]:
"""
Args:
op_type:
i_dtype:
w_dtype:
o_dtype:
k:
c:
arch:
mask_width:
"""
...
def run_with_tuned_result(self, profile_res, op_type: int, inp: Tensor, weight: Tensor, output: Tensor, mask: Tensor, mask_argsort: Tensor, mask_output: Tensor, indices: Tensor, reverse_mask: bool, mask_filter: int = 0xffffffff, mask_width: int = -1, alpha: float = 1.0, beta: float = 0.0, stream_int: int = 0, workspace: Tensor = Tensor(), verbose: bool = False, timer: CUDAKernelTimer = CUDAKernelTimer(false), force_nvrtc: bool = False) -> None:
"""
Args:
profile_res:
op_type:
inp:
weight:
output:
mask:
mask_argsort:
mask_output:
indices:
reverse_mask:
mask_filter:
mask_width:
alpha:
beta:
stream_int:
workspace:
verbose:
timer:
force_nvrtc:
"""
...
def query_workspace_size(self, desp: ConvAlgoDesp, splitk: int, op_type: int, N: int, C: int, K: int, kv: int) -> int:
"""
Args:
desp:
splitk:
op_type:
N:
C:
K:
kv:
"""
...
spconv/core_cc/csrc/sparse/convops/gemmops.pyi
0 → 100644
View file @
899008fa
from typing import overload, Any, Callable, Dict, List, Optional, Set, Tuple, Type, Union
from pccm.stubs import EnumValue, EnumClassValue
from cumm.tensorview.gemm import GemmAlgoDesp
from cumm.tensorview import Tensor
from cumm.tensorview.gemm import NVRTCParams
from spconv.core_cc.csrc.sparse.convops import GemmTuneResult
from cumm.tensorview import CUDAKernelTimer
class GemmTunerSimple:
def __init__(self, desps: List[GemmAlgoDesp]) -> None:
"""
Args:
desps:
"""
...
@staticmethod
def get_available_algo_str_from_arch(arch: Tuple[int, int]) -> List[str]:
"""
Args:
arch:
"""
...
def get_all_available(self, a: Tensor, b: Tensor, c: Tensor, trans_a: bool, trans_b: bool, trans_c: bool, arch: Tuple[int, int], shuffle_type: int) -> List[GemmAlgoDesp]:
"""
Args:
a:
b:
c:
trans_a:
trans_b:
trans_c:
arch:
shuffle_type:
"""
...
def cached_get_nvrtc_params(self, desp: GemmAlgoDesp, arch: Tuple[int, int], stream_int: int) -> NVRTCParams:
"""
Args:
desp:
arch:
stream_int:
"""
...
def tune_and_cache(self, a: Tensor, b: Tensor, c: Tensor, trans_a: bool, trans_b: bool, trans_c: bool, arch: Tuple[int, int], shuffle_type: int, a_inds: Tensor, b_inds: Tensor, c_inds: Tensor, hint: int = 0, alpha: float = 1.0, beta: float = 0.0, stream_int: int = 0, num_run: int = 5) -> Tuple[GemmTuneResult, float]:
"""
Args:
a:
b:
c:
trans_a:
trans_b:
trans_c:
arch:
shuffle_type:
a_inds:
b_inds:
c_inds:
hint:
alpha:
beta:
stream_int:
num_run:
"""
...
def get_tuned_algo(self, a_dtype: int, b_dtype: int, c_dtype: int, a_shape: List[int], b_shape: List[int], c_shape: List[int], trans_a: bool, trans_b: bool, trans_c: bool, arch: Tuple[int, int], shuffle_type: int, a_inds_shape: List[int], b_inds_shape: List[int], c_inds_shape: List[int], hint: int = 0) -> Tuple[Any, bool]:
"""
Args:
a_dtype:
b_dtype:
c_dtype:
a_shape:
b_shape:
c_shape:
trans_a:
trans_b:
trans_c:
arch:
shuffle_type:
a_inds_shape:
b_inds_shape:
c_inds_shape:
hint:
"""
...
def run_with_tuned_result(self, profile_res, a: Tensor, b: Tensor, c: Tensor, trans_a: bool, trans_b: bool, trans_c: bool, arch: Tuple[int, int], stream_int: int, shuffle_type: int, a_inds: Tensor, b_inds: Tensor, c_inds: Tensor, hint: int = 0, alpha: float = 1.0, beta: float = 0.0, workspace: Tensor = Tensor(), timer: CUDAKernelTimer = CUDAKernelTimer(False), force_nvrtc: bool = False) -> None:
"""
Args:
profile_res:
a:
b:
c:
trans_a:
trans_b:
trans_c:
arch:
stream_int:
shuffle_type:
a_inds:
b_inds:
c_inds:
hint:
alpha:
beta:
workspace:
timer:
force_nvrtc:
"""
...
spconv/core_cc/csrc/sparse/convops/spops.pyi
0 → 100644
View file @
899008fa
from typing import overload, Any, Callable, Dict, List, Optional, Set, Tuple, Type, Union
from pccm.stubs import EnumValue, EnumClassValue
from cumm.tensorview import Tensor
from cumm.tensorview import CUDAKernelTimer
class ConvGemmOps:
@staticmethod
def get_compute_capability(index: int = -1) -> Tuple[int, int]:
"""
Args:
index:
"""
...
@staticmethod
def indice_conv(allocator, ext_mm, gemm_tuner, all_w_is_krsc: bool, filter_hwio: bool, features: Tensor, filters: Tensor, indice_pairs: Tensor, indice_pair_num: Tensor, num_activate_out: int, inverse: bool = False, subm: bool = False, algo: int = 0, stream_int: int = 0) -> None:
"""
1. this function need to take a out features
that from subm first mm.
2. this function don't support CPU.
Args:
allocator:
ext_mm:
gemm_tuner:
all_w_is_krsc:
filter_hwio:
features:
filters:
indice_pairs:
indice_pair_num:
num_activate_out:
inverse:
subm:
algo:
stream_int:
"""
...
@staticmethod
def indice_conv_backward(allocator, ext_mm, gemm_tuner, all_w_is_krsc: bool, filter_hwio: bool, features: Tensor, filters: Tensor, out_bp: Tensor, indice_pairs: Tensor, indice_pair_num: Tensor, inverse: bool = False, subm: bool = False, algo: int = 0, stream_int: int = 0) -> None:
"""
Args:
allocator:
ext_mm:
gemm_tuner:
all_w_is_krsc:
filter_hwio:
features:
filters:
out_bp:
indice_pairs:
indice_pair_num:
inverse:
subm:
algo:
stream_int:
"""
...
@staticmethod
def implicit_gemm(allocator, conv_tuner, features: Tensor, filters: Tensor, pair_fwd: Tensor, pair_mask_fwd_splits: List[Tensor], mask_argsort_fwd_splits: List[Tensor], num_activate_out: int, masks: Tensor, is_train: bool = False, is_subm: bool = False, stream_int: int = 0, timer: CUDAKernelTimer = CUDAKernelTimer(False), auto_fp32_accum: bool = True, fp32_accum: bool = False) -> int:
"""
Args:
allocator:
conv_tuner:
features:
filters:
pair_fwd:
pair_mask_fwd_splits:
mask_argsort_fwd_splits:
num_activate_out:
masks:
is_train:
is_subm:
stream_int:
timer:
auto_fp32_accum:
fp32_accum:
"""
...
@staticmethod
def implicit_gemm_backward(allocator, conv_tuner, features: Tensor, filters: Tensor, out_bp: Tensor, pair_fwd: Tensor, pair_bwd: Tensor, pair_mask_fwd_splits: List[Tensor], pair_mask_bwd_splits: List[Tensor], mask_argsort_fwd_splits: List[Tensor], mask_argsort_bwd_splits: List[Tensor], mask_output_fwd: Tensor, masks: Tensor, mask_width: int, is_subm: bool, stream_int: int = 0, timer: CUDAKernelTimer = CUDAKernelTimer(False), auto_fp32_accum: bool = True, fp32_accum: bool = False) -> None:
"""
Args:
allocator:
conv_tuner:
features:
filters:
out_bp:
pair_fwd:
pair_bwd:
pair_mask_fwd_splits:
pair_mask_bwd_splits:
mask_argsort_fwd_splits:
mask_argsort_bwd_splits:
mask_output_fwd:
masks:
mask_width:
is_subm:
stream_int:
timer:
auto_fp32_accum:
fp32_accum:
"""
...
spconv/core_cc/cumm/common.pyi
View file @
899008fa
...
...
@@ -3,3 +3,10 @@ from pccm.stubs import EnumValue, EnumClassValue
class CompileInfo:
@staticmethod
def get_compiled_cuda_arch() -> List[Tuple[int, int]]: ...
@staticmethod
def arch_is_compiled(arch: Tuple[int, int]) -> bool:
"""
Args:
arch:
"""
...
spconv/core_cc/cumm/gemm/main.pyi
View file @
899008fa
from typing import overload, Any, Callable, Dict, List, Optional, Set, Tuple, Type, Union
from pccm.stubs import EnumValue, EnumClassValue
from cumm.tensorview.gemm import GemmAlgoDesp
from cumm.tensorview.gemm import GemmParams
class GemmMainUnitTest:
@staticmethod
def get_all_algo_desp() -> List[
Any
]: ...
def get_all_algo_desp() -> List[
GemmAlgoDesp
]: ...
@staticmethod
def extract_mnk(a_shape: List[int], b_shape: List[int], trans_a: bool, trans_b: bool, trans_c: bool, shuffle_type:
str = "0"
, a_inds_shape: List[int] = [], b_inds_shape: List[int] = [], c_inds_shape: List[int] = []) -> Tuple[int, int, int]:
def extract_mnk(a_shape: List[int], b_shape: List[int], trans_a: bool, trans_b: bool, trans_c: bool, shuffle_type:
int = 0
, a_inds_shape: List[int] = [], b_inds_shape: List[int] = [], c_inds_shape: List[int] = []) -> Tuple[int, int, int]:
"""
Args:
a_shape:
...
...
spconv/csrc/sparse/all.py
View file @
899008fa
This diff is collapsed.
Click to expand it.
spconv/csrc/sparse/alloc.py
View file @
899008fa
import
pccm
from
cumm.common
import
TensorView
,
TensorViewCPU
,
TensorViewKernel
,
ThrustLib
from
spconv.constants
import
AllocKeys
class
ExternalAllocatorGuard
(
pccm
.
Class
):
def
__init__
(
self
):
super
().
__init__
()
...
...
@@ -51,6 +53,9 @@ class ExternalAllocator(pccm.Class):
code
.
arg
(
"shape"
,
"std::vector<int64_t>"
)
code
.
arg
(
"dtype"
,
"int"
)
code
.
arg
(
"device"
,
"int"
)
code
.
arg
(
"is_temp_memory"
,
"bool"
,
"false"
)
code
.
arg
(
"stream"
,
"std::uintptr_t"
,
"0"
)
return
code
.
ret
(
"tv::Tensor"
)
@
pccm
.
pybind
.
mark
(
virtual
=
True
)
...
...
@@ -61,6 +66,9 @@ class ExternalAllocator(pccm.Class):
code
.
arg
(
"shape"
,
"std::vector<int64_t>"
)
code
.
arg
(
"dtype"
,
"int"
)
code
.
arg
(
"device"
,
"int"
)
code
.
arg
(
"is_temp_memory"
,
"bool"
,
"false"
)
code
.
arg
(
"stream"
,
"std::uintptr_t"
,
"0"
)
return
code
.
ret
(
"tv::Tensor"
)
@
pccm
.
pybind
.
mark
(
virtual
=
True
)
...
...
@@ -72,6 +80,9 @@ class ExternalAllocator(pccm.Class):
code
.
arg
(
"value"
,
"int"
)
code
.
arg
(
"dtype"
,
"int"
)
code
.
arg
(
"device"
,
"int"
)
code
.
arg
(
"is_temp_memory"
,
"bool"
,
"false"
)
code
.
arg
(
"stream"
,
"std::uintptr_t"
,
"0"
)
return
code
.
ret
(
"tv::Tensor"
)
@
pccm
.
pybind
.
mark
(
virtual
=
True
)
...
...
@@ -83,6 +94,15 @@ class ExternalAllocator(pccm.Class):
code
.
arg
(
"value"
,
"float"
)
code
.
arg
(
"dtype"
,
"int"
)
code
.
arg
(
"device"
,
"int"
)
code
.
arg
(
"is_temp_memory"
,
"bool"
,
"false"
)
code
.
arg
(
"stream"
,
"std::uintptr_t"
,
"0"
)
return
code
.
ret
(
"tv::Tensor"
)
@
pccm
.
pybind
.
mark
(
virtual
=
True
)
@
pccm
.
member_function
(
virtual
=
True
,
pure_virtual
=
True
)
def
get_tensor_by_name
(
self
):
code
=
pccm
.
code
()
code
.
arg
(
"name"
,
"std::string"
)
return
code
.
ret
(
"tv::Tensor"
)
@
pccm
.
pybind
.
mark
(
virtual
=
True
)
...
...
@@ -105,9 +125,11 @@ class ExternalAllocator(pccm.Class):
code
.
arg
(
"shape"
,
"std::vector<int64_t>"
)
code
.
arg
(
"dtype"
,
"int"
)
code
.
arg
(
"device"
,
"int"
)
code
.
arg
(
"name"
,
"std::string"
,
"
\"\"
"
)
code
.
arg
(
"stream"
,
"std::uintptr_t"
,
"0"
)
code
.
raw
(
f
"""
// "" means temp memory
auto ten = zeros(
""
, shape, dtype, device);
auto ten = zeros(
name
, shape, dtype, device
, true, stream
);
return std::make_
{
self
.
ptr_type
}
<ExternalAllocatorGuard>(ten, [this](tv::Tensor ten){{
this->free(ten);
}});
...
...
@@ -120,8 +142,10 @@ class ExternalAllocator(pccm.Class):
code
.
arg
(
"shape"
,
"std::vector<int64_t>"
)
code
.
arg
(
"dtype"
,
"int"
)
code
.
arg
(
"device"
,
"int"
)
code
.
arg
(
"name"
,
"std::string"
,
"
\"\"
"
)
code
.
arg
(
"stream"
,
"std::uintptr_t"
,
"0"
)
code
.
raw
(
f
"""
auto ten = empty(
""
, shape, dtype, device);
auto ten = empty(
name
, shape, dtype, device
, true, stream
);
return std::make_
{
self
.
ptr_type
}
<ExternalAllocatorGuard>(ten, [this](tv::Tensor ten){{
this->free(ten);
}});
...
...
@@ -135,8 +159,10 @@ class ExternalAllocator(pccm.Class):
code
.
arg
(
"value"
,
"int"
)
code
.
arg
(
"dtype"
,
"int"
)
code
.
arg
(
"device"
,
"int"
)
code
.
arg
(
"name"
,
"std::string"
,
"
\"\"
"
)
code
.
arg
(
"stream"
,
"std::uintptr_t"
,
"0"
)
code
.
raw
(
f
"""
auto ten = full_int(
""
, shape, value, dtype, device);
auto ten = full_int(
name
, shape, value, dtype, device
, true, stream
);
return std::make_
{
self
.
ptr_type
}
<ExternalAllocatorGuard>(ten, [this](tv::Tensor ten){{
this->free(ten);
}});
...
...
@@ -150,14 +176,16 @@ class ExternalAllocator(pccm.Class):
code
.
arg
(
"value"
,
"int"
)
code
.
arg
(
"dtype"
,
"int"
)
code
.
arg
(
"device"
,
"int"
)
code
.
arg
(
"name"
,
"std::string"
,
"
\"\"
"
)
code
.
arg
(
"stream"
,
"std::uintptr_t"
,
"0"
)
code
.
raw
(
f
"""
auto ten = full_float(
""
, shape, value, dtype, device);
auto ten = full_float(
name
, shape, value, dtype, device
, true, stream
);
return std::make_
{
self
.
ptr_type
}
<ExternalAllocatorGuard>(ten, [this](tv::Tensor t){{
this->free(t);
}});
"""
)
return
code
.
ret
(
f
"std::
{
self
.
ptr_type
}
_ptr<ExternalAllocatorGuard>"
)
class
ThrustAllocator
(
pccm
.
Class
):
def
__init__
(
self
):
super
().
__init__
()
...
...
@@ -179,7 +207,7 @@ class ThrustAllocator(pccm.Class):
code
.
arg
(
"num_bytes"
,
"std::ptrdiff_t"
)
code
.
ret
(
"char*"
)
code
.
raw
(
f
"""
auto ten = allocator_.empty(
""
, {{num_bytes}}, tv::uint8, 0);
auto ten = allocator_.empty(
{
pccm
.
literal
(
AllocKeys
.
ThrustTemp
)
}
, {{num_bytes}}, tv::uint8, 0);
return reinterpret_cast<char*>(ten.raw_data());
"""
)
return
code
...
...
@@ -192,4 +220,159 @@ class ThrustAllocator(pccm.Class):
code
.
raw
(
f
"""
return allocator_.free_noexcept(tv::from_blob(ptr, {{num_bytes}}, tv::uint8, 0));
"""
)
return
code
return
code
class
StaticAllocator
(
ExternalAllocator
):
"""a simple allocator for tensorrt plugin.
"""
def
__init__
(
self
):
super
().
__init__
()
self
.
add_dependency
(
TensorView
)
self
.
add_member
(
"tensor_dict_"
,
"std::unordered_map<std::string, tv::Tensor>"
)
self
.
add_member
(
"repr_"
,
"std::string"
)
self
.
add_member
(
"thrust_tmp_tensor_"
,
"tv::Tensor"
)
self
.
grow
=
1.5
@
pccm
.
pybind
.
mark
@
pccm
.
constructor
def
ctor
(
self
):
code
=
pccm
.
code
()
code
.
arg
(
"tensor_dict"
,
"std::unordered_map<std::string, tv::Tensor>"
)
code
.
ctor_init
(
"tensor_dict_"
,
"tensor_dict"
)
code
.
raw
(
f
"""
std::stringstream ss;
for (auto& p : tensor_dict){{
tv::ssprint(ss, p.first, p.second.shape(), tv::dtype_str(p.second.dtype()), "
\\
n");
}}
repr_ = ss.str();
"""
)
return
code
@
pccm
.
member_function
(
virtual
=
True
)
def
_get_raw_and_check
(
self
):
code
=
pccm
.
code
()
code
.
arg
(
"name"
,
"std::string"
)
code
.
arg
(
"shape"
,
"std::vector<int64_t>"
)
code
.
arg
(
"dtype"
,
"int"
)
code
.
arg
(
"device"
,
"int"
)
code
.
raw
(
f
"""
auto res = get_tensor_by_name(name);
size_t total = std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<int64_t>());
TV_ASSERT_RT_ERR(res.nbytes() >= total * tv::bit_size(tv::DType(dtype))
&& res.device() == device, "alloc failed", shape, res.shape());
return tv::from_blob(res.raw_data(), shape, dtype, device);
"""
)
return
code
.
ret
(
"tv::Tensor"
)
@
pccm
.
pybind
.
mark
@
pccm
.
member_function
(
virtual
=
True
)
def
zeros
(
self
):
code
=
pccm
.
code
()
code
.
arg
(
"name"
,
"std::string"
)
code
.
arg
(
"shape"
,
"std::vector<int64_t>"
)
code
.
arg
(
"dtype"
,
"int"
)
code
.
arg
(
"device"
,
"int"
)
code
.
arg
(
"is_temp_memory"
,
"bool"
,
"false"
)
code
.
arg
(
"stream"
,
"std::uintptr_t"
,
"0"
)
code
.
raw
(
f
"""
auto tvctx = tv::Context();
tvctx.set_cuda_stream(reinterpret_cast<cudaStream_t>(stream));
auto blob = _get_raw_and_check(name, shape, dtype, device);
return blob.zero_(tvctx);
"""
)
return
code
.
ret
(
"tv::Tensor"
)
@
pccm
.
pybind
.
mark
@
pccm
.
member_function
(
virtual
=
True
)
def
empty
(
self
):
code
=
pccm
.
code
()
code
.
arg
(
"name"
,
"std::string"
)
code
.
arg
(
"shape"
,
"std::vector<int64_t>"
)
code
.
arg
(
"dtype"
,
"int"
)
code
.
arg
(
"device"
,
"int"
)
code
.
arg
(
"is_temp_memory"
,
"bool"
,
"false"
)
code
.
arg
(
"stream"
,
"std::uintptr_t"
,
"0"
)
code
.
raw
(
f
"""
if (name ==
{
pccm
.
literal
(
AllocKeys
.
ThrustTemp
)
}
){{
// thrust tmp shouldn't inside tensor_dict. use a simple method to allocate
// we assume each allocator always handle one stream
// so we can just use one tensor
tv::Tensor res = thrust_tmp_tensor_;
if (res.empty()){{
res = tv::empty(shape, dtype, device);
thrust_tmp_tensor_ = res;
}}
if (shape[0] > thrust_tmp_tensor_.dim(0)){{
res = tv::empty({{int64_t(shape[0] *
{
self
.
grow
}
)}}, dtype, device);
thrust_tmp_tensor_ = res;
}}
return res;
}}else{{
auto blob = _get_raw_and_check(name, shape, dtype, device);
return blob;
}}
"""
)
return
code
.
ret
(
"tv::Tensor"
)
@
pccm
.
pybind
.
mark
@
pccm
.
member_function
(
virtual
=
True
)
def
full_int
(
self
):
code
=
pccm
.
code
()
code
.
arg
(
"name"
,
"std::string"
)
code
.
arg
(
"shape"
,
"std::vector<int64_t>"
)
code
.
arg
(
"value"
,
"int"
)
code
.
arg
(
"dtype"
,
"int"
)
code
.
arg
(
"device"
,
"int"
)
code
.
arg
(
"is_temp_memory"
,
"bool"
,
"false"
)
code
.
arg
(
"stream"
,
"std::uintptr_t"
,
"0"
)
code
.
raw
(
f
"""
auto tvctx = tv::Context();
auto blob = _get_raw_and_check(name, shape, dtype, device);
return blob.fill_(tvctx, value);
"""
)
return
code
.
ret
(
"tv::Tensor"
)
@
pccm
.
pybind
.
mark
@
pccm
.
member_function
(
virtual
=
True
)
def
full_float
(
self
):
code
=
pccm
.
code
()
code
.
arg
(
"name"
,
"std::string"
)
code
.
arg
(
"shape"
,
"std::vector<int64_t>"
)
code
.
arg
(
"value"
,
"float"
)
code
.
arg
(
"dtype"
,
"int"
)
code
.
arg
(
"device"
,
"int"
)
code
.
arg
(
"is_temp_memory"
,
"bool"
,
"false"
)
code
.
arg
(
"stream"
,
"std::uintptr_t"
,
"0"
)
code
.
raw
(
f
"""
auto blob = _get_raw_and_check(name, shape, dtype, device);
return blob.fill_(tvctx, value);
"""
)
return
code
.
ret
(
"tv::Tensor"
)
@
pccm
.
pybind
.
mark
@
pccm
.
member_function
(
virtual
=
True
)
def
get_tensor_by_name
(
self
):
code
=
pccm
.
code
()
code
.
arg
(
"name"
,
"std::string"
)
code
.
raw
(
f
"""
TV_ASSERT_RT_ERR(tensor_dict_.find(name) != tensor_dict_.end(), "can't find", name, "exists:
\\
n", repr_);
return tensor_dict_.at(name);
"""
)
return
code
.
ret
(
"tv::Tensor"
)
@
pccm
.
pybind
.
mark
@
pccm
.
member_function
(
virtual
=
True
)
def
free
(
self
):
code
=
pccm
.
code
()
code
.
arg
(
"ten"
,
"tv::Tensor"
)
return
code
@
pccm
.
pybind
.
mark
@
pccm
.
member_function
(
virtual
=
True
)
def
free_noexcept
(
self
):
code
=
pccm
.
code
()
code
.
arg
(
"ten"
,
"tv::Tensor"
)
return
code
spconv/csrc/sparse/convops.py
View file @
899008fa
This diff is collapsed.
Click to expand it.
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment