Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
one
spconv
Commits
62c1496f
Commit
62c1496f
authored
Oct 19, 2021
by
yan.yan
Browse files
bug fix
parent
01ed382c
Changes
18
Show whitespace changes
Inline
Side-by-side
Showing
18 changed files
with
508 additions
and
354 deletions
+508
-354
.github/workflows/build.yaml
.github/workflows/build.yaml
+96
-0
README.md
README.md
+0
-1
pyproject.toml
pyproject.toml
+1
-1
setup.py
setup.py
+2
-3
spconv/build.py
spconv/build.py
+0
-1
spconv/core_cc/csrc/sparse/all/__init__.pyi
spconv/core_cc/csrc/sparse/all/__init__.pyi
+8
-15
spconv/csrc/sparse/all.py
spconv/csrc/sparse/all.py
+21
-38
spconv/csrc/sparse/indices.py
spconv/csrc/sparse/indices.py
+53
-80
spconv/csrc/sparse/maxpool.py
spconv/csrc/sparse/maxpool.py
+2
-1
spconv/pytorch/conv.py
spconv/pytorch/conv.py
+80
-20
spconv/pytorch/modules.py
spconv/pytorch/modules.py
+0
-47
spconv/pytorch/ops.py
spconv/pytorch/ops.py
+82
-81
spconv/pytorch/pool.py
spconv/pytorch/pool.py
+28
-0
test/aaa.py
test/aaa.py
+94
-45
test/benchmark.py
test/benchmark.py
+6
-5
test/data/test_spconv.pkl
test/data/test_spconv.pkl
+0
-0
test/test_conv.py
test/test_conv.py
+33
-14
tools/build-wheels.sh
tools/build-wheels.sh
+2
-2
No files found.
.github/workflows/build.yaml
0 → 100644
View file @
62c1496f
name
:
build
on
:
push
:
branches
:
-
main
-
feature/*
tags
:
-
'
*'
jobs
:
build-windows
:
runs-on
:
windows-latest
strategy
:
matrix
:
python-version
:
[
'
3.6'
,
'
3.7'
,
'
3.8'
,
'
3.9'
,
'
3.10'
]
cuda-version
:
[
'
10.2'
,
'
11.1'
,
'
11.4'
]
steps
:
-
uses
:
actions/checkout@master
-
name
:
Install CUDA
env
:
CUDA_VERSION
:
${{ matrix.cuda-version }}
PYTHON_VERSION
:
${{ matrix.python-version }}
cuda
:
${{ matrix.cuda-version }}
if
:
(github.event_name == 'push' && (startsWith(github.ref, 'refs/tags')) && (env.CUDA_VERSION != '') ) || (env.CUDA_VERSION == '11.1' && env.PYTHON_VERSION == '3.10')
shell
:
powershell
run
:
.\tools\install_windows_cuda.ps1
-
name
:
Set up Python ${{ matrix.python-version }}
uses
:
actions/setup-python@v2
with
:
python-version
:
${{ matrix.python-version }}
-
uses
:
ilammy/msvc-dev-cmd@v1
-
name
:
Install pep build
run
:
|
python -m pip install build --user
python -m pip install --upgrade pip twine wheel
python -m pip install pytest setuptools
-
name
:
Build a windows binary wheel
env
:
CUDA_VERSION
:
${{ matrix.cuda-version }}
PYTHON_VERSION
:
${{ matrix.python-version }}
if
:
(github.event_name == 'push' && (startsWith(github.ref, 'refs/tags')) ) || (env.CUDA_VERSION == '11.1' && env.PYTHON_VERSION == '3.10')
run
:
|
$Env:CUMM_CUDA_VERSION = "${{ matrix.cuda-version }}"
$Env:CUMM_CUDA_ARCH_LIST = "all"
$Env:SPCONV_DISABLE_JIT = "1"
pip install pccm pybind11
python -m build --wheel --outdir dist/ .
shell
:
powershell
-
name
:
Publish a Python distribution to PyPI
if
:
github.event_name == 'push' && startsWith(github.ref, 'refs/tags')
run
:
|
$Env:TWINE_USERNAME = "__token__"
$Env:TWINE_PASSWORD = "${{ secrets.pypi_password }}"
twine upload dist/*
shell
:
powershell
build
:
needs
:
build-windows
runs-on
:
ubuntu-20.04
strategy
:
matrix
:
python-version
:
[
'
3.8'
]
# this version is only used for upload.
cuda-version
:
[
'
102'
,
'
111'
,
'
114'
]
steps
:
-
uses
:
actions/checkout@master
-
name
:
Set up Python ${{ matrix.python-version }}
uses
:
actions/setup-python@v2
with
:
python-version
:
${{ matrix.python-version }}
-
name
:
Install pep build
run
:
|
python -m pip install build --user
python -m pip install --upgrade pip twine wheel
python -m pip install pytest setuptools
-
name
:
Build a cuda wheel
env
:
CUDA_VERSION
:
${{ matrix.cuda-version }}
PYTHON_VERSION
:
${{ matrix.python-version }}
DOCKER_IMAGE
:
scrin/manylinux2014-cuda:cu${{ matrix.cuda-version }}-devel
PLAT
:
manylinux2014_x86_64
if
:
(github.event_name == 'push' && (startsWith(github.ref, 'refs/tags')) && (env.CUDA_VERSION != '') ) || env.CUDA_VERSION == '114'
run
:
|
docker run --rm -e PLAT=$PLAT -e CUMM_CUDA_VERSION=${{ matrix.cuda-version }} -v `pwd`:/io $DOCKER_IMAGE bash -c "/io/tools/build-wheels.sh"
-
name
:
Publish a Python distribution to PyPI
if
:
github.event_name == 'push' && startsWith(github.ref, 'refs/tags')
uses
:
pypa/gh-action-pypi-publish@master
with
:
user
:
__token__
password
:
${{ secrets.pypi_password }}
\ No newline at end of file
README.md
View file @
62c1496f
...
...
@@ -34,7 +34,6 @@
*
training/inference speed is increased
*
support int8/tensor core
*
doesn't depend on pytorch binary.
*
If your GPU has tensor core, try mixed precision training in spconv 2.x!
*
since spconv 2.x doesn't depend on pytorch binary (never in future), it's impossible to support torch.jit/libtorch inference.
## TODO in Spconv 2.x
...
...
pyproject.toml
View file @
62c1496f
[build-system]
requires
=
[
"setuptools>=41.0"
,
"wheel"
,
"pccm>=0.2.5"
,
"cumm>=0.1.
3
"
]
requires
=
[
"setuptools>=41.0"
,
"wheel"
,
"pccm>=0.2.5"
,
"cumm>=0.1.
5
"
]
build-backend
=
"setuptools.build_meta"
setup.py
View file @
62c1496f
...
...
@@ -30,7 +30,7 @@ DESCRIPTION = 'spatial sparse convolution'
URL
=
'https://github.com/traveller59/spconv'
EMAIL
=
'yanyan.sub@outlook.com'
AUTHOR
=
'Yan Yan'
REQUIRES_PYTHON
=
'>=3.
7
'
REQUIRES_PYTHON
=
'>=3.
6
'
VERSION
=
None
# What packages are required for this module to be executed?
...
...
@@ -134,12 +134,11 @@ if disable_jit is not None and disable_jit == "1":
}
from
cumm.gemm.main
import
GemmMainUnitTest
,
SHUFFLE_SIMT_PARAMS
,
SHUFFLE_VOLTA_PARAMS
,
SHUFFLE_TURING_PARAMS
from
spconv.csrc.sparse.all
import
SpconvOps
from
cumm.gemm.gather
import
GatherAll
cu
=
GemmMainUnitTest
(
SHUFFLE_SIMT_PARAMS
+
SHUFFLE_VOLTA_PARAMS
+
SHUFFLE_TURING_PARAMS
)
cu
.
namespace
=
"cumm.gemm.main"
ext_modules
:
List
[
Extension
]
=
[
PCCMExtension
([
cu
,
SpconvOps
()
,
GatherAll
()
],
PCCMExtension
([
cu
,
SpconvOps
()],
"spconv/core_cc"
,
Path
(
__file__
).
resolve
().
parent
/
"spconv"
)
]
...
...
spconv/build.py
View file @
62c1496f
...
...
@@ -22,7 +22,6 @@ from .constants import PACKAGE_NAME, PACKAGE_ROOT
if
project_is_installed
(
PACKAGE_NAME
)
and
project_is_editable
(
PACKAGE_NAME
):
from
cumm.gemm.main
import
GemmMainUnitTest
,
SHUFFLE_SIMT_PARAMS
,
SHUFFLE_VOLTA_PARAMS
,
SHUFFLE_TURING_PARAMS
from
spconv.csrc.sparse.all
import
SpconvOps
# from cumm.gemm.gather import GatherAll, ScatterAll
cu
=
GemmMainUnitTest
(
SHUFFLE_SIMT_PARAMS
+
SHUFFLE_VOLTA_PARAMS
+
SHUFFLE_TURING_PARAMS
)
cu
.
namespace
=
"cumm.gemm.main"
pccm
.
builder
.
build_pybind
([
cu
,
SpconvOps
()],
...
...
spconv/core_cc/csrc/sparse/all/__init__.pyi
View file @
62c1496f
...
...
@@ -3,14 +3,12 @@ from pccm.stubs import EnumValue, EnumClassValue
from cumm.tensorview import Tensor
class SpconvOps:
@staticmethod
def generate_conv_inds(indices: Tensor,
hashdata: Tensor,
indice_pairs: Tensor, indice_pairs_uniq: Tensor,
out_inds: Tensor,
indice_num_per_loc: Tensor, batch_size: int, output_dims: List[int], input_dims: List[int], ksize: List[int], stride: List[int], padding: List[int], dilation: List[int]) ->
int
:
def generate_conv_inds
_stage1
(indices: Tensor, indice_pairs: Tensor, indice_pairs_uniq: Tensor, indice_num_per_loc: Tensor, batch_size: int, output_dims: List[int], input_dims: List[int], ksize: List[int], stride: List[int], padding: List[int], dilation: List[int]
, transposed: bool = False, stream_int: int = 0
) ->
None
:
"""
Args:
indices:
hashdata:
indice_pairs:
indice_pairs_uniq:
out_inds:
indice_num_per_loc:
batch_size:
output_dims:
...
...
@@ -19,28 +17,22 @@ class SpconvOps:
stride:
padding:
dilation:
transposed:
stream_int:
"""
...
@staticmethod
def generate_conv_inds_stage1(
indices: Tensor, indice_pairs: Tensor,
indice_pairs_uniq: Tensor,
i
ndi
ce_num_per_loc: Tensor, batch_size: int, output_dims: List[int], input_dims: List[int], ksize: List[int], stride: List[int], padding: List[int], dilation: List[int]
, stream_int: int = 0) -> int:
def generate_conv_inds_stage1
_5
(indice_pairs_uniq: Tensor, ndi
m: int, uniq_size: int
, stream_int: int = 0) -> int:
"""
Args:
indices:
indice_pairs:
indice_pairs_uniq:
indice_num_per_loc:
batch_size:
output_dims:
input_dims:
ksize:
stride:
padding:
dilation:
ndim:
uniq_size:
stream_int:
"""
...
@staticmethod
def generate_conv_inds_stage2(indices: Tensor, hashdata: Tensor, indice_pairs: Tensor, indice_pairs_uniq: Tensor, out_inds: Tensor, num_out_act: int, batch_size: int, output_dims: List[int], input_dims: List[int], ksize: List[int], stride: List[int], padding: List[int], dilation: List[int], stream_int: int = 0) -> int:
def generate_conv_inds_stage2(indices: Tensor, hashdata: Tensor, indice_pairs: Tensor, indice_pairs_uniq: Tensor, out_inds: Tensor, num_out_act: int, batch_size: int, output_dims: List[int], input_dims: List[int], ksize: List[int], stride: List[int], padding: List[int], dilation: List[int],
transposed: bool = False,
stream_int: int = 0) -> int:
"""
Args:
indices:
...
...
@@ -56,6 +48,7 @@ class SpconvOps:
stride:
padding:
dilation:
transposed:
stream_int:
"""
...
...
...
spconv/csrc/sparse/all.py
View file @
62c1496f
...
...
@@ -35,26 +35,30 @@ class SpconvOps(pccm.Class):
problem
=
ConvProblem
(
ndim
,
ConvOpType
.
kForward
,
NHWC
,
NHWC
,
NHWC
)
indices
=
SparseConvIndicesKernel
(
problem
,
dtypes
.
int32
)
# self.add_param_class("ops", indices, "SpconvIndices")
cuda_funcs
=
[
self
.
generate_conv_inds
,
self
.
generate_subm_conv_inds
,
self
.
generate_conv_inds_stage1
,
self
.
generate_conv_inds_stage2
,
self
.
sort_1d_by_key
]
cuda_funcs
=
[
self
.
generate_subm_conv_inds
,
self
.
generate_conv_inds_stage1
,
self
.
generate_conv_inds_stage1_5
,
self
.
generate_conv_inds_stage2
,
self
.
sort_1d_by_key
]
self
.
add_impl_only_param_class
(
cuda_funcs
,
f
"ops
{
ndim
}
d"
,
indices
,
f
"SpconvIndices
{
ndim
}
D"
)
@
pccm
.
pybind
.
mark
@
pccm
.
cuda
.
static_function
def
generate_conv_inds
(
self
):
def
generate_conv_inds
_stage1
(
self
):
code
=
pccm
.
FunctionCode
()
code
.
arg
(
"indices
, hashdata
"
,
"tv::Tensor"
)
code
.
arg
(
"indice_pairs, indice_pairs_uniq,
out_inds,
indice_num_per_loc"
,
"tv::Tensor"
)
code
.
arg
(
"indices"
,
"tv::Tensor"
)
code
.
arg
(
"indice_pairs, indice_pairs_uniq, indice_num_per_loc"
,
"tv::Tensor"
)
code
.
arg
(
"batch_size"
,
"int"
)
code
.
arg
(
"output_dims, input_dims"
,
f
"std::vector<int>"
)
code
.
arg
(
"ksize, stride, padding, dilation"
,
f
"std::vector<int>"
)
code
.
arg
(
"transposed"
,
f
"bool"
,
"false"
)
code
.
arg
(
"stream_int"
,
f
"std::uintptr_t"
,
"0"
,
pyanno
=
"int"
)
code
.
raw
(
f
"""
int ndim = indices.dim(1) - 1;
TV_ASSERT_RT_ERR(output_dims.size() == ndim && input_dims.size() == ndim &&
ksize.size() == ndim && stride.size() == ndim && dilation.size() == ndim &&
padding.size() == ndim, "your params size not equal to ndim", ndim);
"""
)
for
ndim
in
self
.
ndims
:
code
.
raw
(
f
"""
if (ndim ==
{
ndim
}
){{
...
...
@@ -68,53 +72,31 @@ class SpconvOps(pccm.Class):
padding_[i] = padding[i];
dilation_[i] = dilation[i];
}}
return SpconvIndices
{
ndim
}
D::generate_conv_inds(indices,
hashdata,
indice_pairs, indice_pairs_uniq,
out_inds,
indice_num_per_loc,
return SpconvIndices
{
ndim
}
D::generate_conv_inds
_stage1
(indices,
indice_pairs, indice_pairs_uniq, indice_num_per_loc,
batch_size, output_dims_, input_dims_,
ksize_, stride_, padding_, dilation_);
ksize_, stride_, padding_, dilation_
, transposed, stream_int
);
}}
"""
)
code
.
raw
(
f
"""TV_THROW_RT_ERR("unknown ndim", ndim);"""
)
return
code
.
ret
(
"int"
)
return
code
# .ret("int")
@
pccm
.
pybind
.
mark
@
pccm
.
cuda
.
static_function
def
generate_conv_inds_stage1
(
self
):
def
generate_conv_inds_stage1
_5
(
self
):
code
=
pccm
.
FunctionCode
()
code
.
arg
(
"indices"
,
"tv::Tensor"
)
code
.
arg
(
"indice_pairs, indice_pairs_uniq, indice_num_per_loc"
,
"tv::Tensor"
)
code
.
arg
(
"batch_size"
,
"int"
)
code
.
arg
(
"output_dims, input_dims"
,
f
"std::vector<int>"
)
code
.
arg
(
"ksize, stride, padding, dilation"
,
f
"std::vector<int>"
)
code
.
arg
(
"indice_pairs_uniq"
,
"tv::Tensor"
)
code
.
arg
(
"ndim"
,
"int"
)
code
.
arg
(
"uniq_size"
,
"int64_t"
)
code
.
arg
(
"stream_int"
,
f
"std::uintptr_t"
,
"0"
,
pyanno
=
"int"
)
code
.
raw
(
f
"""
int ndim = indices.dim(1) - 1;
TV_ASSERT_RT_ERR(output_dims.size() == ndim && input_dims.size() == ndim &&
ksize.size() == ndim && stride.size() == ndim && dilation.size() == ndim &&
padding.size() == ndim, "your params size not equal to ndim", ndim);
"""
)
for
ndim
in
self
.
ndims
:
code
.
raw
(
f
"""
if (ndim ==
{
ndim
}
){{
tv::array<int,
{
ndim
}
> output_dims_, input_dims_;
tv::array<int,
{
ndim
}
> ksize_, stride_, padding_, dilation_;
for (int i = 0; i <
{
ndim
}
; ++i){{
output_dims_[i] = output_dims[i];
input_dims_[i] = input_dims[i];
ksize_[i] = ksize[i];
stride_[i] = stride[i];
padding_[i] = padding[i];
dilation_[i] = dilation[i];
}}
return SpconvIndices
{
ndim
}
D::generate_conv_inds_stage1(indices,
indice_pairs, indice_pairs_uniq, indice_num_per_loc,
batch_size, output_dims_, input_dims_,
ksize_, stride_, padding_, dilation_);
return SpconvIndices
{
ndim
}
D::generate_conv_inds_stage1_5(indice_pairs_uniq, uniq_size, stream_int);
}}
"""
)
code
.
raw
(
f
"""TV_THROW_RT_ERR("unknown ndim", ndim);"""
)
return
code
.
ret
(
"int"
)
@
pccm
.
pybind
.
mark
...
...
@@ -127,6 +109,7 @@ class SpconvOps(pccm.Class):
code
.
arg
(
"batch_size"
,
"int"
)
code
.
arg
(
"output_dims, input_dims"
,
f
"std::vector<int>"
)
code
.
arg
(
"ksize, stride, padding, dilation"
,
f
"std::vector<int>"
)
code
.
arg
(
"transposed"
,
f
"bool"
,
"false"
)
code
.
arg
(
"stream_int"
,
f
"std::uintptr_t"
,
"0"
,
pyanno
=
"int"
)
code
.
raw
(
f
"""
int ndim = indices.dim(1) - 1;
...
...
@@ -151,7 +134,7 @@ class SpconvOps(pccm.Class):
return SpconvIndices
{
ndim
}
D::generate_conv_inds_stage2(indices, hashdata,
indice_pairs, indice_pairs_uniq, out_inds, num_out_act,
batch_size, output_dims_, input_dims_,
ksize_, stride_, padding_, dilation_);
ksize_, stride_, padding_, dilation_
, transposed, stream_int
);
}}
"""
)
code
.
raw
(
f
"""TV_THROW_RT_ERR("unknown ndim", ndim);"""
)
...
...
spconv/csrc/sparse/indices.py
View file @
62c1496f
...
...
@@ -225,6 +225,28 @@ class ConvOutLocIter(pccm.ParameterizedClass):
"""
)
return
code
@
pccm
.
cuda
.
member_function
(
host
=
True
,
device
=
True
,
forceinline
=
True
,
const
=
True
)
def
query_nhw_out
(
self
):
code
=
pccm
.
FunctionCode
()
code
.
arg
(
"npq_offset"
,
"const int*"
)
code
.
arg
(
"nhw_offset"
,
f
"tv::array<int,
{
self
.
ndim
+
1
}
>&"
)
code
.
ret
(
"bool"
)
code
.
raw
(
f
"""
nhw_offset = npq_to_nhw(npq_offset);
"""
)
hw_valid
=
[]
# type: List[str]
for
i
in
range
(
self
.
ndim
):
hw_valid
.
append
((
f
"nhw_offset[
{
i
+
1
}
] >= 0 && "
f
"nhw_offset[
{
i
+
1
}
] < problem_.output_dims[
{
i
}
]"
))
code
.
raw
(
f
"""
return nhw_offset[0] < problem_.N &&
{
' && '
.
join
(
hw_valid
)
}
;
"""
)
return
code
class
SparseConvIndicesKernel
(
pccm
.
ParameterizedClass
):
def
__init__
(
self
,
problem
:
ConvProblem
,
dtype_indices
:
dtypes
.
DType
):
super
().
__init__
()
...
...
@@ -255,7 +277,7 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
code
.
arg
(
"indices_pair_size"
,
"int"
)
code
.
arg
(
"RS"
,
"int"
)
#
code.arg(
"bool",
"transposed")
code
.
arg
(
"transposed"
,
"bool"
)
code
.
raw
(
f
"""
int filter_offset = blockIdx.y;
...
...
@@ -264,7 +286,13 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
int filter_offset_mul_indices_pair_size = filter_offset * indices_pair_size;
for (int i : tv::KernelLoopX<int>(num_indices_in)) {{
tv::array<int,
{
self
.
ndim
+
1
}
> npq_offset;
if (loc_iter.query_npq(indices_in + i *
{
self
.
ndim
+
1
}
, npq_offset)){{
bool valid;
if (transposed){{
valid = loc_iter.query_nhw_out(indices_in + i *
{
self
.
ndim
+
1
}
, npq_offset);
}}else{{
valid = loc_iter.query_npq(indices_in + i *
{
self
.
ndim
+
1
}
, npq_offset);
}}
if (valid){{
int old_num = tv::cuda::atomicAggInc(indice_num_per_loc + filter_offset);
{
self
.
dtype_indices
}
offset = loc_iter.layout_npq(npq_offset);
if (old_num < indices_pair_size){{
...
...
@@ -514,81 +542,6 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
"""
)
return
code
@
pccm
.
cuda
.
static_function
def
generate_conv_inds
(
self
):
code
=
pccm
.
FunctionCode
()
code
.
arg
(
"indices, hashdata"
,
"tv::Tensor"
)
code
.
arg
(
"indice_pairs, indice_pairs_uniq, out_inds, indice_num_per_loc"
,
"tv::Tensor"
)
code
.
arg
(
"batch_size"
,
"int"
)
code
.
arg
(
"output_dims, input_dims"
,
f
"tv::array<int,
{
self
.
ndim
}
>"
)
code
.
arg
(
"ksize, stride, padding, dilation"
,
f
"tv::array<int,
{
self
.
ndim
}
>"
)
code
.
raw
(
f
"""
// TODO stream
// TODO handle num input == 0
int kv = tv::arrayops::prod(ksize);
TV_ASSERT_RT_ERR(kv == indice_pairs.dim(1), "error");
// indice_pairs: [2, kv, indices.dim(0)]
// indice_pairs_uniq: [indice_pairs.size() / 2 + 1]
// out_inds: [MaxSize,
{
self
.
ndim
+
1
}
]
auto timer = tv::CudaContextTimer<>();
int64_t uniq_size = indice_pairs.size() / 2 + 1;
TV_ASSERT_RT_ERR(indice_pairs_uniq.dim(0) == uniq_size, "error");
TV_ASSERT_RT_ERR(indice_num_per_loc.dim(0) == kv, "error");
int64_t expected_out_size = indices.dim(0) * kv;
TV_ASSERT_RT_ERR(out_inds.dim(0) == expected_out_size && out_inds.dim(1) ==
{
self
.
ndim
+
1
}
, "error");
tv::cuda::Launch launcher_num_act_in(indices.dim(0));
// tv::cuda::Launch launcher_num_act_in_2(indices.dim(0));
launcher_num_act_in.blocks.y = kv;
ConvProblem problem(batch_size, 1, 1, input_dims, output_dims, ksize, padding, stride, dilation);
ConvLocIter loc_iter(problem);
tv::cuda::Launch launcher_clean_uniq(uniq_size);
launcher_clean_uniq(clean_indices_uniq, indice_pairs_uniq.data_ptr<
{
self
.
dtype_indices
}
>(), uniq_size);
tv::ssprint("clean time", timer.report() / 1000.0);
launcher_num_act_in(calc_conv_indices_stage1, loc_iter, indices.data_ptr<const int>(),
indice_pairs.data_ptr<
{
self
.
dtype_indices
}
>(),
indice_pairs_uniq.data_ptr<
{
self
.
dtype_indices
}
>(), indice_num_per_loc.data_ptr<int>(), indices.dim(0),
indice_pairs.dim(2), kv);
tv::ssprint("calc_conv_indices_stage1 time", timer.report() / 1000.0, uniq_size);
thrust::device_ptr<
{
self
.
dtype_indices
}
> ptr_tr(indice_pairs_uniq.data_ptr<
{
self
.
dtype_indices
}
>());
auto thrust_ctx = thrust::cuda::par.on(0);
thrust::sort(thrust_ctx, ptr_tr, ptr_tr + uniq_size);
auto new_end = thrust::unique(thrust_ctx, ptr_tr, ptr_tr + uniq_size);
auto num_out_act = new_end - ptr_tr - 1;
tv::ssprint("unique time", num_out_act, timer.report() / 1000.0);
// return num_out_act;
// TODO handle invalid num_out_act
indice_pairs_uniq = indice_pairs_uniq.slice_first_axis(0, num_out_act);
tv::cuda::Launch lanucher_build_hash(num_out_act);
using V =
{
self
.
dtype_indices
}
;
using KeyType =
{
self
.
dtype_indices
}
;
constexpr KeyType kEmptyKey = std::numeric_limits<KeyType>::max();
using table_t =
tv::hash::LinearHashTable<KeyType, V, tv::hash::Murmur3Hash<KeyType>,
kEmptyKey, false>;
using pair_t = typename table_t::value_type;
TV_ASSERT_RT_ERR(hashdata.dim(0) >= num_out_act, "hash size not enough");
table_t hash = table_t(hashdata.data_ptr<pair_t>(), hashdata.dim(0));
hash.clear();
tv::ssprint("clear hash time", hashdata.dim(0), timer.report() / 1000.0);
lanucher_build_hash(build_conv_hash_table<table_t>, hash, out_inds.data_ptr<int>(), indice_pairs_uniq.data_ptr<const
{
self
.
dtype_indices
}
>(),
loc_iter.layout_npq, num_out_act);
tv::ssprint("build_hash time", num_out_act, timer.report() / 1000.0);
launcher_num_act_in(calc_conv_indices_stage2<table_t>, hash, indice_pairs[1].data_ptr<int>(), indices.dim(0),
indice_pairs.dim(2));
tv::ssprint("gem conv inds time", timer.report() / 1000.0);
return num_out_act;
"""
)
return
code
.
ret
(
"int"
)
@
pccm
.
cuda
.
static_function
def
generate_conv_inds_stage1
(
self
):
code
=
pccm
.
FunctionCode
()
...
...
@@ -597,6 +550,8 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
code
.
arg
(
"batch_size"
,
"int"
)
code
.
arg
(
"output_dims, input_dims"
,
f
"tv::array<int,
{
self
.
ndim
}
>"
)
code
.
arg
(
"ksize, stride, padding, dilation"
,
f
"tv::array<int,
{
self
.
ndim
}
>"
)
code
.
arg
(
"transposed"
,
f
"bool"
,
"false"
)
code
.
arg
(
"stream_int"
,
f
"std::uintptr_t"
,
"0"
)
code
.
raw
(
f
"""
...
...
@@ -620,7 +575,23 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
launcher_num_act_in(calc_conv_indices_stage1, loc_iter, indices.data_ptr<const int>(),
indice_pairs.data_ptr<
{
self
.
dtype_indices
}
>(),
indice_pairs_uniq.data_ptr<
{
self
.
dtype_indices
}
>(), indice_num_per_loc.data_ptr<int>(), indices.dim(0),
indice_pairs.dim(2), kv);
indice_pairs.dim(2), kv, transposed);
// thrust::device_ptr<
{
self
.
dtype_indices
}
> ptr_tr(indice_pairs_uniq.data_ptr<
{
self
.
dtype_indices
}
>());
// auto thrust_ctx = thrust::cuda::par.on(reinterpret_cast<cudaStream_t>(stream_int));
// thrust::sort(thrust_ctx, ptr_tr, ptr_tr + uniq_size);
// auto new_end = thrust::unique(thrust_ctx, ptr_tr, ptr_tr + uniq_size);
// auto num_out_act = new_end - ptr_tr - 1;
// return num_out_act;
"""
)
return
code
# .ret("int")
@
pccm
.
cuda
.
static_function
def
generate_conv_inds_stage1_5
(
self
):
code
=
pccm
.
FunctionCode
()
code
.
arg
(
"indice_pairs_uniq"
,
"tv::Tensor"
)
code
.
arg
(
"uniq_size"
,
"int64_t"
)
code
.
arg
(
"stream_int"
,
f
"std::uintptr_t"
,
"0"
)
code
.
raw
(
f
"""
thrust::device_ptr<
{
self
.
dtype_indices
}
> ptr_tr(indice_pairs_uniq.data_ptr<
{
self
.
dtype_indices
}
>());
auto thrust_ctx = thrust::cuda::par.on(reinterpret_cast<cudaStream_t>(stream_int));
thrust::sort(thrust_ctx, ptr_tr, ptr_tr + uniq_size);
...
...
@@ -630,6 +601,7 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
"""
)
return
code
.
ret
(
"int"
)
@
pccm
.
cuda
.
static_function
def
generate_conv_inds_stage2
(
self
):
code
=
pccm
.
FunctionCode
()
...
...
@@ -639,6 +611,7 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
code
.
arg
(
"batch_size"
,
"int"
)
code
.
arg
(
"output_dims, input_dims"
,
f
"tv::array<int,
{
self
.
ndim
}
>"
)
code
.
arg
(
"ksize, stride, padding, dilation"
,
f
"tv::array<int,
{
self
.
ndim
}
>"
)
code
.
arg
(
"transposed"
,
f
"bool"
,
"false"
)
code
.
arg
(
"stream_int"
,
f
"std::uintptr_t"
,
"0"
)
code
.
raw
(
f
"""
auto custream = reinterpret_cast<cudaStream_t>(stream_int);
...
...
@@ -651,7 +624,7 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
// out_inds: [MaxSize,
{
self
.
ndim
+
1
}
]
auto timer = tv::CudaContextTimer<>();
int64_t uniq_size = indice_pairs.size() / 2 + 1;
TV_ASSERT_RT_ERR(indice_pairs_uniq.dim(0)
=
=
uniq_size
, "error");
TV_ASSERT_RT_ERR(indice_pairs_uniq.dim(0)
>
=
num_out_act
, "error");
TV_ASSERT_RT_ERR(out_inds.dim(0) >= num_out_act && out_inds.dim(1) ==
{
self
.
ndim
+
1
}
, "error");
tv::cuda::Launch launcher_num_act_in(indices.dim(0), custream);
launcher_num_act_in.blocks.y = kv;
...
...
spconv/csrc/sparse/maxpool.py
View file @
62c1496f
...
...
@@ -22,7 +22,9 @@ from cumm.common import TensorView, TensorViewHashKernel, TensorViewKernel, Thru
from
cumm.gemm
import
codeops
from
typing
import
List
from
cumm.conv.params
import
ConvProblem
from
cumm.gemm.mask_iters
import
MaskTileIterator
,
MaskTileIteratorParams
import
numpy
as
np
from
cumm.gemm
import
(
thread_map
)
class
IndiceMaxPool
(
pccm
.
Class
):
# TODO optimize this function
...
...
@@ -171,4 +173,3 @@ class IndiceMaxPool(pccm.Class):
}});
"""
)
return
code
spconv/pytorch/conv.py
View file @
62c1496f
...
...
@@ -14,6 +14,7 @@
import
math
import
time
from
typing
import
List
,
Optional
,
Tuple
,
Union
import
numpy
as
np
import
torch
...
...
@@ -39,6 +40,10 @@ def _calculate_fan_in_and_fan_out_hwio(tensor):
if
dimensions
==
2
:
# Linear
fan_in
=
tensor
.
size
(
-
2
)
fan_out
=
tensor
.
size
(
-
1
)
else
:
if
FILTER_HWIO
:
num_input_fmaps
=
tensor
.
size
(
-
2
)
num_output_fmaps
=
tensor
.
size
(
-
1
)
else
:
num_input_fmaps
=
tensor
.
size
(
-
1
)
num_output_fmaps
=
tensor
.
size
(
-
2
)
...
...
@@ -58,22 +63,22 @@ class SparseConvolution(SparseModule):
]
def
__init__
(
self
,
ndim
,
in_channels
,
out_channels
,
kernel_size
=
3
,
stride
=
1
,
padding
=
0
,
dilation
=
1
,
groups
=
1
,
bias
=
True
,
subm
=
False
,
output_padding
=
0
,
transposed
=
False
,
inverse
=
False
,
indice_key
=
None
,
fused_bn
=
False
,
algo
=
ops
.
ConvAlgo
.
Native
,
ndim
:
int
,
in_channels
:
int
,
out_channels
:
int
,
kernel_size
:
Union
[
int
,
List
[
int
],
Tuple
[
int
,
...]]
=
3
,
stride
:
Union
[
int
,
List
[
int
],
Tuple
[
int
,
...]]
=
1
,
padding
:
Union
[
int
,
List
[
int
],
Tuple
[
int
,
...]]
=
0
,
dilation
:
Union
[
int
,
List
[
int
],
Tuple
[
int
,
...]]
=
1
,
groups
:
Union
[
int
,
List
[
int
],
Tuple
[
int
,
...]]
=
1
,
bias
:
bool
=
True
,
subm
:
bool
=
False
,
output_padding
:
Union
[
int
,
List
[
int
],
Tuple
[
int
,
...]]
=
0
,
transposed
:
bool
=
False
,
inverse
:
bool
=
False
,
indice_key
:
Optional
[
str
]
=
None
,
fused_bn
:
bool
=
False
,
algo
:
ops
.
ConvAlgo
=
ops
.
ConvAlgo
.
Native
,
name
=
None
):
super
(
SparseConvolution
,
self
).
__init__
(
name
=
name
)
assert
groups
==
1
...
...
@@ -117,8 +122,6 @@ class SparseConvolution(SparseModule):
self
.
bias
=
Parameter
(
torch
.
Tensor
(
out_channels
))
else
:
self
.
register_parameter
(
'bias'
,
None
)
# self.workspace_for_splitk = torch.zeros((GLOBAL_MAXIMUM_SPLITK,), dtype=torch.int8)
# self.register_buffer("workspace_for_splitk", self.workspace_for_splitk)
self
.
reset_parameters
()
def
reset_parameters
(
self
):
...
...
@@ -234,6 +237,7 @@ class SparseConvolution(SparseModule):
t
=
time
.
time
()
if
self
.
fused_bn
:
raise
NotImplementedError
assert
self
.
bias
is
not
None
out_features
=
ops
.
fused_indice_conv
(
features
,
self
.
weight
,
self
.
bias
,
...
...
@@ -382,6 +386,34 @@ class SparseConv4d(SparseConvolution):
name
=
name
)
class
SparseConvTranspose1d
(
SparseConvolution
):
def
__init__
(
self
,
in_channels
,
out_channels
,
kernel_size
,
stride
=
1
,
padding
=
0
,
dilation
=
1
,
groups
=
1
,
bias
=
True
,
indice_key
=
None
,
algo
=
ops
.
ConvAlgo
.
Native
,
name
=
None
):
super
(
SparseConvTranspose1d
,
self
).
__init__
(
1
,
in_channels
,
out_channels
,
kernel_size
,
stride
,
padding
,
dilation
,
groups
,
bias
,
transposed
=
True
,
indice_key
=
indice_key
,
algo
=
algo
,
name
=
name
)
class
SparseConvTranspose2d
(
SparseConvolution
):
def
__init__
(
self
,
in_channels
,
...
...
@@ -437,6 +469,34 @@ class SparseConvTranspose3d(SparseConvolution):
algo
=
algo
,
name
=
name
)
class
SparseConvTranspose4d
(
SparseConvolution
):
def
__init__
(
self
,
in_channels
,
out_channels
,
kernel_size
,
stride
=
1
,
padding
=
0
,
dilation
=
1
,
groups
=
1
,
bias
=
True
,
indice_key
=
None
,
algo
=
ops
.
ConvAlgo
.
Native
,
name
=
None
):
super
(
SparseConvTranspose4d
,
self
).
__init__
(
4
,
in_channels
,
out_channels
,
kernel_size
,
stride
,
padding
,
dilation
,
groups
,
bias
,
transposed
=
True
,
indice_key
=
indice_key
,
algo
=
algo
,
name
=
name
)
class
SparseInverseConv1d
(
SparseConvolution
):
def
__init__
(
self
,
in_channels
,
...
...
spconv/pytorch/modules.py
View file @
62c1496f
...
...
@@ -143,50 +143,3 @@ class SparseSequential(SparseModule):
input
=
module
(
input
)
return
input
def
fused
(
self
):
"""don't use this. no effect.
"""
from
spconv.pytorch.conv
import
SparseConvolution
mods
=
[
v
for
k
,
v
in
self
.
_modules
.
items
()]
fused_mods
=
[]
idx
=
0
while
idx
<
len
(
mods
):
if
is_sparse_conv
(
mods
[
idx
]):
if
idx
<
len
(
mods
)
-
1
and
isinstance
(
mods
[
idx
+
1
],
nn
.
BatchNorm1d
):
new_module
=
SparseConvolution
(
ndim
=
mods
[
idx
].
ndim
,
in_channels
=
mods
[
idx
].
in_channels
,
out_channels
=
mods
[
idx
].
out_channels
,
kernel_size
=
mods
[
idx
].
kernel_size
,
stride
=
mods
[
idx
].
stride
,
padding
=
mods
[
idx
].
padding
,
dilation
=
mods
[
idx
].
dilation
,
groups
=
mods
[
idx
].
groups
,
bias
=
True
,
subm
=
mods
[
idx
].
subm
,
output_padding
=
mods
[
idx
].
output_padding
,
transposed
=
mods
[
idx
].
transposed
,
inverse
=
mods
[
idx
].
inverse
,
indice_key
=
mods
[
idx
].
indice_key
,
fused_bn
=
True
,
)
new_module
.
load_state_dict
(
mods
[
idx
].
state_dict
(),
False
)
new_module
.
to
(
mods
[
idx
].
weight
.
device
)
conv
=
new_module
bn
=
mods
[
idx
+
1
]
conv
.
bias
.
data
.
zero_
()
conv
.
weight
.
data
[:]
=
conv
.
weight
.
data
*
bn
.
weight
.
data
/
(
torch
.
sqrt
(
bn
.
running_var
)
+
bn
.
eps
)
conv
.
bias
.
data
[:]
=
(
conv
.
bias
.
data
-
bn
.
running_mean
)
*
bn
.
weight
.
data
/
(
torch
.
sqrt
(
bn
.
running_var
)
+
bn
.
eps
)
+
bn
.
bias
.
data
fused_mods
.
append
(
conv
)
idx
+=
2
else
:
fused_mods
.
append
(
mods
[
idx
])
idx
+=
1
else
:
fused_mods
.
append
(
mods
[
idx
])
idx
+=
1
return
SparseSequential
(
*
fused_mods
)
spconv/pytorch/ops.py
View file @
62c1496f
...
...
@@ -11,7 +11,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
functools
from
enum
import
Enum
from
cumm
import
tensorview
as
tv
from
cumm.gemm.algospec.core
import
ShuffleStrideType
...
...
@@ -23,7 +23,7 @@ from spconv.algo import AlgoHint, ConvAlgo
from
typing
import
List
,
Union
from
spconv.pytorch.cppcore
import
torch_tensor_to_tv
,
get_current_stream
from
spconv.core_cc.csrc.sparse.all
import
SpconvOps
from
spconv.algo
import
GEMM
# , GATHER, SCATTER
from
spconv.algo
import
GEMM
# , GATHER, SCATTER
import
time
from
spconv.constants
import
FILTER_HWIO
...
...
@@ -58,26 +58,17 @@ def get_indice_pairs(indices: torch.Tensor,
batch_size
:
int
,
spatial_shape
:
List
[
int
],
algo
:
ConvAlgo
,
ksize
:
Union
[
int
,
List
[
int
]
]
,
stride
:
Union
[
int
,
List
[
int
]
]
,
padding
:
Union
[
int
,
List
[
int
]
]
,
dilation
:
Union
[
int
,
List
[
int
]
]
,
out_padding
:
Union
[
int
,
List
[
int
]
]
,
ksize
:
List
[
int
],
stride
:
List
[
int
],
padding
:
List
[
int
],
dilation
:
List
[
int
],
out_padding
:
List
[
int
],
subm
:
bool
=
False
,
transpose
:
bool
=
False
):
# torch.cuda.synchronize()
# t = time.time()
ndim
=
indices
.
shape
[
1
]
-
1
if
not
isinstance
(
ksize
,
(
list
,
tuple
)):
ksize
=
[
ksize
]
*
ndim
if
not
isinstance
(
stride
,
(
list
,
tuple
)):
stride
=
[
stride
]
*
ndim
if
not
isinstance
(
padding
,
(
list
,
tuple
)):
padding
=
[
padding
]
*
ndim
if
not
isinstance
(
dilation
,
(
list
,
tuple
)):
dilation
=
[
dilation
]
*
ndim
if
not
isinstance
(
out_padding
,
(
list
,
tuple
)):
out_padding
=
[
out_padding
]
*
ndim
kv
:
int
=
int
(
np
.
prod
(
ksize
))
kv
:
int
=
functools
.
reduce
(
lambda
x
,
y
:
x
*
y
,
ksize
,
1
)
if
not
subm
:
if
transpose
:
out_shape
=
get_deconv_output_size
(
spatial_shape
,
ksize
,
stride
,
...
...
@@ -87,8 +78,9 @@ def get_indice_pairs(indices: torch.Tensor,
padding
,
dilation
)
else
:
out_shape
=
spatial_shape
assert
algo
==
ConvAlgo
.
Native
and
not
transpose
,
"TODO"
assert
algo
==
ConvAlgo
.
Native
,
"TODO"
stream
=
get_current_stream
()
pair
=
torch
.
full
((
2
,
kv
,
indices
.
shape
[
0
]),
-
1
,
dtype
=
indices
.
dtype
,
...
...
@@ -96,19 +88,20 @@ def get_indice_pairs(indices: torch.Tensor,
indice_num_per_loc
=
torch
.
zeros
((
kv
,
),
dtype
=
indices
.
dtype
,
device
=
indices
.
device
)
inds_tv
=
torch_tensor_to_tv
(
indices
)
pair_tv
=
torch_tensor_to_tv
(
pair
)
indice_num_per_loc_tv
=
torch_tensor_to_tv
(
indice_num_per_loc
)
# torch.cuda.synchronize()
# t = time.time()
if
subm
:
out_inds
=
indices
hashdata
=
torch
.
empty
((
out_inds
.
shape
[
0
]
*
2
,
),
dtype
=
torch
.
int64
,
device
=
indices
.
device
)
out_inds_tv
=
torch_tensor_to_tv
(
out_inds
)
hashdata_tv
=
torch_tensor_to_tv
(
hashdata
,
dtype
=
tv
.
custom64
)
SpconvOps
.
generate_subm_conv_inds
(
inds_tv
,
hashdata_tv
,
pair_tv
,
...
...
@@ -120,16 +113,15 @@ def get_indice_pairs(indices: torch.Tensor,
dilation
=
dilation
,
stream_int
=
stream
)
# torch.cuda.synchronize()
# print("SUBM INDICE GEN", time.time() - t)
# print("SUBM", time.time() - t)
else
:
indice_pairs_uniq
=
torch
.
empty
((
pair
.
numel
()
//
2
+
1
,
),
dtype
=
indices
.
dtype
,
device
=
indices
.
device
)
indice_pairs_uniq_tv
=
torch_tensor_to_tv
(
indice_pairs_uniq
)
num_act_out
=
SpconvOps
.
generate_conv_inds_stage1
(
inds_tv
,
SpconvOps
.
generate_conv_inds_stage1
(
inds_tv
,
pair_tv
,
indice_pairs_uniq_tv
,
indice_num_per_loc_tv
,
...
...
@@ -140,7 +132,17 @@ def get_indice_pairs(indices: torch.Tensor,
stride
=
stride
,
padding
=
padding
,
dilation
=
dilation
,
transposed
=
transpose
,
stream_int
=
stream
)
uniq_res
=
indice_pairs_uniq
.
unique
()
num_act_out
=
uniq_res
.
shape
[
0
]
-
1
uniq_res_tv
=
torch_tensor_to_tv
(
uniq_res
)
# num_act_out = SpconvOps.generate_conv_inds_stage1_5(
# indice_pairs_uniq_tv,
# ndim,
# uniq_size=indice_pairs_uniq_tv.size,
# stream_int=stream)
# uniq_res_tv = indice_pairs_uniq_tv.slice_first_axis(0, num_act_out)
out_inds
=
torch
.
empty
((
num_act_out
,
indices
.
shape
[
1
]),
dtype
=
indices
.
dtype
,
device
=
indices
.
device
)
...
...
@@ -152,7 +154,7 @@ def get_indice_pairs(indices: torch.Tensor,
SpconvOps
.
generate_conv_inds_stage2
(
inds_tv
,
hashdata_tv
,
pair_tv
,
indice_pairs_
uniq_tv
,
uniq
_res
_tv
,
out_inds_tv
,
num_out_act
=
num_act_out
,
batch_size
=
batch_size
,
...
...
@@ -162,11 +164,10 @@ def get_indice_pairs(indices: torch.Tensor,
stride
=
stride
,
padding
=
padding
,
dilation
=
dilation
,
transposed
=
transpose
,
stream_int
=
stream
)
# torch.cuda.synchronize()
# print("INDICE GEN", time.time() - t)
# print("REGU", time.time() - t)
return
out_inds
,
pair
,
indice_num_per_loc
...
...
@@ -228,8 +229,6 @@ def indice_conv(features: torch.Tensor,
c_inds_shape
=
[
nhot_profile
],
hint
=
AlgoHint
.
Fowrard
.
value
)
gather_data_tv
=
tv
.
Tensor
()
scatter_data_tv
=
tv
.
Tensor
()
maxnhot
=
max
(
indice_pair_num_cpu
)
if
profile_res
is
None
:
...
...
@@ -270,13 +269,10 @@ def indice_conv(features: torch.Tensor,
continue
inp_indices
=
pair_in
[
i
].
slice_first_axis
(
0
,
nhot
)
out_indices
=
pair_out
[
i
].
slice_first_axis
(
0
,
nhot
)
# inp_indices = torch_tensor_to_tv(inp_indices_th)
# out_indices = torch_tensor_to_tv(out_indices_th)
b
=
filters_tv
[
i
]
# inp @ filter.T, NC @ KC
beta
=
1.0
if
inited
else
0.0
algo_desp
=
GEMM
.
run_profile
(
profile_res
,
algo_desp
=
GEMM
.
run_profile
(
profile_res
,
a
,
b
,
c
,
...
...
@@ -295,11 +291,11 @@ def indice_conv(features: torch.Tensor,
# gather_times += gather_time
inited
=
True
# torch.cuda.synchronize()
# print(stream, valid_count, maxnhot, features.shape[0], features.shape[1], out_channel, time.time() - t, total_times, txt)
# print(algo_desp, profile_res.external_gather, profile_res.splitk, features.shape[0], features.shape[1], out_channel, time.time() - t
, total_times
)
#
#
print(stream, valid_count, maxnhot, features.shape[0], features.shape[1], out_channel, time.time() - t, total_times, txt)
#
#
print(algo_desp, profile_res.external_gather, profile_res.splitk, features.shape[0], features.shape[1], out_channel, time.time() - t)
# print(indice_pair_num_cpu)
# print(time.time() - t)
#
#
print(indice_pair_num_cpu)
# print(
"G",
time.time() - t)
return
out_features
...
...
@@ -316,8 +312,6 @@ def indice_conv_backward(features: torch.Tensor,
inverse
:
bool
=
False
,
subm
:
bool
=
False
,
algo
:
ConvAlgo
=
ConvAlgo
.
Native
):
# workspace = torch.empty((10000), dtype=torch.uint8, device=features.device)
# workspace_tv = torch_tensor_to_tv(workspace)
# torch.cuda.synchronize()
# t = time.time()
...
...
@@ -400,7 +394,6 @@ def indice_conv_backward(features: torch.Tensor,
c_inds
=
out_indices
,
alpha
=
1.0
,
beta
=
0.0
,
# scatter_data=scatter_data_tv.slice_first_axis(0, nhot_profile),
hint
=
AlgoHint
.
BackwardInput
.
value
,
stream
=
stream
)
if
not
FILTER_HWIO
:
...
...
@@ -445,7 +438,6 @@ def indice_conv_backward(features: torch.Tensor,
b_inds
=
b_inds_wgrad
,
alpha
=
1.0
,
beta
=
0.0
,
# scatter_data=scatter_data_tv.slice_first_axis(0, nhot_profile),
hint
=
AlgoHint
.
BackwardWeight
.
value
,
stream
=
stream
)
# print(profile_res_wgrad.algo_desp, profile_res_wgrad.splitk, min_time)
...
...
@@ -457,8 +449,9 @@ def indice_conv_backward(features: torch.Tensor,
else
:
b_shape
=
[
maxnhot
,
out_bp_tv
.
dim
(
1
)]
a_shape
=
[
maxnhot
,
features_tv
.
dim
(
1
)]
m
,
n
,
k
=
GEMM
.
extract_mnk
(
a_shape
,
b_shape
,
profile_res_wgrad
.
algo_desp
.
trans_a
,
m
,
n
,
k
=
GEMM
.
extract_mnk
(
a_shape
,
b_shape
,
profile_res_wgrad
.
algo_desp
.
trans_a
,
profile_res_wgrad
.
algo_desp
.
trans_b
,
profile_res_wgrad
.
algo_desp
.
trans_c
,
arch
=
arch
,
...
...
@@ -466,12 +459,15 @@ def indice_conv_backward(features: torch.Tensor,
a_inds_shape
=
[
maxnhot
],
b_inds_shape
=
[
maxnhot
],
hint
=
AlgoHint
.
BackwardWeight
.
value
)
workspace_size
=
profile_res_wgrad
.
algo_desp
.
query_workspace_size
(
m
,
n
,
k
,
profile_res_wgrad
.
splitk
)
workspace_size
=
profile_res_wgrad
.
algo_desp
.
query_workspace_size
(
m
,
n
,
k
,
profile_res_wgrad
.
splitk
)
workspace
=
torch
.
Tensor
()
workspace_tv
=
tv
.
Tensor
()
if
workspace_size
>
0
:
workspace
=
torch
.
empty
((
workspace_size
,),
dtype
=
torch
.
int8
,
device
=
features
.
device
)
workspace
=
torch
.
empty
((
workspace_size
,
),
dtype
=
torch
.
int8
,
device
=
features
.
device
)
workspace_tv
=
torch_tensor_to_tv
(
workspace
)
# print(workspace_size, m, n, k, profile_res_wgrad.splitk)
# torch.cuda.synchronize()
...
...
@@ -538,11 +534,13 @@ def indice_conv_backward(features: torch.Tensor,
# dw_time = time.time() - t
# # print(dw_time + di_time, di_time, dw_time, profile_res_wgrad.splitk, profile_res_wgrad.algo_desp, dfilters.shape)
# # print(dw_time + di_time)
# print(time.time() - t)
# print(
"BWG",
time.time() - t)
return
(
din
,
dfilters
.
reshape
(
filters_shape
))
def
indice_maxpool
(
features
,
indice_pairs
,
indice_pair_num
,
num_activate_out
):
# torch.cuda.synchronize()
# t = time.time()
out_channel
=
features
.
shape
[
-
1
]
out_features
=
torch
.
zeros
((
num_activate_out
,
out_channel
),
dtype
=
features
.
dtype
,
...
...
@@ -558,6 +556,9 @@ def indice_maxpool(features, indice_pairs, indice_pair_num, num_activate_out):
out_indices
=
torch_tensor_to_tv
(
indice_pairs
[
1
][
i
,
:
nhot
])
SpconvOps
.
maxpool_forward
(
out_features_tv
,
features_tv
,
out_indices
,
inp_indices
,
stream
)
# torch.cuda.synchronize()
# print("M", time.time() - t)
return
out_features
...
...
spconv/pytorch/pool.py
View file @
62c1496f
...
...
@@ -142,6 +142,20 @@ class SparseMaxPool(SparseModule):
return
out_tensor
class
SparseMaxPool1d
(
SparseMaxPool
):
def
__init__
(
self
,
kernel_size
,
stride
=
None
,
padding
=
0
,
dilation
=
1
,
name
=
None
):
super
(
SparseMaxPool1d
,
self
).
__init__
(
1
,
kernel_size
,
stride
,
padding
,
dilation
,
name
=
name
)
class
SparseMaxPool2d
(
SparseMaxPool
):
def
__init__
(
self
,
kernel_size
,
...
...
@@ -170,3 +184,17 @@ class SparseMaxPool3d(SparseMaxPool):
padding
,
dilation
,
name
=
name
)
class
SparseMaxPool4d
(
SparseMaxPool
):
def
__init__
(
self
,
kernel_size
,
stride
=
None
,
padding
=
0
,
dilation
=
1
,
name
=
None
):
super
(
SparseMaxPool4d
,
self
).
__init__
(
4
,
kernel_size
,
stride
,
padding
,
dilation
,
name
=
name
)
test/aaa.py
View file @
62c1496f
...
...
@@ -13,20 +13,20 @@
# limitations under the License.
STR
=
"""
0.0016176700592041016
0.002481698989868164
0.002
7854442596435547
0.0031723976135253906
0.0017437934875488281
0.002
0503997802734375
0.001399993896484375
0.00161838531494140
62
0.0007357597351074219
0.0008492469787597656
0.0006558895111083984
0.00
07
99
4
17
4957275391
0.000335693359375
0.000347137451171875
BWG 0.0008761882781982422
BWG 0.0008311271667480469
BWG
0.002
079486846923828
BWG 0.002329587936401367
BWG 0.0025458335876464844
BWG
0.002
6700496673583984
BWG 0.002583742141723633
BWG 0.00252628326416015
62
BWG 0.003481149673461914
BWG 0.003238201141357422
BWG 0.005095958709716797
BWG
0.00
378
99
0
17
333984375
BWG 0.003931283950805664
BWG 0.003300189971923828
"""
"""
0.003921985626220703
...
...
@@ -46,37 +46,86 @@ STR = """
0.00030994415283203125
"""
STR
=
"""
0.0006084442138671875
0.0005354881286621094
0.0012688636779785156
0.0012619495391845703
0.002301931381225586
0.0019693374633789062
0.0038712024688720703
0.002872467041015625
0.005068302154541016
0.0047588348388671875
0.007832765579223633
0.005643367767333984
0.005807161331176758
0.004715442657470703"""
STR1
=
"""
SUBM 0.00036716461181640625
G 0.0010955333709716797
G 0.0010745525360107422
REGU 0.0006923675537109375
M 0.0005242824554443359
SUBM 0.0003108978271484375
G 0.0010905265808105469
G 0.0011067390441894531
REGU 0.00058746337890625
M 0.0005304813385009766
SUBM 0.0002682209014892578
G 0.0010945796966552734
G 0.0011165142059326172
REGU 0.0005419254302978516
M 0.0005164146423339844
SUBM 0.00021505355834960938
G 0.0010805130004882812
G 0.0010516643524169922
REGU 0.00052642822265625
M 0.0004677772521972656
SUBM 0.0002262592315673828
G 0.0010986328125
G 0.0010256767272949219
REGU 0.0005693435668945312
M 0.00048661231994628906
SUBM 0.0002319812774658203
G 0.0011110305786132812
G 0.0011196136474609375
REGU 0.0005295276641845703
M 0.0005729198455810547
SUBM 0.00023889541625976562
G 0.0005326271057128906
G 0.0005140304565429688
"""
0.0004992485046386719
0.0003979206085205078
0.0013720989227294922
0.0015933513641357422
0.0027768611907958984
0.0024590492248535156
0.004837512969970703
0.004601001739501953
0.009881019592285156
0.008889913558959961
0.017162084579467773
0.009079217910766602
0.009355545043945312
0.0068836212158203125
STR2
=
"""
SUBM 0.0003352165222167969
G 0.001149892807006836
G 0.0017066001892089844
REGU 0.0006349086761474609
M 0.00048804283142089844
SUBM 0.00029850006103515625
G 0.001767873764038086
G 0.0020656585693359375
REGU 0.0005462169647216797
M 0.0005753040313720703
SUBM 0.0002789497375488281
G 0.0012230873107910156
G 0.0014438629150390625
REGU 0.0005102157592773438
M 0.0005676746368408203
SUBM 0.00020241737365722656
G 0.00102996826171875
G 0.0011174678802490234
REGU 0.0005424022674560547
M 0.0005102157592773438
SUBM 0.0001976490020751953
G 0.0010385513305664062
G 0.0010204315185546875
REGU 0.0005321502685546875
M 0.00047278404235839844
SUBM 0.00021529197692871094
G 0.0010280609130859375
G 0.0010151863098144531
REGU 0.0004942417144775391
M 0.0004811286926269531
SUBM 0.00020694732666015625
G 0.0005142688751220703
G 0.0005171298980712891
"""
def
_handle_lines
(
s
:
str
):
arr
=
s
.
split
(
" "
)
return
(
arr
[
0
],
float
(
arr
[
-
1
]))
from
cumm.gemm.codeops
import
group_by
def
print_str
(
s
:
str
):
nums
=
list
(
map
(
_handle_lines
,
s
.
strip
().
split
(
"
\n
"
)))
num_dict
=
group_by
(
lambda
x
:
x
[
0
],
nums
)
num_dict_
=
{
k
:
sum
([
vv
[
1
]
for
vv
in
v
])
for
k
,
v
in
num_dict
.
items
()}
print
(
num_dict_
)
nums
=
list
(
map
(
float
,
STR
.
strip
().
split
(
"
\n
"
)))
print
(
sum
(
nums
))
\ No newline at end of file
print_str
(
STR1
)
print_str
(
STR2
)
\ No newline at end of file
test/benchmark.py
View file @
62c1496f
...
...
@@ -224,25 +224,26 @@ def main():
# voxels, coors, spatial_shape = waymo_data()
# with open("/home/yy/test_spconv.pkl", "wb") as f:
# pickle.dump((voxels, coors, spatial_shape), f)
with
open
(
"/home/yy/
test_spconv.pkl"
,
"rb"
)
as
f
:
with
open
(
Path
(
__file__
).
parent
/
"data"
/
"
test_spconv.pkl"
,
"rb"
)
as
f
:
(
voxels
,
coors
,
spatial_shape
)
=
pickle
.
load
(
f
)
print
(
spatial_shape
)
print
(
voxels
.
shape
)
# voxels = voxels[:100]
# coors = coors[:100]
voxels_th
=
torch
.
from_numpy
(
voxels
).
cuda
().
float
()
dtype
=
torch
.
float32
voxels_th
=
torch
.
from_numpy
(
voxels
).
cuda
().
to
(
dtype
)
coors_th
=
torch
.
from_numpy
(
coors
).
cuda
().
int
()
voxels_th
.
requires_grad
=
True
algo
=
spconv
.
ConvAlgo
.
Native
net
=
Net
(
spatial_shape
,
algo
).
cuda
().
eval
().
float
(
)
net
=
Net
(
spatial_shape
,
algo
).
cuda
().
eval
().
to
(
dtype
)
print
(
coors_th
.
shape
)
out
=
net
(
voxels_th
,
coors_th
,
1
)
print
(
out
.
spatial_shape
)
print
(
voxels
.
mean
(),
voxels
.
max
(),
voxels
.
min
())
dout
=
np
.
random
.
uniform
(
-
0.2
,
0.2
,
out
.
features
.
shape
).
astype
(
np
.
float32
)
dout_t
=
torch
.
from_numpy
(
dout
).
cuda
()
dout_t
=
torch
.
from_numpy
(
dout
).
cuda
()
.
to
(
dtype
)
print
(
out
.
spatial_shape
,
out
.
features
.
mean
(),
out
.
features
.
max
(),
out
.
features
.
min
())
times
=
[]
...
...
test/data/test_spconv.pkl
0 → 100644
View file @
62c1496f
File added
test/test_conv.py
View file @
62c1496f
...
...
@@ -381,17 +381,17 @@ class TestSpConv(TestCase):
else
:
filters
=
np
.
random
.
uniform
(
0
,
1
,
size
=
[
k
,
k
,
k
,
OC
,
IC
]).
astype
(
np
.
float32
)
dtype
=
torch
.
float16
indices_t
=
torch
.
from_numpy
(
indices
).
int
().
to
(
device
)
features_t
=
torch
.
from_numpy
(
features
).
to
(
device
)
features_t
=
torch
.
from_numpy
(
features
).
to
(
device
)
.
to
(
dtype
)
features_t
.
requires_grad
=
True
features_dense_t
=
torch
.
from_numpy
(
features_dense
).
to
(
device
)
features_dense_t
=
torch
.
from_numpy
(
features_dense
).
to
(
device
)
.
to
(
dtype
)
features_dense_t
.
requires_grad
=
True
net
=
SparseConv3dTestTorch
(
1
,
3
,
shape
,
IC
,
OC
,
k
,
s
,
p
,
d
).
to
(
device
)
d
).
to
(
device
)
.
to
(
dtype
)
net_ref
=
Conv3dTestTorch
(
1
,
3
,
shape
,
IC
,
OC
,
k
,
s
,
p
,
d
).
to
(
device
)
filters_t
=
torch
.
from_numpy
(
filters
).
to
(
device
)
d
).
to
(
device
)
.
to
(
dtype
)
filters_t
=
torch
.
from_numpy
(
filters
).
to
(
device
)
.
to
(
dtype
)
if
FILTER_HWIO
:
net_ref
.
net
[
0
].
weight
.
data
[:]
=
filters_t
.
permute
(
4
,
3
,
0
,
1
,
2
).
contiguous
()
...
...
@@ -442,6 +442,11 @@ class TestSpConv(TestCase):
strides
=
[
2
,
3
]
paddings
=
[
0
,
1
,
2
]
dilations
=
[
1
,
2
,
3
]
ksizes
=
[
3
]
strides
=
[
1
]
paddings
=
[
0
]
dilations
=
[
1
]
for
dev
,
shape
,
bs
,
IC
,
OC
,
k
,
s
,
p
,
d
in
params_grid
(
devices
,
shapes
,
batchsizes
,
in_channels
,
out_channels
,
ksizes
,
...
...
@@ -458,8 +463,13 @@ class TestSpConv(TestCase):
indices
=
np
.
ascontiguousarray
(
sparse_dict
[
"indices"
][:,
[
3
,
0
,
1
,
2
]]).
astype
(
np
.
int32
)
features_dense
=
sparse_dict
[
"features_dense"
].
astype
(
np
.
float32
)
if
FILTER_HWIO
:
filters
=
np
.
random
.
uniform
(
0
,
1
,
size
=
[
k
,
k
,
k
,
IC
,
OC
]).
astype
(
np
.
float32
)
else
:
filters
=
np
.
random
.
uniform
(
0
,
1
,
size
=
[
k
,
k
,
k
,
OC
,
IC
]).
astype
(
np
.
float32
)
indices_t
=
torch
.
from_numpy
(
indices
).
int
().
to
(
device
)
features_t
=
torch
.
from_numpy
(
features
).
to
(
device
)
features_t
.
requires_grad
=
True
...
...
@@ -470,11 +480,20 @@ class TestSpConv(TestCase):
net_ref
=
DeConv3dTestTorch
(
1
,
3
,
shape
,
IC
,
OC
,
k
,
s
,
p
,
d
).
to
(
device
)
filters_t
=
torch
.
from_numpy
(
filters
).
to
(
device
)
print
(
net_ref
.
net
[
0
].
weight
.
shape
)
if
FILTER_HWIO
:
net_ref
.
net
[
0
].
weight
.
data
[:]
=
filters_t
.
permute
(
3
,
4
,
0
,
1
,
2
).
contiguous
()
else
:
net_ref
.
net
[
0
].
weight
.
data
[:]
=
filters_t
.
permute
(
4
,
3
,
0
,
1
,
2
).
contiguous
()
net
.
net
[
0
].
weight
.
data
[:]
=
filters_t
out_ref
=
net_ref
(
features_dense_t
)
out
=
net
(
features_t
,
indices_t
,
bs
).
dense
()
out_np
=
out
.
detach
().
cpu
().
numpy
()
out_ref_np
=
out_ref
.
detach
().
cpu
().
numpy
()
self
.
assertAllClose
(
out_np
,
out_ref_np
,
atol
=
1e-4
)
dout
=
np
.
random
.
uniform
(
-
0.2
,
0.2
,
out_ref
.
shape
).
astype
(
features
.
dtype
)
dout_t
=
torch
.
from_numpy
(
dout
).
to
(
device
)
...
...
@@ -490,12 +509,12 @@ class TestSpConv(TestCase):
for
layer
,
layer_ref
in
zip
(
net
.
net
,
net_ref
.
net
):
dw
=
layer
.
weight
.
grad
.
detach
().
cpu
().
numpy
()
dw_ref
=
layer_ref
.
weight
.
grad
.
detach
().
cpu
().
numpy
()
if
FILTER_HWIO
:
dw
=
dw
.
transpose
(
3
,
4
,
0
,
1
,
2
)
else
:
dw
=
dw
.
transpose
(
4
,
3
,
0
,
1
,
2
)
self
.
assertAllClose
(
dw
,
dw_ref
,
atol
=
1e-4
)
out_np
=
out
.
detach
().
cpu
().
numpy
()
out_ref_np
=
out_ref
.
detach
().
cpu
().
numpy
()
self
.
assertAllClose
(
out_np
,
out_ref_np
,
atol
=
1e-4
)
def
testSpCpConv3d
(
self
):
np
.
random
.
seed
(
484
)
...
...
tools/build-wheels.sh
View file @
62c1496f
...
...
@@ -27,8 +27,8 @@ function repair_wheel {
export
SPCONV_DISABLE_JIT
=
"1"
export
CUMM_CUDA_ARCH_LIST
=
"all"
# Compile wheels, we only support 3.
7
-3.10.
#
"/opt/python/cp36-cp36m/bin/pip" wheel /io/ --no-deps -w /io/wheelhouse_tmp
# Compile wheels, we only support 3.
6
-3.10.
"/opt/python/cp36-cp36m/bin/pip"
wheel /io/
--no-deps
-w
/io/wheelhouse_tmp
"/opt/python/cp37-cp37m/bin/pip"
wheel /io/
--no-deps
-w
/io/wheelhouse_tmp
"/opt/python/cp38-cp38/bin/pip"
wheel /io/
--no-deps
-w
/io/wheelhouse_tmp
"/opt/python/cp39-cp39/bin/pip"
wheel /io/
--no-deps
-w
/io/wheelhouse_tmp
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment