Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
one
spconv
Commits
6c767a51
"csrc/sm90/vscode:/vscode.git/clone" did not exist on "41b611f7d7561790a2f5040ff89212e08c7b0011"
Commit
6c767a51
authored
May 21, 2020
by
Yan Yan
Browse files
working on remove functor
parent
19e73bbe
Changes
13
Show whitespace changes
Inline
Side-by-side
Showing
13 changed files
with
584 additions
and
413 deletions
+584
-413
include/spconv/fused_spconv_ops.h
include/spconv/fused_spconv_ops.h
+6
-30
include/spconv/indice.h
include/spconv/indice.h
+15
-0
include/spconv/reordering.h
include/spconv/reordering.h
+10
-15
include/spconv/spconv_ops.h
include/spconv/spconv_ops.h
+9
-1
include/tensorview/tensor.h
include/tensorview/tensor.h
+34
-6
spconv/ops.py
spconv/ops.py
+2
-11
src/spconv/all.cc
src/spconv/all.cc
+4
-5
src/spconv/indice.cc
src/spconv/indice.cc
+74
-0
src/spconv/indice.cu
src/spconv/indice.cu
+61
-43
src/spconv/reordering.cc
src/spconv/reordering.cc
+49
-48
src/spconv/reordering.cu
src/spconv/reordering.cu
+131
-120
src/spconv/spconv_ops.cc
src/spconv/spconv_ops.cc
+183
-129
test/test_conv.py
test/test_conv.py
+6
-5
No files found.
include/spconv/fused_spconv_ops.h
View file @
6c767a51
...
@@ -24,7 +24,6 @@
...
@@ -24,7 +24,6 @@
namespace
spconv
{
namespace
spconv
{
// torch.jit's doc says only support int64, so we need to convert to int32.
// torch.jit's doc says only support int64, so we need to convert to int32.
template
<
typename
T
>
torch
::
Tensor
torch
::
Tensor
fusedIndiceConvBatchNorm
(
torch
::
Tensor
features
,
torch
::
Tensor
filters
,
fusedIndiceConvBatchNorm
(
torch
::
Tensor
features
,
torch
::
Tensor
filters
,
torch
::
Tensor
bias
,
torch
::
Tensor
indicePairs
,
torch
::
Tensor
bias
,
torch
::
Tensor
indicePairs
,
...
@@ -80,31 +79,17 @@ fusedIndiceConvBatchNorm(torch::Tensor features, torch::Tensor filters,
...
@@ -80,31 +79,17 @@ fusedIndiceConvBatchNorm(torch::Tensor features, torch::Tensor filters,
continue
;
continue
;
}
}
// auto timer = spconv::CudaContextTimer<>();
// auto timer = spconv::CudaContextTimer<>();
auto
outputBufferBlob
=
torch
::
from_blob
(
outputBuffer
.
data_ptr
<
T
>
(),
auto
outputBufferBlob
=
torch
::
from_blob
(
outputBuffer
.
data_ptr
(),
{
nHot
,
numOutPlanes
},
options
);
{
nHot
,
numOutPlanes
},
options
);
auto
inputBufferBlob
=
torch
::
from_blob
(
inputBuffer
.
data_ptr
<
T
>
(),
auto
inputBufferBlob
=
torch
::
from_blob
(
inputBuffer
.
data_ptr
(),
{
nHot
,
numInPlanes
},
options
);
{
nHot
,
numInPlanes
},
options
);
if
(
device
==
torch
::
kCPU
)
{
if
(
device
==
torch
::
kCPU
)
{
functor
::
SparseGatherFunctor
<
tv
::
CPU
,
T
,
int
>
gatherFtor
;
sparse_gather_cpu
(
inputBuffer
,
features
,
indicePairs
[
i
][
inverse
],
nHot
);
gatherFtor
(
tv
::
CPU
(),
tv
::
torch2tv
<
T
>
(
inputBuffer
),
tv
::
torch2tv
<
const
T
>
(
features
),
tv
::
torch2tv
<
const
int
>
(
indicePairs
).
subview
(
i
,
inverse
),
nHot
);
}
}
#ifdef TV_CUDA
#ifdef TV_CUDA
else
if
(
device
==
torch
::
kCUDA
)
{
else
if
(
device
==
torch
::
kCUDA
)
{
functor
::
SparseGatherFunctor
<
tv
::
GPU
,
T
,
int
>
gatherFtor
;
sparse_gather_cuda
(
inputBuffer
,
features
,
indicePairs
[
i
][
inverse
],
nHot
);
gatherFtor
(
tv
::
TorchGPU
(),
tv
::
torch2tv
<
T
>
(
inputBuffer
),
tv
::
torch2tv
<
const
T
>
(
features
),
tv
::
torch2tv
<
const
int
>
(
indicePairs
).
subview
(
i
,
inverse
),
nHot
);
TV_CHECK_CUDA_ERR
();
/* slower than SparseGatherFunctor, may due to int->long conversion
auto indicePairLong = indicePairs[i][inverse].to(torch::kInt64);
auto indicePairBlob = torch::from_blob(indicePairLong.data<long>(),
{nHot}, indicePairOptions); torch::index_select_out(inputBufferBlob,
features, 0, indicePairBlob);*/
}
}
#endif
#endif
else
{
else
{
...
@@ -116,20 +101,11 @@ fusedIndiceConvBatchNorm(torch::Tensor features, torch::Tensor filters,
...
@@ -116,20 +101,11 @@ fusedIndiceConvBatchNorm(torch::Tensor features, torch::Tensor filters,
// totalGEMMTime += timer.report() / 1000.0;
// totalGEMMTime += timer.report() / 1000.0;
if
(
device
==
torch
::
kCPU
)
{
if
(
device
==
torch
::
kCPU
)
{
functor
::
SparseScatterAddFunctor
<
tv
::
CPU
,
T
,
int
>
scatterFtor
;
sparse_scatter_add_cpu
(
outputBuffer
,
output
,
indicePairs
[
i
][
!
inverse
],
nHot
);
scatterFtor
(
tv
::
CPU
(),
tv
::
torch2tv
<
T
>
(
output
),
tv
::
torch2tv
<
const
T
>
(
outputBuffer
),
tv
::
torch2tv
<
const
int
>
(
indicePairs
).
subview
(
i
,
!
inverse
),
nHot
,
true
);
}
}
#ifdef TV_CUDA
#ifdef TV_CUDA
else
if
(
device
==
torch
::
kCUDA
)
{
else
if
(
device
==
torch
::
kCUDA
)
{
functor
::
SparseScatterAddFunctor
<
tv
::
GPU
,
T
,
int
>
scatterFtor
;
sparse_scatter_add_cuda
(
outputBuffer
,
output
,
indicePairs
[
i
][
!
inverse
],
nHot
);
scatterFtor
(
tv
::
TorchGPU
(),
tv
::
torch2tv
<
T
>
(
output
),
tv
::
torch2tv
<
const
T
>
(
outputBuffer
),
tv
::
torch2tv
<
const
int
>
(
indicePairs
).
subview
(
i
,
!
inverse
),
nHot
,
true
);
TV_CHECK_CUDA_ERR
();
}
}
#endif
#endif
else
{
else
{
...
...
include/spconv/indice.h
View file @
6c767a51
...
@@ -97,6 +97,21 @@ int create_submconv_indice_pair_cuda(
...
@@ -97,6 +97,21 @@ int create_submconv_indice_pair_cuda(
std
::
vector
<
int64_t
>
dilation
,
std
::
vector
<
int64_t
>
outSpatialShape
,
std
::
vector
<
int64_t
>
dilation
,
std
::
vector
<
int64_t
>
outSpatialShape
,
bool
transpose
,
bool
resetGrid
,
bool
useHash
);
bool
transpose
,
bool
resetGrid
,
bool
useHash
);
int
create_conv_indice_pair_cpu
(
torch
::
Tensor
indicesIn
,
torch
::
Tensor
indicesOut
,
torch
::
Tensor
gridsOut
,
torch
::
Tensor
indicePairs
,
torch
::
Tensor
indiceNum
,
std
::
vector
<
int64_t
>
kernelSize
,
std
::
vector
<
int64_t
>
stride
,
std
::
vector
<
int64_t
>
padding
,
std
::
vector
<
int64_t
>
dilation
,
std
::
vector
<
int64_t
>
outSpatialShape
,
bool
transpose
,
bool
resetGrid
,
bool
useHash
);
int
create_submconv_indice_pair_cpu
(
torch
::
Tensor
indicesIn
,
torch
::
Tensor
gridsOut
,
torch
::
Tensor
indicePairs
,
torch
::
Tensor
indiceNum
,
std
::
vector
<
int64_t
>
kernelSize
,
std
::
vector
<
int64_t
>
stride
,
std
::
vector
<
int64_t
>
padding
,
std
::
vector
<
int64_t
>
dilation
,
std
::
vector
<
int64_t
>
outSpatialShape
,
bool
transpose
,
bool
resetGrid
,
bool
useHash
);
}
// namespace spconv
}
// namespace spconv
#endif
#endif
\ No newline at end of file
include/spconv/reordering.h
View file @
6c767a51
...
@@ -15,24 +15,19 @@
...
@@ -15,24 +15,19 @@
#ifndef SPARSE_REORDERING_FUNCTOR_H_
#ifndef SPARSE_REORDERING_FUNCTOR_H_
#define SPARSE_REORDERING_FUNCTOR_H_
#define SPARSE_REORDERING_FUNCTOR_H_
#include <tensorview/tensorview.h>
#include <tensorview/tensorview.h>
#include <torch/script.h>
namespace
spconv
{
namespace
spconv
{
namespace
functor
{
void
sparse_gather_cuda
(
torch
::
Tensor
buffer
,
torch
::
Tensor
features
,
template
<
typename
Device
,
typename
T
,
typename
Index
>
torch
::
Tensor
indices
,
int
size
);
struct
SparseGatherFunctor
{
void
sparse_scatter_add_cuda
(
torch
::
Tensor
buffer
,
torch
::
Tensor
outFeatures
,
void
operator
()(
const
Device
&
d
,
tv
::
TensorView
<
T
>
buffer
,
torch
::
Tensor
indices
,
int
size
);
tv
::
TensorView
<
const
T
>
features
,
tv
::
TensorView
<
const
Index
>
indices
,
int
size
);
void
sparse_gather_cpu
(
torch
::
Tensor
buffer
,
torch
::
Tensor
features
,
};
torch
::
Tensor
indices
,
int
size
);
void
sparse_scatter_add_cpu
(
torch
::
Tensor
buffer
,
torch
::
Tensor
outFeatures
,
torch
::
Tensor
indices
,
int
size
);
template
<
typename
Device
,
typename
T
,
typename
Index
>
struct
SparseScatterAddFunctor
{
void
operator
()(
const
Device
&
d
,
tv
::
TensorView
<
T
>
out_features
,
tv
::
TensorView
<
const
T
>
buffer
,
tv
::
TensorView
<
const
Index
>
indices
,
int
size
,
bool
stable
=
false
);
};
}
// namespace functor
}
// namespace spconv
}
// namespace spconv
#endif
#endif
\ No newline at end of file
include/spconv/spconv_ops.h
View file @
6c767a51
...
@@ -198,6 +198,15 @@ getIndicePair(torch::Tensor indices, int64_t batchSize,
...
@@ -198,6 +198,15 @@ getIndicePair(torch::Tensor indices, int64_t batchSize,
}
}
}
}
std
::
vector
<
torch
::
Tensor
>
getIndicePairV2
(
torch
::
Tensor
indices
,
int64_t
batchSize
,
std
::
vector
<
int64_t
>
outSpatialShape
,
std
::
vector
<
int64_t
>
spatialShape
,
std
::
vector
<
int64_t
>
kernelSize
,
std
::
vector
<
int64_t
>
stride
,
std
::
vector
<
int64_t
>
padding
,
std
::
vector
<
int64_t
>
dilation
,
std
::
vector
<
int64_t
>
outPadding
,
int64_t
_subM
,
int64_t
_transpose
,
int64_t
_useHash
);
template
<
unsigned
NDim
>
template
<
unsigned
NDim
>
std
::
vector
<
torch
::
Tensor
>
getIndicePairPreGrid
(
std
::
vector
<
torch
::
Tensor
>
getIndicePairPreGrid
(
torch
::
Tensor
indices
,
torch
::
Tensor
gridOut
,
int64_t
batchSize
,
torch
::
Tensor
indices
,
torch
::
Tensor
gridOut
,
int64_t
batchSize
,
...
@@ -333,7 +342,6 @@ std::vector<torch::Tensor> getIndicePairPreGrid(
...
@@ -333,7 +342,6 @@ std::vector<torch::Tensor> getIndicePairPreGrid(
torch
::
Tensor
indiceConv
(
torch
::
Tensor
features
,
torch
::
Tensor
filters
,
torch
::
Tensor
indiceConv
(
torch
::
Tensor
features
,
torch
::
Tensor
filters
,
torch
::
Tensor
indicePairs
,
torch
::
Tensor
indiceNum
,
torch
::
Tensor
indicePairs
,
torch
::
Tensor
indiceNum
,
int64_t
numActOut
,
int64_t
_inverse
,
int64_t
_subM
);
int64_t
numActOut
,
int64_t
_inverse
,
int64_t
_subM
);
std
::
vector
<
torch
::
Tensor
>
std
::
vector
<
torch
::
Tensor
>
indiceConvBackward
(
torch
::
Tensor
features
,
torch
::
Tensor
filters
,
indiceConvBackward
(
torch
::
Tensor
features
,
torch
::
Tensor
filters
,
torch
::
Tensor
outGrad
,
torch
::
Tensor
indicePairs
,
torch
::
Tensor
outGrad
,
torch
::
Tensor
indicePairs
,
...
...
include/tensorview/tensor.h
View file @
6c767a51
...
@@ -52,6 +52,10 @@ enum DType {
...
@@ -52,6 +52,10 @@ enum DType {
namespace
detail
{
namespace
detail
{
using
dtype_collection_t
=
tv
::
mp_list_c
<
int
,
float32
,
int32
,
int16
,
int8
,
float64
,
bool_
,
uint8
,
float16
,
int64
,
uint16
,
uint32
,
uint64
>
;
using
all_tensor_types_t
=
using
all_tensor_types_t
=
std
::
tuple
<
float
,
double
,
int8_t
,
int16_t
,
int32_t
,
int64_t
,
uint8_t
,
std
::
tuple
<
float
,
double
,
int8_t
,
int16_t
,
int32_t
,
int64_t
,
uint8_t
,
uint16_t
,
uint32_t
,
uint64_t
,
bool
>
;
uint16_t
,
uint32_t
,
uint64_t
,
bool
>
;
...
@@ -305,7 +309,7 @@ template <class... Ts, typename F> void dispatch(DType t, F &&f) {
...
@@ -305,7 +309,7 @@ template <class... Ts, typename F> void dispatch(DType t, F &&f) {
static_assert
(
sizeof
...(
Ts
)
>
0
,
"you need to provide at least one type"
);
static_assert
(
sizeof
...(
Ts
)
>
0
,
"you need to provide at least one type"
);
bool
notFound
=
true
;
bool
notFound
=
true
;
mp_for_each
<
mp_list
<
Ts
...
>>
([
=
,
&
notFound
,
&
f
](
auto
I
)
{
mp_for_each
<
mp_list
<
Ts
...
>>
([
=
,
&
notFound
,
&
f
](
auto
I
)
{
if
(
type_v
<
decltype
(
I
)
>
==
t
)
{
if
(
type_v
<
decltype
(
I
)
>
==
t
&&
notFound
)
{
std
::
forward
<
F
>
(
f
)(
decltype
(
I
)());
std
::
forward
<
F
>
(
f
)(
decltype
(
I
)());
notFound
=
false
;
notFound
=
false
;
}
}
...
@@ -325,7 +329,7 @@ template <typename T, T... Is, typename F> void dispatch_scalar(T idx, F &&f) {
...
@@ -325,7 +329,7 @@ template <typename T, T... Is, typename F> void dispatch_scalar(T idx, F &&f) {
"you need to provide at least one candidate"
);
"you need to provide at least one candidate"
);
bool
notFound
=
true
;
bool
notFound
=
true
;
mp_for_each
<
mp_list_c
<
T
,
Is
...
>>
([
=
,
&
notFound
,
&
f
](
auto
I
)
{
mp_for_each
<
mp_list_c
<
T
,
Is
...
>>
([
=
,
&
notFound
,
&
f
](
auto
I
)
{
if
(
T
(
I
)
==
idx
)
{
if
(
T
(
I
)
==
idx
&&
notFound
)
{
std
::
forward
<
F
>
(
f
)(
I
);
std
::
forward
<
F
>
(
f
)(
I
);
notFound
=
false
;
notFound
=
false
;
}
}
...
@@ -343,7 +347,27 @@ template <int... Is, typename F> void dispatch_int(int idx, F &&f) {
...
@@ -343,7 +347,27 @@ template <int... Is, typename F> void dispatch_int(int idx, F &&f) {
"you need to provide at least one candidate"
);
"you need to provide at least one candidate"
);
bool
notFound
=
true
;
bool
notFound
=
true
;
mp_for_each
<
mp_list_c
<
int
,
Is
...
>>
([
=
,
&
notFound
,
&
f
](
auto
I
)
{
mp_for_each
<
mp_list_c
<
int
,
Is
...
>>
([
=
,
&
notFound
,
&
f
](
auto
I
)
{
if
(
int
(
I
)
==
idx
)
{
if
(
decltype
(
I
)
::
value
==
idx
&&
notFound
)
{
std
::
forward
<
F
>
(
f
)(
I
);
notFound
=
false
;
}
});
if
(
notFound
)
{
std
::
stringstream
ss
;
mp_for_each
<
mp_list_c
<
int
,
Is
...
>>
(
[
=
,
&
ss
](
auto
I
)
{
ss
<<
decltype
(
I
)
::
value
<<
" "
;
});
TV_THROW_RT_ERR
(
"unknown value"
,
idx
,
", available:"
,
ss
.
str
());
}
}
template
<
int
...
Is
,
typename
F
,
class
BinaryPredicate
>
void
dispatch_int
(
int
idx
,
BinaryPredicate
p
,
F
&&
f
)
{
// BinaryPredicate: BinaryPredicate(idx, candidate)
static_assert
(
sizeof
...(
Is
)
>
0
,
"you need to provide at least one candidate"
);
bool
notFound
=
true
;
mp_for_each
<
mp_list_c
<
int
,
Is
...
>>
([
=
,
&
notFound
,
&
f
](
auto
I
)
{
if
(
p
(
idx
,
decltype
(
I
)
::
value
)
&&
notFound
)
{
std
::
forward
<
F
>
(
f
)(
I
);
std
::
forward
<
F
>
(
f
)(
I
);
notFound
=
false
;
notFound
=
false
;
}
}
...
@@ -351,7 +375,7 @@ template <int... Is, typename F> void dispatch_int(int idx, F &&f) {
...
@@ -351,7 +375,7 @@ template <int... Is, typename F> void dispatch_int(int idx, F &&f) {
if
(
notFound
)
{
if
(
notFound
)
{
std
::
stringstream
ss
;
std
::
stringstream
ss
;
mp_for_each
<
mp_list_c
<
int
,
Is
...
>>
(
mp_for_each
<
mp_list_c
<
int
,
Is
...
>>
(
[
=
,
&
ss
](
auto
I
)
{
ss
<<
int
(
I
)
<<
" "
;
});
[
=
,
&
ss
](
auto
I
)
{
ss
<<
decltype
(
I
)
::
value
<<
" "
;
});
TV_THROW_RT_ERR
(
"unknown value"
,
idx
,
", available:"
,
ss
.
str
());
TV_THROW_RT_ERR
(
"unknown value"
,
idx
,
", available:"
,
ss
.
str
());
}
}
}
}
...
@@ -373,12 +397,16 @@ struct Dispatch<T<Args...>> {
...
@@ -373,12 +397,16 @@ struct Dispatch<T<Args...>> {
template
<
class
T
>
struct
DispatchInt
;
template
<
class
T
>
struct
DispatchInt
;
template
<
template
<
int
...
>
class
T
,
int
...
Ints
>
template
<
template
<
class
...
>
class
Tin
,
template
<
class
,
int
>
class
T
,
int
...
Ints
>
struct
DispatchInt
<
T
<
Ints
...
>>
{
struct
DispatchInt
<
T
in
<
T
<
int
,
Ints
>
...
>>
{
template
<
typename
F
>
inline
void
operator
()(
int
t
,
F
&&
f
)
{
template
<
typename
F
>
inline
void
operator
()(
int
t
,
F
&&
f
)
{
return
dispatch_int
<
Ints
...
>
(
t
,
std
::
forward
<
F
>
(
f
));
return
dispatch_int
<
Ints
...
>
(
t
,
std
::
forward
<
F
>
(
f
));
}
}
template
<
typename
F
,
typename
BinaryPredicate
>
inline
void
operator
()(
int
t
,
BinaryPredicate
p
,
F
&&
f
)
{
return
dispatch_int
<
Ints
...
>
(
t
,
p
,
std
::
forward
<
F
>
(
f
));
}
};
};
constexpr
size_t
kTensorMaxDim
=
10
;
constexpr
size_t
kTensorMaxDim
=
10
;
using
TensorShape
=
ShapeBase
<
kTensorMaxDim
,
int64_t
>
;
using
TensorShape
=
ShapeBase
<
kTensorMaxDim
,
int64_t
>
;
...
...
spconv/ops.py
View file @
6c767a51
...
@@ -81,16 +81,7 @@ def get_indice_pairs(indices,
...
@@ -81,16 +81,7 @@ def get_indice_pairs(indices,
else
:
else
:
out_shape
=
spatial_shape
out_shape
=
spatial_shape
if
grid
is
None
:
if
grid
is
None
:
if
ndim
==
2
:
res
=
torch
.
ops
.
spconv
.
get_indice_pairs_v2
(
indices
,
batch_size
,
out_shape
,
get_indice_pairs_func
=
torch
.
ops
.
spconv
.
get_indice_pairs_2d
elif
ndim
==
3
:
get_indice_pairs_func
=
torch
.
ops
.
spconv
.
get_indice_pairs_3d
elif
ndim
==
4
:
get_indice_pairs_func
=
torch
.
ops
.
spconv
.
get_indice_pairs_4d
else
:
raise
NotImplementedError
res
=
get_indice_pairs_func
(
indices
,
batch_size
,
out_shape
,
spatial_shape
,
ksize
,
stride
,
padding
,
spatial_shape
,
ksize
,
stride
,
padding
,
dilation
,
out_padding
,
int
(
subm
),
dilation
,
out_padding
,
int
(
subm
),
int
(
transpose
),
int
(
use_hash
))
int
(
transpose
),
int
(
use_hash
))
...
@@ -115,7 +106,7 @@ def indice_conv(features,
...
@@ -115,7 +106,7 @@ def indice_conv(features,
num_activate_out
,
num_activate_out
,
inverse
=
False
,
inverse
=
False
,
subm
=
False
):
subm
=
False
):
return
torch
.
ops
.
spconv
.
indice_conv
(
features
,
filters
,
indice_pairs
,
return
torch
.
ops
.
spconv
.
indice_conv
_v2
(
features
,
filters
,
indice_pairs
,
indice_pair_num
,
num_activate_out
,
indice_pair_num
,
num_activate_out
,
int
(
inverse
),
int
(
subm
))
int
(
inverse
),
int
(
subm
))
...
...
src/spconv/all.cc
View file @
6c767a51
...
@@ -12,28 +12,27 @@
...
@@ -12,28 +12,27 @@
// See the License for the specific language governing permissions and
// See the License for the specific language governing permissions and
// limitations under the License.
// limitations under the License.
#include <spconv/fused_spconv_ops.h>
#include <spconv/nms_ops.h>
#include <spconv/nms_ops.h>
#include <spconv/pillar_scatter_ops.h>
#include <spconv/pillar_scatter_ops.h>
#include <spconv/pool_ops.h>
#include <spconv/pool_ops.h>
#include <spconv/spconv_ops.h>
#include <spconv/spconv_ops.h>
#include <torch/script.h>
#include <torch/script.h>
#include <spconv/fused_spconv_ops.h>
static
auto
registry
=
static
auto
registry
=
torch
::
RegisterOperators
()
torch
::
RegisterOperators
()
.
op
(
"spconv::get_indice_pairs_2d"
,
&
spconv
::
getIndicePair
<
2
>
)
.
op
(
"spconv::get_indice_pairs_2d"
,
&
spconv
::
getIndicePair
<
2
>
)
.
op
(
"spconv::get_indice_pairs_3d"
,
&
spconv
::
getIndicePair
<
3
>
)
.
op
(
"spconv::get_indice_pairs_3d"
,
&
spconv
::
getIndicePair
<
3
>
)
.
op
(
"spconv::get_indice_pairs_4d"
,
&
spconv
::
getIndicePair
<
4
>
)
.
op
(
"spconv::get_indice_pairs_4d"
,
&
spconv
::
getIndicePair
<
4
>
)
.
op
(
"spconv::get_indice_pairs_v2"
,
&
spconv
::
getIndicePairV2
)
.
op
(
"spconv::get_indice_pairs_grid_2d"
,
.
op
(
"spconv::get_indice_pairs_grid_2d"
,
&
spconv
::
getIndicePairPreGrid
<
2
>
)
&
spconv
::
getIndicePairPreGrid
<
2
>
)
.
op
(
"spconv::get_indice_pairs_grid_3d"
,
.
op
(
"spconv::get_indice_pairs_grid_3d"
,
&
spconv
::
getIndicePairPreGrid
<
3
>
)
&
spconv
::
getIndicePairPreGrid
<
3
>
)
.
op
(
"spconv::indice_conv"
,
&
spconv
::
indiceConv
)
.
op
(
"spconv::indice_conv"
,
&
spconv
::
indiceConv
)
.
op
(
"spconv::indice_conv_backward"
,
&
spconv
::
indiceConvBackward
)
.
op
(
"spconv::indice_conv_backward"
,
&
spconv
::
indiceConvBackward
)
.
op
(
"spconv::fused_indice_conv_fp32"
,
.
op
(
"spconv::fused_indice_conv_bn"
,
&
spconv
::
fusedIndiceConvBatchNorm
<
float
>
)
&
spconv
::
fusedIndiceConvBatchNorm
)
.
op
(
"spconv::fused_indice_conv_half"
,
&
spconv
::
fusedIndiceConvBatchNorm
<
at
::
Half
>
)
.
op
(
"spconv::indice_maxpool_fp32"
,
&
spconv
::
indiceMaxPool
<
float
>
)
.
op
(
"spconv::indice_maxpool_fp32"
,
&
spconv
::
indiceMaxPool
<
float
>
)
.
op
(
"spconv::indice_maxpool_backward_fp32"
,
.
op
(
"spconv::indice_maxpool_backward_fp32"
,
&
spconv
::
indiceMaxPoolBackward
<
float
>
)
&
spconv
::
indiceMaxPoolBackward
<
float
>
)
...
...
src/spconv/indice.cc
View file @
6c767a51
...
@@ -16,6 +16,7 @@
...
@@ -16,6 +16,7 @@
#include <spconv/geometry.h>
#include <spconv/geometry.h>
#include <spconv/indice.h>
#include <spconv/indice.h>
#include <spconv/spconv_ops.h>
#include <spconv/spconv_ops.h>
#include <tensorview/tensor.h>
#include <torch/script.h>
#include <torch/script.h>
namespace
spconv
{
namespace
spconv
{
...
@@ -253,6 +254,79 @@ Index getIndicePairsSubM(tv::TensorView<const Index> indicesIn,
...
@@ -253,6 +254,79 @@ Index getIndicePairsSubM(tv::TensorView<const Index> indicesIn,
}
}
#endif
#endif
int
create_conv_indice_pair_cpu
(
torch
::
Tensor
indicesIn
,
torch
::
Tensor
indicesOut
,
torch
::
Tensor
gridsOut
,
torch
::
Tensor
indicePairs
,
torch
::
Tensor
indiceNum
,
std
::
vector
<
int64_t
>
kernelSize
,
std
::
vector
<
int64_t
>
stride
,
std
::
vector
<
int64_t
>
padding
,
std
::
vector
<
int64_t
>
dilation
,
std
::
vector
<
int64_t
>
outSpatialShape
,
bool
transpose
,
bool
resetGrid
,
bool
useHash
)
{
auto
ndim
=
outSpatialShape
.
size
();
auto
numActIn
=
indicesIn
.
size
(
0
);
int
batchSize
=
gridsOut
.
size
(
0
);
auto
kernelVolume
=
indicePairs
.
size
(
0
);
if
(
numActIn
==
0
)
return
0
;
tv
::
dispatch_torch
<
int32_t
,
int64_t
>
(
indicesIn
.
scalar_type
(),
[
&
](
auto
V
)
{
using
Index
=
decltype
(
V
);
using
IndexGrid
=
int32_t
;
tv
::
dispatch_int
<
2
,
3
,
4
>
(
ndim
,
[
&
](
auto
I
)
{
constexpr
int
NDim
=
decltype
(
I
)
::
value
;
tv
::
SimpleVector
<
Index
,
NDim
>
ks
(
kernelSize
.
begin
(),
kernelSize
.
end
());
tv
::
SimpleVector
<
Index
,
NDim
>
st
(
stride
.
begin
(),
stride
.
end
());
tv
::
SimpleVector
<
Index
,
NDim
>
pa
(
padding
.
begin
(),
padding
.
end
());
tv
::
SimpleVector
<
Index
,
NDim
>
di
(
dilation
.
begin
(),
dilation
.
end
());
tv
::
SimpleVector
<
Index
,
NDim
>
ou
(
outSpatialShape
.
begin
(),
outSpatialShape
.
end
());
if
(
transpose
)
numActIn
=
getIndicePairsDeConv
<
Index
,
IndexGrid
,
NDim
>
(
tv
::
torch2tv
<
Index
>
(
indicesIn
),
tv
::
torch2tv
<
Index
>
(
indicesOut
),
tv
::
torch2tv
<
IndexGrid
>
(
gridsOut
),
tv
::
torch2tv
<
Index
>
(
indicePairs
),
tv
::
torch2tv
<
Index
>
(
indiceNum
),
ks
.
data
(),
st
.
data
(),
pa
.
data
(),
di
.
data
(),
ou
.
data
());
else
numActIn
=
getIndicePairsConv
<
Index
,
IndexGrid
,
NDim
>
(
tv
::
torch2tv
<
Index
>
(
indicesIn
),
tv
::
torch2tv
<
Index
>
(
indicesOut
),
tv
::
torch2tv
<
IndexGrid
>
(
gridsOut
),
tv
::
torch2tv
<
Index
>
(
indicePairs
),
tv
::
torch2tv
<
Index
>
(
indiceNum
),
ks
.
data
(),
st
.
data
(),
pa
.
data
(),
di
.
data
(),
ou
.
data
());
});
});
return
numActIn
;
}
int
create_submconv_indice_pair_cpu
(
torch
::
Tensor
indicesIn
,
torch
::
Tensor
gridsOut
,
torch
::
Tensor
indicePairs
,
torch
::
Tensor
indiceNum
,
std
::
vector
<
int64_t
>
kernelSize
,
std
::
vector
<
int64_t
>
stride
,
std
::
vector
<
int64_t
>
padding
,
std
::
vector
<
int64_t
>
dilation
,
std
::
vector
<
int64_t
>
outSpatialShape
,
bool
transpose
,
bool
resetGrid
,
bool
useHash
)
{
auto
ndim
=
outSpatialShape
.
size
();
auto
numActIn
=
indicesIn
.
size
(
0
);
int
batchSize
=
gridsOut
.
size
(
0
);
auto
kernelVolume
=
indicePairs
.
size
(
0
);
if
(
numActIn
==
0
)
return
0
;
tv
::
dispatch_torch
<
int32_t
,
int64_t
>
(
indicesIn
.
scalar_type
(),
[
&
](
auto
V
)
{
using
Index
=
decltype
(
V
);
using
IndexGrid
=
int32_t
;
tv
::
dispatch_int
<
2
,
3
,
4
>
(
ndim
,
[
&
](
auto
I
)
{
constexpr
int
NDim
=
decltype
(
I
)
::
value
;
tv
::
SimpleVector
<
Index
,
NDim
>
ks
(
kernelSize
.
begin
(),
kernelSize
.
end
());
tv
::
SimpleVector
<
Index
,
NDim
>
st
(
stride
.
begin
(),
stride
.
end
());
tv
::
SimpleVector
<
Index
,
NDim
>
pa
(
padding
.
begin
(),
padding
.
end
());
tv
::
SimpleVector
<
Index
,
NDim
>
di
(
dilation
.
begin
(),
dilation
.
end
());
tv
::
SimpleVector
<
Index
,
NDim
>
ou
(
outSpatialShape
.
begin
(),
outSpatialShape
.
end
());
numActIn
=
getIndicePairsSubM
<
Index
,
IndexGrid
,
NDim
>
(
tv
::
torch2tv
<
Index
>
(
indicesIn
),
tv
::
torch2tv
<
IndexGrid
>
(
gridsOut
),
tv
::
torch2tv
<
Index
>
(
indicePairs
),
tv
::
torch2tv
<
Index
>
(
indiceNum
),
ks
.
data
(),
st
.
data
(),
pa
.
data
(),
di
.
data
(),
ou
.
data
());
});
});
return
numActIn
;
}
namespace
functor
{
namespace
functor
{
template
<
typename
Index
,
typename
IndexGrid
,
unsigned
NDim
>
template
<
typename
Index
,
typename
IndexGrid
,
unsigned
NDim
>
struct
CreateConvIndicePairFunctor
<
tv
::
CPU
,
Index
,
IndexGrid
,
NDim
>
{
struct
CreateConvIndicePairFunctor
<
tv
::
CPU
,
Index
,
IndexGrid
,
NDim
>
{
...
...
src/spconv/indice.cu
View file @
6c767a51
...
@@ -38,39 +38,45 @@ int create_conv_indice_pair_p1_cuda(
...
@@ -38,39 +38,45 @@ int create_conv_indice_pair_p1_cuda(
auto
stream
=
at
::
cuda
::
getCurrentCUDAStream
();
auto
stream
=
at
::
cuda
::
getCurrentCUDAStream
();
auto
ndim
=
kernelSize
.
size
();
auto
ndim
=
kernelSize
.
size
();
auto
numActIn
=
indicesIn
.
size
(
0
);
auto
numActIn
=
indicesIn
.
size
(
0
);
auto
kernelVolume
=
indicePairs
.
size
(
0
);
if
(
numActIn
==
0
)
if
(
numActIn
==
0
)
return
0
;
return
0
;
// dispatch_torch must be in outside, this is a gcc bug, fixed in gcc 8.
tv
::
dispatch_torch
<
int32_t
>
(
indicesIn
.
scalar_type
(),
[
&
](
auto
IndexValue
)
{
tv
::
dispatch_torch
<
int32_t
>
(
indicesIn
.
scalar_type
(),
[
&
](
auto
V
)
{
using
Index
=
decltype
(
IndexValue
);
using
Index
=
decltype
(
V
);
using
IndexGrid
=
int32_t
;
using
IndexGrid
=
int32_t
;
tv
::
dispatch_int
<
2
,
3
,
4
>
(
ndim
,
[
&
](
auto
I
)
{
tv
::
dispatch_int
<
2
,
3
,
4
>
(
ndim
,
[
&
](
auto
I
)
{
constexpr
int
NDim
=
I
;
constexpr
int
NDim
=
decltype
(
I
)
::
value
;
tv
::
SimpleVector
<
Index
,
NDim
>
ks
(
kernelSize
.
begin
(),
kernelSize
.
end
());
tv
::
SimpleVector
<
Index
,
NDim
>
ks
(
kernelSize
.
begin
(),
kernelSize
.
end
());
tv
::
SimpleVector
<
Index
,
NDim
>
st
(
stride
.
begin
(),
stride
.
end
());
tv
::
SimpleVector
<
Index
,
NDim
>
st
(
stride
.
begin
(),
stride
.
end
());
tv
::
SimpleVector
<
Index
,
NDim
>
pa
(
padding
.
begin
(),
padding
.
end
());
tv
::
SimpleVector
<
Index
,
NDim
>
pa
(
padding
.
begin
(),
padding
.
end
());
tv
::
SimpleVector
<
Index
,
NDim
>
di
(
dilation
.
begin
(),
dilation
.
end
());
tv
::
SimpleVector
<
Index
,
NDim
>
di
(
dilation
.
begin
(),
dilation
.
end
());
tv
::
SimpleVector
<
Index
,
NDim
>
ou
(
outSpatialShape
.
begin
(),
tv
::
SimpleVector
<
Index
,
NDim
>
ou
(
outSpatialShape
.
begin
(),
outSpatialShape
.
end
());
outSpatialShape
.
end
());
tv
::
dispatch_int
<
16
,
32
,
256
,
4096
>
(
kernelVolume
,
std
::
less_equal
<
int
>
(),
[
&
](
auto
I2
)
{
constexpr
int
MaxKernelVolume
=
decltype
(
I2
)
::
value
;
if
(
transpose
)
{
if
(
transpose
)
{
prepareDeConvIndicePairsKernel
<
Index
,
NDim
,
4096
>
prepareDeConvIndicePairsKernel
<
Index
,
NDim
,
MaxKernelVolume
>
<<<
tv
::
cuda
::
getBlocks
(
numActIn
),
tv
::
cuda
::
CUDA_NUM_THREADS
,
0
,
<<<
tv
::
cuda
::
getBlocks
(
numActIn
),
tv
::
cuda
::
CUDA_NUM_THREADS
,
stream
>>>
(
tv
::
torch2tv
<
Index
>
(
indicesIn
),
0
,
stream
>>>
(
tv
::
torch2tv
<
Index
>
(
indicesIn
),
tv
::
torch2tv
<
Index
>
(
indicePairs
),
tv
::
torch2tv
<
Index
>
(
indicePairs
),
tv
::
torch2tv
<
Index
>
(
indiceNum
),
tv
::
torch2tv
<
Index
>
(
indiceNum
),
tv
::
torch2tv
<
Index
>
(
indicePairUnique
),
ks
,
st
,
pa
,
di
,
tv
::
torch2tv
<
Index
>
(
indicePairUnique
),
ks
,
st
,
ou
);
pa
,
di
,
ou
);
TV_CHECK_CUDA_ERR_V2
(
"prepareDeConvIndicePairsKernel failed"
);
}
else
{
}
else
{
prepareIndicePairsKernel
<
Index
,
NDim
,
4096
>
prepareIndicePairsKernel
<
Index
,
NDim
,
MaxKernelVolume
>
<<<
tv
::
cuda
::
getBlocks
(
numActIn
),
tv
::
cuda
::
CUDA_NUM_THREADS
,
0
,
<<<
tv
::
cuda
::
getBlocks
(
numActIn
),
tv
::
cuda
::
CUDA_NUM_THREADS
,
stream
>>>
(
tv
::
torch2tv
<
Index
>
(
indicesIn
),
0
,
stream
>>>
(
tv
::
torch2tv
<
Index
>
(
indicesIn
),
tv
::
torch2tv
<
Index
>
(
indicePairs
),
tv
::
torch2tv
<
Index
>
(
indicePairs
),
tv
::
torch2tv
<
Index
>
(
indiceNum
),
tv
::
torch2tv
<
Index
>
(
indiceNum
),
tv
::
torch2tv
<
Index
>
(
indicePairUnique
),
ks
,
st
,
pa
,
di
,
tv
::
torch2tv
<
Index
>
(
indicePairUnique
),
ks
,
st
,
ou
);
pa
,
di
,
ou
);
TV_CHECK_CUDA_ERR_V2
(
"prepareIndicePairsKernel failed"
);
}
}
});
});
});
});
});
return
1
;
return
1
;
}
}
...
@@ -88,12 +94,11 @@ int create_conv_indice_pair_p2_cuda(
...
@@ -88,12 +94,11 @@ int create_conv_indice_pair_p2_cuda(
auto
kernelVolume
=
indicePairs
.
size
(
0
);
auto
kernelVolume
=
indicePairs
.
size
(
0
);
if
(
numActIn
==
0
)
if
(
numActIn
==
0
)
return
0
;
return
0
;
// dispatch_torch must be in outside, this is a gcc bug, fixed in gcc 8.
tv
::
dispatch_torch
<
int32_t
>
(
indicesIn
.
scalar_type
(),
[
&
](
auto
IndexValue
)
{
tv
::
dispatch_torch
<
int32_t
>
(
indicesIn
.
scalar_type
(),
[
&
](
auto
V
)
{
using
Index
=
decltype
(
IndexValue
);
using
Index
=
decltype
(
V
);
using
IndexGrid
=
int32_t
;
using
IndexGrid
=
int32_t
;
tv
::
dispatch_int
<
2
,
3
,
4
>
(
ndim
,
[
&
](
auto
I
)
{
tv
::
dispatch_int
<
2
,
3
,
4
>
(
ndim
,
[
&
](
auto
I
)
{
constexpr
int
NDim
=
I
;
constexpr
int
NDim
=
decltype
(
I
)
::
value
;
using
IndexGrid
=
int32_t
;
using
IndexGrid
=
int32_t
;
tv
::
SimpleVector
<
Index
,
NDim
>
ou
(
outSpatialShape
.
begin
(),
tv
::
SimpleVector
<
Index
,
NDim
>
ou
(
outSpatialShape
.
begin
(),
outSpatialShape
.
end
());
outSpatialShape
.
end
());
...
@@ -122,6 +127,8 @@ int create_conv_indice_pair_p2_cuda(
...
@@ -122,6 +127,8 @@ int create_conv_indice_pair_p2_cuda(
<<<
tv
::
cuda
::
getBlocks
(
numAct
),
tv
::
cuda
::
CUDA_NUM_THREADS
,
0
,
<<<
tv
::
cuda
::
getBlocks
(
numAct
),
tv
::
cuda
::
CUDA_NUM_THREADS
,
0
,
stream
>>>
(
tv
::
torch2tv
<
Index
>
(
indicesOut
),
numAct
,
stream
>>>
(
tv
::
torch2tv
<
Index
>
(
indicesOut
),
numAct
,
tv
::
torch2tv
<
Index
>
(
indicePairUnique
),
ou
,
batchSize
);
tv
::
torch2tv
<
Index
>
(
indicePairUnique
),
ou
,
batchSize
);
TV_CHECK_CUDA_ERR_V2
(
"assignIndiceOutKernel failed"
);
auto
tableSize
=
table
.
get_table_size
();
auto
tableSize
=
table
.
get_table_size
();
auto
tableData
=
table
.
data
();
auto
tableData
=
table
.
data
();
auto
constants
=
table
.
get_constants_4
();
auto
constants
=
table
.
get_constants_4
();
...
@@ -133,6 +140,7 @@ int create_conv_indice_pair_p2_cuda(
...
@@ -133,6 +140,7 @@ int create_conv_indice_pair_p2_cuda(
tv
::
torch2tv
<
Index
>
(
indicePairs
),
tv
::
torch2tv
<
Index
>
(
indicePairs
),
tv
::
torch2tv
<
Index
>
(
indicePairUnique
),
tableSize
,
tv
::
torch2tv
<
Index
>
(
indicePairUnique
),
tableSize
,
tableData
,
constants
,
stash_constants
,
stash_count
);
tableData
,
constants
,
stash_constants
,
stash_count
);
TV_CHECK_CUDA_ERR_V2
(
"assignIndicePairsHashKernel failed"
);
}
else
{
}
else
{
assignGridAndIndiceOutKernel
<
Index
,
IndexGrid
,
NDim
>
assignGridAndIndiceOutKernel
<
Index
,
IndexGrid
,
NDim
>
...
@@ -145,7 +153,7 @@ int create_conv_indice_pair_p2_cuda(
...
@@ -145,7 +153,7 @@ int create_conv_indice_pair_p2_cuda(
assignIndicePairsKernel
<
Index
,
IndexGrid
,
NDim
>
assignIndicePairsKernel
<
Index
,
IndexGrid
,
NDim
>
<<<
tv
::
cuda
::
getBlocks
(
numActIn
),
tv
::
cuda
::
CUDA_NUM_THREADS
,
0
,
<<<
tv
::
cuda
::
getBlocks
(
numActIn
),
tv
::
cuda
::
CUDA_NUM_THREADS
,
0
,
stream
>>>
(
tv
::
torch2tv
<
Index
>
(
indicesOut
),
stream
>>>
(
tv
::
torch2tv
<
Index
>
(
indicesOut
),
tv
::
torch2tv
<
IndexGrid
>
(
gridsOut
),
numAct
,
tv
::
torch2tv
<
IndexGrid
>
(
gridsOut
),
numAct
In
,
tv
::
torch2tv
<
Index
>
(
indicePairs
),
tv
::
torch2tv
<
Index
>
(
indicePairs
),
tv
::
torch2tv
<
Index
>
(
indicePairUnique
),
ou
);
tv
::
torch2tv
<
Index
>
(
indicePairUnique
),
ou
);
TV_CHECK_CUDA_ERR_V2
(
"assignIndicePairsKernel failed"
);
TV_CHECK_CUDA_ERR_V2
(
"assignIndicePairsKernel failed"
);
...
@@ -177,11 +185,11 @@ int create_submconv_indice_pair_cuda(
...
@@ -177,11 +185,11 @@ int create_submconv_indice_pair_cuda(
auto
kernelVolume
=
indicePairs
.
size
(
0
);
auto
kernelVolume
=
indicePairs
.
size
(
0
);
if
(
numActIn
==
0
)
if
(
numActIn
==
0
)
return
0
;
return
0
;
tv
::
dispatch_torch
<
int32_t
>
(
indicesIn
.
scalar_type
(),
[
&
](
auto
V
)
{
tv
::
dispatch_torch
<
int32_t
>
(
indicesIn
.
scalar_type
(),
[
&
](
auto
IndexValue
)
{
using
Index
=
decltype
(
V
);
using
Index
=
decltype
(
IndexValue
);
using
IndexGrid
=
int32_t
;
using
IndexGrid
=
int32_t
;
tv
::
dispatch_int
<
2
,
3
,
4
>
(
ndim
,
[
&
](
auto
I
)
{
tv
::
dispatch_int
<
2
,
3
,
4
>
(
ndim
,
[
&
](
auto
I
)
{
constexpr
int
NDim
=
I
;
constexpr
int
NDim
=
decltype
(
I
)
::
value
;
tv
::
SimpleVector
<
Index
,
NDim
>
ks
(
kernelSize
.
begin
(),
kernelSize
.
end
());
tv
::
SimpleVector
<
Index
,
NDim
>
ks
(
kernelSize
.
begin
(),
kernelSize
.
end
());
tv
::
SimpleVector
<
Index
,
NDim
>
st
(
stride
.
begin
(),
stride
.
end
());
tv
::
SimpleVector
<
Index
,
NDim
>
st
(
stride
.
begin
(),
stride
.
end
());
tv
::
SimpleVector
<
Index
,
NDim
>
pa
(
padding
.
begin
(),
padding
.
end
());
tv
::
SimpleVector
<
Index
,
NDim
>
pa
(
padding
.
begin
(),
padding
.
end
());
...
@@ -214,26 +222,36 @@ int create_submconv_indice_pair_cuda(
...
@@ -214,26 +222,36 @@ int create_submconv_indice_pair_cuda(
auto
constants
=
table
.
get_constants_4
();
auto
constants
=
table
.
get_constants_4
();
auto
stash_constants
=
table
.
get_stash_constants
();
auto
stash_constants
=
table
.
get_stash_constants
();
auto
stash_count
=
table
.
get_stash_count
();
auto
stash_count
=
table
.
get_stash_count
();
getSubMIndicePairsHashKernel
<
Index
,
NDim
,
4096
>
tv
::
dispatch_int
<
16
,
32
,
256
,
4096
>
(
<<<
tv
::
cuda
::
getBlocks
(
numActIn
),
tv
::
cuda
::
CUDA_NUM_THREADS
,
0
,
kernelVolume
,
std
::
less_equal
<
int
>
(),
[
&
](
auto
I2
)
{
stream
>>>
(
tv
::
torch2tv
<
Index
>
(
indicesIn
),
constexpr
int
MaxKernelVolume
=
decltype
(
I2
)
::
value
;
getSubMIndicePairsHashKernel
<
Index
,
NDim
,
MaxKernelVolume
>
<<<
tv
::
cuda
::
getBlocks
(
numActIn
),
tv
::
cuda
::
CUDA_NUM_THREADS
,
0
,
stream
>>>
(
tv
::
torch2tv
<
Index
>
(
indicesIn
),
tv
::
torch2tv
<
Index
>
(
indicePairs
),
tv
::
torch2tv
<
Index
>
(
indicePairs
),
tv
::
torch2tv
<
Index
>
(
indiceNum
),
ks
,
st
,
pa
,
di
,
ou
,
tv
::
torch2tv
<
Index
>
(
indiceNum
),
ks
,
st
,
pa
,
tableSize
,
tableData
,
constants
,
stash_constants
,
di
,
ou
,
tableSize
,
tableData
,
constants
,
stash_count
);
stash_constants
,
stash_count
);
TV_CHECK_CUDA_ERR_V2
(
"getSubMIndicePairsHashKernel failed"
);
});
}
else
{
}
else
{
prepareSubMGridKernel
<
Index
,
IndexGrid
,
NDim
>
prepareSubMGridKernel
<
Index
,
IndexGrid
,
NDim
>
<<<
tv
::
cuda
::
getBlocks
(
numActIn
),
tv
::
cuda
::
CUDA_NUM_THREADS
,
0
,
<<<
tv
::
cuda
::
getBlocks
(
numActIn
),
tv
::
cuda
::
CUDA_NUM_THREADS
,
0
,
stream
>>>
(
tv
::
torch2tv
<
Index
>
(
indicesIn
),
stream
>>>
(
tv
::
torch2tv
<
Index
>
(
indicesIn
),
tv
::
torch2tv
<
IndexGrid
>
(
gridsOut
),
ou
);
tv
::
torch2tv
<
IndexGrid
>
(
gridsOut
),
ou
);
TV_CHECK_CUDA_ERR_V2
(
"prepareSubMGridKernel failed"
);
TV_CHECK_CUDA_ERR_V2
(
"prepareSubMGridKernel failed"
);
getSubMIndicePairsKernel
<
Index
,
IndexGrid
,
NDim
,
4096
>
tv
::
dispatch_int
<
16
,
32
,
256
,
4096
>
(
<<<
tv
::
cuda
::
getBlocks
(
numActIn
),
tv
::
cuda
::
CUDA_NUM_THREADS
,
0
,
ndim
,
std
::
less_equal
<
int
>
(),
[
&
](
auto
I2
)
{
stream
>>>
(
tv
::
torch2tv
<
Index
>
(
indicesIn
),
constexpr
int
MaxKernelVolume
=
decltype
(
I2
)
::
value
;
getSubMIndicePairsKernel
<
Index
,
IndexGrid
,
NDim
,
MaxKernelVolume
>
<<<
tv
::
cuda
::
getBlocks
(
numActIn
),
tv
::
cuda
::
CUDA_NUM_THREADS
,
0
,
stream
>>>
(
tv
::
torch2tv
<
Index
>
(
indicesIn
),
tv
::
torch2tv
<
IndexGrid
>
(
gridsOut
),
tv
::
torch2tv
<
IndexGrid
>
(
gridsOut
),
tv
::
torch2tv
<
Index
>
(
indicePairs
),
tv
::
torch2tv
<
Index
>
(
indicePairs
),
tv
::
torch2tv
<
Index
>
(
indiceNum
),
ks
,
st
,
pa
,
di
,
ou
);
tv
::
torch2tv
<
Index
>
(
indiceNum
),
ks
,
st
,
pa
,
di
,
ou
);
TV_CHECK_CUDA_ERR_V2
(
"assignIndicePairsKernel failed"
);
TV_CHECK_CUDA_ERR_V2
(
"assignIndicePairsKernel failed"
);
});
}
}
if
(
resetGrid
&&
(
!
useHash
))
{
if
(
resetGrid
&&
(
!
useHash
))
{
...
...
src/spconv/reordering.cc
View file @
6c767a51
...
@@ -14,59 +14,60 @@
...
@@ -14,59 +14,60 @@
#include <ATen/Parallel.h>
#include <ATen/Parallel.h>
#include <spconv/reordering.h>
#include <spconv/reordering.h>
#include <tensorview/torch_utils.h>
#include <torch/script.h>
#include <torch/script.h>
namespace
spconv
{
namespace
spconv
{
namespace
functor
{
using
float_types_t
=
tv
::
mp_list
<
float
,
double
,
at
::
Half
>
;
template
<
typename
T
,
typename
Index
>
using
int_types_t
=
tv
::
mp_list
<
int32_t
,
int64_t
>
;
struct
SparseGatherFunctor
<
tv
::
CPU
,
T
,
Index
>
{
void
operator
()(
const
tv
::
CPU
&
d
,
tv
::
TensorView
<
T
>
buffer
,
void
sparse_gather_cpu
(
torch
::
Tensor
buffer
,
torch
::
Tensor
features
,
tv
::
TensorView
<
const
T
>
features
,
torch
::
Tensor
indices
,
int
size
)
{
tv
::
TensorView
<
const
Index
>
indices
,
int
size
)
{
int
numPlanes
=
features
.
size
(
1
);
int
numPlanes
=
features
.
dim
(
1
);
auto
dtype
=
features
.
scalar_type
();
auto
int_dtype
=
indices
.
scalar_type
();
tv
::
DispatchTorch
<
float_types_t
>
()(
dtype
,
[
&
](
auto
TValue
)
{
using
T
=
decltype
(
TValue
);
tv
::
DispatchTorch
<
int_types_t
>
()(
int_dtype
,
[
&
](
auto
IndexValue
)
{
using
Index
=
decltype
(
IndexValue
);
Index
*
indices_data
=
indices
.
data_ptr
<
Index
>
();
T
*
buffer_data
=
buffer
.
data_ptr
<
T
>
();
const
T
*
features_data
=
features
.
data_ptr
<
T
>
();
at
::
parallel_for
(
0
,
size
,
0
,
[
&
](
int64_t
begin
,
int64_t
end
)
{
at
::
parallel_for
(
0
,
size
,
0
,
[
&
](
int64_t
begin
,
int64_t
end
)
{
for
(
int
i
=
begin
;
i
<
end
;
++
i
)
{
for
(
int
i
=
begin
;
i
<
end
;
++
i
)
{
std
::
memcpy
(
buffer
.
data
()
+
i
*
numPlanes
,
std
::
memcpy
(
buffer
_
data
+
i
*
numPlanes
,
features
.
data
()
+
indices
[
i
]
*
numPlanes
,
features
_
data
+
indices
_data
[
i
]
*
numPlanes
,
sizeof
(
T
)
*
numPlanes
);
sizeof
(
T
)
*
numPlanes
);
}
}
});
});
}
});
};
});
}
template
<
typename
T
,
typename
Index
>
void
sparse_scatter_add_cpu
(
torch
::
Tensor
buffer
,
torch
::
Tensor
outFeatures
,
struct
SparseScatterAddFunctor
<
tv
::
CPU
,
T
,
Index
>
{
torch
::
Tensor
indices
,
int
size
)
{
void
operator
()(
const
tv
::
CPU
&
d
,
tv
::
TensorView
<
T
>
outFeatures
,
int
numPlanes
=
outFeatures
.
size
(
1
);
tv
::
TensorView
<
const
T
>
buffer
,
auto
dtype
=
outFeatures
.
scalar_type
();
tv
::
TensorView
<
const
Index
>
indices
,
int
size
,
bool
stable
)
{
auto
int_dtype
=
indices
.
scalar_type
();
int
numPlanes
=
outFeatures
.
dim
(
1
);
const
T
*
buf
=
buffer
.
data
();
tv
::
DispatchTorch
<
float_types_t
>
()(
dtype
,
[
&
](
auto
TValue
)
{
T
*
out
=
outFeatures
.
data
();
using
T
=
decltype
(
TValue
);
tv
::
DispatchTorch
<
int_types_t
>
()(
int_dtype
,
[
&
](
auto
IndexValue
)
{
using
Index
=
decltype
(
IndexValue
);
Index
*
indices_data
=
indices
.
data_ptr
<
Index
>
();
const
T
*
buffer_data
=
buffer
.
data_ptr
<
T
>
();
T
*
features_data
=
outFeatures
.
data_ptr
<
T
>
();
const
T
*
buf
=
buffer
.
data_ptr
<
T
>
();
T
*
out
=
outFeatures
.
data_ptr
<
T
>
();
for
(
int
i
=
0
;
i
<
size
;
++
i
)
{
for
(
int
i
=
0
;
i
<
size
;
++
i
)
{
buf
=
buffer
.
data
()
+
i
*
numPlanes
;
buf
=
buffer
_
data
+
i
*
numPlanes
;
out
=
outF
eatures
.
data
()
+
indices
[
i
]
*
numPlanes
;
out
=
f
eatures
_
data
+
indices
_data
[
i
]
*
numPlanes
;
for
(
int
j
=
0
;
j
<
numPlanes
;
++
j
)
{
for
(
int
j
=
0
;
j
<
numPlanes
;
++
j
)
{
out
[
j
]
+=
buf
[
j
];
out
[
j
]
+=
buf
[
j
];
}
}
}
}
}
});
};
});
}
}
// namespace functor
#define DECLARE_CPU_SPECS_T_INDEX(T, Index) \
template struct functor::SparseGatherFunctor<tv::CPU, T, Index>; \
template struct functor::SparseScatterAddFunctor<tv::CPU, T, Index>;
#define DECLARE_CPU_SPECS(T) \
DECLARE_CPU_SPECS_T_INDEX(T, int); \
DECLARE_CPU_SPECS_T_INDEX(T, long);
DECLARE_CPU_SPECS
(
float
);
DECLARE_CPU_SPECS
(
double
);
DECLARE_CPU_SPECS
(
at
::
Half
);
#undef DECLARE_CPU_SPECS
#undef DECLARE_CPU_SPECS_T_INDEX
}
// namespace spconv
}
// namespace spconv
src/spconv/reordering.cu
View file @
6c767a51
...
@@ -20,23 +20,31 @@
...
@@ -20,23 +20,31 @@
#include <tensorview/cuda_utils.h>
#include <tensorview/cuda_utils.h>
#include <tensorview/kernel_utils.h>
#include <tensorview/kernel_utils.h>
#include <tensorview/mp_helper.h>
#include <tensorview/mp_helper.h>
#include <tensorview/tensor.h>
#include <tensorview/tensorview.h>
#include <tensorview/tensorview.h>
#include <tensorview/torch_utils.h>
#include <type_traits>
#include <type_traits>
#include <utility/timer.h>
#include <utility/timer.h>
namespace
spconv
{
namespace
spconv
{
namespace
functor
{
template
<
typename
T
,
typename
Index
>
void
sparse_gather_cuda
(
torch
::
Tensor
buffer
,
torch
::
Tensor
features
,
struct
SparseGatherFunctor
<
tv
::
GPU
,
T
,
Index
>
{
torch
::
Tensor
indices
,
int
size
)
{
if
(
size
<=
0
)
return
;
int
numPlanes
=
features
.
size
(
1
);
auto
stream
=
at
::
cuda
::
getCurrentCUDAStream
();
tv
::
dispatch_torch
<
float
,
double
,
at
::
Half
>
(
features
.
scalar_type
(),
[
&
](
auto
TValue
)
{
using
T
=
decltype
(
TValue
);
using
vecload_type_t
=
using
vecload_type_t
=
std
::
conditional_t
<
std
::
is_same
<
T
,
at
::
Half
>::
value
,
int2
,
int4
>
;
std
::
conditional_t
<
std
::
is_same
<
T
,
at
::
Half
>::
value
,
int2
,
int4
>
;
using
kernel_block_t
=
tv
::
mp_list_c
<
int
,
64
,
32
,
16
>
;
using
kernel_block_t
=
tv
::
mp_list_c
<
int
,
64
,
32
,
16
>
;
void
operator
()(
const
tv
::
GPU
&
d
,
tv
::
TensorView
<
T
>
buffer
,
tv
::
TensorView
<
const
T
>
features
,
tv
::
dispatch_torch
<
int32_t
,
int64_t
>
(
tv
::
TensorView
<
const
Index
>
indices
,
int
size
)
{
indices
.
scalar_type
(),
[
&
](
auto
IndexValue
)
{
if
(
size
<=
0
)
using
Index
=
decltype
(
IndexValue
);
return
;
int
numPlanes
=
features
.
dim
(
1
);
bool
notFound
=
true
;
bool
notFound
=
true
;
constexpr
int
vecloadFactor
=
sizeof
(
vecload_type_t
)
/
sizeof
(
T
);
constexpr
int
vecloadFactor
=
sizeof
(
vecload_type_t
)
/
sizeof
(
T
);
tv
::
mp_for_each
<
kernel_block_t
>
([
=
,
&
buffer
,
&
features
,
&
indices
,
tv
::
mp_for_each
<
kernel_block_t
>
([
=
,
&
buffer
,
&
features
,
&
indices
,
...
@@ -47,11 +55,12 @@ struct SparseGatherFunctor<tv::GPU, T, Index> {
...
@@ -47,11 +55,12 @@ struct SparseGatherFunctor<tv::GPU, T, Index> {
if
(
notFound
)
{
if
(
notFound
)
{
if
(
numPlanes
%
NumTLP
==
0
)
{
if
(
numPlanes
%
NumTLP
==
0
)
{
if
(
nHotBlock
>=
NumTLP
)
{
if
(
nHotBlock
>=
NumTLP
)
{
gatherVecBlockKernel
<
T
,
Index
,
int
(
NumTLP
),
NumILP
,
vecload_type_t
>
gatherVecBlockKernel
<
T
,
Index
,
int
(
NumTLP
),
NumILP
,
vecload_type_t
>
<<<
dim3
(
numPlanes
/
NumTLP
,
size
/
NumTLP
),
<<<
dim3
(
numPlanes
/
NumTLP
,
size
/
NumTLP
),
dim3
(
NumTLP
/
vecloadFactor
,
NumTLP
/
NumILP
),
0
,
dim3
(
NumTLP
/
vecloadFactor
,
NumTLP
/
NumILP
),
0
,
d
.
getS
tream
()
>>>
(
buffer
.
data
(),
features
.
data
(),
s
tream
>>>
(
buffer
.
data
_ptr
<
T
>
(),
features
.
data
_ptr
<
T
>
(),
indices
.
data
(),
nHotBlock
,
indices
.
data
_ptr
<
Index
>
(),
nHotBlock
,
numPlanes
/
vecloadFactor
);
numPlanes
/
vecloadFactor
);
TV_CHECK_CUDA_ERR
();
TV_CHECK_CUDA_ERR
();
...
@@ -60,10 +69,10 @@ struct SparseGatherFunctor<tv::GPU, T, Index> {
...
@@ -60,10 +69,10 @@ struct SparseGatherFunctor<tv::GPU, T, Index> {
gatherVecKernel
<
T
,
Index
,
int
(
NumTLP
),
NumILP
,
vecload_type_t
>
gatherVecKernel
<
T
,
Index
,
int
(
NumTLP
),
NumILP
,
vecload_type_t
>
<<<
dim3
(
1
,
numPlanes
/
NumTLP
),
<<<
dim3
(
1
,
numPlanes
/
NumTLP
),
dim3
(
NumTLP
/
NumILP
,
NumTLP
/
vecloadFactor
),
0
,
dim3
(
NumTLP
/
NumILP
,
NumTLP
/
vecloadFactor
),
0
,
d
.
getS
tream
()
>>>
(
buffer
.
data
()
+
nHotBlock
*
numPlanes
,
s
tream
>>>
(
buffer
.
data
_ptr
<
T
>
()
+
nHotBlock
*
numPlanes
,
features
.
data
(),
indices
.
data
()
+
nHotBlock
,
features
.
data
_ptr
<
T
>
()
,
size
-
nHotBlock
,
indices
.
data_ptr
<
Index
>
()
+
nHotBlock
,
numPlanes
/
vecloadFactor
);
size
-
nHotBlock
,
numPlanes
/
vecloadFactor
);
TV_CHECK_CUDA_ERR
();
TV_CHECK_CUDA_ERR
();
}
}
notFound
=
false
;
notFound
=
false
;
...
@@ -77,49 +86,63 @@ struct SparseGatherFunctor<tv::GPU, T, Index> {
...
@@ -77,49 +86,63 @@ struct SparseGatherFunctor<tv::GPU, T, Index> {
gatherGenericKernel
<
T
,
Index
,
NumTLP
,
NumILP
>
gatherGenericKernel
<
T
,
Index
,
NumTLP
,
NumILP
>
<<<
dim3
(
tv
::
cuda
::
DivUp
(
size
,
NumTLP
),
<<<
dim3
(
tv
::
cuda
::
DivUp
(
size
,
NumTLP
),
tv
::
cuda
::
DivUp
(
numPlanes
,
NumTLP
)),
tv
::
cuda
::
DivUp
(
numPlanes
,
NumTLP
)),
dim3
(
NumTLP
/
NumILP
,
NumTLP
),
0
,
d
.
getStream
()
>>>
(
dim3
(
NumTLP
/
NumILP
,
NumTLP
),
0
,
stream
>>>
(
buffer
.
data
(),
features
.
data
(),
indices
.
data
(),
size
,
numPlanes
);
buffer
.
data_ptr
<
T
>
(),
features
.
data_ptr
<
T
>
(),
indices
.
data_ptr
<
Index
>
(),
size
,
numPlanes
);
TV_CHECK_CUDA_ERR
();
TV_CHECK_CUDA_ERR
();
}
}
}
});
};
});
template
<
typename
T
,
typename
Index
>
}
struct
SparseScatterAddFunctor
<
tv
::
GPU
,
T
,
Index
>
{
void
sparse_scatter_add_cuda
(
torch
::
Tensor
buffer
,
torch
::
Tensor
outFeatures
,
torch
::
Tensor
indices
,
int
size
)
{
if
(
size
<=
0
)
return
;
int
numPlanes
=
outFeatures
.
size
(
1
);
auto
stream
=
at
::
cuda
::
getCurrentCUDAStream
();
tv
::
dispatch_torch
<
float
,
double
,
at
::
Half
>
(
outFeatures
.
scalar_type
(),
[
&
](
auto
TValue
)
{
using
T
=
decltype
(
TValue
);
using
vecload_type_t
=
using
vecload_type_t
=
std
::
conditional_t
<
std
::
is_same
<
T
,
at
::
Half
>::
value
,
int2
,
int4
>
;
std
::
conditional_t
<
std
::
is_same
<
T
,
at
::
Half
>::
value
,
int2
,
int4
>
;
using
kernel_block_t
=
tv
::
mp_list_c
<
int
,
64
,
32
,
16
>
;
using
kernel_block_t
=
tv
::
mp_list_c
<
int
,
64
,
32
,
16
>
;
void
operator
()(
const
tv
::
GPU
&
d
,
tv
::
TensorView
<
T
>
outFeatures
,
tv
::
TensorView
<
const
T
>
buffer
,
tv
::
dispatch_torch
<
int32_t
,
int64_t
>
(
tv
::
TensorView
<
const
Index
>
indices
,
int
size
,
bool
stable
)
{
indices
.
scalar_type
(),
[
&
](
auto
IndexValue
)
{
if
(
size
<=
0
)
using
Index
=
decltype
(
IndexValue
);
return
;
int
numPlanes
=
outFeatures
.
dim
(
1
);
bool
notFound
=
true
;
bool
notFound
=
true
;
constexpr
int
vecloadFactor
=
constexpr
int
vecloadFactor
=
sizeof
(
vecload_type_t
)
/
sizeof
(
T
);
// important for half.
sizeof
(
vecload_type_t
)
/
sizeof
(
T
);
// important for half.
tv
::
mp_for_each
<
kernel_block_t
>
([
=
,
&
d
,
&
outFeatures
,
&
buffer
,
&
indices
,
tv
::
mp_for_each
<
kernel_block_t
>
(
&
notFound
](
auto
NumTLP
)
{
[
=
,
&
outFeatures
,
&
buffer
,
&
indices
,
&
notFound
](
auto
NumTLP
)
{
// constexpr int NumILP = NumTLP / (64 / (NumTLP / vecloadFactor));
// constexpr int NumILP = NumTLP / (64 / (NumTLP /
// vecloadFactor));
constexpr
int
NumILP
=
NumTLP
/
4
;
constexpr
int
NumILP
=
NumTLP
/
4
;
int
nHotBlock
=
(
size
/
NumTLP
)
*
NumTLP
;
int
nHotBlock
=
(
size
/
NumTLP
)
*
NumTLP
;
if
(
notFound
)
{
if
(
notFound
)
{
if
(
numPlanes
%
NumTLP
==
0
)
{
if
(
numPlanes
%
NumTLP
==
0
)
{
if
(
nHotBlock
>=
NumTLP
)
{
if
(
nHotBlock
>=
NumTLP
)
{
scatterAddVecBlockKernel
<
T
,
Index
,
int
(
NumTLP
),
NumILP
,
scatterAddVecBlockKernel
<
T
,
Index
,
int
(
NumTLP
),
vecload_type_t
>
NumILP
,
vecload_type_t
>
<<<
dim3
(
numPlanes
/
NumTLP
,
size
/
NumTLP
),
<<<
dim3
(
numPlanes
/
NumTLP
,
size
/
NumTLP
),
dim3
(
NumTLP
/
vecloadFactor
,
NumTLP
/
NumILP
),
0
,
dim3
(
NumTLP
/
vecloadFactor
,
NumTLP
/
NumILP
),
d
.
getStream
()
>>>
(
outFeatures
.
data
(),
buffer
.
data
(),
0
,
stream
>>>
(
outFeatures
.
data_ptr
<
T
>
(),
indices
.
data
(),
nHotBlock
,
buffer
.
data_ptr
<
T
>
(),
indices
.
data_ptr
<
Index
>
(),
nHotBlock
,
numPlanes
/
vecloadFactor
);
numPlanes
/
vecloadFactor
);
TV_CHECK_CUDA_ERR
();
TV_CHECK_CUDA_ERR
();
}
}
if
(
size
-
nHotBlock
>
0
)
{
if
(
size
-
nHotBlock
>
0
)
{
scatterAddGenericKernel
<
T
,
Index
,
int
(
NumTLP
),
NumILP
>
scatterAddGenericKernel
<
T
,
Index
,
int
(
NumTLP
),
NumILP
>
<<<
dim3
(
1
,
numPlanes
/
NumTLP
),
dim3
(
NumTLP
/
NumILP
,
NumTLP
),
<<<
dim3
(
1
,
numPlanes
/
NumTLP
),
0
,
d
.
getStream
()
>>>
(
dim3
(
NumTLP
/
NumILP
,
NumTLP
),
0
,
stream
>>>
(
outFeatures
.
data
(),
buffer
.
data
()
+
nHotBlock
*
numPlanes
,
outFeatures
.
data_ptr
<
T
>
(),
indices
.
data
()
+
nHotBlock
,
size
-
nHotBlock
,
numPlanes
);
buffer
.
data_ptr
<
T
>
()
+
nHotBlock
*
numPlanes
,
indices
.
data_ptr
<
Index
>
()
+
nHotBlock
,
size
-
nHotBlock
,
numPlanes
);
TV_CHECK_CUDA_ERR
();
TV_CHECK_CUDA_ERR
();
}
}
notFound
=
false
;
notFound
=
false
;
...
@@ -132,25 +155,13 @@ struct SparseScatterAddFunctor<tv::GPU, T, Index> {
...
@@ -132,25 +155,13 @@ struct SparseScatterAddFunctor<tv::GPU, T, Index> {
scatterAddGenericKernel
<
T
,
Index
,
NumTLP
,
NumILP
>
scatterAddGenericKernel
<
T
,
Index
,
NumTLP
,
NumILP
>
<<<
dim3
(
tv
::
cuda
::
DivUp
(
size
,
NumTLP
),
<<<
dim3
(
tv
::
cuda
::
DivUp
(
size
,
NumTLP
),
tv
::
cuda
::
DivUp
(
numPlanes
,
NumTLP
)),
tv
::
cuda
::
DivUp
(
numPlanes
,
NumTLP
)),
dim3
(
NumTLP
/
NumILP
,
NumTLP
),
0
,
d
.
getS
tream
()
>>>
(
dim3
(
NumTLP
/
NumILP
,
NumTLP
),
0
,
s
tream
>>>
(
outFeatures
.
data
(),
buffer
.
data
(),
indices
.
data
(),
size
,
outFeatures
.
data
_ptr
<
T
>
(),
buffer
.
data
_ptr
<
T
>
()
,
numPlanes
);
indices
.
data_ptr
<
Index
>
(),
size
,
numPlanes
);
TV_CHECK_CUDA_ERR
();
TV_CHECK_CUDA_ERR
();
}
}
}
});
};
});
}
// namespace functor
}
#define DECLARE_GPU_SPECS_T_INDEX(T, Index) \
template struct functor::SparseGatherFunctor<tv::GPU, T, Index>; \
template struct functor::SparseScatterAddFunctor<tv::GPU, T, Index>;
#define DECLARE_GPU_SPECS(T) DECLARE_GPU_SPECS_T_INDEX(T, int);
DECLARE_GPU_SPECS
(
float
);
DECLARE_GPU_SPECS
(
double
);
DECLARE_GPU_SPECS
(
at
::
Half
);
#undef DECLARE_GPU_SPECS
#undef DECLARE_GPU_SPECS_T_INDEX
}
// namespace spconv
}
// namespace spconv
\ No newline at end of file
src/spconv/spconv_ops.cc
View file @
6c767a51
#include <spconv/spconv_ops.h>
#include <spconv/spconv_ops.h>
namespace
spconv
{
namespace
spconv
{
std
::
vector
<
torch
::
Tensor
>
getIndicePairV2
(
torch
::
Tensor
indices
,
int64_t
batchSize
,
std
::
vector
<
int64_t
>
outSpatialShape
,
std
::
vector
<
int64_t
>
spatialShape
,
std
::
vector
<
int64_t
>
kernelSize
,
std
::
vector
<
int64_t
>
stride
,
std
::
vector
<
int64_t
>
padding
,
std
::
vector
<
int64_t
>
dilation
,
std
::
vector
<
int64_t
>
outPadding
,
int64_t
_subM
,
int64_t
_transpose
,
int64_t
_useHash
)
{
// auto timer = spconv::CudaContextTimer<>();
bool
subM
=
_subM
!=
0
;
bool
transpose
=
_transpose
!=
0
;
auto
NDim
=
kernelSize
.
size
();
// CPU always use hash (tsl::robin_map).
bool
useHash
=
_useHash
!=
0
||
indices
.
device
().
type
()
==
torch
::
kCPU
;
auto
numAct
=
indices
.
size
(
0
);
auto
coorDim
=
indices
.
size
(
1
)
-
1
;
// batchIdx + xyz
TV_ASSERT_RT_ERR
(
NDim
==
coorDim
,
"error"
);
TV_ASSERT_RT_ERR
(
kernelSize
.
size
()
==
coorDim
,
"error"
);
TV_ASSERT_RT_ERR
(
outSpatialShape
.
size
()
==
coorDim
,
"error"
);
TV_ASSERT_RT_ERR
(
stride
.
size
()
==
coorDim
,
"error"
);
TV_ASSERT_RT_ERR
(
padding
.
size
()
==
coorDim
,
"error"
);
TV_ASSERT_RT_ERR
(
outPadding
.
size
()
==
coorDim
,
"error"
);
TV_ASSERT_RT_ERR
(
dilation
.
size
()
==
coorDim
,
"error"
);
auto
kernelVolume
=
kernelSize
[
0
];
for
(
int
i
=
1
;
i
<
kernelSize
.
size
();
++
i
)
{
kernelVolume
*=
kernelSize
[
i
];
}
TV_ASSERT_RT_ERR
(
kernelVolume
<=
4096
,
"error"
);
auto
outputVolume
=
outSpatialShape
[
0
];
for
(
int
i
=
1
;
i
<
outSpatialShape
.
size
();
++
i
)
{
outputVolume
*=
outSpatialShape
[
i
];
}
std
::
string
msg
=
"due to limits of cuda hash, the volume of dense space "
"include batch size "
;
msg
+=
"must less than std::numeric_limits<int>::max() = 2e9"
;
TV_ASSERT_RT_ERR
(
batchSize
*
outputVolume
<
std
::
numeric_limits
<
int
>::
max
(),
msg
);
torch
::
Tensor
indicePairs
=
torch
::
full
({
kernelVolume
,
2
,
numAct
},
-
1
,
torch
::
dtype
(
torch
::
kInt32
).
device
(
indices
.
device
()));
torch
::
Tensor
indiceNum
=
torch
::
zeros
(
{
kernelVolume
},
torch
::
dtype
(
torch
::
kInt32
).
device
(
indices
.
device
()));
auto
gridSize
=
batchSize
*
outputVolume
;
if
(
useHash
)
{
gridSize
=
batchSize
;
}
torch
::
Tensor
gridOut
=
torch
::
full
(
{
gridSize
},
-
1
,
torch
::
dtype
(
torch
::
kInt32
).
device
(
indices
.
device
()));
gridOut
=
gridOut
.
view
({
batchSize
,
-
1
});
int64_t
numActOut
=
-
1
;
for
(
int
i
=
0
;
i
<
NDim
;
++
i
)
{
if
(
subM
)
{
padding
[
i
]
=
kernelSize
[
i
]
/
2
;
stride
[
i
]
=
1
;
}
}
if
(
subM
)
{
if
(
indices
.
device
().
type
()
==
torch
::
kCPU
)
{
numActOut
=
create_submconv_indice_pair_cpu
(
indices
,
gridOut
,
indicePairs
,
indiceNum
,
kernelSize
,
stride
,
padding
,
dilation
,
outSpatialShape
,
transpose
,
false
,
useHash
);
}
#ifdef TV_CUDA
else
if
(
indices
.
device
().
type
()
==
torch
::
kCUDA
)
{
numActOut
=
create_submconv_indice_pair_cuda
(
indices
,
gridOut
,
indicePairs
,
indiceNum
,
kernelSize
,
stride
,
padding
,
dilation
,
outSpatialShape
,
transpose
,
false
,
useHash
);
}
#endif
else
{
TV_ASSERT_INVALID_ARG
(
false
,
"unknown device type"
);
}
return
{
indices
,
indicePairs
,
indiceNum
};
}
else
{
auto
indicePairUnique
=
torch
::
full
(
{
indicePairs
.
numel
()
/
2
+
1
},
std
::
numeric_limits
<
int
>::
max
(),
torch
::
dtype
(
torch
::
kInt32
).
device
(
indices
.
device
()));
torch
::
Tensor
outInds
=
torch
::
zeros
({
numAct
*
kernelVolume
,
coorDim
+
1
},
torch
::
dtype
(
torch
::
kInt32
).
device
(
indices
.
device
()));
if
(
indices
.
device
().
type
()
==
torch
::
kCPU
)
{
numActOut
=
create_conv_indice_pair_cpu
(
indices
,
outInds
,
gridOut
,
indicePairs
,
indiceNum
,
kernelSize
,
stride
,
padding
,
dilation
,
outSpatialShape
,
transpose
,
false
,
useHash
);
}
#ifdef TV_CUDA
else
if
(
indices
.
device
().
type
()
==
torch
::
kCUDA
)
{
numActOut
=
create_conv_indice_pair_p1_cuda
(
indices
,
indicePairs
,
indiceNum
,
indicePairUnique
,
kernelSize
,
stride
,
padding
,
dilation
,
outSpatialShape
,
transpose
);
if
(
numActOut
>
0
)
{
auto
res
=
torch
::
_unique
(
indicePairUnique
);
indicePairUnique
=
std
::
get
<
0
>
(
res
);
numActOut
=
create_conv_indice_pair_p2_cuda
(
indices
,
outInds
,
gridOut
,
indicePairs
,
indiceNum
,
indicePairUnique
,
outSpatialShape
,
transpose
,
false
,
useHash
);
}
}
#endif
else
{
TV_ASSERT_INVALID_ARG
(
false
,
"unknown device type"
);
}
return
{
outInds
.
slice
(
0
,
0
,
numActOut
),
indicePairs
,
indiceNum
};
}
}
torch
::
Tensor
indiceConv
(
torch
::
Tensor
features
,
torch
::
Tensor
filters
,
torch
::
Tensor
indiceConv
(
torch
::
Tensor
features
,
torch
::
Tensor
filters
,
torch
::
Tensor
indicePairs
,
torch
::
Tensor
indiceNum
,
torch
::
Tensor
indicePairs
,
torch
::
Tensor
indiceNum
,
int64_t
numActOut
,
int64_t
_inverse
,
int64_t
_subM
)
{
int64_t
numActOut
,
int64_t
_inverse
,
int64_t
_subM
)
{
...
@@ -47,9 +153,6 @@ torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters,
...
@@ -47,9 +153,6 @@ torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters,
double
totalGatherTime
=
0
;
double
totalGatherTime
=
0
;
double
totalGEMMTime
=
0
;
double
totalGEMMTime
=
0
;
double
totalSAddTime
=
0
;
double
totalSAddTime
=
0
;
tv
::
dispatch_torch
<
float
,
double
,
at
::
Half
>
(
features
.
scalar_type
(),
[
&
](
auto
I
)
{
using
T
=
decltype
(
I
);
for
(
int
i
=
0
;
i
<
kernelVolume
;
++
i
)
{
for
(
int
i
=
0
;
i
<
kernelVolume
;
++
i
)
{
auto
nHot
=
indicePairNumCpu
.
data_ptr
<
int
>
()[
i
];
auto
nHot
=
indicePairNumCpu
.
data_ptr
<
int
>
()[
i
];
if
(
nHot
<=
0
||
(
subM
&&
i
==
indicePairMaxOffset
))
{
if
(
nHot
<=
0
||
(
subM
&&
i
==
indicePairMaxOffset
))
{
...
@@ -57,25 +160,16 @@ torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters,
...
@@ -57,25 +160,16 @@ torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters,
}
}
// auto timer = spconv::CudaContextTimer<>();
// auto timer = spconv::CudaContextTimer<>();
auto
outputBufferBlob
=
torch
::
from_blob
(
auto
outputBufferBlob
=
torch
::
from_blob
(
outputBuffer
.
data_ptr
<
T
>
(),
{
nHot
,
numOutPlanes
},
options
);
outputBuffer
.
data_ptr
(),
{
nHot
,
numOutPlanes
},
options
);
auto
inputBufferBlob
=
torch
::
from_blob
(
inputBuffer
.
data_ptr
<
T
>
(),
auto
inputBufferBlob
=
torch
::
from_blob
(
inputBuffer
.
data_ptr
(),
{
nHot
,
numInPlanes
},
options
);
{
nHot
,
numInPlanes
},
options
);
if
(
device
==
torch
::
kCPU
)
{
if
(
device
==
torch
::
kCPU
)
{
functor
::
SparseGatherFunctor
<
tv
::
CPU
,
T
,
int
>
gatherFtor
;
sparse_gather_cpu
(
inputBuffer
,
features
,
indicePairs
[
i
][
inverse
],
nHot
);
gatherFtor
(
tv
::
CPU
(),
tv
::
torch2tv
<
T
>
(
inputBuffer
),
tv
::
torch2tv
<
const
T
>
(
features
),
tv
::
torch2tv
<
const
int
>
(
indicePairs
).
subview
(
i
,
inverse
),
nHot
);
}
}
#ifdef TV_CUDA
#ifdef TV_CUDA
else
if
(
device
==
torch
::
kCUDA
)
{
else
if
(
device
==
torch
::
kCUDA
)
{
functor
::
SparseGatherFunctor
<
tv
::
GPU
,
T
,
int
>
gatherFtor
;
sparse_gather_cuda
(
inputBuffer
,
features
,
indicePairs
[
i
][
inverse
],
nHot
);
gatherFtor
(
tv
::
TorchGPU
(),
tv
::
torch2tv
<
T
>
(
inputBuffer
),
tv
::
torch2tv
<
const
T
>
(
features
),
tv
::
torch2tv
<
const
int
>
(
indicePairs
).
subview
(
i
,
inverse
),
nHot
);
TV_CHECK_CUDA_ERR
();
/* slower than SparseGatherFunctor, may due to int->long conversion
/* slower than SparseGatherFunctor, may due to int->long conversion
auto indicePairLong = indicePairs[i][inverse].to(torch::kInt64);
auto indicePairLong = indicePairs[i][inverse].to(torch::kInt64);
auto indicePairBlob = torch::from_blob(indicePairLong.data<long>(),
auto indicePairBlob = torch::from_blob(indicePairLong.data<long>(),
...
@@ -92,22 +186,11 @@ torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters,
...
@@ -92,22 +186,11 @@ torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters,
// totalGEMMTime += timer.report() / 1000.0;
// totalGEMMTime += timer.report() / 1000.0;
if
(
device
==
torch
::
kCPU
)
{
if
(
device
==
torch
::
kCPU
)
{
functor
::
SparseScatterAddFunctor
<
tv
::
CPU
,
T
,
int
>
scatterFtor
;
sparse_scatter_add_cpu
(
outputBuffer
,
output
,
indicePairs
[
i
][
!
inverse
],
nHot
);
scatterFtor
(
tv
::
CPU
(),
tv
::
torch2tv
<
T
>
(
output
),
tv
::
torch2tv
<
const
T
>
(
outputBuffer
),
tv
::
torch2tv
<
const
int
>
(
indicePairs
).
subview
(
i
,
!
inverse
),
nHot
,
true
);
}
}
#ifdef TV_CUDA
#ifdef TV_CUDA
else
if
(
device
==
torch
::
kCUDA
)
{
else
if
(
device
==
torch
::
kCUDA
)
{
functor
::
SparseScatterAddFunctor
<
tv
::
GPU
,
T
,
int
>
scatterFtor
;
sparse_scatter_add_cuda
(
outputBuffer
,
output
,
indicePairs
[
i
][
!
inverse
],
nHot
);
scatterFtor
(
tv
::
TorchGPU
(),
tv
::
torch2tv
<
T
>
(
output
),
tv
::
torch2tv
<
const
T
>
(
outputBuffer
),
tv
::
torch2tv
<
const
int
>
(
indicePairs
).
subview
(
i
,
!
inverse
),
nHot
,
true
);
TV_CHECK_CUDA_ERR
();
}
}
#endif
#endif
else
{
else
{
...
@@ -115,13 +198,14 @@ torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters,
...
@@ -115,13 +198,14 @@ torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters,
}
}
// totalSAddTime += timer.report() / 1000.0;
// totalSAddTime += timer.report() / 1000.0;
}
}
});
// std::cout << "gather time " << totalGatherTime << std::endl;
// std::cout << "gather time " << totalGatherTime << std::endl;
// std::cout << "gemm time " << totalGEMMTime << std::endl;
// std::cout << "gemm time " << totalGEMMTime << std::endl;
// std::cout << "scatteradd time " << totalSAddTime << std::endl;
// std::cout << "scatteradd time " << totalSAddTime << std::endl;
return
output
;
return
output
;
}
}
std
::
vector
<
torch
::
Tensor
>
std
::
vector
<
torch
::
Tensor
>
indiceConvBackward
(
torch
::
Tensor
features
,
torch
::
Tensor
filters
,
indiceConvBackward
(
torch
::
Tensor
features
,
torch
::
Tensor
filters
,
torch
::
Tensor
outGrad
,
torch
::
Tensor
indicePairs
,
torch
::
Tensor
outGrad
,
torch
::
Tensor
indicePairs
,
...
@@ -158,40 +242,19 @@ indiceConvBackward(torch::Tensor features, torch::Tensor filters,
...
@@ -158,40 +242,19 @@ indiceConvBackward(torch::Tensor features, torch::Tensor filters,
torch
::
mm_out
(
filterGradSub
,
features
.
t
(),
outGrad
);
torch
::
mm_out
(
filterGradSub
,
features
.
t
(),
outGrad
);
torch
::
mm_out
(
inputGrad
,
outGrad
,
filters
[
indicePairMaxOffset
].
t
());
torch
::
mm_out
(
inputGrad
,
outGrad
,
filters
[
indicePairMaxOffset
].
t
());
}
}
tv
::
dispatch_torch
<
float
,
double
,
at
::
Half
>
(
features
.
scalar_type
(),
[
&
](
auto
I
)
{
using
T
=
decltype
(
I
);
for
(
int
i
=
0
;
i
<
kernelVolume
;
++
i
)
{
for
(
int
i
=
0
;
i
<
kernelVolume
;
++
i
)
{
auto
nHot
=
indicePairNumCpu
.
data_ptr
<
int
>
()[
i
];
auto
nHot
=
indicePairNumCpu
.
data_ptr
<
int
>
()[
i
];
if
(
nHot
<=
0
||
(
subM
&&
i
==
indicePairMaxOffset
))
{
if
(
nHot
<=
0
||
(
subM
&&
i
==
indicePairMaxOffset
))
{
continue
;
continue
;
}
}
if
(
device
==
torch
::
kCPU
)
{
if
(
device
==
torch
::
kCPU
)
{
functor
::
SparseGatherFunctor
<
tv
::
CPU
,
T
,
int
>
gatherFtor
;
sparse_gather_cpu
(
inputBuffer
,
features
,
indicePairs
[
i
][
inverse
],
nHot
);
functor
::
SparseGatherFunctor
<
tv
::
CPU
,
T
,
int
>
gatherFtorOut
;
sparse_gather_cpu
(
outputBuffer
,
outGrad
,
indicePairs
[
i
][
!
inverse
],
nHot
);
gatherFtor
(
tv
::
CPU
(),
tv
::
torch2tv
<
T
>
(
inputBuffer
),
tv
::
torch2tv
<
const
T
>
(
features
),
tv
::
torch2tv
<
const
int
>
(
indicePairs
).
subview
(
i
,
inverse
),
nHot
);
gatherFtorOut
(
tv
::
CPU
(),
tv
::
torch2tv
<
T
>
(
outputBuffer
),
tv
::
torch2tv
<
const
T
>
(
outGrad
),
tv
::
torch2tv
<
const
int
>
(
indicePairs
).
subview
(
i
,
!
inverse
),
nHot
);
}
}
#ifdef TV_CUDA
#ifdef TV_CUDA
else
if
(
device
==
torch
::
kCUDA
)
{
else
if
(
device
==
torch
::
kCUDA
)
{
functor
::
SparseGatherFunctor
<
tv
::
GPU
,
T
,
int
>
gatherFtor
;
sparse_gather_cuda
(
inputBuffer
,
features
,
indicePairs
[
i
][
inverse
],
nHot
);
functor
::
SparseGatherFunctor
<
tv
::
GPU
,
T
,
int
>
gatherFtorOut
;
sparse_gather_cuda
(
outputBuffer
,
outGrad
,
indicePairs
[
i
][
!
inverse
],
nHot
);
gatherFtor
(
tv
::
TorchGPU
(),
tv
::
torch2tv
<
T
>
(
inputBuffer
),
tv
::
torch2tv
<
const
T
>
(
features
),
tv
::
torch2tv
<
const
int
>
(
indicePairs
).
subview
(
i
,
inverse
),
nHot
);
TV_CHECK_CUDA_ERR
();
gatherFtorOut
(
tv
::
TorchGPU
(),
tv
::
torch2tv
<
T
>
(
outputBuffer
),
tv
::
torch2tv
<
const
T
>
(
outGrad
),
tv
::
torch2tv
<
const
int
>
(
indicePairs
).
subview
(
i
,
!
inverse
),
nHot
);
TV_CHECK_CUDA_ERR
();
}
}
#endif
#endif
else
{
else
{
...
@@ -199,36 +262,27 @@ indiceConvBackward(torch::Tensor features, torch::Tensor filters,
...
@@ -199,36 +262,27 @@ indiceConvBackward(torch::Tensor features, torch::Tensor filters,
}
}
auto
filterGradSub
=
filtersGrad
[
i
];
auto
filterGradSub
=
filtersGrad
[
i
];
auto
outputBufferBlob
=
torch
::
from_blob
(
outputBuffer
.
data_ptr
<
T
>
(),
auto
outputBufferBlob
=
torch
::
from_blob
(
outputBuffer
.
data_ptr
(),
{
nHot
,
numOutPlanes
},
options
);
{
nHot
,
numOutPlanes
},
options
);
auto
inputBufferBlob
=
torch
::
from_blob
(
inputBuffer
.
data_ptr
<
T
>
(),
auto
inputBufferBlob
=
torch
::
from_blob
(
inputBuffer
.
data_ptr
(),
{
nHot
,
numInPlanes
},
options
);
{
nHot
,
numInPlanes
},
options
);
torch
::
mm_out
(
filterGradSub
,
inputBufferBlob
.
t
(),
outputBufferBlob
);
torch
::
mm_out
(
filterGradSub
,
inputBufferBlob
.
t
(),
outputBufferBlob
);
torch
::
mm_out
(
inputBufferBlob
,
outputBufferBlob
,
filters
[
i
].
t
());
torch
::
mm_out
(
inputBufferBlob
,
outputBufferBlob
,
filters
[
i
].
t
());
if
(
device
==
torch
::
kCPU
)
{
if
(
device
==
torch
::
kCPU
)
{
functor
::
SparseScatterAddFunctor
<
tv
::
CPU
,
T
,
int
>
scatterFtor
;
sparse_scatter_add_cpu
(
inputBuffer
,
inputGrad
,
indicePairs
[
i
][
inverse
],
nHot
);
scatterFtor
(
tv
::
CPU
(),
tv
::
torch2tv
<
T
>
(
inputGrad
),
tv
::
torch2tv
<
const
T
>
(
inputBuffer
),
tv
::
torch2tv
<
const
int
>
(
indicePairs
).
subview
(
i
,
inverse
),
nHot
);
}
}
#ifdef TV_CUDA
#ifdef TV_CUDA
else
if
(
device
==
torch
::
kCUDA
)
{
else
if
(
device
==
torch
::
kCUDA
)
{
functor
::
SparseScatterAddFunctor
<
tv
::
GPU
,
T
,
int
>
scatterFtor
;
sparse_scatter_add_cuda
(
inputBuffer
,
inputGrad
,
indicePairs
[
i
][
inverse
],
nHot
);
scatterFtor
(
tv
::
TorchGPU
(),
tv
::
torch2tv
<
T
>
(
inputGrad
),
tv
::
torch2tv
<
const
T
>
(
inputBuffer
),
tv
::
torch2tv
<
const
int
>
(
indicePairs
).
subview
(
i
,
inverse
),
nHot
);
TV_CHECK_CUDA_ERR
();
}
}
#endif
#endif
else
{
else
{
TV_ASSERT_INVALID_ARG
(
false
,
"unknown device type"
);
TV_ASSERT_INVALID_ARG
(
false
,
"unknown device type"
);
}
}
}
}
});
return
{
inputGrad
,
filtersGrad
.
view
(
filterShape
)};
return
{
inputGrad
,
filtersGrad
.
view
(
filterShape
)};
}
}
}
// namespace spconv
}
// namespace spconv
\ No newline at end of file
test/test_conv.py
View file @
6c767a51
...
@@ -392,7 +392,7 @@ class TestSpConv(TestCase):
...
@@ -392,7 +392,7 @@ class TestSpConv(TestCase):
def
testSpDeConv3d
(
self
):
def
testSpDeConv3d
(
self
):
np
.
random
.
seed
(
484
)
np
.
random
.
seed
(
484
)
devices
=
[
"cuda:0"
,
"cpu:0"
]
devices
=
[
"cuda:0"
]
shapes
=
[[
19
,
18
,
17
]]
shapes
=
[[
19
,
18
,
17
]]
batchsizes
=
[
1
,
2
]
batchsizes
=
[
1
,
2
]
...
@@ -598,9 +598,9 @@ def main():
...
@@ -598,9 +598,9 @@ def main():
shapes
=
[[
50
,
30
,
30
]]
shapes
=
[[
50
,
30
,
30
]]
batchsizes
=
[
2
]
batchsizes
=
[
2
]
in_channels
=
[
2
56
]
in_channels
=
[
3
2
]
out_channels
=
[
25
6
]
out_channels
=
[
6
4
]
ksizes
=
[(
3
,
1
,
1
)]
ksizes
=
[(
3
,
3
,
3
)]
strides
=
[
1
]
strides
=
[
1
]
paddings
=
[
0
]
paddings
=
[
0
]
dilations
=
[
1
]
dilations
=
[
1
]
...
@@ -654,5 +654,6 @@ def main():
...
@@ -654,5 +654,6 @@ def main():
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
main
()
#
main()
# unittest.main()
# unittest.main()
TestSpConv
().
testSpDeConv3d
()
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment