Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
one
spconv
Commits
8cbb7d3c
Commit
8cbb7d3c
authored
May 28, 2020
by
yanyan
Browse files
1.2 release
parent
105b3892
Changes
15
Hide whitespace changes
Inline
Side-by-side
Showing
15 changed files
with
424 additions
and
269 deletions
+424
-269
CHANGELOG.md
CHANGELOG.md
+11
-0
CMakeLists.txt
CMakeLists.txt
+1
-0
include/spconv/reordering.cu.h
include/spconv/reordering.cu.h
+88
-46
include/spconv/reordering.h
include/spconv/reordering.h
+4
-4
include/spconv/spconv_ops.h
include/spconv/spconv_ops.h
+8
-8
include/tensorview/tensor.h
include/tensorview/tensor.h
+3
-0
include/tensorview/torch_utils.h
include/tensorview/torch_utils.h
+15
-0
setup.py
setup.py
+3
-0
spconv/__init__.py
spconv/__init__.py
+4
-3
spconv/functional.py
spconv/functional.py
+46
-15
spconv/modules.py
spconv/modules.py
+1
-0
spconv/ops.py
spconv/ops.py
+8
-5
src/spconv/reordering.cu
src/spconv/reordering.cu
+23
-24
src/spconv/spconv_ops.cc
src/spconv/spconv_ops.cc
+168
-142
test/test_conv.py
test/test_conv.py
+41
-22
No files found.
CHANGELOG.md
0 → 100644
View file @
8cbb7d3c
# Changelog
## [1.2.0] - 2020-05-28
### Added
-
add batch gemm support. small performance increasement but more gpu memory usage. you can use algo=spconv.ConvAlgo.Batch to use it.
### Changed
-
replace most of 'functor' with c++14 dispatch in c++ code.
### Fixed
-
change gather/scatterAdd kernel parameter to support large points.
CMakeLists.txt
View file @
8cbb7d3c
...
...
@@ -11,6 +11,7 @@ endif()
if
(
WIN32
)
# true if windows (32 and 64 bit)
add_compile_definitions
(
TV_WINDOWS
)
endif
()
add_compile_definitions
(
PYTORCH_VERSION=
${
PYTORCH_VERSION
}
)
set
(
CMAKE_CXX_EXTENSIONS OFF
)
# avoid gnu++11 be added to CXX flags
if
(
CMAKE_BUILD_TYPE STREQUAL
"Debug"
)
...
...
include/spconv/reordering.cu.h
View file @
8cbb7d3c
...
...
@@ -15,7 +15,15 @@
#ifndef REORDERING_CU_H_
#define REORDERING_CU_H_
#include <THC/THCAtomics.cuh>
#include <THC/THCNumerics.cuh>
#include <tensorview/kernel_utils.h>
#if PYTORCH_VERSION < 10500
#define TH_ATOMIC_ADD atomicAdd
#else
#define TH_ATOMIC_ADD gpuAtomicAdd
#endif
// see http://www.nvidia.com/content/GTC-2010/pdfs/2238_GTC2010.pdf.
namespace
spconv
{
...
...
@@ -78,21 +86,21 @@ template <typename T, typename Index, int NumTLP, int NumILP,
__global__
void
gatherVecBlockKernel
(
T
*
buffer
,
const
T
*
features
,
const
Index
*
indices
,
int
size
,
int
numPlanes
)
{
int
ILPStride
Y
[
NumILP
];
int
ILPStride
X
[
NumILP
];
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
ilp
++
)
ILPStride
Y
[
ilp
]
=
ilp
*
gridDim
.
y
*
blockDim
.
y
;
features
+=
blockIdx
.
x
*
NumTLP
;
buffer
+=
blockIdx
.
x
*
NumTLP
;
ILPStride
X
[
ilp
]
=
ilp
*
gridDim
.
x
*
blockDim
.
x
;
features
+=
blockIdx
.
y
*
NumTLP
;
buffer
+=
blockIdx
.
y
*
NumTLP
;
for
(
int
i
y
:
tv
::
KernelLoop
Y
<
int
,
NumILP
>
(
size
))
{
for
(
int
i
x
:
tv
::
KernelLoop
X
<
int
,
NumILP
>
(
size
))
{
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
++
ilp
)
{
reinterpret_cast
<
VecType
*>
(
buffer
)[(
i
y
+
ILPStride
Y
[
ilp
])
*
numPlanes
+
threadIdx
.
x
]
=
buffer
)[(
i
x
+
ILPStride
X
[
ilp
])
*
numPlanes
+
threadIdx
.
y
]
=
reinterpret_cast
<
const
VecType
*>
(
features
)[
indices
[
i
y
+
ILPStride
Y
[
ilp
]]
*
numPlanes
+
threadIdx
.
x
];
features
)[
indices
[
i
x
+
ILPStride
X
[
ilp
]]
*
numPlanes
+
threadIdx
.
y
];
}
}
}
...
...
@@ -124,22 +132,33 @@ __global__ void batchGatherGenericKernel(T *buffer, const T *features,
for
(
int
iy
:
tv
::
KernelLoopY
<
int
>
(
numPlanes
))
{
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
++
ilp
)
{
if
(
ix
+
ILPStrideX
[
ilp
]
<
size
&&
inds
[
ilp
]
!=
-
1
)
buffer
[(
ix
+
ILPStrideX
[
ilp
])
*
numPlanes
+
iy
]
=
features
[
inds
[
ilp
]
*
numPlanes
+
iy
];
if
(
ix
+
ILPStrideX
[
ilp
]
<
size
)
{
if
(
inds
[
ilp
]
!=
-
1
)
{
buffer
[(
ix
+
ILPStrideX
[
ilp
])
*
numPlanes
+
iy
]
=
features
[
inds
[
ilp
]
*
numPlanes
+
iy
];
}
else
{
buffer
[(
ix
+
ILPStrideX
[
ilp
])
*
numPlanes
+
iy
]
=
T
(
0
);
}
}
}
}
}
}
template
<
typename
T
,
typename
Index
,
int
NumTLP
,
int
NumILP
,
typename
VecType
>
__global__
void
batchGatherVecKernel
(
T
*
buffer
,
const
T
*
features
,
const
Index
*
indices
,
int
size
,
int
feature_offset
,
int
numPlanes
,
int
indice_batch_stride
,
int
feature_batch_stride
)
{
__global__
void
batchGatherVecKernel
(
T
*
buffer
,
const
T
*
features
,
const
Index
*
indices
,
int
size
,
int
feature_offset
,
int
numPlanes
,
int
indice_batch_stride
,
int
feature_batch_stride
)
{
int
ILPStrideX
[
NumILP
];
Index
inds
[
NumILP
];
Index
zero
[
sizeof
(
VecType
)
/
sizeof
(
T
)];
#pragma unroll
for
(
int
i
=
0
;
i
<
sizeof
(
VecType
)
/
sizeof
(
T
);
++
i
)
{
zero
[
i
]
=
T
(
0
);
}
Index
inds_elem
;
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
ilp
++
)
...
...
@@ -158,11 +177,19 @@ __global__ void batchGatherVecKernel(T *buffer, const T *features,
for
(
int
iy
:
tv
::
KernelLoopY
<
int
>
(
numPlanes
))
{
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
++
ilp
)
{
if
(
ix
+
ILPStrideX
[
ilp
]
<
size
&&
inds
[
ilp
]
!=
-
1
)
reinterpret_cast
<
VecType
*>
(
buffer
)[(
ix
+
ILPStrideX
[
ilp
])
*
numPlanes
+
iy
]
=
reinterpret_cast
<
const
VecType
*>
(
features
)[
inds
[
ilp
]
*
numPlanes
+
iy
];
if
(
ix
+
ILPStrideX
[
ilp
]
<
size
)
{
if
(
inds
[
ilp
]
!=
-
1
)
{
reinterpret_cast
<
VecType
*>
(
buffer
)[(
ix
+
ILPStrideX
[
ilp
])
*
numPlanes
+
iy
]
=
reinterpret_cast
<
const
VecType
*>
(
features
)[
inds
[
ilp
]
*
numPlanes
+
iy
];
}
else
{
reinterpret_cast
<
VecType
*>
(
buffer
)[(
ix
+
ILPStrideX
[
ilp
])
*
numPlanes
+
iy
]
=
reinterpret_cast
<
const
VecType
*>
(
&
zero
)[
0
];
}
}
}
}
}
...
...
@@ -174,29 +201,38 @@ __global__ void
batchGatherVecBlockKernel
(
T
*
buffer
,
const
T
*
features
,
const
Index
*
indices
,
int
size
,
int
numPlanes
,
int
indice_batch_stride
,
int
feature_batch_stride
)
{
int
ILPStride
Y
[
NumILP
];
int
ILPStride
X
[
NumILP
];
Index
inds
;
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
ilp
++
)
ILPStride
Y
[
ilp
]
=
ilp
*
gridDim
.
y
*
blockDim
.
y
;
features
+=
blockIdx
.
x
*
NumTLP
;
buffer
+=
blockIdx
.
x
*
NumTLP
;
ILPStride
X
[
ilp
]
=
ilp
*
gridDim
.
x
*
blockDim
.
x
;
features
+=
blockIdx
.
y
*
NumTLP
;
buffer
+=
blockIdx
.
y
*
NumTLP
;
Index
inds_elem
;
Index
zero
[
sizeof
(
VecType
)
/
sizeof
(
T
)];
#pragma unroll
for
(
int
i
=
0
;
i
<
sizeof
(
VecType
)
/
sizeof
(
T
);
++
i
)
{
zero
[
i
]
=
T
(
0
);
}
for
(
int
i
y
:
tv
::
KernelLoop
Y
<
int
,
NumILP
>
(
size
))
{
for
(
int
i
x
:
tv
::
KernelLoop
X
<
int
,
NumILP
>
(
size
))
{
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
++
ilp
)
{
inds_elem
=
i
y
+
ILPStride
Y
[
ilp
];
inds_elem
=
i
x
+
ILPStride
X
[
ilp
];
inds
=
indices
[(
inds_elem
/
feature_batch_stride
)
*
indice_batch_stride
+
inds_elem
%
feature_batch_stride
];
if
(
inds
!=
-
1
)
{
reinterpret_cast
<
VecType
*>
(
buffer
)[(
i
y
+
ILPStride
Y
[
ilp
])
*
numPlanes
+
threadIdx
.
x
]
=
buffer
)[(
i
x
+
ILPStride
X
[
ilp
])
*
numPlanes
+
threadIdx
.
y
]
=
reinterpret_cast
<
const
VecType
*>
(
features
)[
inds
*
numPlanes
+
threadIdx
.
x
];
features
)[
inds
*
numPlanes
+
threadIdx
.
y
];
}
else
{
reinterpret_cast
<
VecType
*>
(
buffer
)[(
ix
+
ILPStrideX
[
ilp
])
*
numPlanes
+
threadIdx
.
y
]
=
reinterpret_cast
<
const
VecType
*>
(
&
zero
)[
0
];
}
}
}
...
...
@@ -234,24 +270,24 @@ template <typename T, typename Index, int NumTLP, int NumILP,
__global__
void
scatterAddVecBlockKernel
(
T
*
outFeatures
,
const
T
*
buffer
,
const
Index
*
indices
,
int
size
,
int
numPlanes
)
{
int
ILPStride
Y
[
NumILP
];
int
ILPStride
X
[
NumILP
];
constexpr
int
vecloadFactor
=
sizeof
(
VecType
)
/
sizeof
(
T
);
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
ilp
++
)
ILPStride
Y
[
ilp
]
=
ilp
*
gridDim
.
y
*
blockDim
.
y
;
outFeatures
+=
blockIdx
.
x
*
NumTLP
;
buffer
+=
blockIdx
.
x
*
NumTLP
;
ILPStride
X
[
ilp
]
=
ilp
*
gridDim
.
x
*
blockDim
.
x
;
outFeatures
+=
blockIdx
.
y
*
NumTLP
;
buffer
+=
blockIdx
.
y
*
NumTLP
;
T
buf
[
vecloadFactor
];
T
buf2
[
vecloadFactor
];
Index
idx
;
for
(
int
i
y
:
tv
::
KernelLoop
Y
<
int
,
NumILP
>
(
size
))
{
for
(
int
i
x
:
tv
::
KernelLoop
X
<
int
,
NumILP
>
(
size
))
{
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
++
ilp
)
{
idx
=
indices
[
i
y
+
ILPStride
Y
[
ilp
]]
*
numPlanes
+
threadIdx
.
x
;
idx
=
indices
[
i
x
+
ILPStride
X
[
ilp
]]
*
numPlanes
+
threadIdx
.
y
;
reinterpret_cast
<
VecType
*>
(
buf
)[
0
]
=
reinterpret_cast
<
VecType
*>
(
outFeatures
)[
idx
];
reinterpret_cast
<
VecType
*>
(
buf2
)[
0
]
=
reinterpret_cast
<
const
VecType
*>
(
buffer
)[(
i
y
+
ILPStride
Y
[
ilp
])
*
numPlanes
+
threadIdx
.
x
];
buffer
)[(
i
x
+
ILPStride
X
[
ilp
])
*
numPlanes
+
threadIdx
.
y
];
#pragma unroll
for
(
int
i
=
0
;
i
<
vecloadFactor
;
i
++
)
{
buf
[
i
]
+=
buf2
[
i
];
...
...
@@ -268,6 +304,10 @@ __global__ void batchScatterAddGenericKernel(T *outFeatures, const T *buffer,
int
feature_offset
,
int
numPlanes
,
int
indice_batch_stride
,
int
feature_batch_stride
)
{
// batch scatter add is greatly slower than native scatter when the number of
// points is large. this may due to atomicAdd?
// batch scatter add is greatly faster than native when the number of points
// is small.
int
ILPStrideX
[
NumILP
];
Index
inds
[
NumILP
];
Index
inds_elem
;
...
...
@@ -288,8 +328,8 @@ __global__ void batchScatterAddGenericKernel(T *outFeatures, const T *buffer,
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
++
ilp
)
{
if
(
ix
+
ILPStrideX
[
ilp
]
<
size
&&
inds
[
ilp
]
!=
-
1
)
{
gpuAtomicAdd
(
outFeatures
+
inds
[
ilp
]
*
numPlanes
+
iy
,
buffer
[(
ix
+
ILPStrideX
[
ilp
])
*
numPlanes
+
iy
]);
TH_ATOMIC_ADD
(
outFeatures
+
inds
[
ilp
]
*
numPlanes
+
iy
,
buffer
[(
ix
+
ILPStrideX
[
ilp
])
*
numPlanes
+
iy
]);
}
}
}
...
...
@@ -301,22 +341,22 @@ __global__ void
batchScatterAddBlockKernel
(
T
*
outFeatures
,
const
T
*
buffer
,
const
Index
*
indices
,
int
size
,
int
numPlanes
,
int
indice_batch_stride
,
int
feature_batch_stride
)
{
int
ILPStride
Y
[
NumILP
];
int
ILPStride
X
[
NumILP
];
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
ilp
++
)
ILPStride
Y
[
ilp
]
=
ilp
*
gridDim
.
y
*
blockDim
.
y
;
outFeatures
+=
blockIdx
.
x
*
NumTLP
;
buffer
+=
blockIdx
.
x
*
NumTLP
;
ILPStride
X
[
ilp
]
=
ilp
*
gridDim
.
x
*
blockDim
.
x
;
outFeatures
+=
blockIdx
.
y
*
NumTLP
;
buffer
+=
blockIdx
.
y
*
NumTLP
;
Index
inds
,
inds_elem
;
for
(
int
i
y
:
tv
::
KernelLoop
Y
<
int
,
NumILP
>
(
size
))
{
for
(
int
i
x
:
tv
::
KernelLoop
X
<
int
,
NumILP
>
(
size
))
{
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
++
ilp
)
{
inds_elem
=
i
y
+
ILPStride
Y
[
ilp
];
inds_elem
=
i
x
+
ILPStride
X
[
ilp
];
inds
=
indices
[(
inds_elem
/
feature_batch_stride
)
*
indice_batch_stride
+
inds_elem
%
feature_batch_stride
];
if
(
inds
!=
-
1
)
{
gpuAtomicAdd
(
outFeatures
+
inds
*
numPlanes
+
threadIdx
.
x
,
buffer
[(
i
y
+
ILPStride
Y
[
ilp
])
*
numPlanes
+
threadIdx
.
x
]);
TH_ATOMIC_ADD
(
outFeatures
+
inds
*
numPlanes
+
threadIdx
.
y
,
buffer
[(
i
x
+
ILPStride
X
[
ilp
])
*
numPlanes
+
threadIdx
.
y
]);
}
}
}
...
...
@@ -324,4 +364,6 @@ batchScatterAddBlockKernel(T *outFeatures, const T *buffer,
}
// namespace spconv
#undef TH_ATOMIC_ADD
#endif
\ No newline at end of file
include/spconv/reordering.h
View file @
8cbb7d3c
...
...
@@ -20,10 +20,10 @@
namespace
spconv
{
void
batch_sparse_gather_cuda
(
torch
::
Tensor
buffer
,
torch
::
Tensor
features
,
torch
::
Tensor
indices
,
int
size
);
void
batch_sparse_scatter_add_cuda
(
torch
::
Tensor
buffer
,
torch
::
Tensor
outFeatures
,
torch
::
Tensor
indices
,
int
size
);
torch
::
Tensor
indices
,
int
size
);
void
batch_sparse_scatter_add_cuda
(
torch
::
Tensor
buffer
,
torch
::
Tensor
outFeatures
,
torch
::
Tensor
indices
,
int
size
);
void
sparse_gather_cuda
(
torch
::
Tensor
buffer
,
torch
::
Tensor
features
,
torch
::
Tensor
indices
,
int
size
);
...
...
include/spconv/spconv_ops.h
View file @
8cbb7d3c
...
...
@@ -23,10 +23,7 @@
namespace
spconv
{
enum
ConvAlgo
{
kNative
=
0
,
kBatchGemm
=
1
};
enum
ConvAlgo
{
kNative
=
0
,
kBatch
=
1
,
kBatchGemmGather
=
2
};
// torch.jit's doc says only support int64, so we need to convert to int32.
template
<
unsigned
NDim
>
...
...
@@ -345,8 +342,10 @@ std::vector<torch::Tensor> getIndicePairPreGrid(
}
}
torch
::
Tensor
indiceConvBatch
(
torch
::
Tensor
features
,
torch
::
Tensor
filters
,
torch
::
Tensor
indicePairs
,
torch
::
Tensor
indiceNum
,
int64_t
numActOut
,
int64_t
_inverse
,
int64_t
_subM
);
torch
::
Tensor
indicePairs
,
torch
::
Tensor
indiceNum
,
int64_t
numActOut
,
int64_t
_inverse
,
int64_t
_subM
,
bool
batchScatter
);
torch
::
Tensor
indiceConv
(
torch
::
Tensor
features
,
torch
::
Tensor
filters
,
torch
::
Tensor
indicePairs
,
torch
::
Tensor
indiceNum
,
...
...
@@ -355,13 +354,14 @@ torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters,
std
::
vector
<
torch
::
Tensor
>
indiceConvBackward
(
torch
::
Tensor
features
,
torch
::
Tensor
filters
,
torch
::
Tensor
outGrad
,
torch
::
Tensor
indicePairs
,
torch
::
Tensor
indiceNum
,
int64_t
_inverse
,
int64_t
_subM
,
int64_t
algo
);
torch
::
Tensor
indiceNum
,
int64_t
_inverse
,
int64_t
_subM
,
int64_t
algo
);
std
::
vector
<
torch
::
Tensor
>
indiceConvBackwardBatch
(
torch
::
Tensor
features
,
torch
::
Tensor
filters
,
torch
::
Tensor
outGrad
,
torch
::
Tensor
indicePairs
,
torch
::
Tensor
indiceNum
,
int64_t
_inverse
,
int64_t
_subM
);
int64_t
_subM
,
bool
batchScatter
);
}
// namespace spconv
#endif
\ No newline at end of file
include/tensorview/tensor.h
View file @
8cbb7d3c
...
...
@@ -80,6 +80,8 @@ public:
}
}
else
{
#ifdef TV_CUDA
// we should select device in external
/*
int deviceCount;
cudaGetDeviceCount(&deviceCount);
if (device >= deviceCount) {
...
...
@@ -87,6 +89,7 @@ public:
" but you only have ", deviceCount, " device.");
}
cudaSetDevice(device);
*/
if
(
managed
)
{
checkCudaErrors
(
cudaMallocManaged
(
&
this
->
mPtr
,
size
*
sizeof
(
T
)));
}
else
{
...
...
include/tensorview/torch_utils.h
View file @
8cbb7d3c
...
...
@@ -125,6 +125,21 @@ TensorView<T, Rank, PtrTraits, Tindex> torch2tv(const torch::Tensor &tensor) {
return
tv
::
TensorView
<
T
,
Rank
,
PtrTraits
,
Tindex
>
(
tensor
.
data_ptr
<
std
::
remove_const_t
<
T
>>
(),
shape
);
}
template
<
typename
T
>
torch
::
Tensor
torch_slice_first_axis
(
torch
::
Tensor
tensor
,
T
start
,
T
end
)
{
// only torch >= 1.5 have tensor slice.
torch
::
Tensor
res
;
auto
tensor_shape
=
tensor
.
sizes
();
std
::
vector
<
int64_t
>
shape
(
tensor_shape
.
begin
(),
tensor_shape
.
end
());
shape
[
0
]
=
end
-
start
;
auto
dtype
=
tensor
.
scalar_type
();
uint8_t
*
ptr
=
reinterpret_cast
<
uint8_t
*>
(
tensor
.
data_ptr
());
res
=
torch
::
from_blob
(
ptr
+
start
*
tensor
.
stride
(
0
)
*
tensor
.
itemsize
(),
torch
::
IntArrayRef
(
shape
),
tensor
.
options
());
return
res
;
}
namespace
detail
{
template
<
>
struct
TypeToString
<
at
::
Half
>
{
static
constexpr
const
char
*
value
=
"half"
;
...
...
setup.py
View file @
8cbb7d3c
...
...
@@ -18,6 +18,8 @@ LIBTORCH_ROOT = str(Path(torch.__file__).parent)
SPCONV_FORCE_BUILD_CUDA
=
os
.
getenv
(
"SPCONV_FORCE_BUILD_CUDA"
)
PYTHON_VERSION
=
"{}.{}"
.
format
(
sys
.
version_info
.
major
,
sys
.
version_info
.
minor
)
PYTORCH_VERSION
=
list
(
map
(
int
,
torch
.
__version__
.
split
(
"."
)))
PYTORCH_VERSION_NUMBER
=
PYTORCH_VERSION
[
0
]
*
10000
+
PYTORCH_VERSION
[
1
]
*
100
+
PYTORCH_VERSION
[
2
]
class
CMakeExtension
(
Extension
):
def
__init__
(
self
,
name
,
sourcedir
=
''
,
library_dirs
=
[]):
...
...
@@ -47,6 +49,7 @@ class CMakeBuild(build_ext):
'-DCMAKE_PREFIX_PATH={}'
.
format
(
LIBTORCH_ROOT
),
'-DPYBIND11_PYTHON_VERSION={}'
.
format
(
PYTHON_VERSION
),
'-DSPCONV_BuildTests=OFF'
,
'-DPYTORCH_VERSION={}'
.
format
(
PYTORCH_VERSION_NUMBER
)
]
# -arch=sm_61
if
not
torch
.
cuda
.
is_available
()
and
SPCONV_FORCE_BUILD_CUDA
is
None
:
cmake_args
+=
[
'-DSPCONV_BuildCUDA=OFF'
]
...
...
spconv/__init__.py
View file @
8cbb7d3c
...
...
@@ -19,12 +19,12 @@ import numpy as np
import
torch
from
spconv
import
ops
,
utils
from
spconv.ops
import
ConvAlgo
from
spconv.conv
import
(
SparseConv2d
,
SparseConv3d
,
SparseConvTranspose2d
,
SparseConvTranspose3d
,
SparseInverseConv2d
,
SparseInverseConv3d
,
SubMConv2d
,
SubMConv3d
)
from
spconv.identity
import
Identity
from
spconv.modules
import
SparseModule
,
SparseSequential
from
spconv.ops
import
ConvAlgo
from
spconv.pool
import
SparseMaxPool2d
,
SparseMaxPool3d
from
spconv.tables
import
AddTable
,
ConcatTable
,
JoinTable
...
...
@@ -62,7 +62,7 @@ class SparseConvTensor(object):
self
.
features
=
features
self
.
indices
=
indices
if
self
.
indices
.
dtype
!=
torch
.
int32
:
self
.
indices
.
int
()
self
.
indices
=
self
.
indices
.
int
()
self
.
spatial_shape
=
spatial_shape
self
.
batch_size
=
batch_size
self
.
indice_dict
=
{}
...
...
@@ -82,7 +82,8 @@ class SparseConvTensor(object):
def
dense
(
self
,
channels_first
=
True
):
output_shape
=
[
self
.
batch_size
]
+
list
(
self
.
spatial_shape
)
+
[
self
.
features
.
shape
[
1
]]
res
=
scatter_nd
(
self
.
indices
.
long
().
to
(
self
.
features
.
device
),
self
.
features
,
output_shape
)
res
=
scatter_nd
(
self
.
indices
.
long
().
to
(
self
.
features
.
device
),
self
.
features
,
output_shape
)
if
not
channels_first
:
return
res
ndim
=
len
(
self
.
spatial_shape
)
...
...
spconv/functional.py
View file @
8cbb7d3c
...
...
@@ -25,16 +25,25 @@ class SparseConvFunction(Function):
num_activate_out
,
algo
):
ctx
.
save_for_backward
(
indice_pairs
,
indice_pair_num
,
features
,
filters
)
ctx
.
algo
=
algo
return
ops
.
indice_conv
(
features
,
filters
,
indice_pairs
,
indice_pair_num
,
num_activate_out
,
False
,
algo
=
algo
)
return
ops
.
indice_conv
(
features
,
filters
,
indice_pairs
,
indice_pair_num
,
num_activate_out
,
False
,
algo
=
algo
)
@
staticmethod
def
backward
(
ctx
,
grad_output
):
indice_pairs
,
indice_pair_num
,
features
,
filters
=
ctx
.
saved_tensors
input_bp
,
filters_bp
=
ops
.
indice_conv_backward
(
features
,
filters
,
grad_output
,
indice_pairs
,
indice_pair_num
,
False
,
algo
=
ctx
.
algo
)
input_bp
,
filters_bp
=
ops
.
indice_conv_backward
(
features
,
filters
,
grad_output
,
indice_pairs
,
indice_pair_num
,
False
,
algo
=
ctx
.
algo
)
return
input_bp
,
filters_bp
,
None
,
None
,
None
,
None
...
...
@@ -45,15 +54,26 @@ class SparseInverseConvFunction(Function):
num_activate_out
,
algo
):
ctx
.
save_for_backward
(
indice_pairs
,
indice_pair_num
,
features
,
filters
)
ctx
.
algo
=
algo
return
ops
.
indice_conv
(
features
,
filters
,
indice_pairs
,
indice_pair_num
,
num_activate_out
,
True
,
False
,
algo
=
algo
)
return
ops
.
indice_conv
(
features
,
filters
,
indice_pairs
,
indice_pair_num
,
num_activate_out
,
True
,
False
,
algo
=
algo
)
@
staticmethod
def
backward
(
ctx
,
grad_output
):
indice_pairs
,
indice_pair_num
,
features
,
filters
=
ctx
.
saved_tensors
input_bp
,
filters_bp
=
ops
.
indice_conv_backward
(
features
,
filters
,
grad_output
,
indice_pairs
,
indice_pair_num
,
True
,
False
,
algo
=
ctx
.
algo
)
input_bp
,
filters_bp
=
ops
.
indice_conv_backward
(
features
,
filters
,
grad_output
,
indice_pairs
,
indice_pair_num
,
True
,
False
,
algo
=
ctx
.
algo
)
return
input_bp
,
filters_bp
,
None
,
None
,
None
,
None
...
...
@@ -64,15 +84,26 @@ class SubMConvFunction(Function):
num_activate_out
,
algo
):
ctx
.
save_for_backward
(
indice_pairs
,
indice_pair_num
,
features
,
filters
)
ctx
.
algo
=
algo
return
ops
.
indice_conv
(
features
,
filters
,
indice_pairs
,
indice_pair_num
,
num_activate_out
,
False
,
True
,
algo
=
algo
)
return
ops
.
indice_conv
(
features
,
filters
,
indice_pairs
,
indice_pair_num
,
num_activate_out
,
False
,
True
,
algo
=
algo
)
@
staticmethod
def
backward
(
ctx
,
grad_output
):
indice_pairs
,
indice_pair_num
,
features
,
filters
=
ctx
.
saved_tensors
input_bp
,
filters_bp
=
ops
.
indice_conv_backward
(
features
,
filters
,
grad_output
,
indice_pairs
,
indice_pair_num
,
False
,
True
,
algo
=
ctx
.
algo
)
input_bp
,
filters_bp
=
ops
.
indice_conv_backward
(
features
,
filters
,
grad_output
,
indice_pairs
,
indice_pair_num
,
False
,
True
,
algo
=
ctx
.
algo
)
return
input_bp
,
filters_bp
,
None
,
None
,
None
,
None
...
...
spconv/modules.py
View file @
8cbb7d3c
...
...
@@ -12,6 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import
sys
import
time
from
collections
import
OrderedDict
...
...
spconv/ops.py
View file @
8cbb7d3c
...
...
@@ -12,15 +12,18 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from
enum
import
Enum
import
torch
import
spconv
from
enum
import
Enum
class
ConvAlgo
(
Enum
):
Native
=
0
BatchGemm
=
1
Native
=
0
# small memory cost, faster when number of points is large.
Batch
=
1
# high memory cost, faster when number of points is small (< 50000)
BatchGemmGather
=
2
# high memory cost, faster when number of points medium
def
get_conv_output_size
(
input_size
,
kernel_size
,
stride
,
padding
,
dilation
):
ndim
=
len
(
input_size
)
...
...
@@ -59,7 +62,7 @@ def get_indice_pairs(indices,
subm
=
False
,
transpose
=
False
,
grid
=
None
,
use_hash
=
Tru
e
):
use_hash
=
Fals
e
):
ndim
=
indices
.
shape
[
1
]
-
1
if
not
isinstance
(
ksize
,
(
list
,
tuple
)):
ksize
=
[
ksize
]
*
ndim
...
...
@@ -133,7 +136,7 @@ def indice_conv_backward(features,
indice_pair_num
,
inverse
=
False
,
subm
=
False
,
algo
=
ConvAlgo
.
Native
.
value
):
algo
=
ConvAlgo
.
Native
.
value
):
return
torch
.
ops
.
spconv
.
indice_conv_backward
(
features
,
filters
,
out_bp
,
indice_pairs
,
indice_pair_num
,
int
(
inverse
),
int
(
subm
),
algo
)
...
...
src/spconv/reordering.cu
View file @
8cbb7d3c
...
...
@@ -25,7 +25,6 @@
#include <tensorview/torch_utils.h>
#include <type_traits>
#include <utility/timer.h>
namespace
spconv
{
using
float_types_t
=
tv
::
mp_list
<
float
,
double
,
at
::
Half
>
;
...
...
@@ -48,7 +47,7 @@ void sparse_gather_cuda(torch::Tensor buffer, torch::Tensor features,
using
Index
=
decltype
(
IndexValue
);
bool
notFound
=
true
;
constexpr
int
vecloadFactor
=
sizeof
(
vecload_type_t
)
/
sizeof
(
T
);
tv
::
mp_for_each
<
kernel_block_t
>
(
[
=
,
&
buffer
,
&
features
,
&
indices
,
&
notFound
](
auto
NumTLP
)
{
constexpr
int
NumILP
=
NumTLP
/
4
;
...
...
@@ -59,8 +58,8 @@ void sparse_gather_cuda(torch::Tensor buffer, torch::Tensor features,
if
(
nHotBlock
>=
NumTLP
)
{
gatherVecBlockKernel
<
T
,
Index
,
int
(
NumTLP
),
NumILP
,
vecload_type_t
>
<<<
dim3
(
numPlanes
/
NumTLP
,
size
/
NumTLP
),
dim3
(
NumTLP
/
vecloadFactor
,
NumTLP
/
NumILP
),
0
,
<<<
dim3
(
size
/
NumTLP
,
numPlanes
/
NumTLP
),
dim3
(
NumTLP
/
NumILP
,
NumTLP
/
vecloadFactor
),
0
,
stream
>>>
(
buffer
.
data_ptr
<
T
>
(),
features
.
data_ptr
<
T
>
(),
indices
.
data_ptr
<
Index
>
(),
nHotBlock
,
numPlanes
/
vecloadFactor
);
...
...
@@ -115,7 +114,7 @@ void sparse_scatter_add_cuda(torch::Tensor buffer, torch::Tensor outFeatures,
bool
notFound
=
true
;
constexpr
int
vecloadFactor
=
sizeof
(
vecload_type_t
)
/
sizeof
(
T
);
// important for half.
tv
::
mp_for_each
<
kernel_block_t
>
([
=
,
&
outFeatures
,
&
buffer
,
&
indices
,
&
notFound
](
auto
NumTLP
)
{
// constexpr int NumILP = NumTLP / (64 / (NumTLP /
...
...
@@ -127,8 +126,8 @@ void sparse_scatter_add_cuda(torch::Tensor buffer, torch::Tensor outFeatures,
if
(
nHotBlock
>=
NumTLP
)
{
scatterAddVecBlockKernel
<
T
,
Index
,
int
(
NumTLP
),
NumILP
,
vecload_type_t
>
<<<
dim3
(
numPlanes
/
NumTLP
,
size
/
NumTLP
),
dim3
(
NumTLP
/
vecloadFactor
,
NumTLP
/
NumILP
),
0
,
<<<
dim3
(
size
/
NumTLP
,
numPlanes
/
NumTLP
),
dim3
(
NumTLP
/
NumILP
,
NumTLP
/
vecloadFactor
),
0
,
stream
>>>
(
outFeatures
.
data_ptr
<
T
>
(),
buffer
.
data_ptr
<
T
>
(),
indices
.
data_ptr
<
Index
>
(),
nHotBlock
,
numPlanes
/
vecloadFactor
);
...
...
@@ -194,31 +193,31 @@ void batch_sparse_gather_cuda(torch::Tensor buffer, torch::Tensor features,
if
(
nHotBlock
>=
NumTLP
)
{
batchGatherVecBlockKernel
<
T
,
Index
,
int
(
NumTLP
),
NumILP
,
vecload_type_t
>
<<<
dim3
(
numPlanes
/
NumTLP
,
size
/
NumTLP
),
dim3
(
NumTLP
/
vecloadFactor
,
NumTLP
/
NumILP
),
0
,
<<<
dim3
(
size
/
NumTLP
,
numPlanes
/
NumTLP
),
dim3
(
NumTLP
/
NumILP
,
NumTLP
/
vecloadFactor
),
0
,
stream
>>>
(
buffer
.
data_ptr
<
T
>
(),
features
.
data_ptr
<
T
>
(),
indices
.
data_ptr
<
Index
>
(),
nHotBlock
,
numPlanes
/
vecloadFactor
,
inds_stride
,
feature_stride
);
TV_CHECK_CUDA_ERR
();
TV_CHECK_CUDA_ERR_V2
(
"batchGatherVecBlockKernel"
);
}
if
(
size
-
nHotBlock
>
0
)
{
batchGatherVecKernel
<
T
,
Index
,
int
(
NumTLP
),
NumILP
,
vecload_type_t
>
batchGatherVecKernel
<
T
,
Index
,
int
(
NumTLP
),
NumILP
,
vecload_type_t
>
<<<
dim3
(
1
,
numPlanes
/
NumTLP
),
dim3
(
NumTLP
/
NumILP
,
NumTLP
/
vecloadFactor
),
0
,
stream
>>>
(
buffer
.
data_ptr
<
T
>
()
+
nHotBlock
*
numPlanes
,
features
.
data_ptr
<
T
>
(),
indices
.
data_ptr
<
Index
>
(),
size
-
nHotBlock
,
nHotBlock
,
numPlanes
/
vecloadFactor
,
indices
.
data_ptr
<
Index
>
(),
size
-
nHotBlock
,
nHotBlock
,
numPlanes
/
vecloadFactor
,
inds_stride
,
feature_stride
);
TV_CHECK_CUDA_ERR
(
);
TV_CHECK_CUDA_ERR
_V2
(
"batchGatherVecKernel"
);
}
notFound
=
false
;
}
}
});
if
(
notFound
)
{
constexpr
int
NumTLP
=
64
;
constexpr
int
NumILP
=
NumTLP
/
4
;
...
...
@@ -259,7 +258,7 @@ void batch_sparse_scatter_add_cuda(torch::Tensor buffer,
using
Index
=
decltype
(
IndexValue
);
bool
notFound
=
true
;
constexpr
int
vecloadFactor
=
1
;
// important for half.
tv
::
mp_for_each
<
kernel_block_t
>
([
=
,
&
outFeatures
,
&
buffer
,
&
indices
,
&
notFound
](
auto
NumTLP
)
{
// constexpr int NumILP = NumTLP / (64 / (NumTLP /
...
...
@@ -270,12 +269,12 @@ void batch_sparse_scatter_add_cuda(torch::Tensor buffer,
if
(
numPlanes
%
NumTLP
==
0
)
{
if
(
nHotBlock
>=
NumTLP
)
{
batchScatterAddBlockKernel
<
T
,
Index
,
int
(
NumTLP
),
NumILP
>
<<<
dim3
(
numPlanes
/
NumTLP
,
size
/
NumTLP
),
dim3
(
NumTLP
/
vecloadFactor
,
NumTLP
/
NumILP
),
0
,
<<<
dim3
(
size
/
NumTLP
,
numPlanes
/
NumTLP
),
dim3
(
NumTLP
/
NumILP
,
NumTLP
/
vecloadFactor
),
0
,
stream
>>>
(
outFeatures
.
data_ptr
<
T
>
(),
buffer
.
data_ptr
<
T
>
(),
indices
.
data_ptr
<
Index
>
(),
nHotBlock
,
numPlanes
/
vecloadFactor
,
inds_stride
,
feature_stride
);
feature_stride
);
TV_CHECK_CUDA_ERR
();
}
if
(
size
-
nHotBlock
>
0
)
{
...
...
@@ -283,8 +282,8 @@ void batch_sparse_scatter_add_cuda(torch::Tensor buffer,
<<<
dim3
(
1
,
numPlanes
/
NumTLP
),
dim3
(
NumTLP
/
NumILP
,
NumTLP
),
0
,
stream
>>>
(
outFeatures
.
data_ptr
<
T
>
(),
buffer
.
data_ptr
<
T
>
()
+
nHotBlock
*
numPlanes
,
indices
.
data_ptr
<
Index
>
(),
size
-
nHotBlock
,
nHotBlock
,
numPlanes
,
inds_stride
,
indices
.
data_ptr
<
Index
>
(),
size
-
nHotBlock
,
nHotBlock
,
numPlanes
,
inds_stride
,
feature_stride
);
TV_CHECK_CUDA_ERR
();
}
...
...
@@ -292,7 +291,7 @@ void batch_sparse_scatter_add_cuda(torch::Tensor buffer,
}
}
});
if
(
notFound
)
{
constexpr
int
NumTLP
=
64
;
constexpr
int
NumILP
=
NumTLP
/
4
;
...
...
@@ -309,4 +308,4 @@ void batch_sparse_scatter_add_cuda(torch::Tensor buffer,
});
}
}
// namespace spconv
\ No newline at end of file
}
// namespace spconv
src/spconv/spconv_ops.cc
View file @
8cbb7d3c
...
...
@@ -139,10 +139,12 @@ torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters,
int64_t
algo
)
{
auto
kernelVolume
=
indiceNum
.
size
(
0
);
switch
(
algo
)
{
case
kBatchGemm
:
{
case
kBatchGemmGather
:
case
kBatch
:
{
if
(
kernelVolume
!=
1
)
{
return
indiceConvBatch
(
features
,
filters
,
indicePairs
,
indiceNum
,
numActOut
,
_inverse
,
_subM
);
numActOut
,
_inverse
,
_subM
,
algo
!=
kBatchGemmGather
);
}
else
{
break
;
}
...
...
@@ -152,6 +154,8 @@ torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters,
default:
TV_THROW_RT_ERR
(
"unknown algo"
);
}
// auto timer = spconv::CudaContextTimer<>();
bool
subM
=
_subM
!=
0
;
bool
inverse
=
_inverse
!=
0
;
auto
device
=
features
.
device
().
type
();
...
...
@@ -170,10 +174,11 @@ torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters,
torch
::
TensorOptions
().
dtype
(
features
.
dtype
()).
device
(
features
.
device
());
torch
::
Tensor
output
=
torch
::
zeros
({
numActOut
,
numOutPlanes
},
options
);
torch
::
Tensor
inputBuffer
=
torch
::
zeros
({
indicePairMaxSize
,
numInPlanes
},
options
);
torch
::
empty
({
indicePairMaxSize
,
numInPlanes
},
options
);
torch
::
Tensor
outputBuffer
=
torch
::
empty
({
indicePairMaxSize
,
numOutPlanes
},
options
);
filters
=
filters
.
view
({
-
1
,
numInPlanes
,
numOutPlanes
});
if
(
subM
)
{
// the center index of subm conv don't need gather and scatter
// add.
torch
::
mm_out
(
output
,
features
,
filters
[
indicePairMaxOffset
]);
...
...
@@ -181,12 +186,13 @@ torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters,
double
totalGatherTime
=
0
;
double
totalGEMMTime
=
0
;
double
totalSAddTime
=
0
;
// tv::ssprint("first subm gemm time", timer.report() / 1000.0);
for
(
int
i
=
0
;
i
<
kernelVolume
;
++
i
)
{
auto
nHot
=
indicePairNumCpu
.
data_ptr
<
int
>
()[
i
];
if
(
nHot
<=
0
||
(
subM
&&
i
==
indicePairMaxOffset
))
{
continue
;
}
// auto timer = spconv::CudaContextTimer<>();
auto
outputBufferBlob
=
torch
::
from_blob
(
outputBuffer
.
data_ptr
(),
{
nHot
,
numOutPlanes
},
options
);
auto
inputBufferBlob
=
...
...
@@ -208,7 +214,10 @@ torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters,
else
{
TV_THROW_INVALID_ARG
(
"unknown device type"
);
}
// totalGatherTime += timer.report() / 1000.0;
torch
::
mm_out
(
outputBufferBlob
,
inputBufferBlob
,
filters
[
i
]);
// totalGEMMTime += timer.report() / 1000.0;
if
(
device
==
torch
::
kCPU
)
{
sparse_scatter_add_cpu
(
outputBuffer
,
output
,
indicePairs
[
!
inverse
][
i
],
nHot
);
...
...
@@ -222,14 +231,17 @@ torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters,
else
{
TV_THROW_INVALID_ARG
(
"unknown device type"
);
}
// totalSAddTime += timer.report() / 1000.0;
}
// tv::ssprint(totalGatherTime, totalGEMMTime, totalSAddTime);
return
output
;
}
torch
::
Tensor
indiceConvBatch
(
torch
::
Tensor
features
,
torch
::
Tensor
filters
,
torch
::
Tensor
indicePairs
,
torch
::
Tensor
indiceNum
,
int64_t
numActOut
,
int64_t
_inverse
,
int64_t
_subM
)
{
int64_t
_inverse
,
int64_t
_subM
,
bool
batchScatter
)
{
bool
subM
=
_subM
!=
0
;
bool
inverse
=
_inverse
!=
0
;
auto
device
=
features
.
device
().
type
();
...
...
@@ -238,6 +250,7 @@ torch::Tensor indiceConvBatch(torch::Tensor features, torch::Tensor filters,
TV_ASSERT_INVALID_ARG
(
kernelVolume
>
1
,
"error"
);
auto
numInPlanes
=
features
.
size
(
1
);
auto
numOutPlanes
=
filters
.
size
(
ndim
+
1
);
// auto timer = spconv::CudaContextTimer<>();
auto
indicePairNumCpu
=
indiceNum
.
to
({
torch
::
kCPU
});
auto
indicePairNumVec
=
std
::
vector
<
int
>
(
indicePairNumCpu
.
data_ptr
<
int
>
(),
...
...
@@ -257,85 +270,98 @@ torch::Tensor indiceConvBatch(torch::Tensor features, torch::Tensor filters,
// number of indice in the center of filter is much more than other
// filter location.
// so we first use top2 indice num to do batch conv, then
// do native conv in center.
// do native conv
(gemm)
in center.
int
bufferSize
=
subM
?
indicePairTop2Size
:
indicePairMaxSize
;
torch
::
Tensor
inputBuffer
=
torch
::
zeros
({
kernelVolume
,
bufferSize
,
numInPlanes
},
options
);
torch
::
Tensor
outputBuffer
=
torch
::
empty
({
kernelVolume
,
bufferSize
,
numOutPlanes
},
options
);
int
maxKernelVolumePart
=
kernelVolume
;
std
::
vector
<
std
::
pair
<
int
,
int
>>
part_ranges
=
{{
0
,
kernelVolume
}};
filters
=
filters
.
view
({
kernelVolume
,
numInPlanes
,
numOutPlanes
});
int64_t
size
=
kernelVolume
*
bufferSize
;
if
(
device
==
torch
::
kCPU
)
{
TV_THROW_INVALID_ARG
(
"unknown device type"
);
}
#ifdef TV_CUDA
else
if
(
device
==
torch
::
kCUDA
)
{
batch_sparse_gather_cuda
(
inputBuffer
,
features
,
indicePairs
[
inverse
],
size
);
}
#endif
else
{
TV_THROW_INVALID_ARG
(
"unknown device type"
);
}
torch
::
bmm_out
(
outputBuffer
,
inputBuffer
,
filters
);
if
(
device
==
torch
::
kCPU
)
{
TV_THROW_INVALID_ARG
(
"unknown device type"
);
}
#ifdef TV_CUDA
else
if
(
device
==
torch
::
kCUDA
)
{
batch_sparse_scatter_add_cuda
(
outputBuffer
,
output
,
indicePairs
[
!
inverse
],
size
);
}
#endif
else
{
TV_THROW_INVALID_ARG
(
"unknown device type"
);
}
if
(
subM
)
{
auto
remain_size
=
indicePairMaxSize
-
indicePairTop2Size
;
if
(
remain_size
<=
0
)
{
maxKernelVolumePart
=
std
::
max
(
indicePairMaxOffset
,
int
(
kernelVolume
-
indicePairMaxOffset
-
1
));
part_ranges
=
{{
0
,
indicePairMaxOffset
},
{
indicePairMaxOffset
+
1
,
kernelVolume
}};
torch
::
mm_out
(
output
,
features
,
filters
[
indicePairMaxOffset
]);
if
(
indicePairTop2Size
==
0
)
{
return
output
;
}
inputBuffer
=
torch
::
empty
({
remain_size
,
numInPlanes
},
options
);
outputBuffer
=
torch
::
empty
({
remain_size
,
numOutPlanes
},
options
);
}
// tv::ssprint("first subm gemm time", timer.report() / 1000.0);
double
totalGatherTime
=
0
;
double
totalGEMMTime
=
0
;
double
totalSAddTime
=
0
;
torch
::
Tensor
inputBuffer
=
torch
::
empty
({
maxKernelVolumePart
,
bufferSize
,
numInPlanes
},
options
);
torch
::
Tensor
outputBuffer
=
torch
::
empty
({
maxKernelVolumePart
,
bufferSize
,
numOutPlanes
},
options
);
for
(
auto
&
range
:
part_ranges
)
{
int
start
=
range
.
first
;
int
end
=
range
.
second
;
int
length
=
end
-
start
;
int64_t
size
=
length
*
bufferSize
;
auto
inputBufferPart
=
tv
::
torch_slice_first_axis
(
inputBuffer
,
0
,
length
);
auto
outputBufferPart
=
tv
::
torch_slice_first_axis
(
outputBuffer
,
0
,
length
);
auto
indicePairs1Part
=
tv
::
torch_slice_first_axis
(
indicePairs
[
inverse
],
start
,
end
);
auto
indicePairs2Part
=
tv
::
torch_slice_first_axis
(
indicePairs
[
!
inverse
],
start
,
end
);
auto
filtersPart
=
tv
::
torch_slice_first_axis
(
filters
,
start
,
end
);
if
(
device
==
torch
::
kCPU
)
{
TV_THROW_INVALID_ARG
(
"unknown device type"
);
}
#ifdef TV_CUDA
else
if
(
device
==
torch
::
kCUDA
)
{
tv
::
dispatch_torch
<
int32_t
,
int64_t
>
(
indice_dtype
,
[
&
](
auto
I
)
{
using
Index
=
decltype
(
I
);
auto
indicePairsRemain
=
torch
::
from_blob
(
indicePairs
[
inverse
][
indicePairMaxOffset
].
data_ptr
<
Index
>
()
+
indicePairTop2Size
,
{
remain_size
},
indicePairs
.
options
());
sparse_gather_cuda
(
inputBuffer
,
features
,
indicePairsRemain
,
remain_size
);
});
batch_sparse_gather_cuda
(
inputBufferPart
,
features
,
indicePairs1Part
,
size
);
}
#endif
else
{
TV_THROW_INVALID_ARG
(
"unknown device type"
);
}
torch
::
mm_out
(
outputBuffer
,
inputBuffer
,
filters
[
indicePairMaxOffset
]);
if
(
device
==
torch
::
kCPU
)
{
TV_THROW_INVALID_ARG
(
"unknown device type"
);
}
// totalGatherTime += timer.report() / 1000.0;
torch
::
bmm_out
(
outputBufferPart
,
inputBufferPart
,
filtersPart
);
// totalGEMMTime += timer.report() / 1000.0;
if
(
batchScatter
)
{
if
(
device
==
torch
::
kCPU
)
{
TV_THROW_INVALID_ARG
(
"unknown device type"
);
}
#ifdef TV_CUDA
else
if
(
device
==
torch
::
kCUDA
)
{
tv
::
dispatch_torch
<
int32_t
,
int64_t
>
(
indice_dtype
,
[
&
](
auto
I
)
{
using
Index
=
decltype
(
I
);
auto
indicePairsRemain
=
torch
::
from_blob
(
indicePairs
[
!
inverse
][
indicePairMaxOffset
].
data_ptr
<
Index
>
()
+
indicePairTop2Size
,
{
remain_size
},
indicePairs
.
options
());
sparse_scatter_add_cuda
(
outputBuffer
,
output
,
indicePairsRemain
,
remain_size
);
});
}
else
if
(
device
==
torch
::
kCUDA
)
{
batch_sparse_scatter_add_cuda
(
outputBufferPart
,
output
,
indicePairs2Part
,
size
);
}
#endif
else
{
TV_THROW_INVALID_ARG
(
"unknown device type"
);
else
{
TV_THROW_INVALID_ARG
(
"unknown device type"
);
}
}
else
{
for
(
int
i
=
0
;
i
<
length
;
++
i
)
{
auto
nHot
=
indicePairNumCpu
.
data_ptr
<
int
>
()[
i
+
start
];
if
(
nHot
<=
0
)
{
continue
;
}
if
(
device
==
torch
::
kCPU
)
{
sparse_scatter_add_cpu
(
outputBufferPart
[
i
],
output
,
indicePairs2Part
[
i
],
nHot
);
}
#ifdef TV_CUDA
else
if
(
device
==
torch
::
kCUDA
)
{
sparse_scatter_add_cuda
(
outputBufferPart
[
i
],
output
,
indicePairs2Part
[
i
],
nHot
);
}
#endif
else
{
TV_THROW_INVALID_ARG
(
"unknown device type"
);
}
}
}
// totalSAddTime += timer.report() / 1000.0;
}
// tv::ssprint(totalGatherTime, totalGEMMTime, totalSAddTime);
return
output
;
}
...
...
@@ -346,10 +372,12 @@ indiceConvBackward(torch::Tensor features, torch::Tensor filters,
int64_t
algo
)
{
auto
kernelVolume
=
indiceNum
.
size
(
0
);
switch
(
algo
)
{
case
kBatchGemm
:
{
case
kBatchGemmGather
:
case
kBatch
:
{
if
(
kernelVolume
!=
1
)
{
return
indiceConvBackwardBatch
(
features
,
filters
,
outGrad
,
indicePairs
,
indiceNum
,
_inverse
,
_subM
);
indiceNum
,
_inverse
,
_subM
,
algo
!=
kBatchGemmGather
);
}
else
{
break
;
}
...
...
@@ -439,7 +467,7 @@ std::vector<torch::Tensor>
indiceConvBackwardBatch
(
torch
::
Tensor
features
,
torch
::
Tensor
filters
,
torch
::
Tensor
outGrad
,
torch
::
Tensor
indicePairs
,
torch
::
Tensor
indiceNum
,
int64_t
_inverse
,
int64_t
_subM
)
{
int64_t
_subM
,
bool
batchScatter
)
{
bool
subM
=
_subM
!=
0
;
bool
inverse
=
_inverse
!=
0
;
...
...
@@ -467,101 +495,99 @@ indiceConvBackwardBatch(torch::Tensor features, torch::Tensor filters,
torch
::
Tensor
inputGrad
=
torch
::
zeros
(
features
.
sizes
(),
options
);
torch
::
Tensor
filtersGrad
=
torch
::
zeros
(
filterShape
,
options
);
int
bufferSize
=
subM
?
indicePairTop2Size
:
indicePairMaxSize
;
torch
::
Tensor
inputBuffer
=
torch
::
zeros
({
kernelVolume
,
bufferSize
,
numInPlanes
},
options
);
torch
::
Tensor
outputBuffer
=
torch
::
zeros
({
kernelVolume
,
bufferSize
,
numOutPlanes
},
options
);
filters
=
filters
.
view
({
-
1
,
numInPlanes
,
numOutPlanes
});
filtersGrad
=
filtersGrad
.
view
({
-
1
,
numInPlanes
,
numOutPlanes
});
int64_t
size
=
kernelVolume
*
bufferSize
;
if
(
device
==
torch
::
kCPU
)
{
TV_THROW_INVALID_ARG
(
"unknown device type"
);
}
#ifdef TV_CUDA
else
if
(
device
==
torch
::
kCUDA
)
{
batch_sparse_gather_cuda
(
inputBuffer
,
features
,
indicePairs
[
inverse
],
size
);
batch_sparse_gather_cuda
(
outputBuffer
,
outGrad
,
indicePairs
[
!
inverse
],
size
);
}
#endif
else
{
TV_THROW_INVALID_ARG
(
"unknown device type"
);
}
// filters: KV, I, O, inputBuffer: [KV, buffer, I]
// outputBuffer: [KV, buffer, O]
torch
::
bmm_out
(
filtersGrad
,
inputBuffer
.
permute
({
0
,
2
,
1
}),
outputBuffer
);
torch
::
bmm_out
(
inputBuffer
,
outputBuffer
,
filters
.
permute
({
0
,
2
,
1
}));
if
(
device
==
torch
::
kCPU
)
{
TV_THROW_INVALID_ARG
(
"unknown device type"
);
}
#ifdef TV_CUDA
else
if
(
device
==
torch
::
kCUDA
)
{
batch_sparse_scatter_add_cuda
(
inputBuffer
,
inputGrad
,
indicePairs
[
inverse
],
size
);
}
#endif
else
{
TV_THROW_INVALID_ARG
(
"unknown device type"
);
}
std
::
vector
<
std
::
pair
<
int
,
int
>>
part_ranges
=
{{
0
,
kernelVolume
}};
int
maxKernelVolumePart
=
kernelVolume
;
if
(
subM
)
{
auto
remain_size
=
indicePairMaxSize
-
indicePairTop2Size
;
if
(
remain_size
<=
0
)
{
maxKernelVolumePart
=
std
::
max
(
indicePairMaxOffset
,
int
(
kernelVolume
-
indicePairMaxOffset
-
1
));
part_ranges
=
{{
0
,
indicePairMaxOffset
},
{
indicePairMaxOffset
+
1
,
kernelVolume
}};
auto
filtersGradSub
=
filtersGrad
[
indicePairMaxOffset
];
auto
filtersSub
=
filters
[
indicePairMaxOffset
];
torch
::
mm_out
(
filtersGradSub
,
features
.
t
(),
outGrad
);
torch
::
mm_out
(
inputGrad
,
outGrad
,
filtersSub
.
t
());
if
(
indicePairTop2Size
==
0
)
{
return
{
inputGrad
,
filtersGrad
.
view
(
filterShape
)};
}
inputBuffer
=
torch
::
zeros
({
remain_size
,
numInPlanes
},
options
);
outputBuffer
=
torch
::
zeros
({
remain_size
,
numOutPlanes
},
options
);
}
torch
::
Tensor
inputBuffer
=
torch
::
zeros
({
maxKernelVolumePart
,
bufferSize
,
numInPlanes
},
options
);
torch
::
Tensor
outputBuffer
=
torch
::
zeros
({
maxKernelVolumePart
,
bufferSize
,
numOutPlanes
},
options
);
for
(
auto
&
range
:
part_ranges
)
{
int
start
=
range
.
first
;
int
end
=
range
.
second
;
int
length
=
end
-
start
;
int64_t
size
=
length
*
bufferSize
;
auto
inputBufferPart
=
tv
::
torch_slice_first_axis
(
inputBuffer
,
0
,
length
);
auto
outputBufferPart
=
tv
::
torch_slice_first_axis
(
outputBuffer
,
0
,
length
);
auto
indicePairs1Part
=
tv
::
torch_slice_first_axis
(
indicePairs
[
inverse
],
start
,
end
);
auto
indicePairs2Part
=
tv
::
torch_slice_first_axis
(
indicePairs
[
!
inverse
],
start
,
end
);
auto
filtersPart
=
tv
::
torch_slice_first_axis
(
filters
,
start
,
end
);
auto
filtersGradPart
=
tv
::
torch_slice_first_axis
(
filtersGrad
,
start
,
end
);
if
(
device
==
torch
::
kCPU
)
{
TV_THROW_INVALID_ARG
(
"unknown device type"
);
}
#ifdef TV_CUDA
else
if
(
device
==
torch
::
kCUDA
)
{
tv
::
dispatch_torch
<
int32_t
,
int64_t
>
(
indice_dtype
,
[
&
](
auto
I
)
{
using
Index
=
decltype
(
I
);
auto
indicePairsRemain
=
torch
::
from_blob
(
indicePairs
[
inverse
][
indicePairMaxOffset
].
data_ptr
<
Index
>
()
+
indicePairTop2Size
,
{
remain_size
},
indicePairs
.
options
());
auto
indicePairsRemain2
=
torch
::
from_blob
(
indicePairs
[
!
inverse
][
indicePairMaxOffset
].
data_ptr
<
Index
>
()
+
indicePairTop2Size
,
{
remain_size
},
indicePairs
.
options
());
batch_sparse_gather_cuda
(
inputBuffer
,
features
,
indicePairsRemain
,
remain_size
);
batch_sparse_gather_cuda
(
outputBuffer
,
outGrad
,
indicePairsRemain2
,
remain_size
);
});
batch_sparse_gather_cuda
(
inputBufferPart
,
features
,
indicePairs1Part
,
size
);
batch_sparse_gather_cuda
(
outputBufferPart
,
outGrad
,
indicePairs2Part
,
size
);
}
#endif
else
{
TV_THROW_INVALID_ARG
(
"unknown device type"
);
}
torch
::
mm_out
(
filtersGrad
,
inputBuffer
.
t
(),
outputBuffer
);
torch
::
mm_out
(
inputBuffer
,
outputBuffer
,
filters
[
indicePairMaxOffset
].
t
());
if
(
device
==
torch
::
kCPU
)
{
TV_THROW_INVALID_ARG
(
"unknown device type"
);
}
// filters: KV, I, O, inputBuffer: [KV, buffer, I]
// outputBuffer: [KV, buffer, O]
torch
::
bmm_out
(
filtersGradPart
,
inputBufferPart
.
permute
({
0
,
2
,
1
}),
outputBufferPart
);
torch
::
bmm_out
(
inputBuffer
,
outputBufferPart
,
filtersPart
.
permute
({
0
,
2
,
1
}));
if
(
batchScatter
)
{
if
(
device
==
torch
::
kCPU
)
{
TV_THROW_INVALID_ARG
(
"unknown device type"
);
}
#ifdef TV_CUDA
else
if
(
device
==
torch
::
kCUDA
)
{
tv
::
dispatch_torch
<
int32_t
,
int64_t
>
(
indice_dtype
,
[
&
](
auto
I
)
{
using
Index
=
decltype
(
I
);
auto
indicePairsRemain2
=
torch
::
from_blob
(
indicePairs
[
!
inverse
][
indicePairMaxOffset
].
data_ptr
<
Index
>
()
+
indicePairTop2Size
,
{
remain_size
},
indicePairs
.
options
());
batch_sparse_scatter_add_cuda
(
inputBuffer
,
inputGrad
,
indicePairsRemain2
,
remain_size
);
});
}
else
if
(
device
==
torch
::
kCUDA
)
{
batch_sparse_scatter_add_cuda
(
inputBufferPart
,
inputGrad
,
indicePairs1Part
,
size
);
}
#endif
else
{
TV_THROW_INVALID_ARG
(
"unknown device type"
);
else
{
TV_THROW_INVALID_ARG
(
"unknown device type"
);
}
}
else
{
for
(
int
i
=
0
;
i
<
length
;
++
i
)
{
auto
nHot
=
indicePairNumCpu
.
data_ptr
<
int
>
()[
i
+
start
];
if
(
nHot
<=
0
)
{
continue
;
}
if
(
device
==
torch
::
kCPU
)
{
sparse_scatter_add_cpu
(
inputBufferPart
[
i
],
inputGrad
,
indicePairs1Part
[
i
],
nHot
);
}
#ifdef TV_CUDA
else
if
(
device
==
torch
::
kCUDA
)
{
sparse_scatter_add_cuda
(
inputBufferPart
[
i
],
inputGrad
,
indicePairs1Part
[
i
],
nHot
);
}
#endif
else
{
TV_THROW_INVALID_ARG
(
"unknown device type"
);
}
}
}
}
return
{
inputGrad
,
filtersGrad
.
view
(
filterShape
)};
}
...
...
test/test_conv.py
View file @
8cbb7d3c
...
...
@@ -27,11 +27,18 @@ from spconv.test_utils import TestCase, generate_sparse_data, params_grid
class
SparseConv3dTestTorch
(
nn
.
Module
):
def
__init__
(
self
,
num_layers
,
ndim
,
shape
,
in_channels
,
out_channels
,
kernel_size
,
stride
,
padding
,
dilation
):
def
__init__
(
self
,
num_layers
,
ndim
,
shape
,
in_channels
,
out_channels
,
kernel_size
,
stride
,
padding
,
dilation
,
algo
=
spconv
.
ConvAlgo
.
BatchGemmGather
):
super
().
__init__
()
algo
=
spconv
.
ConvAlgo
.
BatchGemm
layers
=
[
spconv
.
SparseConv3d
(
in_channels
,
out_channels
,
...
...
@@ -67,8 +74,17 @@ class SparseConv3dTestTorch(nn.Module):
class
SubMConv3dTestTorch
(
nn
.
Module
):
def
__init__
(
self
,
num_layers
,
ndim
,
shape
,
in_channels
,
out_channels
,
kernel_size
,
stride
,
padding
,
dilation
,
algo
=
spconv
.
ConvAlgo
.
Native
):
def
__init__
(
self
,
num_layers
,
ndim
,
shape
,
in_channels
,
out_channels
,
kernel_size
,
stride
,
padding
,
dilation
,
algo
=
spconv
.
ConvAlgo
.
Native
):
super
().
__init__
()
layers
=
[
spconv
.
SubMConv3d
(
in_channels
,
...
...
@@ -89,14 +105,14 @@ class SubMConv3dTestTorch(nn.Module):
padding
=
padding
,
dilation
=
dilation
,
bias
=
False
,
algo
=
algo
))
algo
=
algo
))
self
.
net
=
spconv
.
SparseSequential
(
*
layers
,
)
# self.grid = torch.full([3, *shape], -1, dtype=torch.int32).cuda()
self
.
grid
=
None
self
.
shape
=
shape
def
forward
(
self
,
features
,
coors
,
batch_size
):
coors
=
coors
.
int
()
# .cpu()
coors
=
coors
.
int
()
# .cpu()
x
=
spconv
.
SparseConvTensor
(
features
,
coors
,
self
.
shape
,
batch_size
,
self
.
grid
)
return
self
.
net
(
x
)
# .dense()
...
...
@@ -599,13 +615,13 @@ class TestSpConv(TestCase):
self
.
assertAllClose
(
din_np
,
din_sparse_np
,
atol
=
1e-4
)
def
main
():
def
main
(
algo
=
spconv
.
ConvAlgo
.
Native
):
# function for develop.
np
.
random
.
seed
(
484
)
# devices = ["cuda:0"]
devices
=
[
"cuda:0"
]
shapes
=
[[
5
0
,
3
0
,
30
]]
batchsizes
=
[
2
]
shapes
=
[[
40
0
,
40
0
,
15
]]
batchsizes
=
[
1
]
in_channels
=
[
32
]
out_channels
=
[
64
]
...
...
@@ -620,7 +636,7 @@ def main():
if
all
([
s
>
1
,
d
>
1
]):
continue
device
=
torch
.
device
(
dev
)
num_points
=
[
5
00
]
*
bs
num_points
=
[
300
00
]
*
bs
sparse_dict
=
generate_sparse_data
(
shape
,
num_points
,
IC
)
...
...
@@ -636,8 +652,8 @@ def main():
features_t
=
torch
.
from_numpy
(
features
).
to
(
device
).
float
()
features_dense_t
=
torch
.
from_numpy
(
features_dense
).
to
(
device
).
float
()
net
=
SparseConv3dTestTorch
(
1
,
3
,
shape
,
IC
,
OC
,
k
,
s
,
p
,
d
).
to
(
device
).
float
()
net
=
SparseConv3dTestTorch
(
1
,
3
,
shape
,
IC
,
OC
,
k
,
s
,
p
,
d
,
algo
=
algo
).
to
(
device
).
float
()
net_ref
=
Conv3dTestTorch
(
1
,
3
,
shape
,
IC
,
OC
,
k
,
s
,
p
,
d
).
to
(
device
).
float
()
filters_t
=
torch
.
from_numpy
(
filters
).
to
(
device
).
float
()
...
...
@@ -662,7 +678,8 @@ def main():
print
(
np
.
linalg
.
norm
(
out
.
detach
().
cpu
().
numpy
()
-
out_ref
.
detach
().
cpu
().
numpy
()))
print
(
out_numpy
.
min
(),
out_numpy
.
max
(),
out_numpy
.
mean
(),
out_numpy
.
sum
())
print
(
out_numpy
.
min
(),
out_numpy
.
max
(),
out_numpy
.
mean
(),
out_numpy
.
sum
())
def
main_subm
(
algo
):
...
...
@@ -671,7 +688,7 @@ def main_subm(algo):
torch
.
manual_seed
(
50051
)
# devices = ["cuda:0"]
devices
=
[
"cuda:0"
]
shapes
=
[[
5
0
,
3
0
,
30
]]
shapes
=
[[
40
0
,
40
0
,
15
]]
batchsizes
=
[
2
]
in_channels
=
[
32
]
...
...
@@ -686,7 +703,7 @@ def main_subm(algo):
if
all
([
s
>
1
,
d
>
1
]):
continue
device
=
torch
.
device
(
dev
)
num_points
=
[
1
000
]
*
bs
num_points
=
[
240
000
]
*
bs
sparse_dict
=
generate_sparse_data
(
shape
,
num_points
,
IC
)
...
...
@@ -702,8 +719,8 @@ def main_subm(algo):
features_t
=
torch
.
from_numpy
(
features
).
to
(
device
).
float
()
features_dense_t
=
torch
.
from_numpy
(
features_dense
).
to
(
device
).
float
()
net
=
SubMConv3dTestTorch
(
1
,
3
,
shape
,
IC
,
OC
,
k
,
s
,
p
,
d
,
algo
=
algo
).
to
(
device
).
float
()
net
=
SubMConv3dTestTorch
(
1
,
3
,
shape
,
IC
,
OC
,
k
,
s
,
p
,
d
,
algo
=
algo
).
to
(
device
).
float
()
net_ref
=
Conv3dTestTorch
(
1
,
3
,
shape
,
IC
,
OC
,
k
,
s
,
p
,
d
).
to
(
device
).
float
()
filters_t
=
torch
.
from_numpy
(
filters
).
to
(
device
).
float
()
...
...
@@ -712,7 +729,7 @@ def main_subm(algo):
net
.
net
[
0
].
weight
[:]
=
filters_t
out_ref
=
net_ref
(
features_dense_t
)
times
=
[]
for
i
in
range
(
10
0
):
for
i
in
range
(
2
0
):
t
=
time
.
time
()
out
=
net
(
features_t
,
indices_t
,
bs
)
torch
.
cuda
.
synchronize
()
...
...
@@ -727,11 +744,13 @@ def main_subm(algo):
print
(
np
.
linalg
.
norm
(
out
.
detach
().
cpu
().
numpy
()
-
out_ref
.
detach
().
cpu
().
numpy
()))
print
(
out_numpy
.
min
(),
out_numpy
.
max
(),
out_numpy
.
mean
(),
out_numpy
.
sum
())
print
(
out_numpy
.
min
(),
out_numpy
.
max
(),
out_numpy
.
mean
(),
out_numpy
.
sum
())
return
out_numpy
if
__name__
==
'__main__'
:
#
out_my =
main_subm(algo=spconv.ConvAlgo.BatchGemm)
# main_subm(algo=spconv.ConvAlgo.BatchGemm
Gather
)
# out_ref = main_subm(algo=spconv.ConvAlgo.Native)
# TestCase().assertAllClose(out_my, out_ref)
# unittest.main()
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment