Commit 11bcbbf6 authored by yanyan's avatar yanyan
Browse files

1.2.1: 4x faster subm indice generation

parent 492865a3
# Changelog # Changelog
## [1.2.1] - 2020-06-04
### Changed
- The subm indice pair generation speed is greatly increased by two tricks: 1. most subm conv use only kernelsize=3, so we can unroll loops to get 100% performance increase. 2. subm indice pairs have a property: indicePairs[0, i] = indicePairs[1, kernelVolume - i - 1], so we can get another 100% performance increase.
## [1.2.0] - 2020-05-28 ## [1.2.0] - 2020-05-28
### Added ### Added
- add batch gemm support. small performance increasement but more gpu memory usage. you can use algo=spconv.ConvAlgo.Batch to use it. - add batch gemm support. small performance increasement but more gpu memory usage. you can use algo=spconv.ConvAlgo.Batch to use it.
......
...@@ -274,10 +274,13 @@ __global__ void getSubMIndicePairsKernel3( ...@@ -274,10 +274,13 @@ __global__ void getSubMIndicePairsKernel3(
tv::TensorView<Index> indicePairs, tv::TensorView<Index> indiceNum, tv::TensorView<Index> indicePairs, tv::TensorView<Index> indiceNum,
const tv::SimpleVector<Index, 3> outSpatialShape, Index spatialVolume) { const tv::SimpleVector<Index, 3> outSpatialShape, Index spatialVolume) {
auto numActIn = indicesIn.dim(0); auto numActIn = indicesIn.dim(0);
Index point[3]; Index point[3];
Index index = 0; Index index = 0;
Index offset; Index offset;
constexpr unsigned KV = K0 * K1 * K2;
constexpr unsigned center = KV / 2;
*(indiceNum.data() + center) = numActIn;
for (int ix : tv::KernelLoopX<int>(numActIn)) { for (int ix : tv::KernelLoopX<int>(numActIn)) {
const Index *indice_data = indicesIn.data() + ix * (3 + 1); const Index *indice_data = indicesIn.data() + ix * (3 + 1);
#pragma unroll #pragma unroll
...@@ -287,19 +290,32 @@ __global__ void getSubMIndicePairsKernel3( ...@@ -287,19 +290,32 @@ __global__ void getSubMIndicePairsKernel3(
#pragma unroll #pragma unroll
for (int k = 0; k < K2; ++k) { for (int k = 0; k < K2; ++k) {
offset = i * K1 * K2 + j * K2 + k; offset = i * K1 * K2 + j * K2 + k;
point[2] = indice_data[3] - k + K2 / 2; if (offset > center){
point[1] = indice_data[2] - j + K1 / 2; continue;
point[0] = indice_data[1] - i + K0 / 2; }
if (point[1] >= 0 && point[1] < outSpatialShape[1] && point[2] >= 0 && if (center == offset){
point[2] < outSpatialShape[2] && point[0] >= 0 && // center of subm indice pairs dont need atomicadd
point[0] < outSpatialShape[0]) { indicePairs(1, offset, ix) = ix;
index = tv::ArrayIndexRowMajor<3, 3>::runPtrs( indicePairs(0, offset, ix) = ix;
point, outSpatialShape.data(), 0) + }else{
spatialVolume * indice_data[0]; point[2] = indice_data[3] - k + K2 / 2;
if (gridsOut[index] != -1) { point[1] = indice_data[2] - j + K1 / 2;
Index oldNum = atomicAdd(indiceNum.data() + offset, Index(1)); point[0] = indice_data[1] - i + K0 / 2;
indicePairs(1, offset, oldNum) = gridsOut[index]; if (point[1] >= 0 && point[1] < outSpatialShape[1] && point[2] >= 0 &&
indicePairs(0, offset, oldNum) = ix; point[2] < outSpatialShape[2] && point[0] >= 0 &&
point[0] < outSpatialShape[0]) {
index = tv::ArrayIndexRowMajor<3, 3>::runPtrs(
point, outSpatialShape.data(), 0) +
spatialVolume * indice_data[0];
if (gridsOut[index] != -1) {
// for subm: indicePairs[0, i] = indicePairs[1, kernelVolume - i - 1]
Index oldNum = atomicAdd(indiceNum.data() + offset, Index(1));
atomicAdd(indiceNum.data() + KV - offset - 1, Index(1));
indicePairs(1, offset, oldNum) = gridsOut[index];
indicePairs(0, offset, oldNum) = ix;
indicePairs(1, KV - offset - 1, oldNum) = ix;
indicePairs(0, KV - offset - 1, oldNum) = gridsOut[index];
}
} }
} }
} }
...@@ -317,6 +333,9 @@ __global__ void getSubMIndicePairsKernel2( ...@@ -317,6 +333,9 @@ __global__ void getSubMIndicePairsKernel2(
Index point[2]; Index point[2];
Index index = 0; Index index = 0;
Index offset; Index offset;
constexpr unsigned KV = K0 * K1;
constexpr unsigned center = KV / 2;
*(indiceNum.data() + center) = numActIn;
for (int ix : tv::KernelLoopX<int>(numActIn)) { for (int ix : tv::KernelLoopX<int>(numActIn)) {
const Index *indice_data = indicesIn.data() + ix * (2 + 1); const Index *indice_data = indicesIn.data() + ix * (2 + 1);
...@@ -325,17 +344,29 @@ __global__ void getSubMIndicePairsKernel2( ...@@ -325,17 +344,29 @@ __global__ void getSubMIndicePairsKernel2(
#pragma unroll #pragma unroll
for (int j = 0; j < K1; ++j) { for (int j = 0; j < K1; ++j) {
offset = i * K1 + j; offset = i * K1 + j;
point[1] = indice_data[2] - j + K1 / 2; if (offset > center){
point[0] = indice_data[1] - i + K0 / 2; continue;
if (point[1] >= 0 && point[1] < outSpatialShape[1] && point[0] >= 0 && }
point[0] < outSpatialShape[0]) { if (center == offset){
index = tv::ArrayIndexRowMajor<2, 2>::runPtrs( // center of subm indice pairs dont need atomicadd
point, outSpatialShape.data(), 0) + indicePairs(1, offset, ix) = ix;
spatialVolume * indice_data[0]; indicePairs(0, offset, ix) = ix;
if (gridsOut[index] > -1) { }else{
Index oldNum = atomicAdd(indiceNum.data() + offset, Index(1)); point[1] = indice_data[2] - j + K1 / 2;
indicePairs(1, offset, oldNum) = gridsOut[index]; point[0] = indice_data[1] - i + K0 / 2;
indicePairs(0, offset, oldNum) = ix; if (point[1] >= 0 && point[1] < outSpatialShape[1] && point[0] >= 0 &&
point[0] < outSpatialShape[0]) {
index = tv::ArrayIndexRowMajor<2, 2>::runPtrs(
point, outSpatialShape.data(), 0) +
spatialVolume * indice_data[0];
if (gridsOut[index] > -1) {
Index oldNum = atomicAdd(indiceNum.data() + offset, Index(1));
atomicAdd(indiceNum.data() + KV - offset - 1, Index(1));
indicePairs(1, offset, oldNum) = gridsOut[index];
indicePairs(0, offset, oldNum) = ix;
indicePairs(1, KV - offset - 1, oldNum) = ix;
indicePairs(0, KV - offset - 1, oldNum) = gridsOut[index];
}
} }
} }
} }
......
...@@ -96,7 +96,7 @@ class CMakeBuild(build_ext): ...@@ -96,7 +96,7 @@ class CMakeBuild(build_ext):
packages = find_packages(exclude=('tools', 'tools.*')) packages = find_packages(exclude=('tools', 'tools.*'))
setup( setup(
name='spconv', name='spconv',
version='1.2', version='1.2.1',
author='Yan Yan', author='Yan Yan',
author_email='scrin@foxmail.com', author_email='scrin@foxmail.com',
description='spatial sparse convolution for pytorch', description='spatial sparse convolution for pytorch',
......
...@@ -752,8 +752,8 @@ def main_subm(algo, dtype=torch.float32): ...@@ -752,8 +752,8 @@ def main_subm(algo, dtype=torch.float32):
if __name__ == '__main__': if __name__ == '__main__':
# main_subm(algo=spconv.ConvAlgo.Native, dtype=torch.float32) main_subm(algo=spconv.ConvAlgo.Native, dtype=torch.float32)
# main_subm(algo=spconv.ConvAlgo.Native, dtype=torch.half) main_subm(algo=spconv.ConvAlgo.Native, dtype=torch.half)
# TestCase().assertAllClose(out_my, out_ref) # TestCase().assertAllClose(out_my, out_ref)
# unittest.main() # unittest.main()
TestSpConv().testSpConv3d() # TestSpConv().testSpConv3d()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment