1.2.1: 4x faster subm indice generation

11bcbbf6 · yanyan · 492865a3 · 11bcbbf6 · 11bcbbf6 · 11bcbbf6
Commit 11bcbbf6 authored Jun 04, 2020 by yanyan
Hide whitespace changes
Inline Side-by-side

Showing with 65 additions and 29 deletions

CHANGELOG.md CHANGELOG.md +5 -0

include/spconv/indice.cu.h include/spconv/indice.cu.h +56 -25

setup.py setup.py +1 -1

test/test_conv.py test/test_conv.py +3 -3

No files found.
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
 # Changelog
+## [1.2.1] - 2020-06-04
+### Changed
+- The subm indice pair generation speed is greatly increased by two tricks: 1. most subm conv use only kernelsize=3, so we can unroll loops to get 100% performance increase. 2. subm indice pairs have a property: indicePairs[0, i] = indicePairs[1, kernelVolume - i - 1], so we can get another 100% performance increase. 
 ## [1.2.0] - 2020-05-28
 ### Added
 - add batch gemm support. small performance increasement but more gpu memory usage. you can use algo=spconv.ConvAlgo.Batch to use it.

--- a/include/spconv/indice.cu.h
+++ b/include/spconv/indice.cu.h
@@ -274,10 +274,13 @@ __global__ void getSubMIndicePairsKernel3(
    tv::TensorView<Index> indicePairs, tv::TensorView<Index> indiceNum,
    const tv::SimpleVector<Index, 3> outSpatialShape, Index spatialVolume) {
  auto numActIn = indicesIn.dim(0);
  Index point[3];
  Index index = 0;
  Index offset;
+  constexpr unsigned KV = K0 * K1 * K2;
+  constexpr unsigned center = KV / 2;
+  *(indiceNum.data() + center) = numActIn;
  for (int ix : tv::KernelLoopX<int>(numActIn)) {
    const Index *indice_data = indicesIn.data() + ix * (3 + 1);
 #pragma unroll
@@ -287,19 +290,32 @@ __global__ void getSubMIndicePairsKernel3(
 #pragma unroll
        for (int k = 0; k < K2; ++k) {
          offset = i * K1 * K2 + j * K2 + k;
-          point[2] = indice_data[3] - k + K2 / 2;
+          if (offset > center){
-          point[1] = indice_data[2] - j + K1 / 2;
+            continue;
-          point[0] = indice_data[1] - i + K0 / 2;
+          }
-          if (point[1] >= 0 && point[1] < outSpatialShape[1] && point[2] >= 0 &&
+          if (center == offset){
-              point[2] < outSpatialShape[2] && point[0] >= 0 &&
+              // center of subm indice pairs dont need atomicadd
-              point[0] < outSpatialShape[0]) {
+              indicePairs(1, offset, ix) = ix;
-            index = tv::ArrayIndexRowMajor<3, 3>::runPtrs(
+              indicePairs(0, offset, ix) = ix;
-                        point, outSpatialShape.data(), 0) +
+          }else{
-                    spatialVolume * indice_data[0];
+            point[2] = indice_data[3] - k + K2 / 2;
-            if (gridsOut[index] != -1) {
+            point[1] = indice_data[2] - j + K1 / 2;
-              Index oldNum = atomicAdd(indiceNum.data() + offset, Index(1));
+            point[0] = indice_data[1] - i + K0 / 2;
-              indicePairs(1, offset, oldNum) = gridsOut[index];
+            if (point[1] >= 0 && point[1] < outSpatialShape[1] && point[2] >= 0 &&
-              indicePairs(0, offset, oldNum) = ix;
+                point[2] < outSpatialShape[2] && point[0] >= 0 &&
+                point[0] < outSpatialShape[0]) {
+              index = tv::ArrayIndexRowMajor<3, 3>::runPtrs(
+                          point, outSpatialShape.data(), 0) +
+                      spatialVolume * indice_data[0];
+              if (gridsOut[index] != -1) {
+                // for subm: indicePairs[0, i] = indicePairs[1, kernelVolume - i - 1]
+                Index oldNum = atomicAdd(indiceNum.data() + offset, Index(1));
+                atomicAdd(indiceNum.data() + KV - offset - 1, Index(1));
+                indicePairs(1, offset, oldNum) = gridsOut[index];
+                indicePairs(0, offset, oldNum) = ix;
+                indicePairs(1, KV - offset - 1, oldNum) = ix;
+                indicePairs(0, KV - offset - 1, oldNum) = gridsOut[index];
+              }
            }
          }
        }
@@ -317,6 +333,9 @@ __global__ void getSubMIndicePairsKernel2(
  Index point[2];
  Index index = 0;
  Index offset;
+  constexpr unsigned KV = K0 * K1;
+  constexpr unsigned center = KV / 2;
+  *(indiceNum.data() + center) = numActIn;
  for (int ix : tv::KernelLoopX<int>(numActIn)) {
    const Index *indice_data = indicesIn.data() + ix * (2 + 1);
@@ -325,17 +344,29 @@ __global__ void getSubMIndicePairsKernel2(
 #pragma unroll
      for (int j = 0; j < K1; ++j) {
        offset = i * K1 + j;
-        point[1] = indice_data[2] - j + K1 / 2;
+        if (offset > center){
-        point[0] = indice_data[1] - i + K0 / 2;
+          continue;
-        if (point[1] >= 0 && point[1] < outSpatialShape[1] && point[0] >= 0 &&
+        }
-            point[0] < outSpatialShape[0]) {
+        if (center == offset){
-          index = tv::ArrayIndexRowMajor<2, 2>::runPtrs(
+            // center of subm indice pairs dont need atomicadd
-                      point, outSpatialShape.data(), 0) +
+            indicePairs(1, offset, ix) = ix;
-                  spatialVolume * indice_data[0];
+            indicePairs(0, offset, ix) = ix;
-          if (gridsOut[index] > -1) {
+        }else{
-            Index oldNum = atomicAdd(indiceNum.data() + offset, Index(1));
+          point[1] = indice_data[2] - j + K1 / 2;
-            indicePairs(1, offset, oldNum) = gridsOut[index];
+          point[0] = indice_data[1] - i + K0 / 2;
-            indicePairs(0, offset, oldNum) = ix;
+          if (point[1] >= 0 && point[1] < outSpatialShape[1] && point[0] >= 0 &&
+              point[0] < outSpatialShape[0]) {
+            index = tv::ArrayIndexRowMajor<2, 2>::runPtrs(
+                        point, outSpatialShape.data(), 0) +
+                    spatialVolume * indice_data[0];
+            if (gridsOut[index] > -1) {
+              Index oldNum = atomicAdd(indiceNum.data() + offset, Index(1));
+              atomicAdd(indiceNum.data() + KV - offset - 1, Index(1));
+              indicePairs(1, offset, oldNum) = gridsOut[index];
+              indicePairs(0, offset, oldNum) = ix;
+              indicePairs(1, KV - offset - 1, oldNum) = ix;
+              indicePairs(0, KV - offset - 1, oldNum) = gridsOut[index];
+            }
          }
        }
      }

--- a/setup.py
+++ b/setup.py
@@ -96,7 +96,7 @@ class CMakeBuild(build_ext):
 packages = find_packages(exclude=('tools', 'tools.*'))
 setup(
    name='spconv',
-    version='1.2',
+    version='1.2.1',
    author='Yan Yan',
    author_email='scrin@foxmail.com',
    description='spatial sparse convolution for pytorch',

--- a/test/test_conv.py
+++ b/test/test_conv.py
@@ -752,8 +752,8 @@ def main_subm(algo, dtype=torch.float32):
 if __name__ == '__main__':
-    # main_subm(algo=spconv.ConvAlgo.Native, dtype=torch.float32)
+    main_subm(algo=spconv.ConvAlgo.Native, dtype=torch.float32)
-    # main_subm(algo=spconv.ConvAlgo.Native, dtype=torch.half)
+    main_subm(algo=spconv.ConvAlgo.Native, dtype=torch.half)
    # TestCase().assertAllClose(out_my, out_ref)
    # unittest.main()
-    TestSpConv().testSpConv3d()
+    # TestSpConv().testSpConv3d()