working on batch indice conv (almost finished)

f08268fc · Yan Yan · bf473de0 · f08268fc · f08268fc · f08268fc
Commit f08268fc authored May 25, 2020 by Yan Yan
8 changed files
--- a/include/spconv/reordering.cu.h
+++ b/include/spconv/reordering.cu.h
@@ -14,10 +14,11 @@
 #ifndef REORDERING_CU_H_
 #define REORDERING_CU_H_
+#include <THC/THCAtomics.cuh>
 #include <tensorview/kernel_utils.h>
 // see http://www.nvidia.com/content/GTC-2010/pdfs/2238_GTC2010.pdf.
 namespace spconv {
 template <typename T, typename Index, int NumTLP, int NumILP>
 __global__ void gatherGenericKernel(T *buffer, const T *features,
                                    const Index *indices, int size,
@@ -99,12 +100,13 @@ __global__ void gatherVecBlockKernel(T *buffer, const T *features,
 template <typename T, typename Index, int NumTLP, int NumILP>
 __global__ void batchGatherGenericKernel(T *buffer, const T *features,
                                         const Index *indices, int size,
-                                         int numPlanes, int batch_stride,
+                                         int numPlanes, int indice_batch_stride,
                                         int feature_batch_stride) {
+  // size: max indice num * kernel volume
+  // inds: [volume, num_elems]
  int ILPStrideX[NumILP];
  Index inds[NumILP];
-  Index batchIdx[NumILP];
+  Index inds_elem;
 #pragma unroll
  for (int ilp = 0; ilp < NumILP; ilp++)
    ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
@@ -113,16 +115,18 @@ __global__ void batchGatherGenericKernel(T *buffer, const T *features,
 #pragma unroll
    for (int ilp = 0; ilp < NumILP; ilp++) {
      if (ix + ILPStrideX[ilp] < size) {
-        batchIdx[ilp] = ix / feature_batch_stride;
+        inds_elem = ix + ILPStrideX[ilp];
-        inds[ilp] = indices[ix + ILPStrideX[ilp]] * numPlanes;
+        inds[ilp] =
+            indices[(inds_elem / feature_batch_stride) * indice_batch_stride +
+                    inds_elem % feature_batch_stride];
      }
    }
    for (int iy : tv::KernelLoopY<int>(numPlanes)) {
 #pragma unroll
      for (int ilp = 0; ilp < NumILP; ++ilp) {
-        if (ix + ILPStrideX[ilp] < size)
+        if (ix + ILPStrideX[ilp] < size && inds[ilp] != -1)
          buffer[(ix + ILPStrideX[ilp]) * numPlanes + iy] =
-              features[inds[ilp] + iy];
+              features[inds[ilp] * numPlanes + iy];
      }
    }
  }
@@ -131,9 +135,12 @@ __global__ void batchGatherGenericKernel(T *buffer, const T *features,
 template <typename T, typename Index, int NumTLP, int NumILP, typename VecType>
 __global__ void batchGatherVecKernel(T *buffer, const T *features,
                                     const Index *indices, int size,
-                                     int numPlanes) {
+                                     int feature_offset,
+                                     int numPlanes, int indice_batch_stride,
+                                     int feature_batch_stride) {
  int ILPStrideX[NumILP];
  Index inds[NumILP];
+  Index inds_elem;
 #pragma unroll
  for (int ilp = 0; ilp < NumILP; ilp++)
    ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
@@ -141,16 +148,21 @@ __global__ void batchGatherVecKernel(T *buffer, const T *features,
  for (int ix : tv::KernelLoopX<int, NumILP>(size)) {
 #pragma unroll
    for (int ilp = 0; ilp < NumILP; ilp++) {
-      if (ix + ILPStrideX[ilp] < size)
+      if (ix + ILPStrideX[ilp] < size) {
-        inds[ilp] = indices[ix + ILPStrideX[ilp]] * numPlanes;
+        inds_elem = ix + ILPStrideX[ilp] + feature_offset;
+        inds[ilp] =
+            indices[(inds_elem / feature_batch_stride) * indice_batch_stride +
+                    inds_elem % feature_batch_stride];
+      }
    }
    for (int iy : tv::KernelLoopY<int>(numPlanes)) {
 #pragma unroll
      for (int ilp = 0; ilp < NumILP; ++ilp) {
-        if (ix + ILPStrideX[ilp] < size)
+        if (ix + ILPStrideX[ilp] < size && inds[ilp] != -1)
          reinterpret_cast<VecType *>(
              buffer)[(ix + ILPStrideX[ilp]) * numPlanes + iy] =
-              reinterpret_cast<const VecType *>(features)[inds[ilp] + iy];
+              reinterpret_cast<const VecType *>(
+                  features)[inds[ilp] * numPlanes + iy];
      }
    }
  }
@@ -158,24 +170,34 @@ __global__ void batchGatherVecKernel(T *buffer, const T *features,
 template <typename T, typename Index, int NumTLP, int NumILP,
          typename VecType = int4>
-__global__ void batchGatherVecBlockKernel(T *buffer, const T *features,
+__global__ void
-                                          const Index *indices, int size,
+batchGatherVecBlockKernel(T *buffer, const T *features, const Index *indices,
-                                          int numPlanes) {
+                          int size, int numPlanes, int indice_batch_stride,
+                          int feature_batch_stride) {
  int ILPStrideY[NumILP];
+  Index inds;
 #pragma unroll
  for (int ilp = 0; ilp < NumILP; ilp++)
    ILPStrideY[ilp] = ilp * gridDim.y * blockDim.y;
  features += blockIdx.x * NumTLP;
  buffer += blockIdx.x * NumTLP;
+  Index inds_elem;
  for (int iy : tv::KernelLoopY<int, NumILP>(size)) {
 #pragma unroll
    for (int ilp = 0; ilp < NumILP; ++ilp) {
-      reinterpret_cast<VecType *>(
+      inds_elem = iy + ILPStrideY[ilp];
-          buffer)[(iy + ILPStrideY[ilp]) * numPlanes + threadIdx.x] =
+      inds = indices[(inds_elem / feature_batch_stride) * indice_batch_stride +
-          reinterpret_cast<const VecType *>(
+                     inds_elem % feature_batch_stride];
-              features)[indices[iy + ILPStrideY[ilp]] * numPlanes +
-                        threadIdx.x];
+      if (inds != -1) {
+        reinterpret_cast<VecType *>(
+            buffer)[(iy + ILPStrideY[ilp]) * numPlanes + threadIdx.x] =
+            reinterpret_cast<const VecType *>(
+                features)[inds * numPlanes + threadIdx.x];
+      }
    }
  }
 }
@@ -240,6 +262,68 @@ __global__ void scatterAddVecBlockKernel(T *outFeatures, const T *buffer,
  }
 }
+template <typename T, typename Index, int NumTLP, int NumILP>
+__global__ void batchScatterAddGenericKernel(T *outFeatures, const T *buffer,
+                                             const Index *indices, int size,
+                                             int feature_offset, int numPlanes,
+                                             int indice_batch_stride,
+                                             int feature_batch_stride) {
+  int ILPStrideX[NumILP];
+  Index inds[NumILP];
+  Index inds_elem;
+#pragma unroll
+  for (int ilp = 0; ilp < NumILP; ilp++)
+    ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
+  for (int ix : tv::KernelLoopX<int, NumILP>(size)) {
+#pragma unroll
+    for (int ilp = 0; ilp < NumILP; ilp++) {
+      if (ix + ILPStrideX[ilp] < size) {
+        inds_elem = ix + ILPStrideX[ilp] + feature_offset;
+        inds[ilp] =
+            indices[(inds_elem / feature_batch_stride) * indice_batch_stride +
+                    inds_elem % feature_batch_stride];
+      }
+    }
+    for (int iy : tv::KernelLoopY<int>(numPlanes)) {
+#pragma unroll
+      for (int ilp = 0; ilp < NumILP; ++ilp) {
+        if (ix + ILPStrideX[ilp] < size && inds[ilp] != -1) {
+          gpuAtomicAdd(outFeatures + inds[ilp] * numPlanes + iy,
+                       buffer[(ix + ILPStrideX[ilp]) * numPlanes + iy]);
+          // outFeatures[inds[ilp] * numPlanes + iy] +=
+          //     buffer[(ix + ILPStrideX[ilp]) * numPlanes + iy];
+        }
+      }
+    }
+  }
+}
+template <typename T, typename Index, int NumTLP, int NumILP>
+__global__ void
+batchScatterAddBlockKernel(T *outFeatures, const T *buffer,
+                           const Index *indices, int size, int numPlanes,
+                           int indice_batch_stride, int feature_batch_stride) {
+  int ILPStrideY[NumILP];
+#pragma unroll
+  for (int ilp = 0; ilp < NumILP; ilp++)
+    ILPStrideY[ilp] = ilp * gridDim.y * blockDim.y;
+  outFeatures += blockIdx.x * NumTLP;
+  buffer += blockIdx.x * NumTLP;
+  Index inds, inds_elem;
+  for (int iy : tv::KernelLoopY<int, NumILP>(size)) {
+#pragma unroll
+    for (int ilp = 0; ilp < NumILP; ++ilp) {
+      inds_elem = iy + ILPStrideY[ilp];
+      inds = indices[(inds_elem / feature_batch_stride) * indice_batch_stride +
+                     inds_elem % feature_batch_stride];
+      if (inds != -1) {
+        gpuAtomicAdd(outFeatures + inds * numPlanes + threadIdx.x,
+                     buffer[(iy + ILPStrideY[ilp]) * numPlanes + threadIdx.x]);
+      }
+    }
+  }
+}
 } // namespace spconv
 #endif
\ No newline at end of file
--- a/include/spconv/reordering.h
+++ b/include/spconv/reordering.h
@@ -19,6 +19,12 @@
 namespace spconv {
+void batch_sparse_gather_cuda(torch::Tensor buffer, torch::Tensor features,
+                                torch::Tensor indices, int size);
+void batch_sparse_scatter_add_cuda(torch::Tensor buffer, torch::Tensor outFeatures,
+                                    torch::Tensor indices, int size);
 void sparse_gather_cuda(torch::Tensor buffer, torch::Tensor features,
                        torch::Tensor indices, int size);
 void sparse_scatter_add_cuda(torch::Tensor buffer, torch::Tensor outFeatures,

--- a/include/spconv/spconv_ops.h
+++ b/include/spconv/spconv_ops.h
@@ -338,6 +338,9 @@ std::vector<torch::Tensor> getIndicePairPreGrid(
    return {outInds.slice(0, 0, numActOut), indicePairs, indiceNum};
  }
 }
+torch::Tensor indiceConvBatch(torch::Tensor features, torch::Tensor filters,
+                         torch::Tensor indicePairs, torch::Tensor indiceNum,
+                         int64_t numActOut, int64_t _inverse, int64_t _subM);
 torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters,
                         torch::Tensor indicePairs, torch::Tensor indiceNum,

--- a/spconv/ops.py
+++ b/spconv/ops.py
@@ -107,7 +107,7 @@ def indice_conv(features,
                num_activate_out,
                inverse=False,
                subm=False):
-    return torch.ops.spconv.indice_conv(features, filters, indice_pairs,
+    return torch.ops.spconv.indice_conv_batch(features, filters, indice_pairs,
                                        indice_pair_num, num_activate_out,
                                        int(inverse), int(subm))

--- a/src/spconv/all.cc
+++ b/src/spconv/all.cc
@@ -30,6 +30,7 @@ static auto registry =
        .op("spconv::get_indice_pairs_grid_3d",
            &spconv::getIndicePairPreGrid<3>)
        .op("spconv::indice_conv", &spconv::indiceConv)
+        .op("spconv::indice_conv_batch", &spconv::indiceConvBatch)
        .op("spconv::indice_conv_backward", &spconv::indiceConvBackward)
        .op("spconv::fused_indice_conv_bn", &spconv::fusedIndiceConvBatchNorm)
        .op("spconv::indice_maxpool", &spconv::indiceMaxPool)

--- a/src/spconv/reordering.cu
+++ b/src/spconv/reordering.cu
@@ -161,4 +161,154 @@ void sparse_scatter_add_cuda(torch::Tensor buffer, torch::Tensor outFeatures,
  });
 }
+void batch_sparse_gather_cuda(torch::Tensor buffer, torch::Tensor features,
+                              torch::Tensor indices, int size) {
+  // indices: [volume, inds_stride]
+  // buffer: [volume, num_points, num_features]
+  // size == volume * num_points
+  if (size <= 0)
+    return;
+  int numPlanes = features.size(1);
+  auto stream = at::cuda::getCurrentCUDAStream();
+  auto dtype = features.scalar_type();
+  auto inds_dtype = indices.scalar_type();
+  int inds_stride = indices.size(1);
+  int feature_stride = buffer.size(1);
+  tv::DispatchTorch<float_types_t>()(dtype, [&](auto TValue) {
+    using T = decltype(TValue);
+    using vecload_type_t =
+        std::conditional_t<std::is_same<T, at::Half>::value, int2, int4>;
+    using kernel_block_t = tv::mp_list_c<int, 64, 32, 16>;
+    tv::DispatchTorch<int_types_t>()(inds_dtype, [&](auto IndexValue) {
+      using Index = decltype(IndexValue);
+      bool notFound = true;
+      constexpr int vecloadFactor = sizeof(vecload_type_t) / sizeof(T);
+      tv::mp_for_each<kernel_block_t>(
+          [=, &buffer, &features, &indices, &notFound](auto NumTLP) {
+            constexpr int NumILP = NumTLP / 4;
+            // constexpr int NumILP = NumTLP / (64 / (NumTLP / vecloadFactor));
+            int nHotBlock = (size / NumTLP) * NumTLP;
+            if (notFound) {
+              if (numPlanes % NumTLP == 0) {
+                if (nHotBlock >= NumTLP) {
+                  batchGatherVecBlockKernel<T, Index, int(NumTLP), NumILP,
+                                            vecload_type_t>
+                      <<<dim3(numPlanes / NumTLP, size / NumTLP),
+                         dim3(NumTLP / vecloadFactor, NumTLP / NumILP), 0,
+                         stream>>>(buffer.data_ptr<T>(), features.data_ptr<T>(),
+                                   indices.data_ptr<Index>(), nHotBlock,
+                                   numPlanes / vecloadFactor, inds_stride,
+                                   feature_stride);
+                  TV_CHECK_CUDA_ERR();
+                }
+                if (size - nHotBlock > 0) {
+                auto indices_offset = (nHotBlock / feature_stride) * inds_stride + nHotBlock % feature_stride;
+                  batchGatherVecKernel<T, Index, int(NumTLP), NumILP, vecload_type_t>
+                      <<<dim3(1, numPlanes / NumTLP),
+                         dim3(NumTLP / NumILP, NumTLP / vecloadFactor), 0,
+                         stream>>>(buffer.data_ptr<T>() + nHotBlock * numPlanes,
+                                   features.data_ptr<T>(),
+                                   indices.data_ptr<Index>() + indices_offset,
+                                   size - nHotBlock, nHotBlock, numPlanes / vecloadFactor,
+                                   inds_stride, feature_stride);
+                  TV_CHECK_CUDA_ERR();
+                }
+                notFound = false;
+              }
+            }
+          });
+      if (notFound) {
+        constexpr int NumTLP = 64;
+        constexpr int NumILP = NumTLP / 4;
+        batchGatherGenericKernel<T, Index, NumTLP, NumILP>
+            <<<dim3(tv::cuda::DivUp(size, NumTLP),
+                    tv::cuda::DivUp(numPlanes, NumTLP)),
+               dim3(NumTLP / NumILP, NumTLP), 0, stream>>>(
+                buffer.data_ptr<T>(), features.data_ptr<T>(),
+                indices.data_ptr<Index>(), size, numPlanes, inds_stride,
+                feature_stride);
+        TV_CHECK_CUDA_ERR();
+      }
+    });
+  });
+}
+void batch_sparse_scatter_add_cuda(torch::Tensor buffer,
+                                   torch::Tensor outFeatures,
+                                   torch::Tensor indices, int size) {
+  // indices: [volume, inds_stride]
+  // buffer: [volume, num_points, num_features]
+  // size == volume * num_points
+  if (size <= 0)
+    return;
+  int numPlanes = outFeatures.size(1);
+  auto stream = at::cuda::getCurrentCUDAStream();
+  auto dtype = outFeatures.scalar_type();
+  auto inds_dtype = indices.scalar_type();
+  int inds_stride = indices.size(1);
+  int feature_stride = buffer.size(1);
+  tv::DispatchTorch<float_types_t>()(dtype, [&](auto TValue) {
+    using T = decltype(TValue);
+    using vecload_type_t =
+        std::conditional_t<std::is_same<T, at::Half>::value, int2, int4>;
+    using kernel_block_t = tv::mp_list_c<int, 64, 32, 16>;
+    tv::DispatchTorch<int_types_t>()(inds_dtype, [&](auto IndexValue) {
+      using Index = decltype(IndexValue);
+      bool notFound = true;
+      constexpr int vecloadFactor = 1; // important for half.
+      tv::mp_for_each<kernel_block_t>([=, &outFeatures, &buffer, &indices,
+                                       &notFound](auto NumTLP) {
+        // constexpr int NumILP = NumTLP / (64 / (NumTLP /
+        // vecloadFactor));
+        constexpr int NumILP = NumTLP / 4;
+        int nHotBlock = (size / NumTLP) * NumTLP;
+        if (notFound) {
+          if (numPlanes % NumTLP == 0) {
+            if (nHotBlock >= NumTLP) {
+              batchScatterAddBlockKernel<T, Index, int(NumTLP), NumILP>
+                  <<<dim3(numPlanes / NumTLP, size / NumTLP),
+                     dim3(NumTLP / vecloadFactor, NumTLP / NumILP), 0,
+                     stream>>>(outFeatures.data_ptr<T>(), buffer.data_ptr<T>(),
+                               indices.data_ptr<Index>(), nHotBlock,
+                               numPlanes / vecloadFactor, inds_stride,
+                                  feature_stride);
+              TV_CHECK_CUDA_ERR();
+            }
+            if (size - nHotBlock > 0) {
+              // int indices_offset = (nHotBlock / feature_stride) * inds_stride + nHotBlock % feature_stride;
+              batchScatterAddGenericKernel<T, Index, int(NumTLP), NumILP>
+                  <<<dim3(1, numPlanes / NumTLP), dim3(NumTLP / NumILP, NumTLP),
+                     0, stream>>>(outFeatures.data_ptr<T>(),
+                                  buffer.data_ptr<T>() + nHotBlock * numPlanes,
+                                  indices.data_ptr<Index>(),
+                                  size - nHotBlock, nHotBlock, numPlanes, inds_stride,
+                                  feature_stride);
+              TV_CHECK_CUDA_ERR();
+            }
+            notFound = false;
+          }
+        }
+      });
+      if (notFound) {
+        constexpr int NumTLP = 64;
+        constexpr int NumILP = NumTLP / 4;
+        batchScatterAddGenericKernel<T, Index, NumTLP, NumILP>
+            <<<dim3(tv::cuda::DivUp(size, NumTLP),
+                    tv::cuda::DivUp(numPlanes, NumTLP)),
+               dim3(NumTLP / NumILP, NumTLP), 0, stream>>>(
+                outFeatures.data_ptr<T>(), buffer.data_ptr<T>(),
+                indices.data_ptr<Index>(), size, 0, numPlanes, inds_stride,
+                feature_stride);
+        TV_CHECK_CUDA_ERR();
+      }
+    });
+  });
+}
 } // namespace spconv
\ No newline at end of file
--- a/src/spconv/spconv_ops.cc
+++ b/src/spconv/spconv_ops.cc
@@ -211,6 +211,67 @@ torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters,
  return output;
 }
+torch::Tensor indiceConvBatch(torch::Tensor features, torch::Tensor filters,
+                              torch::Tensor indicePairs,
+                              torch::Tensor indiceNum, int64_t numActOut,
+                              int64_t _inverse, int64_t _subM) {
+  bool subM = _subM != 0;
+  bool inverse = _inverse != 0;
+  auto device = features.device().type();
+  auto ndim = filters.dim() - 2;
+  auto kernelVolume = indiceNum.size(0);
+  auto numInPlanes = features.size(1);
+  auto numOutPlanes = filters.size(ndim + 1);
+  auto indicePairNumCpu = indiceNum.to({torch::kCPU});
+  auto indicePairMaxSizeIter =
+      std::max_element(indicePairNumCpu.data_ptr<int>(),
+                       indicePairNumCpu.data_ptr<int>() + kernelVolume);
+  int indicePairMaxOffset =
+      indicePairMaxSizeIter - indicePairNumCpu.data_ptr<int>();
+  int indicePairMaxSize = *indicePairMaxSizeIter;
+  auto options =
+      torch::TensorOptions().dtype(features.dtype()).device(features.device());
+  torch::Tensor output = torch::zeros({numActOut, numOutPlanes}, options);
+  torch::Tensor inputBuffer =
+      torch::zeros({kernelVolume, indicePairMaxSize, numInPlanes}, options);
+  torch::Tensor outputBuffer =
+      torch::zeros({kernelVolume, indicePairMaxSize, numOutPlanes}, options);
+  filters = filters.view({kernelVolume, numInPlanes, numOutPlanes});
+  if (subM) { // the center index of subm conv don't need gather and scatter
+    // add.
+    torch::mm_out(output, features, filters[indicePairMaxOffset]);
+  }
+  double totalGatherTime = 0;
+  double totalGEMMTime = 0;
+  double totalSAddTime = 0;
+  auto size = kernelVolume * indicePairMaxSize;
+  if (device == torch::kCPU) {
+    TV_ASSERT_INVALID_ARG(false, "unknown device type");
+  }
+#ifdef TV_CUDA
+  else if (device == torch::kCUDA) {
+    batch_sparse_gather_cuda(inputBuffer, features, indicePairs[inverse], size);
+  }
+#endif
+  else {
+    TV_ASSERT_INVALID_ARG(false, "unknown device type");
+  }
+  torch::bmm_out(outputBuffer, inputBuffer, filters);
+  if (device == torch::kCPU) {
+    TV_ASSERT_INVALID_ARG(false, "unknown device type");
+  }
+#ifdef TV_CUDA
+  else if (device == torch::kCUDA) {
+    batch_sparse_scatter_add_cuda(outputBuffer, output, indicePairs[!inverse],
+                                  size);
+  }
+#endif
+  else {
+    TV_ASSERT_INVALID_ARG(false, "unknown device type");
+  }
+  return output;
+}
 std::vector<torch::Tensor>
 indiceConvBackward(torch::Tensor features, torch::Tensor filters,
                   torch::Tensor outGrad, torch::Tensor indicePairs,

--- a/test/test_conv.py
+++ b/test/test_conv.py
@@ -720,6 +720,6 @@ def main_subm():
 if __name__ == '__main__':
-    main_subm()
+    main()
    # unittest.main()
    # TestSpConv().testSpConv3d()