fix #45 release requirement of kernel size

73427720 · traveller59 · 10db9b67 · 73427720 · 73427720 · 73427720
Commit 73427720 authored May 04, 2019 by traveller59
7 changed files
--- a/include/spconv/indice.cu.h
+++ b/include/spconv/indice.cu.h
@@ -147,8 +147,7 @@ assignIndicePairsKernel(tv::TensorView<Index> indicesOut,
  }
 }
-template <typename Index, typename IndexGrid, unsigned NDim,
+template <typename Index, typename IndexGrid, unsigned NDim>
-          int KernelMaxVolume = 256>
 __global__ void
 prepareSubMGridKernel(tv::TensorView<const Index> indicesIn,
                  tv::TensorView<IndexGrid> gridsOut,

--- a/include/spconv/spconv_ops.h
+++ b/include/spconv/spconv_ops.h
@@ -47,7 +47,7 @@ getIndicePair(torch::Tensor indices, int64_t batchSize,
  for (int i = 1; i < kernelSize.size(); ++i) {
    kernelVolume *= kernelSize[i];
  }
-  TV_ASSERT_RT_ERR(kernelVolume <= 256, "error");
+  TV_ASSERT_RT_ERR(kernelVolume <= 4096, "error");
  auto outputVolume = outSpatialShape[0];
  for (int i = 1; i < outSpatialShape.size(); ++i) {
    outputVolume *= outSpatialShape[i];
@@ -159,7 +159,7 @@ getIndicePairPreGrid(torch::Tensor indices, torch::Tensor gridOut, int64_t batch
  for (int i = 1; i < kernelSize.size(); ++i) {
    kernelVolume *= kernelSize[i];
  }
-  TV_ASSERT_RT_ERR(kernelVolume <= 256, "error");
+  TV_ASSERT_RT_ERR(kernelVolume <= 4096, "error");
  auto outputVolume = outSpatialShape[0];
  for (int i = 1; i < outSpatialShape.size(); ++i) {
    outputVolume *= outSpatialShape[i];

--- a/include/tensorview/tensorview.h
+++ b/include/tensorview/tensorview.h
@@ -102,7 +102,7 @@ void sstream_print(SStream &ss, T val, TArgs... args) {
 struct GPU {
  GPU(cudaStream_t s = 0) : mStream(s) {}
-  cudaStream_t stream() const { return mStream; }
+  virtual cudaStream_t getStream() const { return mStream; }
  cudaStream_t mStream = 0;
 };
 struct CPU {};

--- a/include/torch_utils.h
+++ b/include/torch_utils.h
@@ -21,8 +21,8 @@
 namespace tv {
 struct TorchGPU: public tv::GPU {
-  TorchGPU(){
+  virtual cudaStream_t getStream() const override {
-    mStream = at::cuda::getCurrentCUDAStream();
+    return at::cuda::getCurrentCUDAStream();
  }
 };
@@ -48,7 +48,11 @@ template <typename T> void check_torch_dtype(const torch::Tensor &tensor) {
    TV_ASSERT_RT_ERR(val, "error");
    break;
  }
+  case at::ScalarType::Long: {
+    auto val = std::is_same<std::remove_const_t<T>, long>::value;
+    TV_ASSERT_RT_ERR(val, "error");
+    break;
+  }
  default:
    TV_ASSERT_RT_ERR(false, "error");
  }

--- a/src/spconv/indice.cu
+++ b/src/spconv/indice.cu
@@ -45,15 +45,15 @@ struct CreateConvIndicePairFunctorP1<tv::GPU, Index, IndexGrid, NDim> {
      return 0;
    // auto timer = spconv::CudaContextTimer<>();
    if (transpose)
-      prepareDeConvIndicePairsKernel<Index, IndexGrid, NDim, 256>
+      prepareDeConvIndicePairsKernel<Index, IndexGrid, NDim, 4096>
          <<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0,
-             d.stream()>>>(indicesIn, indicesOut, gridsOut, indicePairs,
+             d.getStream()>>>(indicesIn, indicesOut, gridsOut, indicePairs,
                           indiceNum, indicePairUnique, kernelSize, stride,
                           padding, dilation, outSpatialShape);
    else
-      prepareIndicePairsKernel<Index, IndexGrid, NDim, 256>
+      prepareIndicePairsKernel<Index, IndexGrid, NDim, 4096>
          <<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0,
-             d.stream()>>>(indicesIn, indicesOut, gridsOut, indicePairs,
+             d.getStream()>>>(indicesIn, indicesOut, gridsOut, indicePairs,
                           indiceNum, indicePairUnique, kernelSize, stride,
                           padding, dilation, outSpatialShape);
    TV_CHECK_CUDA_ERR();
@@ -80,18 +80,18 @@ struct CreateConvIndicePairFunctorP2<tv::GPU, Index, IndexGrid, NDim> {
    Index numAct = indicePairUnique.dim(0) - 1;
    assignGridAndIndiceOutKernel<Index, IndexGrid, NDim>
        <<<tv::launch::getBlocks(numAct), tv::launch::CUDA_NUM_THREADS, 0,
-           d.stream()>>>(indicesOut, gridsOut, numAct, indicePairs,
+           d.getStream()>>>(indicesOut, gridsOut, numAct, indicePairs,
                         indicePairUnique, outSpatialShape, batchSize);
    TV_CHECK_CUDA_ERR();
    assignIndicePairsKernel<Index, IndexGrid, NDim>
        <<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0,
-           d.stream()>>>(indicesOut, gridsOut, numActIn, indicePairs,
+           d.getStream()>>>(indicesOut, gridsOut, numActIn, indicePairs,
                         indicePairUnique, outSpatialShape);
    TV_CHECK_CUDA_ERR();
    if (resetGrid) {
      resetGridKernel<Index, IndexGrid, NDim>
          <<<tv::launch::getBlocks(numAct), tv::launch::CUDA_NUM_THREADS, 0,
-             d.stream()>>>(indicePairUnique.data(), gridsOut, numAct);
+             d.getStream()>>>(indicePairUnique.data(), gridsOut, numAct);
      TV_CHECK_CUDA_ERR();
    }
    return numAct;
@@ -116,18 +116,18 @@ struct CreateSubMIndicePairFunctor<tv::GPU, Index, IndexGrid, NDim> {
    // auto timer = spconv::CudaContextTimer<>();
    prepareSubMGridKernel<Index, IndexGrid, NDim>
        <<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0,
-           d.stream()>>>(indicesIn, gridsOut, outSpatialShape);
+           d.getStream()>>>(indicesIn, gridsOut, outSpatialShape);
    TV_CHECK_CUDA_ERR();
-    getSubMIndicePairsKernel<Index, IndexGrid, NDim>
+    getSubMIndicePairsKernel<Index, IndexGrid, NDim, 4096>
        <<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0,
-           d.stream()>>>(indicesIn, gridsOut, indicePairs, indiceNum,
+           d.getStream()>>>(indicesIn, gridsOut, indicePairs, indiceNum,
                         kernelSize, stride, padding, dilation, outSpatialShape);
    TV_CHECK_CUDA_ERR();
    // std::cout << "subm gene time " << timer.report() / 1000.0 << std::endl;
    if (resetGrid) {
      resetGridSubMKernel<Index, IndexGrid, NDim>
          <<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0,
-             d.stream()>>>(indicesIn.data(), gridsOut, outSpatialShape, numActIn);
+             d.getStream()>>>(indicesIn.data(), gridsOut, outSpatialShape, numActIn);
      TV_CHECK_CUDA_ERR();
    }
    return numActIn;

--- a/src/spconv/maxpool.cu
+++ b/src/spconv/maxpool.cu
@@ -329,7 +329,7 @@ struct SparseMaxPoolForwardFunctor<tv::GPU, T, Index> {
            maxPoolFwdVecBlockKernel<T, Index, int(NumTLP), NumILP, vecload_type_t>
                <<<dim3(std::min(size / NumTLP, 512), numPlanes / NumTLP),
                   dim3(NumTLP / vecloadFactor, NumTLP / NumILP), 0,
-                   d.stream()>>>(outFeatures.data(), inFeatures.data(),
+                   d.getStream()>>>(outFeatures.data(), inFeatures.data(),
                                 indices.subview(0).data(),
                                 indices.subview(1).data(), numHotBlock,
                                 numPlanes / vecloadFactor);
@@ -339,7 +339,7 @@ struct SparseMaxPoolForwardFunctor<tv::GPU, T, Index> {
          if (size > numHotBlock) {
            maxPoolFwdGenericKernel<T, Index, int(NumTLP), NumILP>
                <<<dim3(1, numPlanes / NumTLP), dim3(NumTLP / NumILP, NumTLP),
-                   0, d.stream()>>>(outFeatures.data(), inFeatures.data(),
+                   0, d.getStream()>>>(outFeatures.data(), inFeatures.data(),
                                    indices.subview(0).data() + numHotBlock,
                                    indices.subview(1).data() + numHotBlock,
                                    size - numHotBlock, numPlanes);
@@ -357,7 +357,7 @@ struct SparseMaxPoolForwardFunctor<tv::GPU, T, Index> {
      if (numHotBlock >= NumTLP) {
        maxPoolFwdGenericBlockKernel<T, Index, NumTLP, NumILP>
            <<<dim3(size / NumTLP, tv::launch::DivUp(numPlanes, NumTLP)),
-               dim3(NumTLP / NumILP, NumTLP), 0, d.stream()>>>(
+               dim3(NumTLP / NumILP, NumTLP), 0, d.getStream()>>>(
                outFeatures.data(), inFeatures.data(),
                indices.subview(0).data(), indices.subview(1).data(),
                numHotBlock, numPlanes);
@@ -367,7 +367,7 @@ struct SparseMaxPoolForwardFunctor<tv::GPU, T, Index> {
      if (size > numHotBlock) {
        maxPoolFwdGenericKernel<T, Index, NumTLP, NumILP>
            <<<dim3(1, tv::launch::DivUp(numPlanes, NumTLP)),
-               dim3(NumTLP / NumILP, NumTLP), 0, d.stream()>>>(
+               dim3(NumTLP / NumILP, NumTLP), 0, d.getStream()>>>(
                outFeatures.data(), inFeatures.data(),
                indices.subview(0).data() + numHotBlock,
                indices.subview(1).data() + numHotBlock, size - numHotBlock,
@@ -403,7 +403,7 @@ struct SparseMaxPoolBackwardFunctor<tv::GPU, T, Index> {
            maxPoolBwdVecBlockKernel<T, Index, int(NumTLP), NumILP, vecload_type_t>
                <<<dim3(std::min(size / NumTLP, 512), numPlanes / NumTLP),
                   dim3(NumTLP / vecloadFactor, NumTLP / NumILP), 0,
-                   d.stream()>>>(outFeatures.data(), inFeatures.data(),
+                   d.getStream()>>>(outFeatures.data(), inFeatures.data(),
                                 dout.data(), din.data(),
                                 indices.subview(0).data(),
                                 indices.subview(1).data(), numHotBlock,
@@ -414,7 +414,7 @@ struct SparseMaxPoolBackwardFunctor<tv::GPU, T, Index> {
          if (size > numHotBlock) {
            maxPoolBwdGenericKernel<T, Index, int(NumTLP), NumILP>
                <<<dim3(1, numPlanes / NumTLP), dim3(NumTLP / NumILP, NumTLP),
-                   0, d.stream()>>>(outFeatures.data(), inFeatures.data(),
+                   0, d.getStream()>>>(outFeatures.data(), inFeatures.data(),
                                    dout.data(), din.data(),
                                    indices.subview(0).data() + numHotBlock,
                                    indices.subview(1).data() + numHotBlock,
@@ -433,7 +433,7 @@ struct SparseMaxPoolBackwardFunctor<tv::GPU, T, Index> {
      if (numHotBlock >= NumTLP) {
        maxPoolBwdGenericBlockKernel<T, Index, NumTLP, NumILP>
            <<<dim3(size / NumTLP, tv::launch::DivUp(numPlanes, NumTLP)),
-               dim3(NumTLP / NumILP, NumTLP), 0, d.stream()>>>(
+               dim3(NumTLP / NumILP, NumTLP), 0, d.getStream()>>>(
                outFeatures.data(), inFeatures.data(), dout.data(), din.data(),
                indices.subview(0).data(), indices.subview(1).data(),
                numHotBlock, numPlanes);
@@ -443,7 +443,7 @@ struct SparseMaxPoolBackwardFunctor<tv::GPU, T, Index> {
      if (size > numHotBlock) {
        maxPoolBwdGenericKernel<T, Index, NumTLP, NumILP>
            <<<dim3(1, tv::launch::DivUp(numPlanes, NumTLP)),
-               dim3(NumTLP / NumILP, NumTLP), 0, d.stream()>>>(
+               dim3(NumTLP / NumILP, NumTLP), 0, d.getStream()>>>(
                outFeatures.data(), inFeatures.data(), dout.data(), din.data(),
                indices.subview(0).data() + numHotBlock,
                indices.subview(1).data() + numHotBlock, size - numHotBlock,

--- a/src/spconv/reordering.cu
+++ b/src/spconv/reordering.cu
@@ -50,7 +50,7 @@ struct SparseGatherFunctor<tv::GPU, T, Index> {
            gatherVecBlockKernel<T, Index, int(NumTLP), NumILP, vecload_type_t>
                <<<dim3(numPlanes / NumTLP, size / NumTLP),
                   dim3(NumTLP / vecloadFactor, NumTLP / NumILP), 0,
-                   d.stream()>>>(buffer.data(), features.data(), indices.data(),
+                   d.getStream()>>>(buffer.data(), features.data(), indices.data(),
                                 nHotBlock, numPlanes / vecloadFactor);
            TV_CHECK_CUDA_ERR();
@@ -59,7 +59,7 @@ struct SparseGatherFunctor<tv::GPU, T, Index> {
            gatherVecKernel<T, Index, int(NumTLP), NumILP, vecload_type_t>
                <<<dim3(1, numPlanes / NumTLP),
                   dim3(NumTLP / NumILP, NumTLP / vecloadFactor), 0,
-                   d.stream()>>>(buffer.data() + nHotBlock * numPlanes,
+                   d.getStream()>>>(buffer.data() + nHotBlock * numPlanes,
                                 features.data(), indices.data() + nHotBlock,
                                 size - nHotBlock, numPlanes / vecloadFactor);
            TV_CHECK_CUDA_ERR();
@@ -75,7 +75,7 @@ struct SparseGatherFunctor<tv::GPU, T, Index> {
      gatherGenericKernel<T, Index, NumTLP, NumILP>
          <<<dim3(tv::launch::DivUp(size, NumTLP),
                  tv::launch::DivUp(numPlanes, NumTLP)),
-             dim3(NumTLP / NumILP, NumTLP), 0, d.stream()>>>(
+             dim3(NumTLP / NumILP, NumTLP), 0, d.getStream()>>>(
              buffer.data(), features.data(), indices.data(), size, numPlanes);
      TV_CHECK_CUDA_ERR();
    }
@@ -107,7 +107,7 @@ struct SparseScatterAddFunctor<tv::GPU, T, Index> {
                                     vecload_type_t>
                <<<dim3(numPlanes / NumTLP, size / NumTLP),
                   dim3(NumTLP / vecloadFactor, NumTLP / NumILP), 0,
-                   d.stream()>>>(outFeatures.data(), buffer.data(),
+                   d.getStream()>>>(outFeatures.data(), buffer.data(),
                                 indices.data(), nHotBlock,
                                 numPlanes / vecloadFactor);
            TV_CHECK_CUDA_ERR();
@@ -115,7 +115,7 @@ struct SparseScatterAddFunctor<tv::GPU, T, Index> {
          if (size - nHotBlock > 0) {
            scatterAddGenericKernel<T, Index, int(NumTLP), NumILP>
                <<<dim3(1, numPlanes / NumTLP), dim3(NumTLP / NumILP, NumTLP),
-                   0, d.stream()>>>(
+                   0, d.getStream()>>>(
                    outFeatures.data(), buffer.data() + nHotBlock * numPlanes,
                    indices.data() + nHotBlock, size - nHotBlock, numPlanes);
            TV_CHECK_CUDA_ERR();
@@ -130,7 +130,7 @@ struct SparseScatterAddFunctor<tv::GPU, T, Index> {
      scatterAddGenericKernel<T, Index, NumTLP, NumILP>
          <<<dim3(tv::launch::DivUp(size, NumTLP),
                  tv::launch::DivUp(numPlanes, NumTLP)),
-             dim3(NumTLP / NumILP, NumTLP), 0, d.stream()>>>(
+             dim3(NumTLP / NumILP, NumTLP), 0, d.getStream()>>>(
              outFeatures.data(), buffer.data(), indices.data(), size,
              numPlanes);
      TV_CHECK_CUDA_ERR();