initial release

3f1316d5 · traveller59 · a347176a · 3f1316d5 · 3f1316d5 · 3f1316d5
Commit 3f1316d5 authored Jan 20, 2019 by traveller59
11 changed files
--- a/src/spconv/maxpool.cu
+++ b/src/spconv/maxpool.cu
+// Copyright 2019 Yan Yan
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <ATen/ATen.h>
+#include <chrono>
+#include <limits>
+#include <spconv/maxpool.h>
+#include <spconv/mp_helper.h>
+#include <tensorview/helper_kernel.cu.h>
+#include <tensorview/helper_launch.h>
+#include <tensorview/tensorview.h>
+#include <type_traits>
+
+namespace spconv {
+template <typename T, typename Index, int NumTLP, int NumILP>
+__global__ void maxPoolFwdBlockKernel(T *outFeatures, const T *inFeatures,
+                                      const Index *indicesIn,
+                                      const Index *indicesOut, int numHot,
+                                      int numPlanes) {
+  T in, out;
+  int ILPStrideY[NumILP];
+  Index idxo, idxi;
+#pragma unroll
+  for (int ilp = 0; ilp < NumILP; ilp++)
+    ILPStrideY[ilp] = threadIdx.y + ilp * blockDim.y;
+  outFeatures += blockIdx.y * NumTLP;
+  inFeatures += blockIdx.y * NumTLP;
+  for (int ix = blockIdx.x * blockDim.x; ix < numHot;
+       ix += blockDim.x * gridDim.x) {
+    {
+#pragma unroll
+      for (int ilp = 0; ilp < NumILP; ++ilp) {
+        idxi = indicesIn[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
+        idxo = indicesOut[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
+        in = inFeatures[idxi];
+        out = outFeatures[idxo];
+        if (in > out) {
+          outFeatures[idxo] = in;
+        }
+      }
+    }
+  }
+}
+
+template <typename T, typename Index, int NumTLP, int NumILP>
+__global__ void
+maxPoolFwdGenericBlockKernel(T *outFeatures, const T *inFeatures,
+                             const Index *indicesIn, const Index *indicesOut,
+                             int numHot, int numPlanes) {
+  // see http://www.nvidia.com/content/GTC-2010/pdfs/2238_GTC2010.pdf.
+  int ILPStrideX[NumILP];
+  Index RI[NumILP];
+  Index RO[NumILP];
+  T in, out;
+#pragma unroll
+  for (int ilp = 0; ilp < NumILP; ilp++)
+    ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
+  for (int ix : tv::KernelLoopX<int, NumILP>(numHot)) {
+#pragma unroll
+    for (int ilp = 0; ilp < NumILP; ilp++) {
+      RI[ilp] = indicesIn[ix + ILPStrideX[ilp]] * numPlanes;
+      RO[ilp] = indicesOut[ix + ILPStrideX[ilp]] * numPlanes;
+    }
+    for (int iy : tv::KernelLoopY<int>(numPlanes)) {
+#pragma unroll
+      for (int ilp = 0; ilp < NumILP; ++ilp) {
+        in = inFeatures[RI[ilp] + iy];
+        out = outFeatures[RO[ilp] + iy];
+        if (in > out) {
+          outFeatures[RO[ilp] + iy] = in;
+        }
+      }
+    }
+  }
+}
+
+template <typename T, typename Index, int NumTLP, int NumILP, typename VecType>
+__global__ void maxPoolFwdVecBlockKernel(T *outFeatures, const T *inFeatures,
+                                         const Index *indicesIn,
+                                         const Index *indicesOut, int numHot,
+                                         int numPlanes) {
+  // see http://www.nvidia.com/content/GTC-2010/pdfs/2238_GTC2010.pdf.
+  int ILPStrideY[NumILP];
+  constexpr int vecloadFactor = sizeof(VecType) / sizeof(T);
+  T bufi[vecloadFactor];
+  T bufo[vecloadFactor];
+  Index idxi, idxo;
+#pragma unroll
+  for (int ilp = 0; ilp < NumILP; ilp++)
+    ILPStrideY[ilp] = threadIdx.y + ilp * blockDim.y;
+  outFeatures += blockIdx.y * NumTLP;
+  inFeatures += blockIdx.y * NumTLP;
+  for (int ix = blockIdx.x * blockDim.x * vecloadFactor; ix < numHot;
+       ix += blockDim.x * gridDim.x * vecloadFactor) {
+#pragma unroll
+    for (int ilp = 0; ilp < NumILP; ++ilp) {
+      idxi = indicesIn[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
+      idxo = indicesOut[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
+      reinterpret_cast<VecType *>(bufo)[0] =
+          reinterpret_cast<VecType *>(outFeatures)[idxo];
+      reinterpret_cast<VecType *>(bufi)[0] =
+          reinterpret_cast<const VecType *>(inFeatures)[idxi];
+#pragma unroll
+      for (int i = 0; i < vecloadFactor; i++) {
+        if (bufi[i] > bufo[i]) {
+          bufo[i] = bufi[i];
+        }
+      }
+      reinterpret_cast<VecType *>(outFeatures)[idxo] =
+          reinterpret_cast<VecType *>(bufo)[0];
+    }
+  }
+}
+
+template <typename T, typename Index, int NumTLP, int NumILP>
+__global__ void maxPoolFwdGenericKernel(T *outFeatures, const T *inFeatures,
+                                        const Index *indicesIn,
+                                        const Index *indicesOut, int numHot,
+                                        int numPlanes) {
+  // see http://www.nvidia.com/content/GTC-2010/pdfs/2238_GTC2010.pdf.
+  int ILPStrideX[NumILP];
+  Index RI[NumILP];
+  Index RO[NumILP];
+  T in, out;
+#pragma unroll
+  for (int ilp = 0; ilp < NumILP; ilp++)
+    ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
+  for (int ix : tv::KernelLoopX<int, NumILP>(numHot)) {
+#pragma unroll
+    for (int ilp = 0; ilp < NumILP; ilp++) {
+      if (ix + ILPStrideX[ilp] < numHot) {
+        RI[ilp] = indicesIn[ix + ILPStrideX[ilp]] * numPlanes;
+        RO[ilp] = indicesOut[ix + ILPStrideX[ilp]] * numPlanes;
+      }
+    }
+    for (int iy : tv::KernelLoopY<int>(numPlanes)) {
+#pragma unroll
+      for (int ilp = 0; ilp < NumILP; ++ilp) {
+        if (ix + ILPStrideX[ilp] < numHot) {
+          in = inFeatures[RI[ilp] + iy];
+          out = outFeatures[RO[ilp] + iy];
+          if (in > out) {
+            outFeatures[RO[ilp] + iy] = in;
+          }
+        }
+      }
+    }
+  }
+}
+
+template <typename T, typename Index, int NumTLP, int NumILP>
+__global__ void
+maxPoolBwdBlockKernel(const T *outFeatures, const T *inFeatures, const T *dout,
+                      T *din, const Index *indicesIn, const Index *indicesOut,
+                      int numHot, int numPlanes) {
+  // see http://www.nvidia.com/content/GTC-2010/pdfs/2238_GTC2010.pdf.
+  T in, out;
+  Index idxo, idxi;
+  int ILPStrideY[NumILP];
+#pragma unroll
+  for (int ilp = 0; ilp < NumILP; ilp++)
+    ILPStrideY[ilp] = threadIdx.y + ilp * blockDim.y;
+  outFeatures += blockIdx.y * NumTLP;
+  inFeatures += blockIdx.y * NumTLP;
+  dout += blockIdx.y * NumTLP;
+  din += blockIdx.y * NumTLP;
+  for (int ix = blockIdx.x * blockDim.x; ix < numHot;
+       ix += blockDim.x * gridDim.x) {
+    {
+#pragma unroll
+      for (int ilp = 0; ilp < NumILP; ++ilp) {
+        idxi = indicesIn[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
+        idxo = indicesOut[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
+        in = inFeatures[idxi];
+        out = outFeatures[idxo];
+        if (in == out) {
+          din[idxi] += dout[idxo];
+        }
+      }
+    }
+  }
+}
+
+template <typename T, typename Index, int NumTLP, int NumILP>
+__global__ void maxPoolBwdGenericBlockKernel(const T *outFeatures,
+                                             const T *inFeatures, const T *dout,
+                                             T *din, const Index *indicesIn,
+                                             const Index *indicesOut,
+                                             int numHot, int numPlanes) {
+  // see http://www.nvidia.com/content/GTC-2010/pdfs/2238_GTC2010.pdf.
+  int ILPStrideX[NumILP];
+  Index RI[NumILP];
+  Index RO[NumILP];
+  T in, out;
+#pragma unroll
+  for (int ilp = 0; ilp < NumILP; ilp++)
+    ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
+  for (int ix : tv::KernelLoopX<int, NumILP>(numHot)) {
+#pragma unroll
+    for (int ilp = 0; ilp < NumILP; ilp++) {
+      RI[ilp] = indicesIn[ix + ILPStrideX[ilp]] * numPlanes;
+      RO[ilp] = indicesOut[ix + ILPStrideX[ilp]] * numPlanes;
+    }
+    for (int iy : tv::KernelLoopY<int>(numPlanes)) {
+#pragma unroll
+      for (int ilp = 0; ilp < NumILP; ++ilp) {
+        in = inFeatures[RI[ilp] + iy];
+        out = outFeatures[RO[ilp] + iy];
+        if (in == out) {
+          din[RI[ilp] + iy] += dout[RO[ilp] + iy];
+        }
+      }
+    }
+  }
+}
+
+template <typename T, typename Index, int NumTLP, int NumILP, typename VecType>
+__global__ void
+maxPoolBwdVecBlockKernel(const T *outFeatures, const T *inFeatures,
+                         const T *dout, T *din, const Index *indicesIn,
+                         const Index *indicesOut, int numHot, int numPlanes) {
+  // see http://www.nvidia.com/content/GTC-2010/pdfs/2238_GTC2010.pdf.
+  int ILPStrideY[NumILP];
+  constexpr int vecloadFactor = sizeof(VecType) / sizeof(T);
+  T bufi[vecloadFactor];
+  T bufo[vecloadFactor];
+  T bufdi[vecloadFactor];
+  T bufdo[vecloadFactor];
+  Index idxi, idxo;
+#pragma unroll
+  for (int ilp = 0; ilp < NumILP; ilp++)
+    ILPStrideY[ilp] = threadIdx.y + ilp * blockDim.y;
+  outFeatures += blockIdx.y * NumTLP;
+  inFeatures += blockIdx.y * NumTLP;
+  for (int ix = blockIdx.x * blockDim.x * vecloadFactor; ix < numHot;
+       ix += blockDim.x * gridDim.x * vecloadFactor) {
+#pragma unroll
+    for (int ilp = 0; ilp < NumILP; ++ilp) {
+      idxi = indicesIn[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
+      idxo = indicesOut[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
+      reinterpret_cast<VecType *>(bufo)[0] =
+          reinterpret_cast<const VecType *>(outFeatures)[idxo];
+      reinterpret_cast<VecType *>(bufi)[0] =
+          reinterpret_cast<const VecType *>(inFeatures)[idxi];
+      reinterpret_cast<VecType *>(bufdo)[0] =
+          reinterpret_cast<const VecType *>(dout)[idxo];
+      reinterpret_cast<VecType *>(bufdi)[0] = reinterpret_cast<VecType *>(din)[idxi];
+
+#pragma unroll
+      for (int i = 0; i < vecloadFactor; i++) {
+        if (bufi[i] == bufo[i]) {
+          bufdi[i] += bufdo[i];
+        }
+      }
+      reinterpret_cast<VecType *>(din)[idxi] = reinterpret_cast<VecType *>(bufdi)[0];
+    }
+  }
+}
+
+template <typename T, typename Index, int NumTLP, int NumILP>
+__global__ void
+maxPoolBwdGenericKernel(const T *outFeatures, const T *inFeatures,
+                        const T *dout, T *din, const Index *indicesIn,
+                        const Index *indicesOut, int numHot, int numPlanes) {
+  // see http://www.nvidia.com/content/GTC-2010/pdfs/2238_GTC2010.pdf.
+  int ILPStrideX[NumILP];
+  Index RI[NumILP];
+  Index RO[NumILP];
+  T in, out;
+#pragma unroll
+  for (int ilp = 0; ilp < NumILP; ilp++)
+    ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
+  for (int ix : tv::KernelLoopX<int, NumILP>(numHot)) {
+#pragma unroll
+    for (int ilp = 0; ilp < NumILP; ilp++) {
+      if (ix + ILPStrideX[ilp] < numHot) {
+        RI[ilp] = indicesIn[ix + ILPStrideX[ilp]] * numPlanes;
+        RO[ilp] = indicesOut[ix + ILPStrideX[ilp]] * numPlanes;
+      }
+    }
+    for (int iy : tv::KernelLoopY<int>(numPlanes)) {
+#pragma unroll
+      for (int ilp = 0; ilp < NumILP; ++ilp) {
+        if (ix + ILPStrideX[ilp] < numHot) {
+          in = inFeatures[RI[ilp] + iy];
+          out = outFeatures[RO[ilp] + iy];
+          if (in == out) {
+            din[RI[ilp] + iy] += dout[RO[ilp] + iy];
+          }
+        }
+      }
+    }
+  }
+}
+
+namespace functor {
+template <typename T, typename Index>
+struct SparseMaxPoolForwardFunctor<tv::GPU, T, Index> {
+  using vecload_type_t =
+      std::conditional_t<std::is_same<T, at::Half>::value, int2, int4>;
+  using kernel_block_t = mp_list_c<int, 64, 32, 16>;
+  void operator()(const tv::GPU &d, tv::TensorView<T> outFeatures,
+                  tv::TensorView<const T> inFeatures,
+                  tv::TensorView<const Index> indices, int size) {
+    if (size <= 0)
+      return;
+    int numPlanes = inFeatures.dim(1);
+    bool notFound = true;
+    constexpr int vecloadFactor = sizeof(vecload_type_t) / sizeof(T);
+    mp_for_each<kernel_block_t>([=, &outFeatures, &inFeatures, &indices,
+                                 &notFound](auto NumTLP) {
+      constexpr int NumILP = NumTLP / 4;
+
+      int numHotBlock = (size / NumTLP) * NumTLP;
+      if (notFound) {
+        if (numPlanes % NumTLP == 0) {
+          if (numHotBlock >= NumTLP) {
+            maxPoolFwdVecBlockKernel<T, Index, int(NumTLP), NumILP, vecload_type_t>
+                <<<dim3(std::min(size / NumTLP, 512), numPlanes / NumTLP),
+                   dim3(NumTLP / vecloadFactor, NumTLP / NumILP), 0,
+                   d.stream()>>>(outFeatures.data(), inFeatures.data(),
+                                 indices.subview(0).data(),
+                                 indices.subview(1).data(), numHotBlock,
+                                 numPlanes / vecloadFactor);
+            TV_CHECK_CUDA_ERR();
+          }
+
+          if (size > numHotBlock) {
+            maxPoolFwdGenericKernel<T, Index, int(NumTLP), NumILP>
+                <<<dim3(1, numPlanes / NumTLP), dim3(NumTLP / NumILP, NumTLP),
+                   0, d.stream()>>>(outFeatures.data(), inFeatures.data(),
+                                    indices.subview(0).data() + numHotBlock,
+                                    indices.subview(1).data() + numHotBlock,
+                                    size - numHotBlock, numPlanes);
+          }
+          notFound = false;
+        }
+      }
+    });
+
+    if (notFound) {
+      constexpr int NumTLP = 64;
+      constexpr int NumILP = NumTLP / 4;
+      int numHotBlock = (size / NumTLP) * NumTLP;
+      if (numHotBlock >= NumTLP) {
+        maxPoolFwdGenericBlockKernel<T, Index, NumTLP, NumILP>
+            <<<dim3(size / NumTLP, tv::launch::DivUp(numPlanes, NumTLP)),
+               dim3(NumTLP / NumILP, NumTLP), 0, d.stream()>>>(
+                outFeatures.data(), inFeatures.data(),
+                indices.subview(0).data(), indices.subview(1).data(),
+                numHotBlock, numPlanes);
+      }
+
+      if (size > numHotBlock) {
+        maxPoolFwdGenericKernel<T, Index, NumTLP, NumILP>
+            <<<dim3(1, tv::launch::DivUp(numPlanes, NumTLP)),
+               dim3(NumTLP / NumILP, NumTLP), 0, d.stream()>>>(
+                outFeatures.data(), inFeatures.data(),
+                indices.subview(0).data() + numHotBlock,
+                indices.subview(1).data() + numHotBlock, size - numHotBlock,
+                numPlanes);
+      }
+    }
+  }
+};
+
+template <typename T, typename Index>
+struct SparseMaxPoolBackwardFunctor<tv::GPU, T, Index> {
+  using vecload_type_t =
+      std::conditional_t<std::is_same<T, at::Half>::value, int2, int4>;
+  using kernel_block_t = mp_list_c<int, 64, 32, 16>;
+  void operator()(const tv::GPU &d, tv::TensorView<const T> outFeatures,
+                  tv::TensorView<const T> inFeatures,
+                  tv::TensorView<const T> dout, tv::TensorView<T> din,
+                  tv::TensorView<const Index> indices, int size) {
+    if (size <= 0)
+      return;
+    int numPlanes = inFeatures.dim(1);
+    bool notFound = true;
+    constexpr int vecloadFactor = sizeof(vecload_type_t) / sizeof(T);
+    mp_for_each<kernel_block_t>([=, &outFeatures, &inFeatures, &dout, &din,
+                                 &indices, &notFound](auto NumTLP) {
+      constexpr int NumILP = NumTLP / 4;
+
+      int numHotBlock = (size / NumTLP) * NumTLP;
+      if (notFound) {
+        if (numPlanes % NumTLP == 0) {
+          if (numHotBlock >= NumTLP) {
+            maxPoolBwdVecBlockKernel<T, Index, int(NumTLP), NumILP, vecload_type_t>
+                <<<dim3(std::min(size / NumTLP, 512), numPlanes / NumTLP),
+                   dim3(NumTLP / vecloadFactor, NumTLP / NumILP), 0,
+                   d.stream()>>>(outFeatures.data(), inFeatures.data(),
+                                 dout.data(), din.data(),
+                                 indices.subview(0).data(),
+                                 indices.subview(1).data(), numHotBlock,
+                                 numPlanes / vecloadFactor);
+            TV_CHECK_CUDA_ERR();
+          }
+
+          if (size > numHotBlock) {
+            maxPoolBwdGenericKernel<T, Index, int(NumTLP), NumILP>
+                <<<dim3(1, numPlanes / NumTLP), dim3(NumTLP / NumILP, NumTLP),
+                   0, d.stream()>>>(outFeatures.data(), inFeatures.data(),
+                                    dout.data(), din.data(),
+                                    indices.subview(0).data() + numHotBlock,
+                                    indices.subview(1).data() + numHotBlock,
+                                    size - numHotBlock, numPlanes);
+          }
+          notFound = false;
+        }
+      }
+    });
+
+    if (notFound) {
+      constexpr int NumTLP = 64;
+      constexpr int NumILP = NumTLP / 4;
+      int numHotBlock = (size / NumTLP) * NumTLP;
+      if (numHotBlock >= NumTLP) {
+        maxPoolBwdGenericBlockKernel<T, Index, NumTLP, NumILP>
+            <<<dim3(size / NumTLP, tv::launch::DivUp(numPlanes, NumTLP)),
+               dim3(NumTLP / NumILP, NumTLP), 0, d.stream()>>>(
+                outFeatures.data(), inFeatures.data(), dout.data(), din.data(),
+                indices.subview(0).data(), indices.subview(1).data(),
+                numHotBlock, numPlanes);
+      }
+
+      if (size > numHotBlock) {
+        maxPoolBwdGenericKernel<T, Index, NumTLP, NumILP>
+            <<<dim3(1, tv::launch::DivUp(numPlanes, NumTLP)),
+               dim3(NumTLP / NumILP, NumTLP), 0, d.stream()>>>(
+                outFeatures.data(), inFeatures.data(), dout.data(), din.data(),
+                indices.subview(0).data() + numHotBlock,
+                indices.subview(1).data() + numHotBlock, size - numHotBlock,
+                numPlanes);
+      }
+    }
+  }
+};
+
+} // namespace functor
+
+#define DECLARE_GPU_SPECS_T_INDEX(T, Index)                                    \
+  template struct functor::SparseMaxPoolForwardFunctor<tv::GPU, T, Index>;     \
+  template struct functor::SparseMaxPoolBackwardFunctor<tv::GPU, T, Index>;
+
+#define DECLARE_GPU_SPECS(T) DECLARE_GPU_SPECS_T_INDEX(T, int);
+
+DECLARE_GPU_SPECS(float);
+DECLARE_GPU_SPECS(double);
+DECLARE_GPU_SPECS(at::Half);
+
+#undef DECLARE_GPU_SPECS
+#undef DECLARE_GPU_SPECS_T_INDEX
+} // namespace spconv
\ No newline at end of file
--- a/src/spconv/reordering.cc
+++ b/src/spconv/reordering.cc
+// Copyright 2019 Yan Yan
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// 
+//     http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <spconv/reordering.h>
+#include <torch/script.h>
+
+namespace spconv {
+namespace functor {
+template <typename T, typename Index>
+struct SparseGatherFunctor<tv::CPU, T, Index> {
+  void operator()(const tv::CPU& d, tv::TensorView<T> buffer, tv::TensorView<const T> features,
+                  tv::TensorView<const Index> indices, int size) {
+    int numPlanes = features.dim(1);
+    for (int i = 0; i < size; ++i) {
+      std::memcpy(buffer.data() + i * numPlanes,
+                  features.data() + indices[i] * numPlanes,
+                  sizeof(T) * numPlanes);
+    }
+  }
+};
+
+template <typename T, typename Index>
+struct SparseScatterAddFunctor<tv::CPU, T, Index> {
+  void operator()(const tv::CPU& d, tv::TensorView<T> outFeatures,
+                  tv::TensorView<const T> buffer, tv::TensorView<const Index> indices,
+                  int size, bool stable) {
+    int numPlanes = outFeatures.dim(1);
+    const T* buf = buffer.data();
+    T* out = outFeatures.data();
+    for (int i = 0; i < size; ++i) {
+      buf = buffer.data() + i * numPlanes;
+      out = outFeatures.data() + indices[i] * numPlanes;
+      for (int j = 0; j < numPlanes; ++j){
+        out[j] += buf[j];
+      }
+    }
+  }
+};
+
+} // namespace functor
+
+
+#define DECLARE_CPU_SPECS_T_INDEX(T, Index)               \
+  template struct functor::SparseGatherFunctor<tv::CPU, T, Index>;  \
+  template struct functor::SparseScatterAddFunctor<tv::CPU, T, Index>;
+
+#define DECLARE_CPU_SPECS(T)                                                   \
+  DECLARE_CPU_SPECS_T_INDEX(T, int);                                           \
+  DECLARE_CPU_SPECS_T_INDEX(T, long);
+
+DECLARE_CPU_SPECS(float);
+DECLARE_CPU_SPECS(double);
+DECLARE_CPU_SPECS(at::Half);
+
+#undef DECLARE_CPU_SPECS
+#undef DECLARE_CPU_SPECS_T_INDEX
+
+} // namespace spconv
+
--- a/src/spconv/reordering.cu
+++ b/src/spconv/reordering.cu
+// Copyright 2019 Yan Yan
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <ATen/ATen.h>
+#include <chrono>
+#include <limits>
+#include <spconv/mp_helper.h>
+#include <spconv/reordering.h>
+#include <spconv/reordering.cu.h>
+#include <tensorview/helper_kernel.cu.h>
+#include <tensorview/helper_launch.h>
+#include <tensorview/tensorview.h>
+#include <type_traits>
+#include <utility/timer.h>
+
+namespace spconv {
+namespace functor {
+template <typename T, typename Index>
+struct SparseGatherFunctor<tv::GPU, T, Index> {
+  using vecload_type_t =
+      std::conditional_t<std::is_same<T, at::Half>::value, int2, int4>;
+  using kernel_block_t = mp_list_c<int, 64, 32, 16>;
+  void operator()(const tv::GPU &d, tv::TensorView<T> buffer,
+                  tv::TensorView<const T> features,
+                  tv::TensorView<const Index> indices, int size) {
+    if (size <= 0)
+      return;
+    int numPlanes = features.dim(1);
+    bool notFound = true;
+    constexpr int vecloadFactor = sizeof(vecload_type_t) / sizeof(T);
+    mp_for_each<kernel_block_t>([=, &buffer, &features, &indices,
+                                 &notFound](auto NumTLP) {
+      constexpr int NumILP = NumTLP / 4;
+      // constexpr int NumILP = NumTLP / (64 / (NumTLP / vecloadFactor));
+      int nHotBlock = (size / NumTLP) * NumTLP;
+      if (notFound) {
+        if (numPlanes % NumTLP == 0) {
+          if (nHotBlock >= NumTLP) {
+            gatherVecBlockKernel<T, Index, int(NumTLP), NumILP, vecload_type_t>
+                <<<dim3(numPlanes / NumTLP, size / NumTLP),
+                   dim3(NumTLP / vecloadFactor, NumTLP / NumILP), 0,
+                   d.stream()>>>(buffer.data(), features.data(), indices.data(),
+                                 nHotBlock, numPlanes / vecloadFactor);
+
+            TV_CHECK_CUDA_ERR();
+          }
+          if (size - nHotBlock > 0) {
+            gatherVecKernel<T, Index, int(NumTLP), NumILP, vecload_type_t>
+                <<<dim3(1, numPlanes / NumTLP),
+                   dim3(NumTLP / NumILP, NumTLP / vecloadFactor), 0,
+                   d.stream()>>>(buffer.data() + nHotBlock * numPlanes,
+                                 features.data(), indices.data() + nHotBlock,
+                                 size - nHotBlock, numPlanes / vecloadFactor);
+          }
+          notFound = false;
+        }
+      }
+    });
+
+    if (notFound) {
+      constexpr int NumTLP = 64;
+      constexpr int NumILP = NumTLP / 4;
+      gatherGenericKernel<T, Index, NumTLP, NumILP>
+          <<<dim3(tv::launch::DivUp(size, NumTLP),
+                  tv::launch::DivUp(numPlanes, NumTLP)),
+             dim3(NumTLP / NumILP, NumTLP), 0, d.stream()>>>(
+              buffer.data(), features.data(), indices.data(), size, numPlanes);
+    }
+  }
+};
+template <typename T, typename Index>
+struct SparseScatterAddFunctor<tv::GPU, T, Index> {
+  using vecload_type_t =
+      std::conditional_t<std::is_same<T, at::Half>::value, int2, int4>;
+  using kernel_block_t = mp_list_c<int, 64, 32, 16>;
+  void operator()(const tv::GPU &d, tv::TensorView<T> outFeatures,
+                  tv::TensorView<const T> buffer,
+                  tv::TensorView<const Index> indices, int size, bool stable) {
+    if (size <= 0)
+      return;
+    int numPlanes = outFeatures.dim(1);
+    bool notFound = true;
+    constexpr int vecloadFactor =
+        sizeof(vecload_type_t) / sizeof(T); // important for half.
+    mp_for_each<kernel_block_t>([=, &d, &outFeatures, &buffer, &indices,
+                                 &notFound](auto NumTLP) {
+      // constexpr int NumILP = NumTLP / (64 / (NumTLP / vecloadFactor));
+      constexpr int NumILP = NumTLP / 4;
+      int nHotBlock = (size / NumTLP) * NumTLP;
+      if (notFound) {
+        if (numPlanes % NumTLP == 0) {
+          if (nHotBlock >= NumTLP) {
+            scatterAddVecBlockKernel<T, Index, int(NumTLP), NumILP,
+                                     vecload_type_t>
+                <<<dim3(numPlanes / NumTLP, size / NumTLP),
+                   dim3(NumTLP / vecloadFactor, NumTLP / NumILP), 0,
+                   d.stream()>>>(outFeatures.data(), buffer.data(),
+                                 indices.data(), nHotBlock,
+                                 numPlanes / vecloadFactor);
+          }
+          if (size - nHotBlock > 0) {
+            scatterAddGenericKernel<T, Index, int(NumTLP), NumILP>
+                <<<dim3(1, numPlanes / NumTLP), dim3(NumTLP / NumILP, NumTLP),
+                   0, d.stream()>>>(
+                    outFeatures.data(), buffer.data() + nHotBlock * numPlanes,
+                    indices.data() + nHotBlock, size - nHotBlock, numPlanes);
+          }
+          notFound = false;
+        }
+      }
+    });
+    if (notFound) {
+      constexpr int NumTLP = 64;
+      constexpr int NumILP = NumTLP / 4;
+      scatterAddGenericKernel<T, Index, NumTLP, NumILP>
+          <<<dim3(tv::launch::DivUp(size, NumTLP),
+                  tv::launch::DivUp(numPlanes, NumTLP)),
+             dim3(NumTLP / NumILP, NumTLP), 0, d.stream()>>>(
+              outFeatures.data(), buffer.data(), indices.data(), size,
+              numPlanes);
+    }
+  }
+};
+} // namespace functor
+
+
+#define DECLARE_GPU_SPECS_T_INDEX(T, Index)                                    \
+  template struct functor::SparseGatherFunctor<tv::GPU, T, Index>;             \
+  template struct functor::SparseScatterAddFunctor<tv::GPU, T, Index>;
+
+#define DECLARE_GPU_SPECS(T) DECLARE_GPU_SPECS_T_INDEX(T, int);
+
+DECLARE_GPU_SPECS(float);
+DECLARE_GPU_SPECS(double);
+DECLARE_GPU_SPECS(at::Half);
+
+#undef DECLARE_GPU_SPECS
+#undef DECLARE_GPU_SPECS_T_INDEX
+} // namespace spconv
\ No newline at end of file
--- a/src/utils/CMakeLists.txt
+++ b/src/utils/CMakeLists.txt
+add_library(spconv_nms SHARED nms.cu)
+set_target_properties(spconv_nms PROPERTIES VERSION ${PROJECT_VERSION})
+set_target_properties(spconv_nms PROPERTIES SOVERSION 1)
+target_include_directories(spconv_nms PRIVATE ${ALL_INCLUDE})
+set_property(TARGET spconv_nms PROPERTY CXX_STANDARD 14)
+set_property(TARGET spconv_nms PROPERTY CUDA_STANDARD 14)
+target_link_libraries(spconv_nms ${CUDA_CUDART})
+
+install (TARGETS spconv_nms DESTINATION lib)
+
+add_library(spconv_utils SHARED all.cc)
+set_target_properties(spconv_utils PROPERTIES VERSION ${PROJECT_VERSION})
+set_target_properties(spconv_utils PROPERTIES SOVERSION 1)
+target_include_directories(spconv_utils PRIVATE ${ALL_INCLUDE}
+                    ${PROJECT_SOURCE_DIR}/third_party/pybind11/include)
+set_property(TARGET spconv_utils PROPERTY CXX_STANDARD 14)
+set_property(TARGET spconv_utils PROPERTY CUDA_STANDARD 14)
+set_target_properties(spconv_utils PROPERTIES PREFIX "${PYTHON_MODULE_PREFIX}"
+                                         SUFFIX "${PYTHON_MODULE_EXTENSION}")
+target_link_libraries(spconv_utils ${CUDA_CUDART} pybind11::module -Wl,--no-as-needed spconv_nms)
+
+install (TARGETS spconv_utils DESTINATION lib)
--- a/src/utils/all.cc
+++ b/src/utils/all.cc
+// Copyright 2019 Yan Yan
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// 
+//     http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <spconv/nms.h>
+#include <spconv/point2voxel.h>
+#include <spconv/box_iou.h>
+namespace py = pybind11;
+using namespace pybind11::literals;
+
+PYBIND11_MODULE(spconv_utils, m)
+{
+    m.doc() = "util pybind11 functions for spconv";
+    m.def("non_max_suppression", &spconv::non_max_suppression<double>, py::return_value_policy::reference_internal, "bbox iou",
+          "boxes"_a = 1, "keep_out"_a = 2, "nms_overlap_thresh"_a = 3, "device_id"_a = 4);
+    m.def("non_max_suppression", &spconv::non_max_suppression<float>, py::return_value_policy::reference_internal, "bbox iou",
+          "boxes"_a = 1, "keep_out"_a = 2, "nms_overlap_thresh"_a = 3, "device_id"_a = 4);
+    m.def("non_max_suppression_cpu", &spconv::non_max_suppression_cpu<double>, py::return_value_policy::reference_internal, "bbox iou",
+          "boxes"_a = 1, "order"_a = 2, "nms_overlap_thresh"_a = 3, "eps"_a = 4);
+    m.def("non_max_suppression_cpu", &spconv::non_max_suppression_cpu<float>, py::return_value_policy::reference_internal, "bbox iou",
+          "boxes"_a = 1, "order"_a = 2, "nms_overlap_thresh"_a = 3, "eps"_a = 4);
+    m.def("rotate_non_max_suppression_cpu", &spconv::rotate_non_max_suppression_cpu<float>, py::return_value_policy::reference_internal, "bbox iou",
+          "box_corners"_a = 1, "order"_a = 2, "standup_iou"_a = 3, "thresh"_a = 4);
+    m.def("rotate_non_max_suppression_cpu", &spconv::rotate_non_max_suppression_cpu<double>, py::return_value_policy::reference_internal, "bbox iou",
+          "box_corners"_a = 1, "order"_a = 2, "standup_iou"_a = 3, "thresh"_a = 4);
+    m.def("rbbox_iou", &spconv::rbbox_iou<double>,
+          py::return_value_policy::reference_internal, "rbbox iou",
+          "box_corners"_a = 1, "qbox_corners"_a = 2, "standup_iou"_a = 3,
+          "standup_thresh"_a = 4);
+    m.def("rbbox_iou", &spconv::rbbox_iou<float>,
+          py::return_value_policy::reference_internal, "rbbox iou",
+          "box_corners"_a = 1, "qbox_corners"_a = 2, "standup_iou"_a = 3,
+          "standup_thresh"_a = 4);
+    m.def("points_to_voxel_3d_np", &spconv::points_to_voxel_3d_np<float, 3>,
+          "matrix tensor_square", "points"_a = 1, "voxels"_a = 2, "coors"_a = 3,
+          "num_points_per_voxel"_a = 4, "coor_to_voxelidx"_a = 5,
+          "voxel_size"_a = 6, "coors_range"_a = 7, "max_points"_a = 8,
+          "max_voxels"_a = 9);
+    m.def("points_to_voxel_3d_np", &spconv::points_to_voxel_3d_np<double, 3>,
+          "matrix tensor_square", "points"_a = 1, "voxels"_a = 2, "coors"_a = 3,
+          "num_points_per_voxel"_a = 4, "coor_to_voxelidx"_a = 5,
+          "voxel_size"_a = 6, "coors_range"_a = 7, "max_points"_a = 8,
+          "max_voxels"_a = 9);
+}
\ No newline at end of file
--- a/src/utils/nms.cu
+++ b/src/utils/nms.cu
+// ------------------------------------------------------------------
+// Deformable Convolutional Networks
+// Copyright (c) 2015 Microsoft
+// Licensed under The MIT License
+// Modified from MATLAB Faster R-CNN (https://github.com/shaoqingren/faster_rcnn)
+// ------------------------------------------------------------------
+#include <vector>
+#include <iostream>
+#include <cuda_runtime.h>
+#include <spconv/nms_gpu.h>
+
+#define CUDA_CHECK(condition)                               \
+  /* Code block avoids redefinition of cudaError_t error */ \
+  do                                                        \
+  {                                                         \
+    cudaError_t error = condition;                          \
+    if (error != cudaSuccess)                               \
+    {                                                       \
+      std::cout << cudaGetErrorString(error) << std::endl;  \
+    }                                                       \
+  } while (0)
+
+#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
+int const threadsPerBlock = sizeof(unsigned long long) * 8;
+
+template <typename DType>
+__device__ inline DType devIoU(DType const *const a, DType const *const b)
+{
+  DType left = max(a[0], b[0]), right = min(a[2], b[2]);
+  DType top = max(a[1], b[1]), bottom = min(a[3], b[3]);
+  DType width = max(right - left + 1, 0.f), height = max(bottom - top + 1, 0.f);
+  DType interS = width * height;
+  DType Sa = (a[2] - a[0] + 1) * (a[3] - a[1] + 1);
+  DType Sb = (b[2] - b[0] + 1) * (b[3] - b[1] + 1);
+  return interS / (Sa + Sb - interS);
+}
+
+template <typename DType, int BLOCK_THREADS>
+__global__ void nms_kernel(const int n_boxes, const DType nms_overlap_thresh,
+                           const DType *dev_boxes, unsigned long long *dev_mask)
+{
+  const int row_start = blockIdx.y;
+  const int col_start = blockIdx.x;
+
+  // if (row_start > col_start) return;
+
+  const int row_size =
+      min(n_boxes - row_start * BLOCK_THREADS, BLOCK_THREADS);
+  const int col_size =
+      min(n_boxes - col_start * BLOCK_THREADS, BLOCK_THREADS);
+
+  __shared__ DType block_boxes[BLOCK_THREADS * 5];
+  if (threadIdx.x < col_size)
+  {
+#pragma unroll
+    for (int i = 0; i < 5; ++i)
+    {
+      block_boxes[threadIdx.x * 5 + i] =
+          dev_boxes[(BLOCK_THREADS * col_start + threadIdx.x) * 5 + i];
+    }
+  }
+  __syncthreads();
+
+  if (threadIdx.x < row_size)
+  {
+    const int cur_box_idx = BLOCK_THREADS * row_start + threadIdx.x;
+    const DType *cur_box = dev_boxes + cur_box_idx * 5;
+    unsigned long long t = 0;
+    int start = 0;
+    if (row_start == col_start)
+    {
+      start = threadIdx.x + 1;
+    }
+    for (int i = start; i < col_size; i++)
+    {
+      if (devIoU(cur_box, block_boxes + i * 5) > nms_overlap_thresh)
+      {
+        t |= 1ULL << i;
+      }
+    }
+    const int col_blocks = DIVUP(n_boxes, BLOCK_THREADS);
+    dev_mask[cur_box_idx * col_blocks + col_start] = t;
+  }
+}
+
+void _set_device(int device_id)
+{
+  int current_device;
+  CUDA_CHECK(cudaGetDevice(&current_device));
+  if (current_device == device_id)
+  {
+    return;
+  }
+  // The call to cudaSetDevice must come before any calls to Get, which
+  // may perform initialization using the GPU.
+  CUDA_CHECK(cudaSetDevice(device_id));
+}
+
+template <typename DType, int BLOCK_THREADS>
+int _nms_gpu(int *keep_out, const DType *boxes_host, int boxes_num,
+          int boxes_dim, DType nms_overlap_thresh, int device_id)
+{
+  _set_device(device_id);
+
+  DType *boxes_dev = NULL;
+  unsigned long long *mask_dev = NULL;
+
+  const int col_blocks = DIVUP(boxes_num, BLOCK_THREADS);
+
+  CUDA_CHECK(cudaMalloc(&boxes_dev,
+                        boxes_num * boxes_dim * sizeof(DType)));
+  CUDA_CHECK(cudaMemcpy(boxes_dev,
+                        boxes_host,
+                        boxes_num * boxes_dim * sizeof(DType),
+                        cudaMemcpyHostToDevice));
+
+  CUDA_CHECK(cudaMalloc(&mask_dev,
+                        boxes_num * col_blocks * sizeof(unsigned long long)));
+
+  dim3 blocks(DIVUP(boxes_num, BLOCK_THREADS),
+              DIVUP(boxes_num, BLOCK_THREADS));
+  dim3 threads(BLOCK_THREADS);
+  nms_kernel<DType, BLOCK_THREADS><<<blocks, threads>>>(boxes_num,
+                                  nms_overlap_thresh,
+                                  boxes_dev,
+                                  mask_dev);
+
+  std::vector<unsigned long long> mask_host(boxes_num * col_blocks);
+  CUDA_CHECK(cudaMemcpy(&mask_host[0],
+                        mask_dev,
+                        sizeof(unsigned long long) * boxes_num * col_blocks,
+                        cudaMemcpyDeviceToHost));
+
+  std::vector<unsigned long long> remv(col_blocks);
+  memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks);
+
+  int num_to_keep = 0;
+  for (int i = 0; i < boxes_num; i++)
+  {
+    int nblock = i / BLOCK_THREADS;
+    int inblock = i % BLOCK_THREADS;
+
+    if (!(remv[nblock] & (1ULL << inblock)))
+    {
+      keep_out[num_to_keep++] = i;
+      unsigned long long *p = &mask_host[0] + i * col_blocks;
+      for (int j = nblock; j < col_blocks; j++)
+      {
+        remv[j] |= p[j];
+      }
+    }
+  }
+
+  CUDA_CHECK(cudaFree(boxes_dev));
+  CUDA_CHECK(cudaFree(mask_dev));
+  return num_to_keep;
+}
+
+
+//template<>
+template int _nms_gpu<float, threadsPerBlock>(int *keep_out, const float *boxes_host, int boxes_num,
+          int boxes_dim, float nms_overlap_thresh, int device_id);
+//template<>
+template int _nms_gpu<double, threadsPerBlock>(int *keep_out, const double *boxes_host, int boxes_num,
+          int boxes_dim, double nms_overlap_thresh, int device_id);
\ No newline at end of file
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
+set(CATCH_HEADER ${PROJECT_SOURCE_DIR}/third_party/catch2)
+add_library(catch_main OBJECT src/catch_main.cpp)
+# target_compile_features(catch_main PUBLIC cxx_std_2a)
+set_property(TARGET catch_main PROPERTY CXX_STANDARD 14)
+target_include_directories(catch_main PRIVATE ${CATCH_HEADER})
+
+file(GLOB files "src/test_*.cpp")
+foreach(file ${files})
+    get_filename_component(file_basename ${file} NAME_WE)
+    string(REGEX REPLACE "test_([^$]+)" "test-\\1" testcase ${file_basename})
+
+    add_executable(${testcase} ${file} $<TARGET_OBJECTS:catch_main>)
+    set_property(TARGET ${testcase} PROPERTY CXX_STANDARD 14)
+    # set_target_properties(${testcase} PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
+    # set_property(TARGET ${testcase} PROPERTY CUDA_STANDARD 14)
+
+    target_compile_definitions(${testcase} PRIVATE
+        CATCH_CONFIG_FAST_COMPILE
+    )
+    target_include_directories(${testcase} PRIVATE
+        ${CATCH_HEADER} ${ALL_INCLUDE}
+    )
+    target_link_libraries(${testcase} ${ALL_LIBS} pybind11::embed -Wl,--no-as-needed spconv)
+    add_test(NAME "${testcase}" 
+        COMMAND ${testcase} 
+        WORKING_DIRECTORY ${CMAKE_SOURCE_DIR})
+endforeach()
\ No newline at end of file
--- a/test/src/catch_main.cpp
+++ b/test/src/catch_main.cpp
+// 000-CatchMain.cpp
+
+// In a Catch project with multiple files, dedicate one file to compile the
+// source code of Catch itself and reuse the resulting object file for linking.
+
+// Let Catch provide main():
+#define CATCH_CONFIG_MAIN
+
+#include "catch.hpp"
+
+// That's it
+
+// Compile implementation of Catch for use with files that do contain tests:
+// - g++ -std=c++11 -Wall -I$(CATCH_SINGLE_INCLUDE) -c 000-CatchMain.cpp
+// - cl -EHsc -I%CATCH_SINGLE_INCLUDE% -c 000-CatchMain.cpp
--- a/test/src/test_conv_rule.cpp
+++ b/test/src/test_conv_rule.cpp
+
+
+#include <algorithm>
+#include <iostream>
+#include <map>
+#include "catch.hpp"
+#include <prettyprint.h>
+#include <string>
+#include <vector>
+#include <exception>
+#include <fmt/format.h>
+#include <numeric>
+#include <pybind11/embed.h> // everything needed for embedding
+#include <pybind11/functional.h>
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <tuple>
+#include <pybind11_utils.h>
+#include <spconv/spconv_ops.h>
+
+namespace py = pybind11;
+
+TEST_CASE("GetConvIndPair", "[SpConvNet]")
+{
+    
+    using namespace py::literals;
+    py::scoped_interpreter guard{}; // start the interpreter and keep it alive
+    py::exec(R"(
+    from __future__ import print_function
+    import numpy as np
+    import math
+    # import spconv
+    # import torch
+
+    def get_convolution_output_size(input_size,
+                                    kernel_size,
+                                    stride,
+                                    padding=None,
+                                    rate=None):
+        ndim = len(input_size)
+        if padding is None:
+            padding = [0] * ndim
+
+        output_size = []
+        for i in range(ndim):
+            output_size.append((input_size[i] + 2 * padding[i] - (
+                (kernel_size[i] - 1) + 1)) // stride[i] + 1)
+        return output_size
+
+
+    def get_test_sparse_data(shape,
+                            num_points,
+                            num_channels,
+                            integer=False,
+                            dtype=np.float32):
+        dense_shape = shape
+        ndim = len(dense_shape)
+        # num_points = np.random.randint(10, 100, size=[batch_size, ndim])
+        num_points = np.array(num_points)
+        # num_points = np.array([3, 2])
+        batch_size = len(num_points)
+        batch_indices = []
+        coors_total = np.stack(
+            np.meshgrid(*[np.arange(0, s) for s in shape]), axis=-1)
+        coors_total = coors_total.reshape(-1, ndim)
+        for i in range(batch_size):
+            np.random.shuffle(coors_total)
+            inds_total = coors_total[:num_points[i]]
+            inds_total = np.pad(
+                inds_total, ((0, 0), (0, 1)), mode="constant", constant_values=i)
+            batch_indices.append(inds_total)
+        if integer:
+            sparse_data = np.random.randint(
+                20, 100, size=[num_points.sum(), num_channels]).astype(dtype)
+        else:
+            sparse_data = np.random.uniform(
+                -1, 1, size=[num_points.sum(), num_channels]).astype(dtype)
+
+        # sparse_data = np.arange(1, num_points.sum() + 1).astype(np.float32).reshape(5, 1)
+        dense_data = np.zeros(
+            [batch_size, num_channels, *dense_shape], dtype=sparse_data.dtype)
+        start = 0
+        for i, inds in enumerate(batch_indices):
+            for j, ind in enumerate(inds):
+                dense_slice = (i, slice(None), *ind[:-1])
+                dense_data[dense_slice] = sparse_data[start + j]
+            start += len(inds)
+        batch_indices = np.concatenate(batch_indices, axis=0)
+        return {
+            "features": sparse_data.astype(dtype),
+            "indices": batch_indices.astype(np.int32),
+            "features_dense": dense_data.astype(dtype),
+        }
+    shape = [50, 30, 30]
+    num_points = [5000] * 1
+    # np.random.seed(np.random.randint(1, 100000))
+    in_channels = 64
+    sparse_dict = get_test_sparse_data(shape, num_points, in_channels)
+    features = np.ascontiguousarray(sparse_dict["features"]).astype(np.float32)
+    indices = np.ascontiguousarray(sparse_dict["indices"][:, [3, 0, 1, 2]]).astype(np.int32)
+    features_dense = sparse_dict["features_dense"]
+    # indices_t = torch.from_numpy(indices)
+    filters = np.random.uniform(0, 1, size=[3, 3, 3, 64, 64]).astype(np.float32)
+    # print(outids.shape)
+    )");
+    SECTION("DebugTest"){
+
+        auto inds = array2TensorView<int>(py::array(py::globals()["indices"]));
+        auto inds_tensor = torch::from_blob(inds.data(), {inds.dim(0), inds.dim(1)}, torch::dtype(torch::kInt32));
+        auto inds_gpu = inds_tensor.to(torch::Device(torch::kCPU));
+        
+        auto features = array2TensorView<float>(py::array(py::globals()["features"]));
+        auto features_tensor = torch::from_blob(features.data(), {features.dim(0), features.dim(1)}, torch::dtype(torch::kFloat));
+        auto features_gpu = features_tensor.to(torch::Device(torch::kCUDA, 0));
+        auto filters = array2TensorView<float>(py::array(py::globals()["filters"]));
+        auto filters_tensor = torch::from_blob(filters.data(), {filters.dim(0), filters.dim(1), filters.dim(2), filters.dim(3), filters.dim(4)}, torch::dtype(torch::kFloat));
+        auto filters_gpu = filters_tensor.to(torch::Device(torch::kCUDA, 0));
+        
+        auto outputs = spconv::getIndicePair<3>(inds_gpu, 1, {46, 26, 26}, {50, 30, 30}, {3, 3, 3},
+            {1, 1, 1}, {0, 0, 0}, {2, 2, 2}, {0, 0, 0}, false, false);
+        // std::cout << outputs[2] << std::endl;
+        /*
+        auto output = spconv::indiceConv<float>(features_gpu, filters_gpu, outputs[1], outputs[2], outputs[0].size(0), false);
+        std::cout << output << std::endl;*/
+    }
+}
\ No newline at end of file
--- a/test/test_conv.py
+++ b/test/test_conv.py
+# Copyright 2019 Yan Yan
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from pathlib import Path
+import spconv 
+import torch
+from torch import nn 
+import numpy as np 
+import time 
+from spconv.test_utils import params_grid, generate_sparse_data, TestCase
+import unittest
+# import sparseconvnet as scn 
+
+class SparseConv3dTestTorch(nn.Module):
+    def __init__(self, num_layers, ndim, shape, in_channels, out_channels, kernel_size,
+                 stride, padding, dilation):
+        super().__init__()
+        layers = [spconv.SparseConv3d(
+                in_channels,
+                out_channels,
+                kernel_size,
+                stride,
+                padding=padding,
+                dilation=dilation,
+                bias=False)]
+        for i in range(1, num_layers):
+            layers.append(spconv.SparseConv3d(
+                out_channels,
+                out_channels,
+                kernel_size,
+                stride,
+                padding=padding,
+                dilation=dilation,
+                bias=False))
+        self.net = spconv.SparseSequential(
+            *layers,
+        )
+        # self.grid = torch.full([3, *shape], -1, dtype=torch.int32).cuda()
+        self.grid = None
+        self.shape = shape
+
+    def forward(self, features, coors, batch_size):
+        coors = coors.int()
+        x = spconv.SparseConvTensor(features, coors,self.shape, batch_size, self.grid)
+        return self.net(x)# .dense()
+
+class SubMConv3dTestTorch(nn.Module):
+    def __init__(self, num_layers, ndim, shape, in_channels, out_channels, kernel_size,
+                 stride, padding, dilation):
+        super().__init__()
+        layers = [spconv.SubMConv3d(
+                in_channels,
+                out_channels,
+                kernel_size,
+                stride,
+                padding=padding,
+                dilation=dilation,
+                bias=False)]
+        for i in range(1, num_layers):
+            layers.append(spconv.SubMConv3d(
+                out_channels,
+                out_channels,
+                kernel_size,
+                stride,
+                padding=padding,
+                dilation=dilation,
+                bias=False))
+        self.net = spconv.SparseSequential(
+            *layers,
+        )
+        self.grid = torch.full([3, *shape], -1, dtype=torch.int32).cuda()
+        self.shape = shape
+
+    def forward(self, features, coors, batch_size):
+        coors = coors.int()
+        x = spconv.SparseConvTensor(features, coors,self.shape, batch_size, self.grid)
+        return self.net(x)# .dense()
+
+
+class Conv3dTestTorch(nn.Module):
+    def __init__(self, num_layers, ndim, shape, in_channels, out_channels, kernel_size,
+                 stride, padding, dilation):
+        super().__init__()
+        layers = [nn.Conv3d(
+                in_channels,
+                out_channels,
+                kernel_size,
+                stride,
+                padding=padding,
+                dilation=dilation,
+                bias=False)]
+        for i in range(1, num_layers):
+            layers.append(nn.Conv3d(
+                out_channels,
+                out_channels,
+                kernel_size,
+                stride,
+                padding=padding,
+                dilation=dilation,
+                bias=False))
+        self.net = nn.Sequential(
+            *layers,
+        )
+        self.shape = shape
+
+    def forward(self, x):
+        return self.net(x)# .dense()
+
+
+class SparseDeConv3dTestTorch(nn.Module):
+    def __init__(self, num_layers, ndim, shape, in_channels, out_channels, kernel_size,
+                 stride, padding, dilation):
+        super().__init__()
+        layers = [spconv.SparseConvTranspose3d(
+                in_channels,
+                out_channels,
+                kernel_size,
+                stride,
+                padding=padding,
+                dilation=dilation,
+                bias=False)]
+        for i in range(1, num_layers):
+            layers.append(spconv.SparseConvTranspose3d(
+                out_channels,
+                out_channels,
+                kernel_size,
+                stride,
+                padding=padding,
+                dilation=dilation,
+                bias=False))
+        self.net = spconv.SparseSequential(
+            *layers,
+        )
+        self.shape = shape
+
+    def forward(self, features, coors, batch_size):
+        coors = coors.int()
+        x = spconv.SparseConvTensor(features, coors,self.shape, batch_size)
+        return self.net(x)# .dense()
+
+class DeConv3dTestTorch(nn.Module):
+    def __init__(self, num_layers, ndim, shape, in_channels, out_channels, kernel_size,
+                 stride, padding, dilation):
+        super().__init__()
+        layers = [nn.ConvTranspose3d(
+                in_channels,
+                out_channels,
+                kernel_size,
+                stride,
+                padding=padding,
+                dilation=dilation,
+                bias=False)]
+        for i in range(1, num_layers):
+            layers.append(nn.ConvTranspose3d(
+                out_channels,
+                out_channels,
+                kernel_size,
+                stride,
+                padding=padding,
+                dilation=dilation,
+                bias=False))
+        self.net = nn.Sequential(
+            *layers,
+        )
+        self.shape = shape
+
+    def forward(self, x):
+        return self.net(x)# .dense()
+
+
+class SparseMaxPoolTestTorch(nn.Module):
+    def __init__(self, num_layers, ndim, shape, kernel_size,
+                 stride, padding, dilation):
+        super().__init__()
+        layers = [spconv.SparseMaxPool3d(
+                kernel_size,
+                stride, padding, dilation)]
+        for i in range(1, num_layers):
+            layers.append(spconv.SparseMaxPool3d(
+                kernel_size,
+                stride, padding, dilation))
+        self.net = spconv.SparseSequential(
+            *layers,
+        )
+        self.shape = shape
+
+    def forward(self, features, coors, batch_size):
+        coors = coors.int()
+        x = spconv.SparseConvTensor(features, coors, self.shape, batch_size )
+        return self.net(x)# .dense()
+
+class MaxPool3dTestTorch(nn.Module):
+    def __init__(self, num_layers, ndim, shape, kernel_size,
+                 stride, padding, dilation):
+        super().__init__()
+        layers = [nn.MaxPool3d(
+                kernel_size,
+                stride, padding, dilation)]
+        for i in range(1, num_layers):
+            layers.append(nn.MaxPool3d(
+                kernel_size,
+                stride, padding, dilation))
+        self.net = nn.Sequential(
+            *layers,
+        )
+        self.shape = shape
+
+    def forward(self, x):
+        return self.net(x)# .dense()
+
+
+class SubmanifoldConvTestTorch(nn.Module):
+    def __init__(self, num_layers, ndim, shape, in_channels, out_channels, kernel_size, stride):
+        super().__init__()
+        layers = [spconv.SubMConv3d(
+                in_channels, out_channels, kernel_size, bias=False, indice_key="subm0")]
+        for i in range(1, num_layers):
+            layers.append(spconv.SubMConv3d(
+                out_channels, out_channels, kernel_size, bias=False))
+        self.net = nn.Sequential(
+            *layers,
+        )
+        self.shape = shape
+
+    def forward(self, features, coors, batch_size):
+        coors = coors.int()
+        x = spconv.SparseConvTensor(features, coors, self.shape, batch_size )
+        return self.net(x)
+
+class SCNCoupleDeConvTest(nn.Module):
+    def __init__(self, num_layers, ndim, shape, in_channels, out_channels, kernel_size,
+                 stride):
+        super().__init__()
+        self.scn_input = scn.InputLayer(ndim, shape, mode=0)
+        self.net = nn.Sequential(
+            scn.Convolution(
+                ndim,
+                in_channels,
+                out_channels,
+                kernel_size,
+                stride,
+                bias=False),
+            scn.Deconvolution(
+                ndim,
+                out_channels,
+                in_channels,
+                kernel_size,
+                stride,
+                bias=False),
+            scn.SparseToDense(ndim, in_channels),
+        )
+
+    def forward(self, features, coors, batch_size):
+        coors = coors.long().cpu()
+        x = self.scn_input((coors, features))
+        return self.net(x)
+
+class SparseCoupleDeConvTest(nn.Module):
+    def __init__(self, num_layers, ndim, shape, in_channels, out_channels, kernel_size,
+                 stride):
+        super().__init__()
+        self.net = spconv.SparseSequential(
+            spconv.SparseConv3d(
+                in_channels,
+                out_channels,
+                kernel_size,
+                stride,
+                indice_key="cp0",
+                bias=False),
+            spconv.SparseConvCoupleTranspose3d(
+                out_channels,
+                in_channels,
+                kernel_size,
+                stride,
+                indice_key="cp0",
+                bias=False),
+            
+        )
+        self.todense = spconv.ToDense()
+        self.shape = shape
+
+    def forward(self, features, coors, batch_size):
+        coors = coors.int()
+        x = spconv.SparseConvTensor(features, coors,self.shape, batch_size )
+        return self.todense(self.net(x))# .dense()
+
+
+def gather_nd(params, indices):
+    # this function has a limit that MAX_ADVINDEX_CALC_DIMS=5
+    ndim = indices.shape[-1]
+    output_shape = list(indices.shape[:-1]) + list(params.shape[indices.shape[-1]:])
+    flatted_indices = indices.view(-1, ndim)
+    slices = [flatted_indices[:, i] for i in range(ndim)]
+    slices += [Ellipsis]
+    return params[slices].view(*output_shape)
+
+def scatter_nd(indices, updates, shape):
+    """pytorch edition of tensorflow scatter_nd.
+    this function don't contain except handle code. so use this carefully
+    when indice repeats, don't support repeat add which is supported
+    in tensorflow.
+    """
+    ret = torch.zeros(*shape, dtype=updates.dtype, device=updates.device)
+    ndim = indices.shape[-1]
+    output_shape = list(indices.shape[:-1]) + shape[indices.shape[-1]:]
+    flatted_indices = indices.view(-1, ndim)
+    slices = [flatted_indices[:, i] for i in range(ndim)]
+    slices += [Ellipsis]
+    ret[slices] = updates.view(*output_shape)
+    return ret
+
+
+class TestSpConv(TestCase):
+    
+    def testSpConv3d(self):
+        np.random.seed(484)
+        devices = ["cpu:0"]
+        shapes = [[19, 18, 17]]
+        batchsizes = [1, 2]
+
+        in_channels = [64]
+        out_channels = [32, 48, 64]
+        ksizes = [2, 3]
+        strides = [1, 2, 3]
+        paddings = [0, 1, 2]
+        dilations = [1, 2, 3]
+
+        for dev, shape, bs, IC, OC, k, s, p, d in params_grid(
+            devices, shapes, batchsizes, in_channels, out_channels, ksizes, 
+            strides, paddings, dilations):
+            if all([s > 1, d > 1]):
+                continue # don't support this.
+            device = torch.device(dev)
+            num_points = [1000] * bs
+
+            sparse_dict = generate_sparse_data(shape, num_points, IC)
+
+            features = np.ascontiguousarray(sparse_dict["features"]).astype(np.float32)
+            indices = np.ascontiguousarray(sparse_dict["indices"][:, [3, 0, 1, 2]]).astype(np.int32)
+            features_dense = sparse_dict["features_dense"].astype(np.float32)
+            filters = np.random.uniform(0, 1, size=[k, k, k, IC, OC]).astype(np.float32)
+            indices_t = torch.from_numpy(indices).int().to(device)
+            features_t = torch.from_numpy(features).to(device)
+            features_t.requires_grad = True
+            features_dense_t = torch.from_numpy(features_dense).to(device)
+            features_dense_t.requires_grad = True
+            net = SparseConv3dTestTorch(1, 3, shape, IC, OC, k, s, p, d).to(device)
+            net_ref = Conv3dTestTorch(1, 3, shape, IC, OC, k, s, p, d).to(device)
+            filters_t = torch.from_numpy(filters).to(device)
+            net_ref.net[0].weight.data[:] = filters_t.permute(4, 3, 0, 1, 2).contiguous()
+            net.net[0].weight.data[:] = filters_t
+            out_ref = net_ref(features_dense_t)
+            out = net(features_t, indices_t, bs).dense().permute(0, 4, 1, 2, 3).contiguous()
+            dout = np.random.uniform(-0.2, 0.2, out_ref.shape).astype(features.dtype)
+            dout_t = torch.from_numpy(dout).to(device)
+            out.backward(dout_t)
+            out_ref.backward(dout_t)
+            din_dense = features_dense_t.grad.detach().permute(0, 2, 3, 4, 1).contiguous()
+            din_sparse = gather_nd(din_dense, indices_t.long())
+            din = features_t.grad.detach()
+            din_np = din.cpu().numpy()
+            din_sparse_np = din_sparse.cpu().numpy()
+            self.assertAllClose(din_np, din_sparse_np, atol=1e-4)
+            for layer, layer_ref in zip(net.net, net_ref.net):
+                dw = layer.weight.grad.detach().cpu().numpy()
+                dw_ref = layer_ref.weight.grad.detach().cpu().numpy()
+                dw = dw.transpose(4, 3, 0, 1, 2)
+                self.assertAllClose(dw, dw_ref, atol=1e-4)
+
+            out_np = out.detach().cpu().numpy()
+            out_ref_np = out_ref.detach().cpu().numpy()
+            self.assertAllClose(out_np, out_ref_np, atol=1e-4)
+    
+    def testSpDeConv3d(self):
+        np.random.seed(484)
+        devices = ["cuda:0", "cpu:0"]
+        shapes = [[19, 18, 17]]
+        batchsizes = [1, 2]
+
+        in_channels = [64]
+        out_channels = [32, 48, 64]
+        ksizes = [2, 3]
+        strides = [2, 3]
+        paddings = [0, 1, 2]
+        dilations = [1, 2, 3]
+
+        for dev, shape, bs, IC, OC, k, s, p, d in params_grid(
+            devices, shapes, batchsizes, in_channels, out_channels, ksizes, 
+            strides, paddings, dilations):
+            if all([s > 1, d > 1]):
+                continue # don't support this.
+            device = torch.device(dev)
+            num_points = [1000] * bs
+
+            sparse_dict = generate_sparse_data(shape, num_points, IC)
+
+            features = np.ascontiguousarray(sparse_dict["features"]).astype(np.float32)
+            indices = np.ascontiguousarray(sparse_dict["indices"][:, [3, 0, 1, 2]]).astype(np.int32)
+            features_dense = sparse_dict["features_dense"].astype(np.float32)
+            filters = np.random.uniform(0, 1, size=[k, k, k, IC, OC]).astype(np.float32)
+            indices_t = torch.from_numpy(indices).int().to(device)
+            features_t = torch.from_numpy(features).to(device)
+            features_t.requires_grad = True
+            features_dense_t = torch.from_numpy(features_dense).to(device)
+            features_dense_t.requires_grad = True
+            net = SparseDeConv3dTestTorch(1, 3, shape, IC, OC, k, s, p, d).to(device)
+            net_ref = DeConv3dTestTorch(1, 3, shape, IC, OC, k, s, p, d).to(device)
+            filters_t = torch.from_numpy(filters).to(device)
+            net_ref.net[0].weight.data[:] = filters_t.permute(3, 4, 0, 1, 2).contiguous()
+            net.net[0].weight.data[:] = filters_t
+            out_ref = net_ref(features_dense_t)
+            out = net(features_t, indices_t, bs).dense().permute(0, 4, 1, 2, 3).contiguous()
+            dout = np.random.uniform(-0.2, 0.2, out_ref.shape).astype(features.dtype)
+            dout_t = torch.from_numpy(dout).to(device)
+            out.backward(dout_t)
+            out_ref.backward(dout_t)
+            din_dense = features_dense_t.grad.detach().permute(0, 2, 3, 4, 1).contiguous()
+            din_sparse = gather_nd(din_dense, indices_t.long())
+            din = features_t.grad.detach()
+            din_np = din.cpu().numpy()
+            din_sparse_np = din_sparse.cpu().numpy()
+            self.assertAllClose(din_np, din_sparse_np, atol=1e-4)
+            for layer, layer_ref in zip(net.net, net_ref.net):
+                dw = layer.weight.grad.detach().cpu().numpy()
+                dw_ref = layer_ref.weight.grad.detach().cpu().numpy()
+                dw = dw.transpose(3, 4, 0, 1, 2)
+                self.assertAllClose(dw, dw_ref, atol=1e-4)
+
+            out_np = out.detach().cpu().numpy()
+            out_ref_np = out_ref.detach().cpu().numpy()
+            self.assertAllClose(out_np, out_ref_np, atol=1e-4)
+    
+    def testSpCpConv3d(self):
+        np.random.seed(484)
+        devices = ["cuda:0", "cpu:0"]
+        shapes = [[20, 20, 20]]
+        batchsizes = [1, 2]
+
+        in_channels = [64]
+        out_channels = [32, 48, 64]
+        ksizes = [2]
+        strides = [2]
+        paddings = [0, 1, 2]
+        dilations = [1, 2, 3]
+
+        for dev, shape, bs, IC, OC, k, s in params_grid(
+            devices, shapes, batchsizes, in_channels, out_channels, ksizes, 
+            strides):
+            device = torch.device(dev)
+            num_points = [1000] * bs
+
+            sparse_dict = generate_sparse_data(shape, num_points, IC)
+
+            features = np.ascontiguousarray(sparse_dict["features"]).astype(np.float32)
+            indices = np.ascontiguousarray(sparse_dict["indices"][:, [3, 0, 1, 2]]).astype(np.int32)
+            features_dense = sparse_dict["features_dense"].astype(np.float32)
+            filters = np.random.uniform(0, 1, size=[k, k, k, IC, OC]).astype(np.float32)
+            indices_t = torch.from_numpy(indices).int().to(device)
+            indices_scn_t = torch.from_numpy(indices[:, [1, 2, 3, 0]]).int().to(device)
+            features_t = torch.from_numpy(features).to(device)
+            features_t.requires_grad = True
+            features_ref_t = torch.from_numpy(features).to(device)
+            features_ref_t.requires_grad = True
+
+            net_ref = SCNCoupleDeConvTest(1, 3, shape, IC, OC, k, s).to(device)
+            net = SparseCoupleDeConvTest(1, 3, shape, IC, OC, k, s).to(device)
+            net_ref.net[0].weight.data[:] = net.net[0].weight.data[:].view(*net_ref.net[0].weight.shape)
+            net_ref.net[1].weight.data[:] = net.net[1].weight.data[:].view(*net_ref.net[1].weight.shape)
+            out_ref = net_ref(features_ref_t, indices_scn_t, bs)
+            out = net(features_t, indices_t, bs)
+            dout = np.random.uniform(-0.2, 0.2, out_ref.shape).astype(features.dtype)
+            dout_t = torch.from_numpy(dout).to(device)
+            out.backward(dout_t)
+            out_ref.backward(dout_t)
+            din = features_t.grad.detach()
+            din_ref = features_ref_t.grad.detach()
+            din_np = din.cpu().numpy()
+            din_ref_np = din_ref.cpu().numpy()
+            self.assertAllClose(din_ref_np, din_np, atol=1e-4)
+            for layer, layer_ref in zip(net.net, net_ref.net):
+                dw = layer.weight.grad.detach().cpu().numpy()
+                dw_ref = layer_ref.weight.grad.detach().cpu().view(*dw.shape).numpy()
+                self.assertAllClose(dw, dw_ref, atol=1e-4)
+
+            out_np = out.detach().cpu().numpy()
+            out_ref_np = out_ref.detach().cpu().numpy()
+            self.assertAllClose(out_np, out_ref_np, atol=1e-4)
+
+
+    
+    def testSpMaxPool3d(self):
+        np.random.seed(484)
+        devices = ["cuda:0", "cpu:0"]
+        shapes = [[19, 18, 17]]
+        batchsizes = [1, 2]
+
+        in_channels = [64]
+        out_channels = [64]
+        ksizes = [2, 3]
+        strides = [1, 2, 3]
+        paddings = [0, 1]
+        dilations = [1, 2, 3]
+
+        for dev, shape, bs, IC, OC, k, s, p, d in params_grid(
+            devices, shapes, batchsizes, in_channels, out_channels, ksizes, 
+            strides, paddings, dilations):
+            if all([s > 1, d > 1]):
+                continue # don't support this.
+            device = torch.device(dev)
+            num_points = [1000] * bs
+            # when data contains negative, sparse maxpool is not equal to dense maxpool.
+            sparse_dict = generate_sparse_data(shape, num_points, IC, data_range=[0.1, 1])
+
+            features = np.ascontiguousarray(sparse_dict["features"]).astype(np.float32)
+            indices = np.ascontiguousarray(sparse_dict["indices"][:, [3, 0, 1, 2]]).astype(np.int32)
+            features_dense = sparse_dict["features_dense"].astype(np.float32)
+            filters = np.random.uniform(0, 1, size=[k, k, k, IC, OC]).astype(np.float32)
+            indices_t = torch.from_numpy(indices).int().to(device)
+            features_t = torch.from_numpy(features).to(device)
+            features_t.requires_grad = True
+            features_dense_t = torch.from_numpy(features_dense).to(device)
+            features_dense_t.requires_grad = True
+            net = SparseMaxPoolTestTorch(1, 3, shape, k, s, p, d).to(device)
+            net_ref = MaxPool3dTestTorch(1, 3, shape, k, s, p, d).to(device)
+
+            out_ref = net_ref(features_dense_t)
+            out = net(features_t, indices_t, bs)
+            outids = out.indices
+            outfeatures = out.features
+            out_dense = out.dense()
+            out = out_dense.permute(0, 4, 1, 2, 3).contiguous()
+            
+            dout_sparse = np.random.uniform(-0.2, 0.2, outfeatures.shape).astype(features.dtype)
+            dout_sparse_t = torch.from_numpy(dout_sparse).to(device)
+            dout_t = scatter_nd(outids.long(), dout_sparse_t, list(out_dense.shape))
+            dout_t = dout_t.permute(0, 4, 1, 2, 3).contiguous()
+            out.backward(dout_t)
+            out_ref.backward(dout_t)
+            din_dense = features_dense_t.grad.detach().permute(0, 2, 3, 4, 1).contiguous()
+            din_sparse = gather_nd(din_dense, indices_t.long())
+            din = features_t.grad.detach()
+            din_np = din.cpu().numpy()
+            din_sparse_np = din_sparse.cpu().numpy()
+            self.assertAllClose(din_np, din_sparse_np, atol=1e-4)
+            
+            out_np = out.detach().cpu().numpy()
+            out_ref_np = out_ref.detach().cpu().numpy()
+            self.assertAllClose(out_np, out_ref_np, atol=1e-4)
+    
+
+def main():
+    # function for develop.
+    np.random.seed(484)
+    devices = ["cuda:0"]
+    shapes = [[50, 30, 30]]
+    batchsizes = [3]
+
+    
+    in_channels = [256]
+    out_channels = [256]
+    ksizes = [3]
+    strides = [1]
+    paddings = [0]
+    dilations = [1]
+
+    for dev, shape, bs, IC, OC, k, s, p, d in params_grid(
+        devices, shapes, batchsizes, in_channels, out_channels, ksizes, 
+        strides, paddings, dilations):
+        if all([s > 1, d > 1]):
+            continue
+        device = torch.device(dev)
+        num_points = [5000] * bs
+
+        sparse_dict = generate_sparse_data(shape, num_points, IC)
+
+        features = np.ascontiguousarray(sparse_dict["features"]).astype(np.float32)
+        indices = np.ascontiguousarray(sparse_dict["indices"][:, [3, 0, 1, 2]]).astype(np.int32)
+        features_dense = sparse_dict["features_dense"].astype(np.float32)
+        indices_t = torch.from_numpy(indices)
+        filters = np.random.uniform(0, 1, size=[k, k, k, IC, OC]).astype(np.float32)
+        indices_t = torch.from_numpy(indices).int().to(device).half()
+        features_t = torch.from_numpy(features).to(device).half()
+        
+        features_dense_t = torch.from_numpy(features_dense).to(device).half()
+        net = SparseConv3dTestTorch(1, 3, shape, IC, OC, k, s, p, d).to(device).half()
+        net_ref = Conv3dTestTorch(1, 3, shape, IC, OC, k, s, p, d).to(device).half()
+        filters_t = torch.from_numpy(filters).to(device).half()
+        net_ref.net[0].weight[:] = filters_t.permute(4, 3, 0, 1, 2).contiguous()
+        net.net[0].weight[:] = filters_t
+        out_ref = net_ref(features_dense_t)
+        times = []
+        for i in range(30):
+            t = time.time()
+            out = net(features_t, indices_t, bs)
+            torch.cuda.synchronize()
+            times.append(time.time() - t)
+        print((net.grid == -1).float().sum(), net.grid.numel())
+            # print("spconv time", time.time() - t)
+        print("spconv time", np.mean(times[2:]))
+        out = net(features_t, indices_t, bs).dense().permute(0, 4, 1, 2, 3).contiguous()
+        print(np.linalg.norm(out.detach().cpu().numpy() - out_ref.detach().cpu().numpy()))
+
+
+if __name__ == '__main__':
+    # main()
+    unittest.main()
+
+
--- a/third_party/catch2/catch.hpp
+++ b/third_party/catch2/catch.hpp