add SparseConvNet algorithm

9ce18407 · yanyan · 47bea04c · 9ce18407 · 9ce18407 · 9ce18407
Commit 9ce18407 authored Jun 16, 2020 by yanyan
18 changed files
--- a/codeai-devops.yaml
+++ b/codeai-devops.yaml
@@ -37,8 +37,8 @@ observers:
        libpaths: [
            /home/yy/anaconda3/lib,
        ]
-        libraries: [-lnvinfer, -lpython3.7m, -lcublas, -lcudart, -ljpeg]
-        std: c++2a
+        libraries: [-lpython3.7m, -lcublas, -lcudart, -ljpeg]
+        std: c++14
        options: [-Wall, -Wextra]

    cudadev:
@@ -65,11 +65,11 @@ observers:
        std: c++14
        options: [
            -Wno-deprecated-declarations,
-            "-gencode=arch=compute_52,code=sm_52",
+            # "-gencode=arch=compute_52,code=sm_52",
            "-gencode=arch=compute_61,code=sm_61",
-            "-gencode=arch=compute_60,code=sm_60",
-            "-gencode=arch=compute_70,code=sm_70",
-            "-gencode=arch=compute_75,code=sm_75",
+            # "-gencode=arch=compute_60,code=sm_60",
+            # "-gencode=arch=compute_70,code=sm_70",
+            # "-gencode=arch=compute_75,code=sm_75",
        ]

    torchdev:

--- a/include/spconv/fused_conv.cu.h
+++ b/include/spconv/fused_conv.cu.h
+
+/*
+BSD License
+
+For SparseConvNet software
+
+Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+
+ * Neither the name Facebook nor the names of its contributors may be used to
+   endorse or promote products derived from this software without specific
+   prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#define TACC double
+
+template <typename T, int32_t K, int32_t V>
+__global__ void
+dConvolution_KMxKN_forwardA(T *inFeatures, T *outFeatures, T *w,
+                            int32_t *rulesIn, int32_t *rulesOut, int32_t nHot,
+                            int32_t input_nPlanes, int32_t input_stride,
+                            int32_t output_nPlanes, int32_t output_stride) {
+  // nHot must be a multiple of K!!
+
+  // Input x Weight -> Output
+  // blockDim=(K,K/V,1), gridDim=(nBlocks,N,nGroups) Volkov-blocks
+  // K is a multiple of V,
+
+  // nHot x KM -> nHot x KN - parallel over N,nHot - loop over M
+
+  int32_t M = input_nPlanes / K;
+  // N = gridDim.y == output_nPlanes/K
+  int32_t n = blockIdx.y;
+  int32_t g = blockIdx.z;
+  inFeatures += g * input_nPlanes;
+  outFeatures += n * K + g * output_nPlanes;
+  w += n * K + g * input_nPlanes * output_nPlanes;
+
+  TACC O[V];
+  __shared__ T W[K][K];
+  __shared__ T I[K][K];
+  int32_t R0[V];
+  int32_t R1[V];
+  const int32_t tx = threadIdx.x;
+  int32_t ty[V];
+#pragma unroll
+  for (int32_t v = 0; v < V; v++)
+    ty[v] = threadIdx.y + v * (K / V);
+
+  for (int32_t m = 0; m < M; m++) {
+// Read w
+#pragma unroll
+    for (int32_t v = 0; v < V; v++)
+      W[ty[v]][tx] = w[ty[v] * output_nPlanes + tx];
+
+    for (int32_t s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
+#pragma unroll
+      for (int32_t v = 0; v < V; v++) {
+        R0[v] = rulesIn[s + ty[v]];
+        R1[v] = rulesOut[s + ty[v]];
+      }
+      __syncthreads();
+
+// Read input, reset O[]
+#pragma unroll
+      for (int32_t v = 0; v < V; v++) {
+        I[ty[v]][tx] = inFeatures[R0[v] * input_stride + tx];
+        O[v] = 0;
+      }
+      __syncthreads();
+
+#pragma unroll
+      for (int32_t k = 0; k < K; k++)
+#pragma unroll
+        for (int32_t v = 0; v < V; v++)
+          O[v] += I[ty[v]][k] * W[k][tx];
+
+#pragma unroll
+      for (int32_t v = 0; v < V; v++)
+        O[v] += outFeatures[R1[v] * output_stride + tx];
+#pragma unroll
+      for (int32_t v = 0; v < V; v++)
+        outFeatures[R1[v] * output_stride + tx] = O[v];
+      __syncthreads();
+    }
+    w += K * output_nPlanes;
+    inFeatures += K;
+  }
+}
+template <typename T, int32_t K, int32_t V>
+__global__ void
+dConvolution_KMxKN_forwardB(T *inFeatures, T *outFeatures, T *w,
+                            int32_t *rulesIn, int32_t *rulesOut, int32_t nHot,
+                            int32_t input_nPlanes, int32_t input_stride,
+                            int32_t output_nPlanes, int32_t output_stride) {
+  // Input x Weight -> Output
+  // blockDim=(K,K/V,1), gridDim=(nBlocks,N,nGroups) Volkov-blocks
+  // K is a multiple of V,
+
+  // nHot x KM -> nHot x KN - parallel over N,nHot - loop over M
+
+  int32_t M = input_nPlanes / K;
+  // N = gridDim.y == output_nPlanes/K
+  int32_t n = blockIdx.y;
+  int32_t g = blockIdx.z;
+  inFeatures += g * input_nPlanes;
+  outFeatures += n * K + g * output_nPlanes;
+  w += n * K + g * input_nPlanes * output_nPlanes;
+
+  TACC O[V];
+  __shared__ T W[K][K];
+  __shared__ T I[K][K];
+  int32_t R0[V];
+  int32_t R1[V];
+  const int32_t tx = threadIdx.x;
+  int32_t ty[V];
+#pragma unroll
+  for (int32_t v = 0; v < V; v++)
+    ty[v] = threadIdx.y + v * (K / V);
+
+  for (int32_t m = 0; m < M; m++) {
+// Read w
+#pragma unroll
+    for (int32_t v = 0; v < V; v++)
+      W[ty[v]][tx] = w[ty[v] * output_nPlanes + tx];
+
+    for (int32_t s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
+#pragma unroll
+      for (int32_t v = 0; v < V; v++) {
+        if (s + ty[v] < nHot) {
+          R0[v] = rulesIn[s + ty[v]];
+          R1[v] = rulesOut[s + ty[v]];
+        }
+      }
+      __syncthreads();
+
+// Read input, reset O[]
+#pragma unroll
+      for (int32_t v = 0; v < V; v++) {
+        if (s + ty[v] < nHot)
+          I[ty[v]][tx] = inFeatures[R0[v] * input_stride + tx];
+        O[v] = 0;
+      }
+      __syncthreads();
+
+#pragma unroll
+      for (int32_t k = 0; k < K; k++)
+#pragma unroll
+        for (int32_t v = 0; v < V; v++)
+          O[v] += I[ty[v]][k] * W[k][tx];
+
+#pragma unroll
+      for (int32_t v = 0; v < V; v++)
+        if (s + ty[v] < nHot)
+          O[v] += outFeatures[R1[v] * output_stride + tx];
+#pragma unroll
+      for (int32_t v = 0; v < V; v++)
+        if (s + ty[v] < nHot)
+          outFeatures[R1[v] * output_stride + tx] = O[v];
+      __syncthreads();
+    }
+    w += K * output_nPlanes;
+    inFeatures += K;
+  }
+}
+
+#define FOO(T, K, V)                                                           \
+  {                                                                            \
+    if (input_nPlanes % K == 0 and output_nPlanes % K == 0) {                  \
+      int32_t o = (nHot / K) * K;                                              \
+      if (o >= K)                                                              \
+        dConvolution_KMxKN_forwardA<T, K, V>                                   \
+            <<<dim3(std::min(o / K, (int32_t)512), output_nPlanes / K,         \
+                    nGroups),                                                  \
+               dim3(K, K / V), 0, s>>>(                                        \
+                inFeatures, outFeatures, w, rulesIn, rulesOut, o,              \
+                input_nPlanes, input_stride, output_nPlanes, output_stride);   \
+      if (nHot > o)                                                            \
+        dConvolution_KMxKN_forwardB<T, K, V>                                   \
+            <<<dim3(1, output_nPlanes / K, nGroups), dim3(K, K / V), 0, s>>>(  \
+                inFeatures, outFeatures, w, rulesIn + o, rulesOut + o, \
+                nHot - o, input_nPlanes, input_stride, output_nPlanes,         \
+                output_stride);                                                \
+      return;                                                                  \
+    }                                                                          \
+  }
+template <typename T>
+void dConvolution_forward(cudaStream_t s, T *inFeatures, T *outFeatures, T *w,
+                          int32_t *rulesIn, int32_t *rulesOut, int32_t nHot,
+                          int32_t input_nPlanes, int32_t input_stride,
+                          int32_t output_nPlanes, int32_t output_stride,
+                          int32_t nGroups) {
+  FOO(T, 64, 16)
+  FOO(T, 32, 8)
+  FOO(T, 16, 4)
+  FOO(T, 8, 2)
+  assert(false);
+}
+template <>
+void dConvolution_forward<double>(cudaStream_t s, double *inFeatures,
+                                  double *outFeatures, double *w,
+                                  int32_t *rulesIn, int32_t *rulesOut,
+                                  int32_t nHot, int32_t input_nPlanes,
+                                  int32_t input_stride, int32_t output_nPlanes,
+                                  int32_t output_stride, int32_t nGroups) {
+  FOO(double, 32, 8)
+  FOO(double, 16, 4)
+  FOO(double, 8, 2)
+  assert(false);
+}
+#undef FOO
+// dOutput x W^T -> dInput and
+// Input^T x dOutput -> dW
+// blockDim=(K,K/V,1), gridDim=(nBlocks,M,nGroups)
+template <typename T, int32_t K, int32_t V>
+__global__ void dConvolution_KMxKN_backward_dW_A(
+    T *inFeatures, T *dInFeatures, T *dOutFeatures, T *w, T *dw,
+    int32_t *rulesIn, int32_t *rulesOut, int32_t nHot, int32_t input_nPlanes,
+    int32_t input_stride, int32_t output_nPlanes, int32_t output_stride) {
+  // M = gridDim.y == input_nPlanes / K
+  int32_t N = output_nPlanes / K;
+  int32_t m = blockIdx.y;
+  int32_t g = blockIdx.z;
+  inFeatures += m * K + g * input_nPlanes;
+  dInFeatures += m * K + g * input_nPlanes;
+  dOutFeatures += g * output_nPlanes;
+  w += m * K * output_nPlanes + g * input_nPlanes * output_nPlanes;
+  dw += m * K * output_nPlanes + g * input_nPlanes * output_nPlanes;
+  TACC dI[V];
+  TACC dW[V];
+  __shared__ T I[K][K];
+  __shared__ T dO[K][K];
+  __shared__ T W[K][K];
+  int32_t R0[V];
+  int32_t R1[V];
+  const int32_t tx = threadIdx.x;
+  int32_t ty[V];
+#pragma unroll
+  for (int32_t v = 0; v < V; v++)
+    ty[v] = threadIdx.y + v * (K / V);
+  for (int32_t n = 0; n < N; n++) {
+// Read w, reset dW
+#pragma unroll
+    for (int32_t v = 0; v < V; v++) {
+      W[ty[v]][tx] = w[ty[v] * output_nPlanes + tx];
+      dW[v] = 0;
+    }
+    for (int32_t s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
+#pragma unroll
+      for (int32_t v = 0; v < V; v++) {
+        R0[v] = rulesIn[s + ty[v]];
+        R1[v] = rulesOut[s + ty[v]];
+        dI[v] = 0;
+      }
+      __syncthreads();
+// Read input and dOutput
+#pragma unroll
+      for (int32_t v = 0; v < V; v++) {
+        I[ty[v]][tx] = inFeatures[R0[v] * input_stride + tx];
+        dO[ty[v]][tx] = dOutFeatures[R1[v] * output_stride + tx];
+      }
+      __syncthreads();
+#pragma unroll
+      for (int32_t k = 0; k < K; k++)
+#pragma unroll
+        for (int32_t v = 0; v < V; v++) {
+          dI[v] += dO[ty[v]][k] * W[tx][k];
+          dW[v] += I[k][ty[v]] * dO[k][tx];
+        }
+#pragma unroll
+      for (int32_t v = 0; v < V; v++)
+        dI[v] += dInFeatures[R0[v] * input_stride + tx];
+#pragma unroll
+      for (int32_t v = 0; v < V; v++)
+        dInFeatures[R0[v] * input_stride + tx] = dI[v];
+      __syncthreads();
+    }
+#pragma unroll
+    for (int32_t v = 0; v < V; v++)
+      atomicAdd(&dw[ty[v] * output_nPlanes + tx], dW[v]);
+    w += K;
+    dw += K;
+    dOutFeatures += K;
+  }
+}
+// dOutput x W^T -> dInput and
+// Input^T x dOutput -> dW
+// blockDim=(K,K/V,1), gridDim=(nBlocks,M,nGroups)
+template <typename T, int32_t K, int32_t V>
+__global__ void dConvolution_KMxKN_backward_dW_B(
+    T *inFeatures, T *dInFeatures, T *dOutFeatures, T *w, T *dw,
+    int32_t *rulesIn, int32_t *rulesOut, int32_t nHot, int32_t input_nPlanes,
+    int32_t input_stride, int32_t output_nPlanes, int32_t output_stride) {
+  // M = gridDim.y == input_nPlanes / K
+  int32_t N = output_nPlanes / K;
+  int32_t m = blockIdx.y;
+  int32_t g = blockIdx.z;
+  inFeatures += m * K + g * input_nPlanes;
+  dInFeatures += m * K + g * input_nPlanes;
+  dOutFeatures += g * output_nPlanes;
+  w += m * K * output_nPlanes + g * input_nPlanes * output_nPlanes;
+  dw += m * K * output_nPlanes + g * input_nPlanes * output_nPlanes;
+  TACC dI[V];
+  TACC dW[V];
+  __shared__ T I[K][K];
+  __shared__ T dO[K][K];
+  __shared__ T W[K][K];
+  int32_t R0[V];
+  int32_t R1[V];
+  const int32_t tx = threadIdx.x;
+  int32_t ty[V];
+#pragma unroll
+  for (int32_t v = 0; v < V; v++)
+    ty[v] = threadIdx.y + v * (K / V);
+  for (int32_t n = 0; n < N; n++) {
+// Read w, reset dW
+#pragma unroll
+    for (int32_t v = 0; v < V; v++) {
+      W[ty[v]][tx] = w[ty[v] * output_nPlanes + tx];
+      dW[v] = 0;
+    }
+    for (int32_t s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
+#pragma unroll
+      for (int32_t v = 0; v < V; v++) {
+        if (s + ty[v] < nHot) {
+          R0[v] = rulesIn[s + ty[v]];
+          R1[v] = rulesOut[s + ty[v]];
+        }
+        dI[v] = 0;
+      }
+      __syncthreads();
+// Read input and dOutput
+#pragma unroll
+      for (int32_t v = 0; v < V; v++)
+        if (s + ty[v] < nHot) {
+          I[ty[v]][tx] = inFeatures[R0[v] * input_stride + tx];
+          dO[ty[v]][tx] = dOutFeatures[R1[v] * output_stride + tx];
+        } else {
+          I[ty[v]][tx] = 0;
+          dO[ty[v]][tx] = 0;
+        }
+      __syncthreads();
+#pragma unroll
+      for (int32_t k = 0; k < K; k++)
+#pragma unroll
+        for (int32_t v = 0; v < V; v++) {
+          dI[v] += dO[ty[v]][k] * W[tx][k];
+          dW[v] += I[k][ty[v]] * dO[k][tx];
+        }
+#pragma unroll
+      for (int32_t v = 0; v < V; v++)
+        if (s + ty[v] < nHot)
+          dI[v] += dInFeatures[R0[v] * input_stride + tx];
+#pragma unroll
+      for (int32_t v = 0; v < V; v++)
+        if (s + ty[v] < nHot)
+          dInFeatures[R0[v] * input_stride + tx] = dI[v];
+      __syncthreads();
+    }
+#pragma unroll
+    for (int32_t v = 0; v < V; v++)
+      atomicAdd(&dw[ty[v] * output_nPlanes + tx], dW[v]);
+    w += K;
+    dw += K;
+    dOutFeatures += K;
+  }
+}
+#define FOO(T, K, V)                                                           \
+  {                                                                            \
+    if (input_nPlanes % K == 0 and output_nPlanes % K == 0) {                  \
+      int32_t o = (nHot / K) * K;                                              \
+      if (o >= K)                                                              \
+        dConvolution_KMxKN_backward_dW_A<T, K, V>                              \
+            <<<dim3(std::min(o / K, (int32_t)512), input_nPlanes / K,          \
+                    nGroups),                                                  \
+               dim3(K, K / V), 0, s>>>(inFeatures, dInFeatures, dOutFeatures,  \
+                                       w, dw, rulesIn, rulesOut, o,            \
+                                       input_nPlanes, input_stride,            \
+                                       output_nPlanes, output_stride);         \
+      if (nHot > o)                                                            \
+        dConvolution_KMxKN_backward_dW_B<T, K, V>                              \
+            <<<dim3(1, input_nPlanes / K, nGroups), dim3(K, K / V), 0, s>>>(   \
+                inFeatures, dInFeatures, dOutFeatures, w, dw, rulesIn + o, \
+                rulesOut + o, nHot - o, input_nPlanes, input_stride,       \
+                output_nPlanes, output_stride);                                \
+      return;                                                                  \
+    }                                                                          \
+  }
+template <typename T>
+void dConvolution_backward_dW(cudaStream_t s, T *inFeatures, T *dInFeatures,
+                              T *dOutFeatures, T *w, T *dw, int32_t *rulesIn,
+                              int32_t *rulesOut, int32_t nHot,
+                              int32_t input_nPlanes, int32_t input_stride,
+                              int32_t output_nPlanes, int32_t output_stride,
+                              int32_t nGroups) {
+  FOO(T, 32, 8)
+  FOO(T, 16, 4)
+  FOO(T, 8, 2)
+  assert(false);
+}
+#undef FOO
+template <typename T, int32_t K, int32_t V>
+__global__ void
+dConvolution_KMxKN_forward2(T *inFeatures, T *outFeatures, T *w,
+                            int32_t *rulesIn, int32_t *rulesOut, int32_t nHot,
+                            int32_t input_nPlanes, int32_t input_stride,
+                            int32_t output_nPlanes, int32_t output_stride) {
+  // Input x Weight -> Output
+  // blockDim=(K,K/V,1), gridDim=(nBlocks,N,nGroups) Volkov-blocks
+  // K is a multiple of V,
+  // nHot x input_nplanes<=KM -> nHot x output_nPlanes<=KN
+  // - parallel over N,nHot - loop over M
+  int32_t M = (input_nPlanes + K - 1) / K;
+  // N = gridDim.y ~ output_nPlanes/K
+  int32_t n = blockIdx.y;
+  int32_t g = blockIdx.z;
+  inFeatures += g * input_nPlanes;
+  outFeatures += n * K + g * output_nPlanes;
+  w += n * K + g * input_nPlanes * output_nPlanes;
+  int32_t KO = min(K, output_nPlanes - K * n);
+  TACC O[V];
+  __shared__ T W[K][K];
+  __shared__ T I[K][K];
+  __shared__ int32_t R[K * 2];
+  const int32_t tx = threadIdx.x;
+  int32_t ty[V];
+#pragma unroll
+  for (int32_t v = 0; v < V; v++)
+    ty[v] = threadIdx.y + v * (K / V);
+  for (int32_t m = 0; m < M; m++) {
+    int32_t KI = min(K, input_nPlanes - K * m);
+// Read w
+#pragma unroll
+    for (int32_t v = 0; v < V; v++)
+      if (ty[v] < KI and tx < KO)
+        W[ty[v]][tx] = w[ty[v] * output_nPlanes + tx];
+    for (int32_t s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
+// Read rules for K input/output pairs
+#pragma unroll
+      for (int32_t v = 0; v < V; v++) {
+        if (ty[v] < 1) {
+          if (s + tx < nHot) {
+            R[2 * tx] = rulesIn[s + tx];
+            R[2 * tx + 1] = rulesOut[s + tx];
+          }
+          // R[q] = rules[2 * s + q];
+        }
+      }
+      __syncthreads();
+// Read input, reset O[]
+#pragma unroll
+      for (int32_t v = 0; v < V; v++) {
+        if (tx < KI and s + ty[v] < nHot)
+          I[ty[v]][tx] = inFeatures[R[2 * ty[v]] * input_stride + tx];
+        O[v] = 0;
+      }
+      __syncthreads();
+#pragma unroll
+      for (int32_t k = 0; k < KI; k++)
+#pragma unroll
+        for (int32_t v = 0; v < V; v++)
+          O[v] += I[ty[v]][k] * W[k][tx];
+      __syncthreads();
+#pragma unroll
+      for (int32_t v = 0; v < V; v++)
+        if (tx < KO and s + ty[v] < nHot)
+          outFeatures[R[2 * ty[v] + 1] * output_stride + tx] += O[v];
+      __syncthreads();
+    }
+    w += K * output_nPlanes;
+    inFeatures += K;
+  }
+}
+// dOutput x W^T -> dInput and
+// Input^T x dOutput -> dW
+// blockDim=(K,K/V,1), gridDim=(nBlocks,M,nGroups)
+template <typename T, int32_t K, int32_t V>
+__global__ void dConvolution_KMxKN_backward_dW2(
+    T *inFeatures, T *dInFeatures, T *dOutFeatures, T *w, T *dw,
+    int32_t *rulesIn, int32_t *rulesOut, int32_t nHot, int32_t input_nPlanes,
+    int32_t input_stride, int32_t output_nPlanes, int32_t output_stride) {
+  // M = gridDim.y == input_nPlanes / K
+  int32_t N = (output_nPlanes + K - 1) / K;
+  int32_t m = blockIdx.y;
+  int32_t g = blockIdx.z;
+  inFeatures += m * K + g * input_nPlanes;
+  dInFeatures += m * K + g * input_nPlanes;
+  dOutFeatures += g * output_nPlanes;
+  w += m * K * output_nPlanes + g * input_nPlanes * output_nPlanes;
+  dw += m * K * output_nPlanes + g * input_nPlanes * output_nPlanes;
+  int32_t KI = min(K, input_nPlanes - K * m);
+  TACC dI[V];
+  TACC dW[V];
+  __shared__ T I[K][K];
+  __shared__ T dO[K][K];
+  __shared__ T W[K][K];
+  __shared__ int32_t R[K * 2];
+  const int32_t tx = threadIdx.x;
+  int32_t ty[V];
+#pragma unroll
+  for (int32_t v = 0; v < V; v++)
+    ty[v] = threadIdx.y + v * (K / V);
+  for (int32_t n = 0; n < N; n++) {
+    int32_t KO = min(K, output_nPlanes - K * n);
+// Read w, reset dW
+#pragma unroll
+    for (int32_t v = 0; v < V; v++) {
+      if (ty[v] < KI and tx < KO)
+        W[ty[v]][tx] = w[ty[v] * output_nPlanes + tx];
+      dW[v] = 0;
+    }
+    for (int32_t s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
+// Read rules for K input/output pairs, reset dI[]
+#pragma unroll
+      for (int32_t v = 0; v < V; v++) {
+        if (ty[v] < 1) {
+          if (s + tx < nHot) {
+            R[2 * tx] = rulesIn[s + tx];
+            R[2 * tx + 1] = rulesOut[s + tx];
+          }
+          // R[q] = rules[2 * s + q];
+        }
+        dI[v] = 0;
+      }
+      __syncthreads();
+// Read input and dOutput
+#pragma unroll
+      for (int32_t v = 0; v < V; v++) {
+        if (tx < KI and s + ty[v] < nHot)
+          I[ty[v]][tx] = inFeatures[R[2 * ty[v]] * input_stride + tx];
+        else
+          I[ty[v]][tx] = 0;
+        if (tx < KO and s + ty[v] < nHot)
+          dO[ty[v]][tx] = dOutFeatures[R[2 * ty[v] + 1] * output_stride + tx];
+        else
+          dO[ty[v]][tx] = 0;
+      }
+      __syncthreads();
+#pragma unroll
+      for (int32_t k = 0; k < KO; k++)
+#pragma unroll
+        for (int32_t v = 0; v < V; v++)
+          dI[v] += dO[ty[v]][k] * W[tx][k];
+#pragma unroll
+      for (int32_t k = 0; k < K; k++)
+#pragma unroll
+        for (int32_t v = 0; v < V; v++)
+          dW[v] += I[k][ty[v]] * dO[k][tx];
+      __syncthreads();
+#pragma unroll
+      for (int32_t v = 0; v < V; v++)
+        if (tx < KI and s + ty[v] < nHot)
+          dInFeatures[R[2 * ty[v]] * input_stride + tx] += dI[v];
+      __syncthreads();
+    }
+#pragma unroll
+    for (int32_t v = 0; v < V; v++)
+      if (ty[v] < KI and tx < KO)
+        atomicAdd(&dw[ty[v] * output_nPlanes + tx], dW[v]);
+    w += K;
+    dw += K;
+    dOutFeatures += K;
+  }
+}
+template <typename T>
+void dConvolution_forward2(cudaStream_t s, T *inFeatures, T *outFeatures, T *w,
+                           int32_t *rulesIn, int32_t *rulesOut, int32_t nHot,
+                           int32_t input_nPlanes, int32_t input_stride,
+                           int32_t output_nPlanes, int32_t output_stride,
+                           int32_t nGroups) {
+  int32_t c = input_nPlanes * output_nPlanes * nGroups;
+  if (input_nPlanes % 8 != 0 or output_nPlanes % 8 != 0) {
+    const int32_t K = 16;
+    const int32_t V = 4;
+    dConvolution_KMxKN_forward2<T, K, V>
+        <<<dim3(128, (output_nPlanes + K - 1) / K, nGroups), dim3(K, K / V), 0,
+           s>>>(inFeatures, outFeatures, w, rulesIn, rulesOut, nHot,
+                input_nPlanes, input_stride, output_nPlanes, output_stride);
+
+  } else {
+    dConvolution_forward(s, inFeatures, outFeatures, w, rulesIn, rulesOut, nHot,
+                         input_nPlanes, input_stride, output_nPlanes,
+                         output_stride, nGroups);
+  }
+}
+template <typename T>
+void dConvolution_backward_dW2(cudaStream_t s, T *inFeatures, T *dInFeatures,
+                               T *dOutFeatures, T *w, T *dw, int32_t *rulesIn,
+                               int32_t *rulesOut, int32_t nHot,
+                               int32_t input_nPlanes, int32_t input_stride,
+                               int32_t output_nPlanes, int32_t output_stride,
+                               int32_t nGroups) {
+  int32_t c = input_nPlanes * output_nPlanes * nGroups;
+  if (input_nPlanes % 8 != 0 or output_nPlanes % 8 != 0) {
+    const int32_t K = 16;
+    const int32_t V = 4;
+    dConvolution_KMxKN_backward_dW2<T, K, V>
+        <<<dim3(128, (input_nPlanes + K - 1) / K, nGroups), dim3(K, K / V), 0,
+           s>>>(inFeatures, dInFeatures, dOutFeatures, w, dw, rulesIn, rulesOut,
+                nHot, input_nPlanes, input_stride, output_nPlanes,
+                output_stride);
+  } else {
+    dConvolution_backward_dW(s, inFeatures, dInFeatures, dOutFeatures, w, dw,
+                             rulesIn, rulesOut, nHot, input_nPlanes,
+                             input_stride, output_nPlanes, output_stride,
+                             nGroups);
+  }
+}
+#undef TACC
\ No newline at end of file
--- a/include/spconv/fused_conv.h
+++ b/include/spconv/fused_conv.h
+// Copyright 2019 Yan Yan
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <cuda_runtime_api.h>
+#include <tensorview/tensorview.h>
+#include <torch/script.h>
+namespace spconv {
+
+void fused_conv_cuda(torch::Tensor output, torch::Tensor features,
+                     torch::Tensor filters, torch::Tensor indicesIn,
+                     torch::Tensor indicesOut, int nHot);
+
+void fused_conv_backward_cuda(torch::Tensor features, torch::Tensor din,
+                              torch::Tensor dout, torch::Tensor filters,
+                              torch::Tensor dfilters, torch::Tensor indicesIn,
+                              torch::Tensor indicesOut, int nHot);
+
+} // namespace spconv
--- a/include/spconv/indice.cu.h
+++ b/include/spconv/indice.cu.h
@@ -20,6 +20,18 @@
 #include <tensorview/tensorview.h>

 namespace spconv {
+
+template <bool UseDeconv, typename Index, unsigned NDim> struct ConvIndiceDispatch;
+
+template <typename Index, unsigned NDim>
+struct ConvIndiceDispatch<true, Index, NDim>{
+  constexpr static auto* func = getValidOutPosTranspose<Index, NDim>;
+};
+template <typename Index, unsigned NDim>
+struct ConvIndiceDispatch<false, Index, NDim>{
+  constexpr static auto* func = getValidOutPos<Index, NDim>;
+};
+
 template <typename Index, unsigned NDim, bool UseDeconv,
          int KernelMaxVolume = 256, typename Index1D = int>
 __global__ void prepareIndicePairsKernel(
@@ -47,19 +59,10 @@ __global__ void prepareIndicePairsKernel(
  auto indicePairsDim2 = indicePairs.dim(2);
  Index index;
  for (int ix : tv::KernelLoopX<int>(numActIn)) {
-    if (UseDeconv) {
-      // nvcc will optimize this fake "if constexpr"
-      // after cuda 11 released, we will start to use real if constexpr.
-      numValidPoints = getValidOutPosTranspose<Index, NDim>(
-          indicesIn.data() + ix * (NDim + 1) + 1, kernelSize.data(),
-          stride.data(), padding.data(), dilation.data(),
-          outSpatialShape.data(), validPoints);
-    } else {
-      numValidPoints = getValidOutPos<Index, NDim>(
-          indicesIn.data() + ix * (NDim + 1) + 1, kernelSize.data(),
-          stride.data(), padding.data(), dilation.data(),
-          outSpatialShape.data(), validPoints);
-    }
+    numValidPoints = ConvIndiceDispatch<UseDeconv, Index, NDim>::func(
+        indicesIn.data() + ix * (NDim + 1) + 1, kernelSize.data(),
+        stride.data(), padding.data(), dilation.data(),
+        outSpatialShape.data(), validPoints);
    for (Index i = 0; i < numValidPoints; ++i) {
      pointPtr = validPoints + i * (NDim + 1);
      auto offset = pointPtr[NDim];

--- a/include/spconv/spconv_ops.h
+++ b/include/spconv/spconv_ops.h
@@ -23,7 +23,9 @@

 namespace spconv {

-enum ConvAlgo { kNative = 0, kBatch = 1, kBatchGemmGather = 2 };
+enum ConvAlgo { kNative = 0, kBatch, kBatchGemmGather, kSparseConvNet };
+using all_conv_algos_t =
+    tv::mp_list_c<int, kNative, kBatch, kBatchGemmGather, kSparseConvNet>;

 // torch.jit's doc says only support int64, so we need to convert to int32.
 std::vector<torch::Tensor>

--- a/include/spgemm/gemm.h
+++ b/include/spgemm/gemm.h
+#pragma once
 #include <cutlass/gemm/device/gemm.h>
 #include <type_traits>
 namespace spconv {

--- a/include/spgemm/gemm_th.h
+++ b/include/spgemm/gemm_th.h
-
+#pragma once
 #include <cuda_runtime_api.h>
 #include <tensorview/torch_utils.h>
 #include <torch/script.h>

--- a/include/tensorview/cc17.h
+++ b/include/tensorview/cc17.h
+/*
+From PyTorch:
+
+Copyright (c) 2016-     Facebook, Inc            (Adam Paszke)
+Copyright (c) 2014-     Facebook, Inc            (Soumith Chintala)
+Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert)
+Copyright (c) 2012-2014 Deepmind Technologies    (Koray Kavukcuoglu)
+Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu)
+Copyright (c) 2011-2013 NYU                      (Clement Farabet)
+Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, Iain Melvin, Jason Weston)
+Copyright (c) 2006      Idiap Research Institute (Samy Bengio)
+Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, Samy Bengio, Johnny Mariethoz)
+
+From Caffe2:
+
+Copyright (c) 2016-present, Facebook Inc. All rights reserved.
+
+All contributions by Facebook:
+Copyright (c) 2016 Facebook Inc.
+ 
+All contributions by Google:
+Copyright (c) 2015 Google Inc.
+All rights reserved.
+ 
+All contributions by Yangqing Jia:
+Copyright (c) 2015 Yangqing Jia
+All rights reserved.
+ 
+All contributions from Caffe:
+Copyright(c) 2013, 2014, 2015, the respective contributors
+All rights reserved.
+ 
+All other contributions:
+Copyright(c) 2015, 2016 the respective contributors
+All rights reserved.
+ 
+Caffe2 uses a copyright model similar to Caffe: each contributor holds
+copyright over their contributions to Caffe2. The project versioning records
+all such contribution and copyright details. If a contributor wants to further
+mark their specific copyright on a particular contribution, they should
+indicate their copyright solely in the commit message of the change when it is
+committed.
+
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+3. Neither the names of Facebook, Deepmind Technologies, NYU, NEC Laboratories America
+   and IDIAP Research Institute nor the names of its contributors may be
+   used to endorse or promote products derived from this software without
+   specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+*/
+#pragma once
+#include <type_traits>
+#include <utility>
+
+namespace tv {
+  
+#ifdef __cpp_lib_void_t
+
+template <class T> using void_t = std::void_t<T>;
+
+#else
+
+// Implementation taken from http://en.cppreference.com/w/cpp/types/void_t
+// (it takes CWG1558 into account and also works for older compilers)
+template <typename... Ts> struct make_void { typedef void type; };
+template <typename... Ts> using void_t = typename make_void<Ts...>::type;
+
+#endif
+
+namespace detail {
+struct _identity final {
+  template <class T> using type_identity = T;
+
+  template <class T> decltype(auto) operator()(T &&arg) {
+    return std::forward<T>(arg);
+  }
+};
+template<class Func, class Enable = void>
+struct function_takes_identity_argument : std::false_type {};
+#if defined(_MSC_VER)
+// For some weird reason, MSVC shows a compiler error when using guts::void_t instead of std::void_t.
+// But we're only building on MSVC versions that have std::void_t, so let's just use that one.
+template<class Func>
+struct function_takes_identity_argument<Func, std::void_t<decltype(std::declval<Func>()(_identity()))>> : std::true_type {};
+#else
+template<class Func>
+struct function_takes_identity_argument<Func, void_t<decltype(std::declval<Func>()(_identity()))>> : std::true_type {};
+#endif
+
+template<bool Condition>
+struct _if_constexpr;
+
+template<>
+struct _if_constexpr<true> final {
+  template<class ThenCallback, class ElseCallback, std::enable_if_t<function_takes_identity_argument<ThenCallback>::value, void*> = nullptr>
+  static decltype(auto) call(ThenCallback&& thenCallback, ElseCallback&& /* elseCallback */) {
+    // The _identity instance passed in can be used to delay evaluation of an expression,
+    // because the compiler can't know that it's just the identity we're passing in.
+    return thenCallback(_identity());
+  }
+
+  template<class ThenCallback, class ElseCallback, std::enable_if_t<!function_takes_identity_argument<ThenCallback>::value, void*> = nullptr>
+  static decltype(auto) call(ThenCallback&& thenCallback, ElseCallback&& /* elseCallback */) {
+    return thenCallback();
+  }
+};
+
+template<>
+struct _if_constexpr<false> final {
+  template<class ThenCallback, class ElseCallback, std::enable_if_t<function_takes_identity_argument<ElseCallback>::value, void*> = nullptr>
+  static decltype(auto) call(ThenCallback&& /* thenCallback */, ElseCallback&& elseCallback) {
+    // The _identity instance passed in can be used to delay evaluation of an expression,
+    // because the compiler can't know that it's just the identity we're passing in.
+    return elseCallback(_identity());
+  }
+
+  template<class ThenCallback, class ElseCallback, std::enable_if_t<!function_takes_identity_argument<ElseCallback>::value, void*> = nullptr>
+  static decltype(auto) call(ThenCallback&& /* thenCallback */, ElseCallback&& elseCallback) {
+    return elseCallback();
+  }
+};
+} // namespace detail
+
+/*
+ * Get something like C++17 if constexpr in C++14.
+ *
+ * Example 1: simple constexpr if/then/else
+ *   template<int arg> int increment_absolute_value() {
+ *     int result = arg;
+ *     if_constexpr<(arg > 0)>(
+ *       [&] { ++result; }  // then-case
+ *       [&] { --result; }  // else-case
+ *     );
+ *     return result;
+ *   }
+ *
+ * Example 2: without else case (i.e. conditionally prune code from assembly)
+ *   template<int arg> int decrement_if_positive() {
+ *     int result = arg;
+ *     if_constexpr<(arg > 0)>(
+ *       // This decrement operation is only present in the assembly for
+ *       // template instances with arg > 0.
+ *       [&] { --result; }
+ *     );
+ *     return result;
+ *   }
+ *
+ * Example 3: branch based on type (i.e. replacement for SFINAE)
+ *   struct MyClass1 {int value;};
+ *   struct MyClass2 {int val};
+ *   template <class T>
+ *   int func(T t) {
+ *     return if_constexpr<std::is_same<T, MyClass1>::value>(
+ *       [&](auto _) { return _(t).value; }, // this code is invalid for T == MyClass2, so a regular non-constexpr if statement wouldn't compile
+ *       [&](auto _) { return _(t).val; }    // this code is invalid for T == MyClass1
+ *     );
+ *   }
+ *
+ * Note: The _ argument passed in Example 3 is the identity function, i.e. it does nothing.
+ *       It is used to force the compiler to delay type checking, because the compiler
+ *       doesn't know what kind of _ is passed in. Without it, the compiler would fail
+ *       when you try to access t.value but the member doesn't exist.
+ *
+ * Note: In Example 3, both branches return int, so func() returns int. This is not necessary.
+ *       If func() had a return type of "auto", then both branches could return different
+ *       types, say func<MyClass1>() could return int and func<MyClass2>() could return string.
+ */
+template<bool Condition, class ThenCallback, class ElseCallback>
+decltype(auto) if_constexpr(ThenCallback&& thenCallback, ElseCallback&& elseCallback) {
+#if defined(__cpp_if_constexpr)
+  // If we have C++17, just use it's "if constexpr" feature instead of wrapping it.
+  // This will give us better error messages.
+  if constexpr(Condition) {
+    if constexpr (detail::function_takes_identity_argument<ThenCallback>::value) {
+      return std::forward<ThenCallback>(thenCallback)(detail::_identity());
+    } else {
+      return std::forward<ThenCallback>(thenCallback)();
+    }
+  } else {
+    if constexpr (detail::function_takes_identity_argument<ElseCallback>::value) {
+      return std::forward<ElseCallback>(elseCallback)(detail::_identity());
+    } else {
+      return std::forward<ElseCallback>(elseCallback)();
+    }
+  }
+#else
+  // C++14 implementation of if constexpr
+  return detail::_if_constexpr<Condition>::call(std::forward<ThenCallback>(thenCallback),
+                                                 std::forward<ElseCallback>(elseCallback));
+#endif
+}
+
+template<bool Condition, class ThenCallback>
+decltype(auto) if_constexpr(ThenCallback&& thenCallback) {
+#if defined(__cpp_if_constexpr)
+  // If we have C++17, just use it's "if constexpr" feature instead of wrapping it.
+  // This will give us better error messages.
+  if constexpr(Condition) {
+    if constexpr (detail::function_takes_identity_argument<ThenCallback>::value) {
+      return std::forward<ThenCallback>(thenCallback)(detail::_identity());
+    } else {
+      return std::forward<ThenCallback>(thenCallback)();
+    }
+  }
+#else
+  // C++14 implementation of if constexpr
+  return if_constexpr<Condition>(std::forward<ThenCallback>(thenCallback), [] (auto) {});
+#endif
+}
+
+
+}
--- a/include/tensorview/common.h
+++ b/include/tensorview/common.h
@@ -11,7 +11,7 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-
+#pragma once
 #include <iostream>
 #include <sstream>
 #ifdef TV_USE_STACKTRACE

--- a/include/tensorview/tensor.h
+++ b/include/tensorview/tensor.h
@@ -28,6 +28,7 @@ If you can use libtorch, dont use tv::Tensor.
 #include <iomanip>
 #include <memory>
 #include <type_traits>
+#include "cc17.h"
 #ifdef TV_CUDA
 #include <cuda_fp16.h>
 #include <cuda_runtime.h>
@@ -623,66 +624,35 @@ struct Tensor {
    return TensorView<T, Rank, PtrTraits, Tindex>(
        reinterpret_cast<T *>(data<T>()), shape, stride);
  }
-  template <typename T, int Rank = -1,
-            template <class> class PtrTraits = DefaultPtrTraits,
-            typename Tindex = int,
-            typename std::enable_if<Rank == -1, int>::type = 0>
-  TensorView<T, Rank, PtrTraits, Tindex> tview() {
-    writable_check();
-    static_assert(Rank == -1 || Rank > 0, "error");
-    TV_ASSERT_RT_ERR(dtype_ == type_v<T>, "error");
-    ShapeBase<TV_MAX_DIM, Tindex> shape(ndim()), stride(ndim());
-    for (size_t i = 0; i < ndim(); ++i) {
-      shape[i] = shape_[i];
-      stride[i] = stride_[i];
-    }
-    return TensorView<T, Rank, PtrTraits, Tindex>(
-        reinterpret_cast<T *>(data<T>()), shape, stride);
-  }

  template <typename T, int Rank = -1,
            template <class> class PtrTraits = DefaultPtrTraits,
-            typename Tindex = int,
-            typename std::enable_if<(Rank > 0), int>::type = 0>
+            typename Tindex = int>
  TensorView<const std::remove_const_t<T>, Rank, PtrTraits, Tindex>
  tview() const {
    static_assert(Rank == -1 || Rank > 0, "error");
-    if (Rank > 0) {
-      TV_ASSERT_RT_ERR(Rank == ndim(), "error");
-    }
    TV_ASSERT_RT_ERR(dtype_ == type_v<T>, "error");
-
-    ShapeBase<Rank == -1 ? TV_MAX_DIM : Rank, Tindex> shape(Rank), stride(Rank);
-    for (int i = 0; i < Rank; ++i) {
-      shape[i] = shape_[i];
-      stride[i] = stride_[i];
-    }
-    return TensorView<const std::remove_const_t<T>, Rank, PtrTraits, Tindex>(
-        reinterpret_cast<const std::remove_const_t<T> *>(data<T>()), shape,
-        stride);
-  }
-  template <typename T, int Rank = -1,
-            template <class> class PtrTraits = DefaultPtrTraits,
-            typename Tindex = int,
-            typename std::enable_if<Rank == -1, int>::type = 0>
-  TensorView<const std::remove_const_t<T>, Rank, PtrTraits, Tindex>
-  tview() const {
-    static_assert(Rank == -1 || Rank > 0, "error");
-    if (Rank > 0) {
+    return if_constexpr<(Rank > 0)>([&](auto _){
      TV_ASSERT_RT_ERR(Rank == ndim(), "error");
-    }
-    TV_ASSERT_RT_ERR(dtype_ == type_v<T>, "error");
-
-    ShapeBase<TV_MAX_DIM, Tindex> shape(ndim()), stride(ndim());
-    for (int i = 0; i < int(ndim()); ++i) {
-      shape[i] = shape_[i];
-      stride[i] = stride_[i];
-    }
-    return TensorView<const std::remove_const_t<T>, Rank, PtrTraits, Tindex>(
-        reinterpret_cast<const std::remove_const_t<T> *>(data<T>()), shape,
-        stride);
+      ShapeBase<_(Rank) == -1 ? TV_MAX_DIM : Rank, Tindex> shape(Rank), stride(Rank);
+      for (int i = 0; i < Rank; ++i) {
+        shape[i] = shape_[i];
+        stride[i] = stride_[i];
+      }
+      return TensorView<const std::remove_const_t<T>, Rank, PtrTraits, Tindex>(
+          reinterpret_cast<const std::remove_const_t<T> *>(data<T>()), shape,
+          stride);
+    }, [&](auto _){
+      ShapeBase<TV_MAX_DIM, Tindex> shape(_(ndim())), stride(ndim());
+      for (int i = 0; i < int(ndim()); ++i) {
+        shape[i] = shape_[i];
+        stride[i] = stride_[i];
+      }
+      return TensorView<const std::remove_const_t<T>, Rank, PtrTraits, Tindex>(
+          reinterpret_cast<const std::remove_const_t<T> *>(data<T>()), shape,
+          stride);
+    });
  }
-
  template <class... Inds> Tensor view(Inds... newShapes) const {
    static_assert(sizeof...(newShapes) > 0, "dont support empty for now");
    TensorShape shape{int(newShapes)...};
@@ -780,10 +750,10 @@ struct Tensor {

  int dim(int idx) const {
    if (idx < 0) {
-      TV_ASSERT_RT_ERR(shape_.size() + idx < shape_.size(), idx, shape_);
-      return shape_[shape_.size() + idx];
+      TV_ASSERT_RT_ERR(shape_.ndim() + idx < shape_.ndim(), idx, shape_);
+      return shape_[shape_.ndim() + idx];
    } else {
-      TV_ASSERT_RT_ERR(idx < int(shape_.size()), idx, shape_);
+      TV_ASSERT_RT_ERR(idx < int(shape_.ndim()), idx, shape_);
      return shape_[idx];
    }
  }

--- a/include/tensorview/tools.h
+++ b/include/tensorview/tools.h
@@ -35,6 +35,24 @@ template <typename TimeT = std::chrono::microseconds> struct CudaContextTimer {
    mCurTime = std::chrono::steady_clock::now();
    return res;
  }
+  template <int Count, typename F>
+  double benchmark(F&& f, int start=int(Count) * 0.3){
+    // std::vector<TimeT::rep> times;
+    auto res = typename TimeT::rep();
+    int count = 0;
+    cudaDeviceSynchronize();
+    for (int i = 0; i < Count; ++i){
+      std::forward<F>(f)();
+      auto time = report();
+      if (i >= start){
+        // times.push_back(time)
+        res += time;
+        count += 1;
+      }
+    }
+    return res / double(count);
+
+  }

 private:
  std::chrono::time_point<std::chrono::steady_clock> mCurTime;

--- a/spconv/ops.py
+++ b/spconv/ops.py
@@ -23,7 +23,7 @@ class ConvAlgo(Enum):
    Native = 0  # small memory cost, faster when number of points is large.
    Batch = 1  # high memory cost, faster when number of points is small (< 50000)
    BatchGemmGather = 2  # high memory cost, faster when number of points medium
-
+    SparseConvNet = 3

 def get_conv_output_size(input_size, kernel_size, stride, padding, dilation):
    ndim = len(input_size)

--- a/spconv/spatial.py
+++ b/spconv/spatial.py
+# Copyright 2019 Yan Yan
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+import time
+
+import numpy as np
+import torch
+from torch import nn
+from torch.nn import init
+from torch.nn.parameter import Parameter
+
+import spconv
+from spconv.modules import SparseModule
+
+class RemoveDuplicate(SparseModule):
+    def forward(self, x: spconv.SparseConvTensor):
+        inds = x.indices 
+        spatial_shape = [x.batch_size, *x.spatial_shape]
+        spatial_stride = [0] * len(spatial_shape)
+        val = 1
+        for i in range(inds.shape[1] - 1, -1, -1):
+            spatial_stride[i] = val
+            val *= spatial_shape[i]
+        indices_index = inds[:, -1]
+        for i in range(len(spatial_shape) - 1):
+            indices_index += spatial_stride[i] * inds[:, i]
+        _, unique_inds = torch.unique(indices_index)
+        new_inds = inds[unique_inds]
+        new_features = x.features[unique_inds]
+        res = spconv.SparseConvTensor(new_features, new_inds, x.spatial_shape, x.batch_size, x.grid)
+        return res
\ No newline at end of file
--- a/src/spconv/CMakeLists.txt
+++ b/src/spconv/CMakeLists.txt
 set(ALL_FILES all.cc indice.cc reordering.cc maxpool.cc nms.cc spconv_ops.cc pool_ops.cc)
 if (SPCONV_BuildCUDA)
-    set(ALL_FILES ${ALL_FILES} indice.cu reordering.cu maxpool.cu pillar_scatter.cu cublas_gemm.cc)
+    set(ALL_FILES ${ALL_FILES} indice.cu reordering.cu maxpool.cu pillar_scatter.cu cublas_gemm.cc fused_conv.cu)
 endif()
 add_library(spconv SHARED ${ALL_FILES})


--- a/src/spconv/fused_conv.cu
+++ b/src/spconv/fused_conv.cu
+#include <ATen/ATen.h>
+#include <spconv/fused_conv.cu.h>
+#include <spconv/fused_conv.h>
+#include <tensorview/torch_utils.h>
+namespace spconv {
+void fused_conv_cuda(torch::Tensor output, torch::Tensor features,
+                     torch::Tensor filters, torch::Tensor indicesIn,
+                     torch::Tensor indicesOut, int nHot) {
+  auto dtype = output.scalar_type();
+  auto input_nPlanes = features.size(1);
+  auto output_nPlanes = output.size(1);
+
+  auto stream = at::cuda::getCurrentCUDAStream();
+
+  tv::dispatch_torch<float, at::Half>(dtype, [&](auto I) {
+    using T = decltype(I);
+    dConvolution_forward2(stream, features.data_ptr<T>(), output.data_ptr<T>(),
+                          filters.data_ptr<T>(), indicesIn.data_ptr<int32_t>(),
+                          indicesOut.data_ptr<int32_t>(), nHot, input_nPlanes,
+                          input_nPlanes, output_nPlanes, output_nPlanes, 1);
+  });
+}
+
+void fused_conv_backward_cuda(torch::Tensor features, torch::Tensor din,
+                              torch::Tensor dout, torch::Tensor filters,
+                              torch::Tensor dfilters, torch::Tensor indicesIn,
+                              torch::Tensor indicesOut, int nHot) {
+  auto dtype = features.scalar_type();
+  auto input_nPlanes = features.size(1);
+  auto output_nPlanes = dout.size(1);
+  auto stream = at::cuda::getCurrentCUDAStream();
+  tv::dispatch_torch<float>(dtype, [&](auto I) {
+    using T = decltype(I);
+    dConvolution_backward_dW2(
+        stream, features.data_ptr<T>(), din.data_ptr<T>(), dout.data_ptr<T>(),
+        filters.data_ptr<T>(), dfilters.data_ptr<T>(),
+        indicesIn.data_ptr<int32_t>(), indicesOut.data_ptr<int32_t>(), nHot,
+        input_nPlanes, input_nPlanes, output_nPlanes, output_nPlanes, 1);
+  });
+}
+
+} // namespace spconv
\ No newline at end of file
--- a/src/spconv/indice.cu
+++ b/src/spconv/indice.cu
@@ -28,7 +28,6 @@
 #include <thrust/execution_policy.h>
 #include <type_traits>
 #include <utility/timer.h>
-
 namespace spconv {

 using max_kernel_vol_t = tv::mp_list_c<int, 9, 16, 27, 32, 128, 256, 4096>;
@@ -330,7 +329,6 @@ int create_submconv_indice_pair_cuda(
        auto found = false;
        if (dilation_one && (NDim == 2 || NDim == 3)) {
          auto indiceNumCpu = indiceNum.cpu();
-
          if (NDim == 2) {
            tv::SimpleVector<Index, 2> ou_(outSpatialShape.begin(),
                                           outSpatialShape.end());

--- a/src/spconv/spconv_ops.cc
+++ b/src/spconv/spconv_ops.cc
+#include <spconv/fused_conv.h>
 #include <spconv/spconv_ops.h>
 #include <spgemm/gemm_th.h>
+#include <tensorview/tensor.h>

 namespace spconv {

@@ -140,27 +142,11 @@ getIndicePairs(torch::Tensor indices, torch::Tensor gridOut, int64_t batchSize,
  }
 }

-torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters,
-                         torch::Tensor indicePairs, torch::Tensor indiceNum,
-                         int64_t numActOut, int64_t _inverse, int64_t _subM,
-                         int64_t algo) {
+torch::Tensor indiceConvNative(torch::Tensor features, torch::Tensor filters,
+                               torch::Tensor indicePairs,
+                               torch::Tensor indiceNum, int64_t numActOut,
+                               int64_t _inverse, int64_t _subM) {
  auto kernelVolume = indiceNum.size(0);
-  switch (algo) {
-  case kBatchGemmGather:
-  case kBatch: {
-    if (kernelVolume != 1) {
-      return indiceConvBatch(features, filters, indicePairs, indiceNum,
-                             numActOut, _inverse, _subM,
-                             algo != kBatchGemmGather);
-    } else {
-      break;
-    }
-  }
-  case kNative:
-    break;
-  default:
-    TV_THROW_RT_ERR("unknown algo");
-  }
  // auto timer = spconv::CudaContextTimer<>();

  bool subM = _subM != 0;
@@ -184,16 +170,16 @@ torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters,
    torch::mm_out(output, features, filters[indicePairMaxOffset]);

    // get indice pair second max size based on subM symmetric property
-    indicePairMaxSize =
-      *std::max_element(indicePairNumCpu.data_ptr<int>(),
-                        indicePairNumCpu.data_ptr<int>() + indicePairMaxOffset);
+    indicePairMaxSize = *std::max_element(indicePairNumCpu.data_ptr<int>(),
+                                          indicePairNumCpu.data_ptr<int>() +
+                                              indicePairMaxOffset);
    if (indicePairMaxSize == 0) {
      return output;
    }
  } else {
    indicePairMaxSize =
-      *std::max_element(indicePairNumCpu.data_ptr<int>(),
-                        indicePairNumCpu.data_ptr<int>() + kernelVolume);
+        *std::max_element(indicePairNumCpu.data_ptr<int>(),
+                          indicePairNumCpu.data_ptr<int>() + kernelVolume);
  }

  torch::Tensor inputBuffer =
@@ -260,12 +246,59 @@ torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters,
  return output;
 }

+torch::Tensor
+indiceConvSparseConvNet(torch::Tensor features, torch::Tensor filters,
+                        torch::Tensor indicePairs, torch::Tensor indiceNum,
+                        int64_t numActOut, int64_t _inverse, int64_t _subM) {
+  auto kernelVolume = indiceNum.size(0);
+  // auto timer = spconv::CudaContextTimer<>();
+  bool subM = _subM != 0;
+  bool inverse = _inverse != 0;
+  auto device = features.device().type();
+  auto ndim = filters.dim() - 2;
+  auto numInPlanes = features.size(1);
+  auto numOutPlanes = filters.size(ndim + 1);
+  auto indicePairNumCpu = indiceNum.to({torch::kCPU});
+
+  auto options =
+      torch::TensorOptions().dtype(features.dtype()).device(features.device());
+  torch::Tensor output = torch::zeros({numActOut, numOutPlanes}, options);
+  filters = filters.view({-1, numInPlanes, numOutPlanes});
+
+  // init for subM
+  int indicePairMaxOffset = kernelVolume / 2;
+  if (subM) { // the center index of subm conv don't need gather and scatter
+    // add.
+    torch::mm_out(output, features, filters[indicePairMaxOffset]);
+  }
+  for (int i = 0; i < kernelVolume; ++i) {
+    auto nHot = indicePairNumCpu.data_ptr<int>()[i];
+    if (nHot <= 0 || (subM && i == indicePairMaxOffset)) {
+      continue;
+    }
+    if (device == torch::kCPU) {
+      TV_THROW_INVALID_ARG("SparseConvNet only support gpu");
+    }
+#ifdef TV_CUDA
+    else if (device == torch::kCUDA) {
+      fused_conv_cuda(output, features, filters[i], indicePairs[inverse][i],
+                      indicePairs[!inverse][i], nHot);
+    }
+#endif
+    else {
+      TV_THROW_INVALID_ARG("unknown device type");
+    }
+  }
+  return output;
+}
+
+template <bool BatchScatter>
 torch::Tensor indiceConvBatch(torch::Tensor features, torch::Tensor filters,
                              torch::Tensor indicePairs,
                              torch::Tensor indiceNum, int64_t numActOut,
-                              int64_t _inverse, int64_t _subM,
-                              bool batchScatter) {
+                              int64_t _inverse, int64_t _subM) {
  bool subM = _subM != 0;
+  auto batchScatter = BatchScatter;
  bool inverse = _inverse != 0;
  auto device = features.device().type();
  auto ndim = filters.dim() - 2;
@@ -388,29 +421,42 @@ torch::Tensor indiceConvBatch(torch::Tensor features, torch::Tensor filters,
  return output;
 }

+template <int Algo> struct ConvDispatch;
+
+template <> struct ConvDispatch<kNative> {
+  constexpr static auto *func = indiceConvNative;
+};
+
+template <> struct ConvDispatch<kBatch> {
+  constexpr static auto *func = indiceConvBatch<false>;
+};
+
+template <> struct ConvDispatch<kBatchGemmGather> {
+  constexpr static auto *func = indiceConvBatch<true>;
+};
+
+template <> struct ConvDispatch<kSparseConvNet> {
+  constexpr static auto *func = indiceConvSparseConvNet;
+};
+
+torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters,
+                         torch::Tensor indicePairs, torch::Tensor indiceNum,
+                         int64_t numActOut, int64_t _inverse, int64_t _subM,
+                         int64_t algo) {
+  torch::Tensor res;
+  tv::DispatchInt<all_conv_algos_t>()(algo, [&](auto I) {
+    constexpr int AlgoValue = decltype(I)::value;
+    res = ConvDispatch<AlgoValue>::func(features, filters, indicePairs,
+                                        indiceNum, numActOut, _inverse, _subM);
+  });
+  return res;
+}
+
 std::vector<torch::Tensor>
-indiceConvBackward(torch::Tensor features, torch::Tensor filters,
+indiceConvBwNative(torch::Tensor features, torch::Tensor filters,
                   torch::Tensor outGrad, torch::Tensor indicePairs,
-                   torch::Tensor indiceNum, int64_t _inverse, int64_t _subM,
-                   int64_t algo) {
+                   torch::Tensor indiceNum, int64_t _inverse, int64_t _subM) {
  auto kernelVolume = indiceNum.size(0);
-  switch (algo) {
-  case kBatchGemmGather:
-  case kBatch: {
-    if (kernelVolume != 1) {
-      return indiceConvBackwardBatch(features, filters, outGrad, indicePairs,
-                                     indiceNum, _inverse, _subM,
-                                     algo != kBatchGemmGather);
-    } else {
-      break;
-    }
-  }
-  case kNative:
-    break;
-  default:
-    TV_THROW_RT_ERR("unknown algo");
-  }
-
  bool subM = _subM != 0;
  bool inverse = _inverse != 0;

@@ -437,16 +483,16 @@ indiceConvBackward(torch::Tensor features, torch::Tensor filters,
    torch::mm_out(inputGrad, outGrad, filters[indicePairMaxOffset].t());

    // get indice pair second max size based on subM symmetric property
-    indicePairMaxSize =
-      *std::max_element(indicePairNumCpu.data_ptr<int>(),
-                        indicePairNumCpu.data_ptr<int>() + indicePairMaxOffset);
+    indicePairMaxSize = *std::max_element(indicePairNumCpu.data_ptr<int>(),
+                                          indicePairNumCpu.data_ptr<int>() +
+                                              indicePairMaxOffset);
    if (indicePairMaxSize == 0) {
      return {inputGrad, filtersGrad.view(filterShape)};
    }
  } else {
    indicePairMaxSize =
-      *std::max_element(indicePairNumCpu.data_ptr<int>(),
-                        indicePairNumCpu.data_ptr<int>() + kernelVolume);
+        *std::max_element(indicePairNumCpu.data_ptr<int>(),
+                          indicePairNumCpu.data_ptr<int>() + kernelVolume);
  }

  torch::Tensor inputBuffer =
@@ -499,13 +545,66 @@ indiceConvBackward(torch::Tensor features, torch::Tensor filters,
 }

 std::vector<torch::Tensor>
-indiceConvBackwardBatch(torch::Tensor features, torch::Tensor filters,
-                        torch::Tensor outGrad, torch::Tensor indicePairs,
-                        torch::Tensor indiceNum, int64_t _inverse,
-                        int64_t _subM, bool batchScatter) {
+indiceConvBwSparseConvNet(torch::Tensor features, torch::Tensor filters,
+                          torch::Tensor outGrad, torch::Tensor indicePairs,
+                          torch::Tensor indiceNum, int64_t _inverse,
+                          int64_t _subM) {
+  auto kernelVolume = indiceNum.size(0);
  bool subM = _subM != 0;
  bool inverse = _inverse != 0;

+  auto device = features.device().type();
+  auto ndim = filters.dim() - 2;
+  auto numInPlanes = features.size(1);
+  auto numOutPlanes = filters.size(ndim + 1);
+  auto indicePairNumCpu = indiceNum.to({torch::kCPU});
+  auto options =
+      torch::TensorOptions().dtype(features.dtype()).device(features.device());
+  auto filterShape = filters.sizes();
+  torch::Tensor inputGrad = torch::zeros(features.sizes(), options);
+  torch::Tensor filtersGrad = torch::zeros(filterShape, options);
+
+  filters = filters.view({-1, numInPlanes, numOutPlanes});
+  filtersGrad = filtersGrad.view({-1, numInPlanes, numOutPlanes});
+
+  // init for subM
+  int indicePairMaxOffset = kernelVolume / 2;
+  int indicePairMaxSize = indicePairNumCpu.data_ptr<int>()[indicePairMaxOffset];
+  if (subM) {
+    auto filterGradSub = filtersGrad[indicePairMaxOffset];
+    torch::mm_out(filterGradSub, features.t(), outGrad);
+    torch::mm_out(inputGrad, outGrad, filters[indicePairMaxOffset].t());
+  }
+  for (int i = 0; i < kernelVolume; ++i) {
+    auto nHot = indicePairNumCpu.data_ptr<int>()[i];
+    if (nHot <= 0 || (subM && i == indicePairMaxOffset)) {
+      continue;
+    }
+    if (device == torch::kCPU) {
+      TV_THROW_INVALID_ARG("unknown device type");
+    }
+#ifdef TV_CUDA
+    else if (device == torch::kCUDA) {
+      fused_conv_backward_cuda(features, inputGrad, outGrad, filters[i],
+                               filtersGrad[i], indicePairs[inverse][i],
+                               indicePairs[!inverse][i], nHot);
+    }
+#endif
+    else {
+      TV_THROW_INVALID_ARG("unknown device type");
+    }
+  }
+  return {inputGrad, filtersGrad.view(filterShape)};
+}
+
+template <bool BatchScatter>
+std::vector<torch::Tensor>
+indiceConvBwBatch(torch::Tensor features, torch::Tensor filters,
+                  torch::Tensor outGrad, torch::Tensor indicePairs,
+                  torch::Tensor indiceNum, int64_t _inverse, int64_t _subM) {
+  bool subM = _subM != 0;
+  bool inverse = _inverse != 0;
+  auto batchScatter = BatchScatter;
  auto device = features.device().type();
  auto ndim = filters.dim() - 2;
  auto kernelVolume = indiceNum.size(0);
@@ -626,4 +725,35 @@ indiceConvBackwardBatch(torch::Tensor features, torch::Tensor filters,
  return {inputGrad, filtersGrad.view(filterShape)};
 }

+template <int Algo> struct ConvBwDispatch;
+
+template <> struct ConvBwDispatch<kNative> {
+  constexpr static auto *func = indiceConvBwNative;
+};
+
+template <> struct ConvBwDispatch<kBatch> {
+  constexpr static auto *func = indiceConvBwBatch<false>;
+};
+
+template <> struct ConvBwDispatch<kBatchGemmGather> {
+  constexpr static auto *func = indiceConvBwBatch<true>;
+};
+
+template <> struct ConvBwDispatch<kSparseConvNet> {
+  constexpr static auto *func = indiceConvBwSparseConvNet;
+};
+
+std::vector<torch::Tensor>
+indiceConvBackward(torch::Tensor features, torch::Tensor filters,
+                   torch::Tensor outGrad, torch::Tensor indicePairs,
+                   torch::Tensor indiceNum, int64_t _inverse, int64_t _subM,
+                   int64_t algo) {
+  std::vector<torch::Tensor> res;
+  tv::DispatchInt<all_conv_algos_t>()(algo, [&](auto I) {
+    constexpr int AlgoValue = decltype(I)::value;
+    res = ConvBwDispatch<AlgoValue>::func(
+        features, filters, outGrad, indicePairs, indiceNum, _inverse, _subM);
+  });
+  return res;
+}
 } // namespace spconv
--- a/test/test_conv.py
+++ b/test/test_conv.py
@@ -37,7 +37,7 @@ class SparseConv3dTestTorch(nn.Module):
                 stride,
                 padding,
                 dilation,
-                 algo=spconv.ConvAlgo.Native):
+                 algo=spconv.ConvAlgo.SparseConvNet):
        super().__init__()
        layers = [
            spconv.SparseConv3d(in_channels,
@@ -353,7 +353,7 @@ class TestSpConv(TestCase):
        shapes = [[19, 18, 17]]
        batchsizes = [1, 2]

-        in_channels = [64]
+        in_channels = [32]
        out_channels = [32, 48, 64]
        ksizes = [2, 3]
        strides = [1, 2, 3]
@@ -623,8 +623,8 @@ def main(algo=spconv.ConvAlgo.Native, dtype=torch.float32):
    shapes = [[400, 400, 15]]
    batchsizes = [2]

-    in_channels = [32]
-    out_channels = [64]
+    in_channels = [19]
+    out_channels = [17]
    ksizes = [(3, 3, 3)]
    strides = [1]
    paddings = [0]
@@ -752,8 +752,8 @@ def main_subm(algo, dtype=torch.float32):


 if __name__ == '__main__':
-    main_subm(algo=spconv.ConvAlgo.Native, dtype=torch.float32)
-    # main(algo=spconv.ConvAlgo.Native, dtype=torch.half)
+    # main_subm(algo=spconv.ConvAlgo.SparseConvNet, dtype=torch.float32)
+    # main(algo=spconv.ConvAlgo.SparseConvNet, dtype=torch.float32)
    # TestCase().assertAllClose(out_my, out_ref)
    # unittest.main()
-    # TestSpConv().testSpConv3d()
+    TestSpConv().testSpConv3d()