Factor out CUDA code

de3743f6 · Benjamin Thomas Graham · f0407b36 · f0407b36 · de3743f6 · de3743f6
Commit de3743f6 authored Jul 13, 2018 by Benjamin Thomas Graham
20 changed files
--- a/sparseconvnet/SCN/CUDA/BatchwiseMultiplicativeDropout.h
+++ b/sparseconvnet/SCN/CUDA/BatchwiseMultiplicativeDropout.h
-// Copyright 2016-present, Facebook, Inc.
-// All rights reserved.
-//
-// This source code is licensed under the license found in the
-// LICENSE file in the root directory of this source tree.
-
-#ifndef CUDA_BATCHWISEMULTIPLICATIVEDROPOUT_H
-#define CUDA_BATCHWISEMULTIPLICATIVEDROPOUT_H
-template <typename T, Int NTX, Int NTY>
-__global__ void BatchwiseMultiplicativeDropout_fp(T *input_features,
-                                                  T *output_features, T *noise,
-                                                  Int nActive, Int nPlanes,
-                                                  Int input_stride,
-                                                  Int output_stride, T alpha) {
-  __shared__ T nz[NTX];
-  for (Int plane = threadIdx.x + blockIdx.x * NTX; plane < nPlanes;
-       plane += gridDim.x * NTX) {
-    if (threadIdx.y == 0)
-      nz[threadIdx.x] = noise[plane];
-    __syncthreads();
-    for (Int row = threadIdx.y + blockIdx.y * NTY; row < nActive;
-         row += gridDim.y * NTY) {
-      Int i = row * input_stride + plane;
-      Int o = row * output_stride + plane;
-      output_features[o] = input_features[i] * nz[threadIdx.x] *
-                           ((input_features[i] > 0) ? 1 : alpha);
-    }
-    __syncthreads();
-  }
-}
-template <typename T, Int NTX, Int NTY>
-__global__ void
-BatchwiseMultiplicativeDropout_bp(T *input_features, T *d_input_features,
-                                  T *d_output_features, T *noise, Int nActive,
-                                  Int nPlanes, Int input_stride,
-                                  Int output_stride, T alpha) {
-  __shared__ T nz[NTX];
-  for (Int plane = threadIdx.x + blockIdx.x * NTX; plane < nPlanes;
-       plane += gridDim.x * NTX) {
-    if (threadIdx.y == 0)
-      nz[threadIdx.x] = noise[plane];
-    __syncthreads();
-    for (Int row = threadIdx.y + blockIdx.y * NTY; row < nActive;
-         row += gridDim.y * NTY) {
-      Int i = row * input_stride + plane;
-      Int o = row * output_stride + plane;
-      d_input_features[i] = d_output_features[o] * nz[threadIdx.x] *
-                            ((input_features[i] > 0) ? 1 : alpha);
-    }
-    __syncthreads();
-  }
-}
-#endif /* CUDA_BATCHWISEMULTIPLICATIVEDROPOUT_H */
--- a/sparseconvnet/SCN/CUDA/Convolution.cpp
+++ b/sparseconvnet/SCN/CUDA/Convolution.cpp
+// Copyright 2016-present, Facebook, Inc.
+// All rights reserved.
+//
+// This source code is licensed under the license found in the
+// LICENSE file in the root directory of this source tree.
+
+template <typename T>
+void Convolution_fp_bias(T *of, T *b, Int nPlanes, Int nActiveOut);
+template <typename T>
+void Convolution_bp_bias(T *matrix, T *target, Int nRows, Int nColumns,
+                         Int nCOLUMNS);
+template <typename T>
+double dConvolution_forward2(T *inFeatures, T *outFeatures, T *w,
+                             RuleBook _rules, Int input_nPlanes,
+                             Int input_stride, Int output_nPlanes,
+                             Int output_stride);
+
+template <typename T>
+void dConvolution_backward_dW2(T *inFeatures, T *dInFeatures, T *dOutFeatures,
+                               T *w, T *dw, RuleBook _rules, Int input_nPlanes,
+                               Int input_stride, Int output_nPlanes,
+                               Int output_stride);
+
+template <typename T, Int Dimension>
+double cuda_Convolution_updateOutput(
+    /*long*/ at::Tensor inputSize, /*long*/ at::Tensor outputSize,
+    /*long*/ at::Tensor filterSize,
+    /*long*/ at::Tensor filterStride, Metadata<Dimension> &m,
+    /*cuda float*/ at::Tensor input_features,
+    /*cuda float*/ at::Tensor output_features, /*cuda float*/ at::Tensor weight,
+    /*cuda float*/ at::Tensor bias) {
+
+  auto _rules =
+      m.getRuleBook(inputSize, outputSize, filterSize, filterStride, true);
+  Int nActiveOut = m.getNActive(outputSize);
+
+  if (nActiveOut) {
+    Int ip = weight.size(1);
+    Int op = weight.size(2);
+    output_features.resize_({nActiveOut, op});
+    auto iF = input_features.data<T>();
+    auto oF = output_features.data<T>();
+    auto w = weight.data<T>();
+
+    if (bias.numel())
+      Convolution_fp_bias(oF, bias.data<T>(), op, nActiveOut);
+    else
+      output_features.zero_();
+
+    return dConvolution_forward2<T>(iF, oF, w, _rules, ip, ip, op, op);
+  } else {
+    return 0;
+  }
+}
+
+template <typename T, Int Dimension>
+void cuda_Convolution_backward(
+    /*long*/ at::Tensor inputSize, /*long*/ at::Tensor outputSize,
+    /*long*/ at::Tensor filterSize,
+    /*long*/ at::Tensor filterStride, Metadata<Dimension> &m,
+    /*cuda float*/ at::Tensor input_features,
+    /*cuda float*/ at::Tensor d_input_features,
+    /*cuda float*/ at::Tensor d_output_features,
+    /*cuda float*/ at::Tensor weight, /*cuda float*/ at::Tensor d_weight,
+    /*cuda float*/ at::Tensor d_bias) {
+
+  auto _rules =
+      m.getRuleBook(inputSize, outputSize, filterSize, filterStride, true);
+  Int nActiveIn = m.getNActive(inputSize);
+  Int nActiveOut = m.getNActive(outputSize);
+
+  if (nActiveOut) {
+    Int ip = weight.size(1);
+    Int op = weight.size(2);
+    d_input_features.resize_({nActiveIn, ip});
+    d_input_features.zero_();
+    auto iF = input_features.data<T>();
+    auto diF = d_input_features.data<T>();
+    auto doF = d_output_features.data<T>();
+    auto w = weight.data<T>();
+    auto dw = d_weight.data<T>();
+
+    dConvolution_backward_dW2<T>(iF, diF, doF, w, dw, _rules, ip, ip, op, op);
+
+    if (d_bias.numel()) {
+      auto db = d_bias.data<T>();
+      Convolution_bp_bias(doF, db, op, op, nActiveOut);
+    }
+  }
+}
+
+template <typename T, Int Dimension>
+double cuda_SubmanifoldConvolution_updateOutput(
+    /*long*/ at::Tensor inputSize, /*long*/ at::Tensor filterSize,
+    Metadata<Dimension> &m,
+    /*cuda float*/ at::Tensor input_features,
+    /*cuda float*/ at::Tensor output_features, /*cuda float*/ at::Tensor weight,
+    /*cuda float*/ at::Tensor bias) {
+
+  auto _rules = m.getSubmanifoldRuleBook(inputSize, filterSize, true);
+  Int nActive = m.getNActive(inputSize);
+
+  if (nActive) {
+    Int ip = weight.size(1);
+    Int op = weight.size(2);
+    output_features.resize_({nActive, op});
+    auto iF = input_features.data<T>();
+    auto oF = output_features.data<T>();
+    auto w = weight.data<T>();
+
+    if (bias.numel())
+      Convolution_fp_bias(oF, bias.data<T>(), op, nActive);
+    else
+      output_features.zero_();
+
+    return dConvolution_forward2<T>(iF, oF, w, _rules, ip, ip, op, op);
+  } else {
+    return 0;
+  }
+}
+
+template <typename T, Int Dimension>
+void cuda_SubmanifoldConvolution_backward(
+    /*long*/ at::Tensor inputSize, /*long*/ at::Tensor filterSize,
+    Metadata<Dimension> &m,
+    /*cuda float*/ at::Tensor input_features,
+    /*cuda float*/ at::Tensor d_input_features,
+    /*cuda float*/ at::Tensor d_output_features,
+    /*cuda float*/ at::Tensor weight, /*cuda float*/ at::Tensor d_weight,
+    /*cuda float*/ at::Tensor d_bias) {
+
+  auto _rules = m.getSubmanifoldRuleBook(inputSize, filterSize, true);
+  Int nActive = m.getNActive(inputSize);
+
+  if (nActive) {
+    Int ip = weight.size(1);
+    Int op = weight.size(2);
+    d_input_features.resize_({nActive, ip});
+    d_input_features.zero_();
+    auto iF = input_features.data<T>();
+    auto diF = d_input_features.data<T>();
+    auto doF = d_output_features.data<T>();
+    auto w = weight.data<T>();
+    auto dw = d_weight.data<T>();
+
+    dConvolution_backward_dW2<T>(iF, diF, doF, w, dw, _rules, ip, ip, op, op);
+
+    if (d_bias.numel()) {
+      auto db = d_bias.data<T>();
+      Convolution_bp_bias(doF, db, op, op, nActive);
+    }
+  }
+}
+
+template <typename T, Int Dimension>
+double cuda_FullConvolution_updateOutput(
+    /*long*/ at::Tensor inputSize, /*long*/ at::Tensor outputSize,
+    /*long*/ at::Tensor filterSize,
+    /*long*/ at::Tensor filterStride, Metadata<Dimension> &mIn,
+    Metadata<Dimension> &mOut,
+    /*cuda float*/ at::Tensor input_features,
+    /*cuda float*/ at::Tensor output_features, /*cuda float*/ at::Tensor weight,
+    /*cuda float*/ at::Tensor bias) {
+
+  auto _rules = mIn.getFullConvolutionRuleBook(inputSize, outputSize,
+                                               filterSize, filterStride, mOut);
+  Int nActiveOut = mOut.getNActive(outputSize);
+  if (nActiveOut) {
+    Int ip = weight.size(1);
+    Int op = weight.size(2);
+    output_features.resize_({nActiveOut, op});
+    auto iF = input_features.data<T>();
+    auto oF = output_features.data<T>();
+    auto w = weight.data<T>();
+
+    if (bias.numel())
+      Convolution_fp_bias(oF, bias.data<T>(), op, nActiveOut);
+    else
+      output_features.zero_();
+
+    return dConvolution_forward2<T>(iF, oF, w, _rules, ip, ip, op, op);
+  } else {
+    return 0;
+  }
+}
+
+template <typename T, Int Dimension>
+void cuda_FullConvolution_backward(
+    /*long*/ at::Tensor inputSize, /*long*/ at::Tensor outputSize,
+    /*long*/ at::Tensor filterSize,
+    /*long*/ at::Tensor filterStride, Metadata<Dimension> &mIn,
+    Metadata<Dimension> &mOut,
+    /*cuda float*/ at::Tensor input_features,
+    /*cuda float*/ at::Tensor d_input_features,
+    /*cuda float*/ at::Tensor d_output_features,
+    /*cuda float*/ at::Tensor weight, /*cuda float*/ at::Tensor d_weight,
+    /*cuda float*/ at::Tensor d_bias) {
+
+  auto _rules = mIn.getFullConvolutionRuleBook(inputSize, outputSize,
+                                               filterSize, filterStride, mOut);
+  Int nActiveIn = mIn.getNActive(inputSize);
+  Int nActiveOut = mOut.getNActive(outputSize);
+
+  if (nActiveOut) {
+    Int ip = weight.size(1);
+    Int op = weight.size(2);
+    d_input_features.resize_({nActiveIn, ip});
+    d_input_features.zero_();
+    auto iF = input_features.data<T>();
+    auto diF = d_input_features.data<T>();
+    auto doF = d_output_features.data<T>();
+    auto w = weight.data<T>();
+    auto dw = d_weight.data<T>();
+
+    dConvolution_backward_dW2<T>(iF, diF, doF, w, dw, _rules, ip, ip, op, op);
+
+    if (d_bias.numel()) {
+      auto db = d_bias.data<T>();
+      Convolution_bp_bias(doF, db, op, op, nActiveOut);
+    }
+  }
+}
+template <typename T, Int Dimension>
+double cuda_RandomizedStrideConvolution_updateOutput(
+    /*long*/ at::Tensor inputSize, /*long*/ at::Tensor outputSize,
+    /*long*/ at::Tensor filterSize,
+    /*long*/ at::Tensor filterStride, Metadata<Dimension> &m,
+    /*cuda float*/ at::Tensor input_features,
+    /*cuda float*/ at::Tensor output_features,
+    /*cuda float*/ at::Tensor weight, /*cuda float*/ at::Tensor bias) {
+
+  auto _rules = m.getRandomizedStrideRuleBook(inputSize, outputSize, filterSize,
+                                              filterStride, true);
+  Int nActiveOut = m.getNActive(outputSize);
+
+  if (nActiveOut) {
+    Int ip = weight.size(1);
+    Int op = weight.size(2);
+    output_features.resize_({nActiveOut, op});
+    auto iF = input_features.data<T>();
+    auto oF = output_features.data<T>();
+    auto w = weight.data<T>();
+
+    if (bias.numel())
+      Convolution_fp_bias(oF, bias.data<T>(), op, nActiveOut);
+    else
+      output_features.zero_();
+
+    return dConvolution_forward2<T>(iF, oF, w, _rules, ip, ip, op, op);
+  } else {
+    return 0;
+  }
+}
+
+template <typename T, Int Dimension>
+void cuda_RandomizedStrideConvolution_backward(
+    /*long*/ at::Tensor inputSize, /*long*/ at::Tensor outputSize,
+    /*long*/ at::Tensor filterSize,
+    /*long*/ at::Tensor filterStride, Metadata<Dimension> &m,
+    /*cuda float*/ at::Tensor input_features,
+    /*cuda float*/ at::Tensor d_input_features,
+    /*cuda float*/ at::Tensor d_output_features,
+    /*cuda float*/ at::Tensor weight, /*cuda float*/ at::Tensor d_weight,
+    /*cuda float*/ at::Tensor d_bias) {
+
+  auto _rules = m.getRandomizedStrideRuleBook(inputSize, outputSize, filterSize,
+                                              filterStride, true);
+  Int nActiveIn = m.getNActive(inputSize);
+  Int nActiveOut = m.getNActive(outputSize);
+
+  if (nActiveOut) {
+    Int ip = weight.size(1);
+    Int op = weight.size(2);
+    d_input_features.resize_({nActiveIn, ip});
+    d_input_features.zero_();
+    auto iF = input_features.data<T>();
+    auto diF = d_input_features.data<T>();
+    auto doF = d_output_features.data<T>();
+    auto w = weight.data<T>();
+    auto dw = d_weight.data<T>();
+
+    dConvolution_backward_dW2<T>(iF, diF, doF, w, dw, _rules, ip, ip, op, op);
+
+    if (d_bias.numel()) {
+      auto db = d_bias.data<T>();
+      Convolution_bp_bias(doF, db, op, op, nActiveOut);
+    }
+  }
+}
--- a/sparseconvnet/SCN/CUDA/Convolution.cu
+++ b/sparseconvnet/SCN/CUDA/Convolution.cu
@@ -4,315 +4,649 @@
 // This source code is licensed under the license found in the
 // LICENSE file in the root directory of this source tree.

-#include "Convolution.h"
 #include "RuleBookIterator.h"

-template <typename T, Int Dimension>
-double cuda_Convolution_updateOutput(
-    /*long*/ at::Tensor inputSize, /*long*/ at::Tensor outputSize,
-    /*long*/ at::Tensor filterSize,
-    /*long*/ at::Tensor filterStride, Metadata<Dimension> &m,
-    /*cuda float*/ at::Tensor input_features,
-    /*cuda float*/ at::Tensor output_features, /*cuda float*/ at::Tensor weight,
-    /*cuda float*/ at::Tensor bias) {
-
-  auto _rules =
-      m.getRuleBook(inputSize, outputSize, filterSize, filterStride, true);
-  Int nActive = m.getNActive(outputSize);
-  output_features.resize_({nActive, weight.size(2)});
-  if (not bias.numel())
-    output_features.zero_();
+template <typename T>
+__global__ void Convolution_fp_bias_(T *output_features, T *bias, Int nPlanes,
+                                     Int nActive) {
+  Int n = blockIdx.x * 32 + threadIdx.x;
+  T b = bias[n];
+  output_features += n;
+  for (Int row = blockIdx.y; row < nActive; row += gridDim.y) {
+    output_features[row * nPlanes] = b;
+  }
+}

-  double flops = 0;
-  if (nActive) {
-    auto iF = input_features.data<T>();
-    auto oF = output_features.data<T>();
-    Int ip = input_features.size(1);
-    Int op = output_features.size(1);
-    auto w = weight.data<T>();
-
-    if (bias.numel()) {
-      auto b = bias.data<T>();
-      for (Int i = 0; i < op; i += 32) {
-        Int blockDim = min((Int)32, op - i);
-        Int gridDim = min((Int)4096, nActive);
-        Convolution_fp_bias<<<gridDim, blockDim>>>(oF + i, b + i, op, op,
+template <typename T>
+void Convolution_fp_bias(T *oF, T *b, Int nPlanes, Int nActive) {
+  if (nPlanes / 32 > 0)
+    Convolution_fp_bias_<<<dim3(nPlanes / 32, 4096), 32>>>(oF, b, nPlanes,
+                                                           nActive);
+  if (nPlanes % 32 > 0) {
+    Int o = nPlanes / 32 * 32;
+    Convolution_fp_bias_<<<dim3(1, 4096), nPlanes - o>>>(oF + o, b + o, nPlanes,
                                                         nActive);
  }
+}
+
+template <typename T>
+__global__ void dColumnSum(T *matrix, T *target, Int nRows, Int nColumns,
+                           Int nCOLUMNS) {
+  Int i = blockIdx.x * 32 + threadIdx.x;
+  T t = 0;
+  for (Int j = blockIdx.y; j < nRows; j += 32)
+    t += matrix[j * nCOLUMNS + i];
+  atomicAdd(&target[i], t);
+}
+template <typename T>
+void Convolution_bp_bias(T *matrix, T *target, Int nRows, Int nColumns,
+                         Int nCOLUMNS) {
+  if (nColumns / 32 > 0)
+    dColumnSum<<<dim3(nColumns / 32, 32), 32>>>(matrix, target, nRows, nColumns,
+                                                nCOLUMNS);
+  if (nColumns % 32 > 0) {
+    Int o = nColumns / 32 * 32;
+    dColumnSum<<<dim3(1, 32), nColumns - o>>>(matrix + o, target + o, nRows,
+                                              nColumns, nCOLUMNS);
  }
-    Int c = ip * op;
-    RULEBOOKITERATOR(
-        dConvolution_forward2<T>(iF, oF, w, rbB, nHotB, ip, ip, op, op);
-        , w += c; flops += nHotB * c;)
-  }
-  return flops;
 }

-template <typename T, Int Dimension>
-void cuda_Convolution_backward(
-    /*long*/ at::Tensor inputSize, /*long*/ at::Tensor outputSize,
-    /*long*/ at::Tensor filterSize,
-    /*long*/ at::Tensor filterStride, Metadata<Dimension> &m,
-    /*cuda float*/ at::Tensor input_features,
-    /*cuda float*/ at::Tensor d_input_features,
-    /*cuda float*/ at::Tensor d_output_features,
-    /*cuda float*/ at::Tensor weight, /*cuda float*/ at::Tensor d_weight,
-    /*cuda float*/ at::Tensor d_bias) {
-
-  auto _rules =
-      m.getRuleBook(inputSize, outputSize, filterSize, filterStride, true);
-  Int nActive = m.getNActive(outputSize);
-  d_input_features.resize_as_(input_features);
-  d_input_features.zero_();
-
-  if (nActive) {
-    auto iF = input_features.data<T>();
-    auto diF = d_input_features.data<T>();
-    auto doF = d_output_features.data<T>();
-    Int ip = input_features.size(1);
-    Int op = d_output_features.size(1);
-    auto w = weight.data<T>();
-    auto dw = d_weight.data<T>();
-    Int c = ip * op;
-    RULEBOOKITERATOR(dConvolution_backward_dW2<T>(iF, diF, doF, w, dw, rbB,
-                                                  nHotB, ip, ip, op, op);
-                     , w += c; dw += c;)
+template <typename T, Int K, Int V>
+__global__ void
+dConvolution_KMxKN_forwardA(T *inFeatures, T *outFeatures, T *w, Int *rules,
+                            Int nHot, Int input_nPlanes, Int input_stride,
+                            Int output_nPlanes, Int output_stride) {
+  // nHot must be a multiple of K!!

-    if (d_bias.numel()) {
-      auto db = d_bias.data<T>();
-      Convolution_bp_bias(doF, db, op, op, nActive);
+  // Input x Weight -> Output
+  // blockDim=(K,K/V,1), gridDim=(nBlocks,N,1) Volkov-blocks
+  // K is a multiple of V,
+
+  // nHot x KM -> nHot x KN - parallel over N,nHot - loop over M
+
+  Int M = input_nPlanes / K;
+  // N = gridDim.y == output_nPlanes/K
+  Int n = blockIdx.y;
+  outFeatures += n * K;
+  w += n * K;
+
+  T O[V];
+  __shared__ T W[K][K];
+  __shared__ T I[K][K];
+  Int R0[V];
+  Int R1[V];
+  const int tx = threadIdx.x;
+  int ty[V];
+#pragma unroll
+  for (int v = 0; v < V; v++)
+    ty[v] = threadIdx.y + v * (K / V);
+
+  for (int m = 0; m < M; m++) {
+// Read w
+#pragma unroll
+    for (int v = 0; v < V; v++)
+      W[ty[v]][tx] = w[ty[v] * output_nPlanes + tx];
+
+    for (Int s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
+#pragma unroll
+      for (int v = 0; v < V; v++) {
+        R0[v] = rules[2 * (s + ty[v])];
+        R1[v] = rules[2 * (s + ty[v]) + 1];
+      }
+      __syncthreads();
+
+// Read input, reset O[]
+#pragma unroll
+      for (int v = 0; v < V; v++) {
+        I[ty[v]][tx] = inFeatures[R0[v] * input_stride + tx];
+        O[v] = 0;
+      }
+      __syncthreads();
+
+#pragma unroll
+      for (int k = 0; k < K; k++)
+#pragma unroll
+        for (int v = 0; v < V; v++)
+          O[v] += I[ty[v]][k] * W[k][tx];
+
+#pragma unroll
+      for (int v = 0; v < V; v++)
+        O[v] += outFeatures[R1[v] * output_stride + tx];
+#pragma unroll
+      for (int v = 0; v < V; v++)
+        outFeatures[R1[v] * output_stride + tx] = O[v];
+      __syncthreads();
    }
+    w += K * output_nPlanes;
+    inFeatures += K;
  }
 }
+template <typename T, Int K, Int V>
+__global__ void
+dConvolution_KMxKN_forwardB(T *inFeatures, T *outFeatures, T *w, Int *rules,
+                            Int nHot, Int input_nPlanes, Int input_stride,
+                            Int output_nPlanes, Int output_stride) {
+  // Input x Weight -> Output
+  // blockDim=(K,K/V,1), gridDim=(nBlocks,N,1) Volkov-blocks
+  // K is a multiple of V,

-template <typename T, Int Dimension>
-double cuda_SubmanifoldConvolution_updateOutput(
-    /*long*/ at::Tensor inputSize, /*long*/ at::Tensor filterSize,
-    Metadata<Dimension> &m,
-    /*cuda float*/ at::Tensor input_features,
-    /*cuda float*/ at::Tensor output_features, /*cuda float*/ at::Tensor weight,
-    /*cuda float*/ at::Tensor bias) {
-
-  auto _rules = m.getSubmanifoldRuleBook(inputSize, filterSize, true);
-  Int nActive = m.getNActive(inputSize);
-  output_features.resize_({nActive, weight.size(2)});
-  if (bias.numel() and nActive)
-    output_features.copy_(bias);
-  else
-    output_features.zero_();
+  // nHot x KM -> nHot x KN - parallel over N,nHot - loop over M

-  double flops = 0;
-  if (nActive) {
-    auto iF = input_features.data<T>();
-    auto oF = output_features.data<T>();
-    Int ip = input_features.size(1);
-    Int op = output_features.size(1);
-    auto w = weight.data<T>();
-
-    // if (bias.numel()) {
-    //   auto b = bias.data<T>();
-    //   for (Int i = 0; i < op; i += 32) {
-    //     Int blockDim = min((Int)32, op - i);
-    //     Int gridDim = min((Int)4096, nActive);
-    //     Convolution_fp_bias<<<gridDim, blockDim>>>(oF + i, b + i, op, op,
-    //                                                nActive);
-    //   }
-    // }
-    Int c = ip * op;
-    RULEBOOKITERATOR(
-        dConvolution_forward2<T>(iF, oF, w, rbB, nHotB, ip, ip, op, op);
-        , w += c; flops += nHotB * c;)
+  Int M = input_nPlanes / K;
+  // N = gridDim.y == output_nPlanes/K
+  Int n = blockIdx.y;
+  outFeatures += n * K;
+  w += n * K;
+
+  T O[V];
+  __shared__ T W[K][K];
+  __shared__ T I[K][K];
+  Int R0[V];
+  Int R1[V];
+  const int tx = threadIdx.x;
+  int ty[V];
+#pragma unroll
+  for (int v = 0; v < V; v++)
+    ty[v] = threadIdx.y + v * (K / V);
+
+  for (int m = 0; m < M; m++) {
+// Read w
+#pragma unroll
+    for (int v = 0; v < V; v++)
+      W[ty[v]][tx] = w[ty[v] * output_nPlanes + tx];
+
+    for (Int s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
+#pragma unroll
+      for (int v = 0; v < V; v++) {
+        if (s + ty[v] < nHot) {
+          R0[v] = rules[2 * (s + ty[v])];
+          R1[v] = rules[2 * (s + ty[v]) + 1];
+        }
+      }
+      __syncthreads();
+
+// Read input, reset O[]
+#pragma unroll
+      for (int v = 0; v < V; v++) {
+        if (s + ty[v] < nHot)
+          I[ty[v]][tx] = inFeatures[R0[v] * input_stride + tx];
+        O[v] = 0;
+      }
+      __syncthreads();
+
+#pragma unroll
+      for (int k = 0; k < K; k++)
+#pragma unroll
+        for (int v = 0; v < V; v++)
+          O[v] += I[ty[v]][k] * W[k][tx];
+
+#pragma unroll
+      for (int v = 0; v < V; v++)
+        if (s + ty[v] < nHot)
+          O[v] += outFeatures[R1[v] * output_stride + tx];
+#pragma unroll
+      for (int v = 0; v < V; v++)
+        if (s + ty[v] < nHot)
+          outFeatures[R1[v] * output_stride + tx] = O[v];
+      __syncthreads();
+    }
+    w += K * output_nPlanes;
+    inFeatures += K;
  }
-  return flops;
 }

-template <typename T, Int Dimension>
-void cuda_SubmanifoldConvolution_backward(
-    /*long*/ at::Tensor inputSize, /*long*/ at::Tensor filterSize,
-    Metadata<Dimension> &m,
-    /*cuda float*/ at::Tensor input_features,
-    /*cuda float*/ at::Tensor d_input_features,
-    /*cuda float*/ at::Tensor d_output_features,
-    /*cuda float*/ at::Tensor weight, /*cuda float*/ at::Tensor d_weight,
-    /*cuda float*/ at::Tensor d_bias) {
-
-  auto _rules = m.getSubmanifoldRuleBook(inputSize, filterSize, true);
-  Int nActive = m.getNActive(inputSize);
-  d_input_features.resize_as_(input_features);
-  d_input_features.zero_();
-
-  if (nActive) {
-    auto iF = input_features.data<T>();
-    auto diF = d_input_features.data<T>();
-    auto doF = d_output_features.data<T>();
-    Int ip = input_features.size(1);
-    Int op = d_output_features.size(1);
-    auto w = weight.data<T>();
-    auto dw = d_weight.data<T>();
-    Int c = ip * op;
-    RULEBOOKITERATOR(dConvolution_backward_dW2<T>(iF, diF, doF, w, dw, rbB,
-                                                  nHotB, ip, ip, op, op);
-                     , w += c; dw += c;)
+#define FOO(T, K, V)                                                           \
+  {                                                                            \
+    if (input_nPlanes % K == 0 and output_nPlanes % K == 0) {                  \
+      Int o = (nHot / K) * K;                                                  \
+      if (o >= K)                                                              \
+        dConvolution_KMxKN_forwardA<                                           \
+            T, K, V><<<dim3(std::min(o / K, (Int)512), output_nPlanes / K),    \
+                       dim3(K, K / V)>>>(inFeatures, outFeatures, w, rules, o, \
+                                         input_nPlanes, input_stride,          \
+                                         output_nPlanes, output_stride);       \
+      if (nHot > o)                                                            \
+        dConvolution_KMxKN_forwardB<                                           \
+            T, K, V><<<dim3(1, output_nPlanes / K), dim3(K, K / V)>>>(         \
+            inFeatures, outFeatures, w, rules + 2 * o, nHot - o,               \
+            input_nPlanes, input_stride, output_nPlanes, output_stride);       \
+      return;                                                                  \
+    }                                                                          \
+  }
+
+template <typename T>
+void dConvolution_forward(T *inFeatures, T *outFeatures, T *w, Int *rules,
+                          Int nHot, Int input_nPlanes, Int input_stride,
+                          Int output_nPlanes, Int output_stride) {
+  FOO(T, 64, 16)
+  FOO(T, 32, 8)
+  FOO(T, 16, 4)
+  FOO(T, 8, 2)
+  assert(false);
+}
+template <>
+void dConvolution_forward<double>(double *inFeatures, double *outFeatures,
+                                  double *w, Int *rules, Int nHot,
+                                  Int input_nPlanes, Int input_stride,
+                                  Int output_nPlanes, Int output_stride) {
+  FOO(double, 32, 8)
+  FOO(double, 16, 4)
+  FOO(double, 8, 2)
+  assert(false);
+}
+#undef FOO

-    if (d_bias.numel()) {
-      auto db = d_bias.data<T>();
-      Convolution_bp_bias(doF, db, op, op, nActive);
+// dOutput x W^T -> dInput and
+// Input^T x dOutput -> dW
+// blockDim=(K,K/V,1), gridDim=(nBlocks,M,1)
+template <typename T, Int K, Int V>
+__global__ void
+dConvolution_KMxKN_backward_dW_A(T *inFeatures, T *dInFeatures, T *dOutFeatures,
+                                 T *w, T *dw, Int *rules, Int nHot,
+                                 Int input_nPlanes, Int input_stride,
+                                 Int output_nPlanes, Int output_stride) {
+  // M = gridDim.y == input_nPlanes / K
+  Int N = output_nPlanes / K;
+  Int m = blockIdx.y;
+  inFeatures += m * K;
+  dInFeatures += m * K;
+  w += m * K * output_nPlanes;
+  dw += m * K * output_nPlanes;
+
+  T dI[V];
+  T dW[V];
+  __shared__ T I[K][K];
+  __shared__ T dO[K][K];
+  __shared__ T W[K][K];
+  Int R0[V];
+  Int R1[V];
+  const int tx = threadIdx.x;
+  int ty[V];
+#pragma unroll
+  for (int v = 0; v < V; v++)
+    ty[v] = threadIdx.y + v * (K / V);
+
+  for (int n = 0; n < N; n++) {
+// Read w, reset dW
+#pragma unroll
+    for (int v = 0; v < V; v++) {
+      W[ty[v]][tx] = w[ty[v] * output_nPlanes + tx];
+      dW[v] = 0;
+    }
+
+    for (Int s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
+#pragma unroll
+      for (int v = 0; v < V; v++) {
+        R0[v] = rules[2 * (s + ty[v])];
+        R1[v] = rules[2 * (s + ty[v]) + 1];
+        dI[v] = 0;
+      }
+      __syncthreads();
+// Read input and dOutput
+#pragma unroll
+      for (int v = 0; v < V; v++) {
+        I[ty[v]][tx] = inFeatures[R0[v] * input_stride + tx];
+        dO[ty[v]][tx] = dOutFeatures[R1[v] * output_stride + tx];
+      }
+      __syncthreads();
+#pragma unroll
+      for (int k = 0; k < K; k++)
+#pragma unroll
+        for (int v = 0; v < V; v++) {
+          dI[v] += dO[ty[v]][k] * W[tx][k];
+          dW[v] += I[k][ty[v]] * dO[k][tx];
        }
+#pragma unroll
+      for (int v = 0; v < V; v++)
+        dI[v] += dInFeatures[R0[v] * input_stride + tx];
+#pragma unroll
+      for (int v = 0; v < V; v++)
+        dInFeatures[R0[v] * input_stride + tx] = dI[v];
+      __syncthreads();
+    }
+#pragma unroll
+    for (int v = 0; v < V; v++)
+      atomicAdd(&dw[ty[v] * output_nPlanes + tx], dW[v]);
+    w += K;
+    dw += K;
+    dOutFeatures += K;
  }
 }

-template <typename T, Int Dimension>
-double cuda_FullConvolution_updateOutput(
-    /*long*/ at::Tensor inputSize, /*long*/ at::Tensor outputSize,
-    /*long*/ at::Tensor filterSize,
-    /*long*/ at::Tensor filterStride, Metadata<Dimension> &mIn,
-    Metadata<Dimension> &mOut,
-    /*cuda float*/ at::Tensor input_features,
-    /*cuda float*/ at::Tensor output_features, /*cuda float*/ at::Tensor weight,
-    /*cuda float*/ at::Tensor bias) {
-
-  auto _rules = mIn.getFullConvolutionRuleBook(inputSize, outputSize,
-                                               filterSize, filterStride, mOut);
-  Int nActive = mOut.getNActive(outputSize);
-  output_features.resize_({nActive, weight.size(2)});
-  if (not bias.numel())
-    output_features.zero_();
-  double flops = 0;
+// dOutput x W^T -> dInput and
+// Input^T x dOutput -> dW
+// blockDim=(K,K/V,1), gridDim=(nBlocks,M,1)
+template <typename T, Int K, Int V>
+__global__ void
+dConvolution_KMxKN_backward_dW_B(T *inFeatures, T *dInFeatures, T *dOutFeatures,
+                                 T *w, T *dw, Int *rules, Int nHot,
+                                 Int input_nPlanes, Int input_stride,
+                                 Int output_nPlanes, Int output_stride) {
+  // M = gridDim.y == input_nPlanes / K
+  Int N = output_nPlanes / K;
+  Int m = blockIdx.y;
+  inFeatures += m * K;
+  dInFeatures += m * K;
+  w += m * K * output_nPlanes;
+  dw += m * K * output_nPlanes;

-  if (nActive) {
-    auto iF = input_features.data<T>();
-    auto oF = output_features.data<T>();
-    Int ip = input_features.size(1);
-    Int op = output_features.size(1);
-    auto w = weight.data<T>();
-
-    if (bias.numel()) {
-      auto b = bias.data<T>();
-      for (Int i = 0; i < op; i += 32) {
-        Int blockDim = min((Int)32, op - i);
-        Int gridDim = min((Int)4096, nActive);
-        Convolution_fp_bias<<<gridDim, blockDim>>>(oF + i, b + i, op, op,
-                                                   nActive);
+  T dI[V];
+  T dW[V];
+  __shared__ T I[K][K];
+  __shared__ T dO[K][K];
+  __shared__ T W[K][K];
+  Int R0[V];
+  Int R1[V];
+  const int tx = threadIdx.x;
+  int ty[V];
+#pragma unroll
+  for (int v = 0; v < V; v++)
+    ty[v] = threadIdx.y + v * (K / V);
+
+  for (int n = 0; n < N; n++) {
+// Read w, reset dW
+#pragma unroll
+    for (int v = 0; v < V; v++) {
+      W[ty[v]][tx] = w[ty[v] * output_nPlanes + tx];
+      dW[v] = 0;
    }
+
+    for (Int s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
+#pragma unroll
+      for (int v = 0; v < V; v++) {
+        if (s + ty[v] < nHot) {
+          R0[v] = rules[2 * (s + ty[v])];
+          R1[v] = rules[2 * (s + ty[v]) + 1];
        }
-    Int c = ip * op;
-    RULEBOOKITERATOR(
-        dConvolution_forward2<T>(iF, oF, w, rbB, nHotB, ip, ip, op, op);
-        , w += c; flops += nHotB * c;)
+        dI[v] = 0;
+      }
+      __syncthreads();
+// Read input and dOutput
+#pragma unroll
+      for (int v = 0; v < V; v++)
+        if (s + ty[v] < nHot) {
+          I[ty[v]][tx] = inFeatures[R0[v] * input_stride + tx];
+          dO[ty[v]][tx] = dOutFeatures[R1[v] * output_stride + tx];
+        } else {
+          I[ty[v]][tx] = 0;
+          dO[ty[v]][tx] = 0;
+        }
+      __syncthreads();
+#pragma unroll
+      for (int k = 0; k < K; k++)
+#pragma unroll
+        for (int v = 0; v < V; v++) {
+          dI[v] += dO[ty[v]][k] * W[tx][k];
+          dW[v] += I[k][ty[v]] * dO[k][tx];
+        }
+#pragma unroll
+      for (int v = 0; v < V; v++)
+        if (s + ty[v] < nHot)
+          dI[v] += dInFeatures[R0[v] * input_stride + tx];
+#pragma unroll
+      for (int v = 0; v < V; v++)
+        if (s + ty[v] < nHot)
+          dInFeatures[R0[v] * input_stride + tx] = dI[v];
+      __syncthreads();
+    }
+#pragma unroll
+    for (int v = 0; v < V; v++)
+      atomicAdd(&dw[ty[v] * output_nPlanes + tx], dW[v]);
+    w += K;
+    dw += K;
+    dOutFeatures += K;
  }
-  return flops;
 }

-template <typename T, Int Dimension>
-void cuda_FullConvolution_backward(
-    /*long*/ at::Tensor inputSize, /*long*/ at::Tensor outputSize,
-    /*long*/ at::Tensor filterSize,
-    /*long*/ at::Tensor filterStride, Metadata<Dimension> &mIn,
-    Metadata<Dimension> &mOut,
-    /*cuda float*/ at::Tensor input_features,
-    /*cuda float*/ at::Tensor d_input_features,
-    /*cuda float*/ at::Tensor d_output_features,
-    /*cuda float*/ at::Tensor weight, /*cuda float*/ at::Tensor d_weight,
-    /*cuda float*/ at::Tensor d_bias) {
-
-  auto _rules = mIn.getFullConvolutionRuleBook(inputSize, outputSize,
-                                               filterSize, filterStride, mOut);
-  Int nActive = mOut.getNActive(outputSize);
-  d_input_features.resize_as_(input_features);
-  d_input_features.zero_();
-  if (nActive) {
-    auto iF = input_features.data<T>();
-    auto diF = d_input_features.data<T>();
-    auto doF = d_output_features.data<T>();
-    Int ip = input_features.size(1);
-    Int op = d_output_features.size(1);
-    auto w = weight.data<T>();
-    auto dw = d_weight.data<T>();
-    Int c = ip * op;
-    RULEBOOKITERATOR(dConvolution_backward_dW2<T>(iF, diF, doF, w, dw, rbB,
-                                                  nHotB, ip, ip, op, op);
-                     , w += c; dw += c;)
+#define FOO(T, K, V)                                                           \
+  {                                                                            \
+    if (input_nPlanes % K == 0 and output_nPlanes % K == 0) {                  \
+      Int o = (nHot / K) * K;                                                  \
+      if (o >= K)                                                              \
+        dConvolution_KMxKN_backward_dW_A<                                      \
+            T, K, V><<<dim3(std::min(o / K, (Int)512), input_nPlanes / K),     \
+                       dim3(K, K / V)>>>(                                      \
+            inFeatures, dInFeatures, dOutFeatures, w, dw, rules, o,            \
+            input_nPlanes, input_stride, output_nPlanes, output_stride);       \
+      if (nHot > o)                                                            \
+        dConvolution_KMxKN_backward_dW_B<                                      \
+            T, K, V><<<dim3(1, input_nPlanes / K), dim3(K, K / V)>>>(          \
+            inFeatures, dInFeatures, dOutFeatures, w, dw, rules + 2 * o,       \
+            nHot - o, input_nPlanes, input_stride, output_nPlanes,             \
+            output_stride);                                                    \
+      return;                                                                  \
+    }                                                                          \
+  }
+
+template <typename T>
+void dConvolution_backward_dW(T *inFeatures, T *dInFeatures, T *dOutFeatures,
+                              T *w, T *dw, Int *rules, Int nHot,
+                              Int input_nPlanes, Int input_stride,
+                              Int output_nPlanes, Int output_stride) {
+  FOO(T, 32, 8)
+  FOO(T, 16, 4)
+  FOO(T, 8, 2)
+  assert(false);
+}
+#undef FOO
+
+template <typename T, Int K, Int V>
+__global__ void
+dConvolution_KMxKN_forward2(T *inFeatures, T *outFeatures, T *w, Int *rules,
+                            Int nHot, Int input_nPlanes, Int input_stride,
+                            Int output_nPlanes, Int output_stride) {
+  // Input x Weight -> Output
+  // blockDim=(K,K/V,1), gridDim=(nBlocks,N,1) Volkov-blocks
+  // K is a multiple of V,
+
+  // nHot x input_nplanes<=KM -> nHot x output_nPlanes<=KN
+  // - parallel over N,nHot - loop over M
+
+  Int M = (input_nPlanes + K - 1) / K;
+  // N = gridDim.y ~ output_nPlanes/K
+  Int n = blockIdx.y;
+  outFeatures += n * K;
+  w += n * K;
+  Int KO = min(K, output_nPlanes - K * n);
+
+  T O[V];
+  __shared__ T W[K][K];
+  __shared__ T I[K][K];
+  __shared__ Int R[K * 2];
+  const int tx = threadIdx.x;
+  int ty[V];
+#pragma unroll
+  for (int v = 0; v < V; v++)
+    ty[v] = threadIdx.y + v * (K / V);
+
+  for (int m = 0; m < M; m++) {
+    Int KI = min(K, input_nPlanes - K * m);
+
+// Read w
+#pragma unroll
+    for (int v = 0; v < V; v++)
+      if (ty[v] < KI and tx < KO)
+        W[ty[v]][tx] = w[ty[v] * output_nPlanes + tx];
+
+    for (Int s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
+// Read rules for K input/output pairs
+#pragma unroll
+      for (int v = 0; v < V; v++) {
+        if (ty[v] < 2) {
+          int q = ty[v] * K + tx;
+          if (s + q / 2 < nHot)
+            R[q] = rules[2 * s + q];
+        }
+      }
+      __syncthreads();
+
+// Read input, reset O[]
+#pragma unroll
+      for (int v = 0; v < V; v++) {
+        if (tx < KI and s + ty[v] < nHot)
+          I[ty[v]][tx] = inFeatures[R[2 * ty[v]] * input_stride + tx];
+        O[v] = 0;
+      }
+      __syncthreads();

-    if (d_bias.numel()) {
-      auto db = d_bias.data<T>();
-      Convolution_bp_bias(doF, db, op, op, nActive);
+#pragma unroll
+      for (int k = 0; k < KI; k++)
+#pragma unroll
+        for (int v = 0; v < V; v++)
+          O[v] += I[ty[v]][k] * W[k][tx];
+      __syncthreads();
+
+#pragma unroll
+      for (int v = 0; v < V; v++)
+        if (tx < KO and s + ty[v] < nHot)
+          outFeatures[R[2 * ty[v] + 1] * output_stride + tx] += O[v];
+      __syncthreads();
    }
+    w += K * output_nPlanes;
+    inFeatures += K;
  }
 }
-template <typename T, Int Dimension>
-double cuda_RandomizedStrideConvolution_updateOutput(
-    /*long*/ at::Tensor inputSize, /*long*/ at::Tensor outputSize,
-    /*long*/ at::Tensor filterSize,
-    /*long*/ at::Tensor filterStride, Metadata<Dimension> &m,
-    /*cuda float*/ at::Tensor input_features,
-    /*cuda float*/ at::Tensor output_features,
-    /*cuda float*/ at::Tensor weight, /*cuda float*/ at::Tensor bias) {
-
-  auto _rules = m.getRandomizedStrideRuleBook(inputSize, outputSize, filterSize,
-                                              filterStride, true);
-  Int nActive = m.getNActive(outputSize);
-  output_features.resize_({nActive, weight.size(2)});
-  if (not bias.numel())
-    output_features.zero_();

-  double flops = 0;
-  if (nActive) {
-    auto iF = input_features.data<T>();
-    auto oF = output_features.data<T>();
-    Int ip = input_features.size(1);
-    Int op = output_features.size(1);
-    auto w = weight.data<T>();
-
-    if (bias.numel()) {
-      auto b = bias.data<T>();
-      for (Int i = 0; i < op; i += 32) {
-        Int blockDim = min((Int)32, op - i);
-        Int gridDim = min((Int)4096, nActive);
-        Convolution_fp_bias<<<gridDim, blockDim>>>(oF + i, b + i, op, op,
-                                                   nActive);
+// dOutput x W^T -> dInput and
+// Input^T x dOutput -> dW
+// blockDim=(K,K/V,1), gridDim=(nBlocks,M,1)
+template <typename T, Int K, Int V>
+__global__ void
+dConvolution_KMxKN_backward_dW2(T *inFeatures, T *dInFeatures, T *dOutFeatures,
+                                T *w, T *dw, Int *rules, Int nHot,
+                                Int input_nPlanes, Int input_stride,
+                                Int output_nPlanes, Int output_stride) {
+  // M = gridDim.y == input_nPlanes / K
+  Int N = (output_nPlanes + K - 1) / K;
+  Int m = blockIdx.y;
+  inFeatures += m * K;
+  dInFeatures += m * K;
+  w += m * K * output_nPlanes;
+  dw += m * K * output_nPlanes;
+  Int KI = min(K, input_nPlanes - K * m);
+
+  T dI[V];
+  T dW[V];
+  __shared__ T I[K][K];
+  __shared__ T dO[K][K];
+  __shared__ T W[K][K];
+  __shared__ Int R[K * 2];
+  const int tx = threadIdx.x;
+  int ty[V];
+#pragma unroll
+  for (int v = 0; v < V; v++)
+    ty[v] = threadIdx.y + v * (K / V);
+
+  for (int n = 0; n < N; n++) {
+    Int KO = min(K, output_nPlanes - K * n);
+
+// Read w, reset dW
+#pragma unroll
+    for (int v = 0; v < V; v++) {
+      if (ty[v] < KI and tx < KO)
+        W[ty[v]][tx] = w[ty[v] * output_nPlanes + tx];
+      dW[v] = 0;
+    }
+
+    for (Int s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
+// Read rules for K input/output pairs, reset dI[]
+#pragma unroll
+      for (int v = 0; v < V; v++) {
+        if (ty[v] < 2) {
+          int q = ty[v] * K + tx;
+          if (s + q / 2 < nHot)
+            R[q] = rules[2 * s + q];
+        }
+        dI[v] = 0;
      }
+      __syncthreads();
+// Read input and dOutput
+#pragma unroll
+      for (int v = 0; v < V; v++) {
+        if (tx < KI and s + ty[v] < nHot)
+          I[ty[v]][tx] = inFeatures[R[2 * ty[v]] * input_stride + tx];
+        else
+          I[ty[v]][tx] = 0;
+        if (tx < KO and s + ty[v] < nHot)
+          dO[ty[v]][tx] = dOutFeatures[R[2 * ty[v] + 1] * output_stride + tx];
+        else
+          dO[ty[v]][tx] = 0;
+      }
+      __syncthreads();
+#pragma unroll
+      for (int k = 0; k < KO; k++)
+#pragma unroll
+        for (int v = 0; v < V; v++)
+          dI[v] += dO[ty[v]][k] * W[tx][k];
+#pragma unroll
+      for (int k = 0; k < K; k++)
+#pragma unroll
+        for (int v = 0; v < V; v++)
+          dW[v] += I[k][ty[v]] * dO[k][tx];
+      __syncthreads();
+#pragma unroll
+      for (int v = 0; v < V; v++)
+        if (tx < KI and s + ty[v] < nHot)
+          dInFeatures[R[2 * ty[v]] * input_stride + tx] += dI[v];
+      __syncthreads();
    }
-    Int c = ip * op;
+#pragma unroll
+    for (int v = 0; v < V; v++)
+      if (ty[v] < KI and tx < KO)
+        atomicAdd(&dw[ty[v] * output_nPlanes + tx], dW[v]);
+    w += K;
+    dw += K;
+    dOutFeatures += K;
+  }
+}
+
+template <typename T>
+double dConvolution_forward2(T *inFeatures, T *outFeatures, T *w,
+                             RuleBook _rules, Int input_nPlanes,
+                             Int input_stride, Int output_nPlanes,
+                             Int output_stride) {
+  Int c = input_nPlanes * output_nPlanes;
+  double flops = 0;
+  if (input_nPlanes % 8 != 0 or output_nPlanes % 8 != 0) {
+    const int K = 16;
+    const int V = 4;
    RULEBOOKITERATOR(
-        dConvolution_forward2<T>(iF, oF, w, rbB, nHotB, ip, ip, op, op);
+        (dConvolution_KMxKN_forward2<
+            T, K,
+            V><<<dim3(128, (output_nPlanes + K - 1) / K), dim3(K, K / V)>>>(
+            inFeatures, outFeatures, w, rbB, nHotB, input_nPlanes, input_stride,
+            output_nPlanes, output_stride));
+        , w += c; flops += nHotB * c;)
+  } else {
+    RULEBOOKITERATOR(dConvolution_forward(inFeatures, outFeatures, w, rbB,
+                                          nHotB, input_nPlanes, input_stride,
+                                          output_nPlanes, output_stride);
                     , w += c; flops += nHotB * c;)
  }
  return flops;
 }

-template <typename T, Int Dimension>
-void cuda_RandomizedStrideConvolution_backward(
-    /*long*/ at::Tensor inputSize, /*long*/ at::Tensor outputSize,
-    /*long*/ at::Tensor filterSize,
-    /*long*/ at::Tensor filterStride, Metadata<Dimension> &m,
-    /*cuda float*/ at::Tensor input_features,
-    /*cuda float*/ at::Tensor d_input_features,
-    /*cuda float*/ at::Tensor d_output_features,
-    /*cuda float*/ at::Tensor weight, /*cuda float*/ at::Tensor d_weight,
-    /*cuda float*/ at::Tensor d_bias) {
-
-  auto _rules = m.getRandomizedStrideRuleBook(inputSize, outputSize, filterSize,
-                                              filterStride, true);
-  Int nActive = m.getNActive(outputSize);
-  d_input_features.resize_as_(input_features);
-  d_input_features.zero_();
-
-  if (nActive) {
-    auto iF = input_features.data<T>();
-    auto diF = d_input_features.data<T>();
-    auto doF = d_output_features.data<T>();
-    Int ip = input_features.size(1);
-    Int op = d_output_features.size(1);
-    auto w = weight.data<T>();
-    auto dw = d_weight.data<T>();
-    Int c = ip * op;
-    RULEBOOKITERATOR(dConvolution_backward_dW2<T>(iF, diF, doF, w, dw, rbB,
-                                                  nHotB, ip, ip, op, op);
+template <typename T>
+void dConvolution_backward_dW2(T *inFeatures, T *dInFeatures, T *dOutFeatures,
+                               T *w, T *dw, RuleBook _rules, Int input_nPlanes,
+                               Int input_stride, Int output_nPlanes,
+                               Int output_stride) {
+  Int c = input_nPlanes * output_nPlanes;
+  if (input_nPlanes % 8 != 0 or output_nPlanes % 8 != 0) {
+    const int K = 16;
+    const int V = 4;
+    RULEBOOKITERATOR(
+        (dConvolution_KMxKN_backward_dW2<
+            T, K,
+            V><<<dim3(128, (input_nPlanes + K - 1) / K), dim3(K, K / V)>>>(
+            inFeatures, dInFeatures, dOutFeatures, w, dw, rbB, nHotB,
+            input_nPlanes, input_stride, output_nPlanes, output_stride));
+        , w += c; dw += c;)
+  } else {
+    RULEBOOKITERATOR(dConvolution_backward_dW(inFeatures, dInFeatures,
+                                              dOutFeatures, w, dw, rbB, nHotB,
+                                              input_nPlanes, input_stride,
+                                              output_nPlanes, output_stride);
                     , w += c; dw += c;)
-
-    if (d_bias.numel()) {
-      auto db = d_bias.data<T>();
-      Convolution_bp_bias(doF, db, op, op, nActive);
-    }
  }
 }
--- a/sparseconvnet/SCN/CUDA/Convolution.h
+++ b/sparseconvnet/SCN/CUDA/Convolution.h
-// Copyright 2016-present, Facebook, Inc.
-// All rights reserved.
-//
-// This source code is licensed under the license found in the
-// LICENSE file in the root directory of this source tree.
-
-#ifndef CUDA_CONVOLUTION_H
-#define CUDA_CONVOLUTION_H
-
-
-template <typename T>
-__global__ void Convolution_fp_bias(T *output_features, T *bias, Int nPlanes,
-                                    Int output_stride, Int nActive) {
-  __shared__ T b[32];
-  b[threadIdx.x] = bias[threadIdx.x];
-  for (Int row = blockIdx.x; row < nActive; row += 1 << 12) {
-    output_features[row * output_stride + threadIdx.x] = b[threadIdx.x];
-  }
-}
-
-template <typename T>
-__global__ void dColumnSum(T *matrix, T *target, Int nRows, Int nColumns,
-                           Int nCOLUMNS) {
-  Int i = blockIdx.x * 32 + threadIdx.x;
-  T t = 0;
-  for (Int j = blockIdx.y; j < nRows; j += 32)
-    t += matrix[j * nCOLUMNS + i];
-  atomicAdd(&target[i], t);
-}
-template <typename T>
-void Convolution_bp_bias(T *matrix, T *target, Int nRows, Int nColumns,
-                         Int nCOLUMNS) {
-  if (nColumns / 32 > 0)
-    dColumnSum<<<dim3(nColumns / 32, 32), 32>>>(
-        matrix, target, nRows, nColumns, nCOLUMNS);
-  if (nColumns % 32 > 0) {
-    Int o = nColumns / 32 * 32;
-    dColumnSum<<<dim3(1, 32), nColumns - o>>>(
-        matrix + o, target + o, nRows, nColumns, nCOLUMNS);
-  }
-}
-
-template <typename T, Int K, Int V>
-__global__ void
-dConvolution_KMxKN_forwardA(T *inFeatures, T *outFeatures, T *w, Int *rules,
-                            Int nHot, Int input_nPlanes, Int input_stride,
-                            Int output_nPlanes, Int output_stride) {
-  // nHot must be a multiple of K!!
-
-  // Input x Weight -> Output
-  // blockDim=(K,K/V,1), gridDim=(nBlocks,N,1) Volkov-blocks
-  // K is a multiple of V,
-
-  // nHot x KM -> nHot x KN - parallel over N,nHot - loop over M
-
-  Int M = input_nPlanes / K;
-  // N = gridDim.y == output_nPlanes/K
-  Int n = blockIdx.y;
-  outFeatures += n * K;
-  w += n * K;
-
-  T O[V];
-  __shared__ T W[K][K];
-  __shared__ T I[K][K];
-  Int R0[V];
-  Int R1[V];
-  const int tx = threadIdx.x;
-  int ty[V];
-#pragma unroll
-  for (int v = 0; v < V; v++)
-    ty[v] = threadIdx.y + v * (K / V);
-
-  for (int m = 0; m < M; m++) {
-// Read w
-#pragma unroll
-    for (int v = 0; v < V; v++)
-      W[ty[v]][tx] = w[ty[v] * output_nPlanes + tx];
-
-    for (Int s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
-#pragma unroll
-      for (int v = 0; v < V; v++) {
-        R0[v] = rules[2 * (s + ty[v])];
-        R1[v] = rules[2 * (s + ty[v]) + 1];
-      }
-      __syncthreads();
-
-// Read input, reset O[]
-#pragma unroll
-      for (int v = 0; v < V; v++) {
-        I[ty[v]][tx] = inFeatures[R0[v] * input_stride + tx];
-        O[v] = 0;
-      }
-      __syncthreads();
-
-#pragma unroll
-      for (int k = 0; k < K; k++)
-#pragma unroll
-        for (int v = 0; v < V; v++)
-          O[v] += I[ty[v]][k] * W[k][tx];
-
-#pragma unroll
-      for (int v = 0; v < V; v++)
-        O[v] += outFeatures[R1[v] * output_stride + tx];
-#pragma unroll
-      for (int v = 0; v < V; v++)
-        outFeatures[R1[v] * output_stride + tx] = O[v];
-      __syncthreads();
-    }
-    w += K * output_nPlanes;
-    inFeatures += K;
-  }
-}
-template <typename T, Int K, Int V>
-__global__ void
-dConvolution_KMxKN_forwardB(T *inFeatures, T *outFeatures, T *w, Int *rules,
-                            Int nHot, Int input_nPlanes, Int input_stride,
-                            Int output_nPlanes, Int output_stride) {
-  // Input x Weight -> Output
-  // blockDim=(K,K/V,1), gridDim=(nBlocks,N,1) Volkov-blocks
-  // K is a multiple of V,
-
-  // nHot x KM -> nHot x KN - parallel over N,nHot - loop over M
-
-  Int M = input_nPlanes / K;
-  // N = gridDim.y == output_nPlanes/K
-  Int n = blockIdx.y;
-  outFeatures += n * K;
-  w += n * K;
-
-  T O[V];
-  __shared__ T W[K][K];
-  __shared__ T I[K][K];
-  Int R0[V];
-  Int R1[V];
-  const int tx = threadIdx.x;
-  int ty[V];
-#pragma unroll
-  for (int v = 0; v < V; v++)
-    ty[v] = threadIdx.y + v * (K / V);
-
-  for (int m = 0; m < M; m++) {
-// Read w
-#pragma unroll
-    for (int v = 0; v < V; v++)
-      W[ty[v]][tx] = w[ty[v] * output_nPlanes + tx];
-
-    for (Int s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
-#pragma unroll
-      for (int v = 0; v < V; v++) {
-        if (s + ty[v] < nHot) {
-          R0[v] = rules[2 * (s + ty[v])];
-          R1[v] = rules[2 * (s + ty[v]) + 1];
-        }
-      }
-      __syncthreads();
-
-// Read input, reset O[]
-#pragma unroll
-      for (int v = 0; v < V; v++) {
-        if (s + ty[v] < nHot)
-          I[ty[v]][tx] = inFeatures[R0[v] * input_stride + tx];
-        O[v] = 0;
-      }
-      __syncthreads();
-
-#pragma unroll
-      for (int k = 0; k < K; k++)
-#pragma unroll
-        for (int v = 0; v < V; v++)
-          O[v] += I[ty[v]][k] * W[k][tx];
-
-#pragma unroll
-      for (int v = 0; v < V; v++)
-        if (s + ty[v] < nHot)
-          O[v] += outFeatures[R1[v] * output_stride + tx];
-#pragma unroll
-      for (int v = 0; v < V; v++)
-        if (s + ty[v] < nHot)
-          outFeatures[R1[v] * output_stride + tx] = O[v];
-      __syncthreads();
-    }
-    w += K * output_nPlanes;
-    inFeatures += K;
-  }
-}
-
-#define FOO(T, K, V)                                                           \
-  {                                                                            \
-    if (input_nPlanes % K == 0 and output_nPlanes % K == 0) {                  \
-      Int o = (nHot / K) * K;                                                 \
-      if (o >= K)                                                              \
-        dConvolution_KMxKN_forwardA<                                           \
-            T, K, V><<<dim3(std::min(o / K, (Int)512), output_nPlanes / K),   \
-                       dim3(K, K / V)>>>(                           \
-            inFeatures, outFeatures, w, rules, o, input_nPlanes, input_stride, \
-            output_nPlanes, output_stride);                                    \
-      if (nHot > o)                                                            \
-        dConvolution_KMxKN_forwardB<                                           \
-            T, K,                                                              \
-            V><<<dim3(1, output_nPlanes / K), dim3(K, K / V)>>>(    \
-            inFeatures, outFeatures, w, rules + 2 * o, nHot - o,               \
-            input_nPlanes, input_stride, output_nPlanes, output_stride);       \
-      return;                                                                  \
-    }                                                                          \
-  }
-
-template <typename T>
-void dConvolution_forward(T *inFeatures, T *outFeatures, T *w, Int *rules,
-                          Int nHot, Int input_nPlanes, Int input_stride,
-                          Int output_nPlanes, Int output_stride) {
-  FOO(T, 64, 16)
-  FOO(T, 32, 8)
-  FOO(T, 16, 4)
-  FOO(T, 8, 2)
-  assert(false);
-}
-template <>
-void dConvolution_forward<double>(double *inFeatures, double *outFeatures,
-                                  double *w, Int *rules, Int nHot,
-                                  Int input_nPlanes, Int input_stride,
-                                  Int output_nPlanes, Int output_stride) {
-  FOO(double, 32, 8)
-  FOO(double, 16, 4)
-  FOO(double, 8, 2)
-  assert(false);
-}
-#undef FOO
-
-// dOutput x W^T -> dInput and
-// Input^T x dOutput -> dW
-// blockDim=(K,K/V,1), gridDim=(nBlocks,M,1)
-template <typename T, Int K, Int V>
-__global__ void
-dConvolution_KMxKN_backward_dW_A(T *inFeatures, T *dInFeatures, T *dOutFeatures,
-                                 T *w, T *dw, Int *rules, Int nHot,
-                                 Int input_nPlanes, Int input_stride,
-                                 Int output_nPlanes, Int output_stride) {
-  // M = gridDim.y == input_nPlanes / K
-  Int N = output_nPlanes / K;
-  Int m = blockIdx.y;
-  inFeatures += m * K;
-  dInFeatures += m * K;
-  w += m * K * output_nPlanes;
-  dw += m * K * output_nPlanes;
-
-  T dI[V];
-  T dW[V];
-  __shared__ T I[K][K];
-  __shared__ T dO[K][K];
-  __shared__ T W[K][K];
-  Int R0[V];
-  Int R1[V];
-  const int tx = threadIdx.x;
-  int ty[V];
-#pragma unroll
-  for (int v = 0; v < V; v++)
-    ty[v] = threadIdx.y + v * (K / V);
-
-  for (int n = 0; n < N; n++) {
-// Read w, reset dW
-#pragma unroll
-    for (int v = 0; v < V; v++) {
-      W[ty[v]][tx] = w[ty[v] * output_nPlanes + tx];
-      dW[v] = 0;
-    }
-
-    for (Int s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
-#pragma unroll
-      for (int v = 0; v < V; v++) {
-        R0[v] = rules[2 * (s + ty[v])];
-        R1[v] = rules[2 * (s + ty[v]) + 1];
-        dI[v] = 0;
-      }
-      __syncthreads();
-// Read input and dOutput
-#pragma unroll
-      for (int v = 0; v < V; v++) {
-        I[ty[v]][tx] = inFeatures[R0[v] * input_stride + tx];
-        dO[ty[v]][tx] = dOutFeatures[R1[v] * output_stride + tx];
-      }
-      __syncthreads();
-#pragma unroll
-      for (int k = 0; k < K; k++)
-#pragma unroll
-        for (int v = 0; v < V; v++) {
-          dI[v] += dO[ty[v]][k] * W[tx][k];
-          dW[v] += I[k][ty[v]] * dO[k][tx];
-        }
-#pragma unroll
-      for (int v = 0; v < V; v++)
-        dI[v] += dInFeatures[R0[v] * input_stride + tx];
-#pragma unroll
-      for (int v = 0; v < V; v++)
-        dInFeatures[R0[v] * input_stride + tx] = dI[v];
-      __syncthreads();
-    }
-#pragma unroll
-    for (int v = 0; v < V; v++)
-      atomicAdd(&dw[ty[v] * output_nPlanes + tx], dW[v]);
-    w += K;
-    dw += K;
-    dOutFeatures += K;
-  }
-}
-
-// dOutput x W^T -> dInput and
-// Input^T x dOutput -> dW
-// blockDim=(K,K/V,1), gridDim=(nBlocks,M,1)
-template <typename T, Int K, Int V>
-__global__ void
-dConvolution_KMxKN_backward_dW_B(T *inFeatures, T *dInFeatures, T *dOutFeatures,
-                                 T *w, T *dw, Int *rules, Int nHot,
-                                 Int input_nPlanes, Int input_stride,
-                                 Int output_nPlanes, Int output_stride) {
-  // M = gridDim.y == input_nPlanes / K
-  Int N = output_nPlanes / K;
-  Int m = blockIdx.y;
-  inFeatures += m * K;
-  dInFeatures += m * K;
-  w += m * K * output_nPlanes;
-  dw += m * K * output_nPlanes;
-
-  T dI[V];
-  T dW[V];
-  __shared__ T I[K][K];
-  __shared__ T dO[K][K];
-  __shared__ T W[K][K];
-  Int R0[V];
-  Int R1[V];
-  const int tx = threadIdx.x;
-  int ty[V];
-#pragma unroll
-  for (int v = 0; v < V; v++)
-    ty[v] = threadIdx.y + v * (K / V);
-
-  for (int n = 0; n < N; n++) {
-// Read w, reset dW
-#pragma unroll
-    for (int v = 0; v < V; v++) {
-      W[ty[v]][tx] = w[ty[v] * output_nPlanes + tx];
-      dW[v] = 0;
-    }
-
-    for (Int s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
-#pragma unroll
-      for (int v = 0; v < V; v++) {
-        if (s + ty[v] < nHot) {
-          R0[v] = rules[2 * (s + ty[v])];
-          R1[v] = rules[2 * (s + ty[v]) + 1];
-        }
-        dI[v] = 0;
-      }
-      __syncthreads();
-// Read input and dOutput
-#pragma unroll
-      for (int v = 0; v < V; v++)
-        if (s + ty[v] < nHot) {
-          I[ty[v]][tx] = inFeatures[R0[v] * input_stride + tx];
-          dO[ty[v]][tx] = dOutFeatures[R1[v] * output_stride + tx];
-        } else {
-          I[ty[v]][tx] = 0;
-          dO[ty[v]][tx] = 0;
-        }
-      __syncthreads();
-#pragma unroll
-      for (int k = 0; k < K; k++)
-#pragma unroll
-        for (int v = 0; v < V; v++) {
-          dI[v] += dO[ty[v]][k] * W[tx][k];
-          dW[v] += I[k][ty[v]] * dO[k][tx];
-        }
-#pragma unroll
-      for (int v = 0; v < V; v++)
-        if (s + ty[v] < nHot)
-          dI[v] += dInFeatures[R0[v] * input_stride + tx];
-#pragma unroll
-      for (int v = 0; v < V; v++)
-        if (s + ty[v] < nHot)
-          dInFeatures[R0[v] * input_stride + tx] = dI[v];
-      __syncthreads();
-    }
-#pragma unroll
-    for (int v = 0; v < V; v++)
-      atomicAdd(&dw[ty[v] * output_nPlanes + tx], dW[v]);
-    w += K;
-    dw += K;
-    dOutFeatures += K;
-  }
-}
-
-#define FOO(T, K, V)                                                           \
-  {                                                                            \
-    if (input_nPlanes % K == 0 and output_nPlanes % K == 0) {                  \
-      Int o = (nHot / K) * K;                                                 \
-      if (o >= K)                                                              \
-        dConvolution_KMxKN_backward_dW_A<                                      \
-            T, K, V><<<dim3(std::min(o / K, (Int)512), input_nPlanes / K),    \
-                       dim3(K, K / V)>>>(                           \
-            inFeatures, dInFeatures, dOutFeatures, w, dw, rules, o,            \
-            input_nPlanes, input_stride, output_nPlanes, output_stride);       \
-      if (nHot > o)                                                            \
-        dConvolution_KMxKN_backward_dW_B<                                      \
-            T, K,                                                              \
-            V><<<dim3(1, input_nPlanes / K), dim3(K, K / V)>>>(     \
-            inFeatures, dInFeatures, dOutFeatures, w, dw, rules + 2 * o,       \
-            nHot - o, input_nPlanes, input_stride, output_nPlanes,             \
-            output_stride);                                                    \
-      return;                                                                  \
-    }                                                                          \
-  }
-
-template <typename T>
-void dConvolution_backward_dW(T *inFeatures, T *dInFeatures, T *dOutFeatures,
-                              T *w, T *dw, Int *rules, Int nHot,
-                              Int input_nPlanes, Int input_stride,
-                              Int output_nPlanes, Int output_stride) {
-  FOO(T, 32, 8)
-  FOO(T, 16, 4)
-  FOO(T, 8, 2)
-  assert(false);
-}
-#undef FOO
-
-template <typename T, Int K, Int V>
-__global__ void
-dConvolution_KMxKN_forward2(T *inFeatures, T *outFeatures, T *w, Int *rules,
-                            Int nHot, Int input_nPlanes, Int input_stride,
-                            Int output_nPlanes, Int output_stride) {
-  // Input x Weight -> Output
-  // blockDim=(K,K/V,1), gridDim=(nBlocks,N,1) Volkov-blocks
-  // K is a multiple of V,
-
-  // nHot x input_nplanes<=KM -> nHot x output_nPlanes<=KN
-  // - parallel over N,nHot - loop over M
-
-  Int M = (input_nPlanes + K - 1) / K;
-  // N = gridDim.y ~ output_nPlanes/K
-  Int n = blockIdx.y;
-  outFeatures += n * K;
-  w += n * K;
-  Int KO = min(K, output_nPlanes - K * n);
-
-  T O[V];
-  __shared__ T W[K][K];
-  __shared__ T I[K][K];
-  __shared__ Int R[K * 2];
-  const int tx = threadIdx.x;
-  int ty[V];
-#pragma unroll
-  for (int v = 0; v < V; v++)
-    ty[v] = threadIdx.y + v * (K / V);
-
-  for (int m = 0; m < M; m++) {
-    Int KI = min(K, input_nPlanes - K * m);
-
-// Read w
-#pragma unroll
-    for (int v = 0; v < V; v++)
-      if (ty[v] < KI and tx < KO)
-        W[ty[v]][tx] = w[ty[v] * output_nPlanes + tx];
-
-    for (Int s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
-// Read rules for K input/output pairs
-#pragma unroll
-      for (int v = 0; v < V; v++) {
-        if (ty[v] < 2) {
-          int q = ty[v] * K + tx;
-          if (s + q / 2 < nHot)
-            R[q] = rules[2 * s + q];
-        }
-      }
-      __syncthreads();
-
-// Read input, reset O[]
-#pragma unroll
-      for (int v = 0; v < V; v++) {
-        if (tx < KI and s + ty[v] < nHot)
-          I[ty[v]][tx] = inFeatures[R[2 * ty[v]] * input_stride + tx];
-        O[v] = 0;
-      }
-      __syncthreads();
-
-#pragma unroll
-      for (int k = 0; k < KI; k++)
-#pragma unroll
-        for (int v = 0; v < V; v++)
-          O[v] += I[ty[v]][k] * W[k][tx];
-      __syncthreads();
-
-#pragma unroll
-      for (int v = 0; v < V; v++)
-        if (tx < KO and s + ty[v] < nHot)
-          outFeatures[R[2 * ty[v] + 1] * output_stride + tx] += O[v];
-      __syncthreads();
-    }
-    w += K * output_nPlanes;
-    inFeatures += K;
-  }
-}
-
-template <typename T>
-void dConvolution_forward2(T *inFeatures, T *outFeatures, T *w, Int *rules,
-                           Int nHot, Int input_nPlanes, Int input_stride,
-                           Int output_nPlanes, Int output_stride) {
-  if (input_nPlanes % 8 != 0 or output_nPlanes % 8 != 0) {
-    const int K = 16;
-    const int V = 4;
-    dConvolution_KMxKN_forward2<
-        T, K, V><<<dim3(128, (output_nPlanes + K - 1) / K), dim3(K, K / V)>>>(
-        inFeatures, outFeatures, w, rules, nHot, input_nPlanes, input_stride,
-        output_nPlanes, output_stride);
-    return;
-  } else {
-    dConvolution_forward(inFeatures, outFeatures, w, rules, nHot, input_nPlanes,
-                         input_stride, output_nPlanes, output_stride);
-  }
-}
-
-// dOutput x W^T -> dInput and
-// Input^T x dOutput -> dW
-// blockDim=(K,K/V,1), gridDim=(nBlocks,M,1)
-template <typename T, Int K, Int V>
-__global__ void
-dConvolution_KMxKN_backward_dW2(T *inFeatures, T *dInFeatures, T *dOutFeatures,
-                                T *w, T *dw, Int *rules, Int nHot,
-                                Int input_nPlanes, Int input_stride,
-                                Int output_nPlanes, Int output_stride) {
-  // M = gridDim.y == input_nPlanes / K
-  Int N = (output_nPlanes + K - 1) / K;
-  Int m = blockIdx.y;
-  inFeatures += m * K;
-  dInFeatures += m * K;
-  w += m * K * output_nPlanes;
-  dw += m * K * output_nPlanes;
-  Int KI = min(K, input_nPlanes - K * m);
-
-  T dI[V];
-  T dW[V];
-  __shared__ T I[K][K];
-  __shared__ T dO[K][K];
-  __shared__ T W[K][K];
-  __shared__ Int R[K * 2];
-  const int tx = threadIdx.x;
-  int ty[V];
-#pragma unroll
-  for (int v = 0; v < V; v++)
-    ty[v] = threadIdx.y + v * (K / V);
-
-  for (int n = 0; n < N; n++) {
-    Int KO = min(K, output_nPlanes - K * n);
-
-// Read w, reset dW
-#pragma unroll
-    for (int v = 0; v < V; v++) {
-      if (ty[v] < KI and tx < KO)
-        W[ty[v]][tx] = w[ty[v] * output_nPlanes + tx];
-      dW[v] = 0;
-    }
-
-    for (Int s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
-// Read rules for K input/output pairs, reset dI[]
-#pragma unroll
-      for (int v = 0; v < V; v++) {
-        if (ty[v] < 2) {
-          int q = ty[v] * K + tx;
-          if (s + q / 2 < nHot)
-            R[q] = rules[2 * s + q];
-        }
-        dI[v] = 0;
-      }
-      __syncthreads();
-// Read input and dOutput
-#pragma unroll
-      for (int v = 0; v < V; v++) {
-        if (tx < KI and s + ty[v] < nHot)
-          I[ty[v]][tx] = inFeatures[R[2 * ty[v]] * input_stride + tx];
-        else
-          I[ty[v]][tx] = 0;
-        if (tx < KO and s + ty[v] < nHot)
-          dO[ty[v]][tx] = dOutFeatures[R[2 * ty[v] + 1] * output_stride + tx];
-        else
-          dO[ty[v]][tx] = 0;
-      }
-      __syncthreads();
-#pragma unroll
-      for (int k = 0; k < KO; k++)
-#pragma unroll
-        for (int v = 0; v < V; v++)
-          dI[v] += dO[ty[v]][k] * W[tx][k];
-#pragma unroll
-      for (int k = 0; k < K; k++)
-#pragma unroll
-        for (int v = 0; v < V; v++)
-          dW[v] += I[k][ty[v]] * dO[k][tx];
-      __syncthreads();
-#pragma unroll
-      for (int v = 0; v < V; v++)
-        if (tx < KI and s + ty[v] < nHot)
-          dInFeatures[R[2 * ty[v]] * input_stride + tx] += dI[v];
-      __syncthreads();
-    }
-#pragma unroll
-    for (int v = 0; v < V; v++)
-      if (ty[v] < KI and tx < KO)
-        atomicAdd(&dw[ty[v] * output_nPlanes + tx], dW[v]);
-    w += K;
-    dw += K;
-    dOutFeatures += K;
-  }
-}
-
-template <typename T>
-void dConvolution_backward_dW2(T *inFeatures, T *dInFeatures, T *dOutFeatures,
-                               T *w, T *dw, Int *rules, Int nHot,
-                               Int input_nPlanes, Int input_stride,
-                               Int output_nPlanes, Int output_stride) {
-  if (input_nPlanes % 8 != 0 or output_nPlanes % 8 != 0) {
-    const int K = 16;
-    const int V = 4;
-    dConvolution_KMxKN_backward_dW2<T, K, V><<<
-        dim3(128, (input_nPlanes + K - 1) / K), dim3(K, K / V)>>>(
-        inFeatures, dInFeatures, dOutFeatures, w, dw, rules, nHot,
-        input_nPlanes, input_stride, output_nPlanes, output_stride);
-    return;
-  } else {
-    dConvolution_backward_dW(inFeatures, dInFeatures, dOutFeatures, w, dw,
-                             rules, nHot, input_nPlanes, input_stride,
-                             output_nPlanes, output_stride);
-  }
-}
-
-#endif /* CUDA_CONVOLUTION_H */
--- a/sparseconvnet/SCN/CUDA/Deconvolution.cpp
+++ b/sparseconvnet/SCN/CUDA/Deconvolution.cpp
+// Copyright 2016-present, Facebook, Inc.
+// All rights reserved.
+//
+// This source code is licensed under the license found in the
+// LICENSE file in the root directory of this source tree.
+
+template <typename T>
+double dDeconvolution_forward2(T *inFeatures, T *outFeatures, T *w,
+                               RuleBook _rules, Int input_nPlanes,
+                               Int input_stride, Int output_nPlanes,
+                               Int output_stride);
+
+template <typename T>
+void dDeconvolution_backward_dW2(T *inFeatures, T *dInFeatures, T *dOutFeatures,
+                                 T *w, T *dw, RuleBook _rules,
+                                 Int input_nPlanes, Int input_stride,
+                                 Int output_nPlanes, Int output_stride);
+
+template <typename T, Int Dimension>
+double cuda_Deconvolution_updateOutput(
+    /*long*/ at::Tensor inputSize, /*long*/ at::Tensor outputSize,
+    /*long*/ at::Tensor filterSize,
+    /*long*/ at::Tensor filterStride, Metadata<Dimension> &m,
+    /*cuda float*/ at::Tensor input_features,
+    /*cuda float*/ at::Tensor output_features, /*cuda float*/ at::Tensor weight,
+    /*cuda float*/ at::Tensor bias) {
+
+  auto _rules =
+      m.getRuleBook(outputSize, inputSize, filterSize, filterStride, true);
+  Int nActiveOut = m.getNActive(outputSize);
+
+  if (nActiveOut) {
+    Int ip = weight.size(1);
+    Int op = weight.size(2);
+    output_features.resize_({nActiveOut, op});
+    auto iF = input_features.data<T>();
+    auto oF = output_features.data<T>();
+    auto w = weight.data<T>();
+
+    if (bias.numel())
+      Convolution_fp_bias(oF, bias.data<T>(), op, nActiveOut);
+    else
+      output_features.zero_();
+
+    return dDeconvolution_forward2<T>(iF, oF, w, _rules, ip, ip, op, op);
+  } else {
+    return 0;
+  }
+}
+
+template <typename T, Int Dimension>
+void cuda_Deconvolution_backward(
+    /*long*/ at::Tensor inputSize, /*long*/ at::Tensor outputSize,
+    /*long*/ at::Tensor filterSize,
+    /*long*/ at::Tensor filterStride, Metadata<Dimension> &m,
+    /*cuda float*/ at::Tensor input_features,
+    /*cuda float*/ at::Tensor d_input_features,
+    /*cuda float*/ at::Tensor d_output_features,
+    /*cuda float*/ at::Tensor weight, /*cuda float*/ at::Tensor d_weight,
+    /*cuda float*/ at::Tensor d_bias) {
+
+  auto _rules =
+      m.getRuleBook(outputSize, inputSize, filterSize, filterStride, true);
+  Int nActiveIn = m.getNActive(inputSize);
+  Int nActiveOut = m.getNActive(outputSize);
+
+  if (nActiveOut) {
+    Int ip = weight.size(1);
+    Int op = weight.size(2);
+    d_input_features.resize_({nActiveIn, ip});
+    d_input_features.zero_();
+    auto iF = input_features.data<T>();
+    auto diF = d_input_features.data<T>();
+    auto doF = d_output_features.data<T>();
+    auto w = weight.data<T>();
+    auto dw = d_weight.data<T>();
+
+    dDeconvolution_backward_dW2<T>(iF, diF, doF, w, dw, _rules, ip, ip, op, op);
+    if (d_bias.numel()) {
+      auto db = d_bias.data<T>();
+      Convolution_bp_bias(doF, db, op, op, nActiveOut);
+    }
+  }
+}
--- a/sparseconvnet/SCN/CUDA/Deconvolution.cu
+++ b/sparseconvnet/SCN/CUDA/Deconvolution.cu
@@ -4,80 +4,600 @@
 // This source code is licensed under the license found in the
 // LICENSE file in the root directory of this source tree.

-#include "Convolution.h"
-#include "Deconvolution.h"
-
-template <typename T, Int Dimension>
-double cuda_Deconvolution_updateOutput(
-    /*long*/ at::Tensor inputSize, /*long*/ at::Tensor outputSize,
-    /*long*/ at::Tensor filterSize,
-    /*long*/ at::Tensor filterStride, Metadata<Dimension> &m,
-    /*cuda float*/ at::Tensor input_features,
-    /*cuda float*/ at::Tensor output_features, /*cuda float*/ at::Tensor weight,
-    /*cuda float*/ at::Tensor bias) {
-
-  auto _rules =
-      m.getRuleBook(outputSize, inputSize, filterSize, filterStride, true);
-  Int nActive = m.getNActive(outputSize);
-  output_features.resize_({nActive, weight.size(2)});
-  if (not bias.numel())
-    output_features.zero_();
-
-  auto iF = input_features.data<T>();
-  auto oF = output_features.data<T>();
-  Int ip = input_features.size(1);
-  Int op = output_features.size(1);
-  auto w = weight.data<T>();
-  double flops = 0;
+template <typename T, Int K, Int V>
+__global__ void
+dDeconvolution_KMxKN_forwardA(T *inFeatures, T *outFeatures, T *w, Int *rules,
+                              Int nHot, Int input_nPlanes, Int input_stride,
+                              Int output_nPlanes, Int output_stride) {
+  // nHot must be a multiple of K!!
+
+  // Input x Weight -> Output
+  // blockDim=(K,K/V,1), gridDim=(nBlocks,N,1) Volkov-blocks
+  // K is a multiple of V,
+
+  // nHot x KM -> nHot x KN - parallel over N,nHot - loop over M
+
+  Int M = input_nPlanes / K;
+  // N = gridDim.y == output_nPlanes/K
+  Int n = blockIdx.y;
+  outFeatures += n * K;
+  w += n * K;
+
+  T O[V];
+  __shared__ T W[K][K];
+  __shared__ T I[K][K];
+  Int R0[V];
+  Int R1[V];
+  const int tx = threadIdx.x;
+  int ty[V];
+#pragma unroll
+  for (int v = 0; v < V; v++)
+    ty[v] = threadIdx.y + v * (K / V);
+
+  for (int m = 0; m < M; m++) {
+// Read w
+#pragma unroll
+    for (int v = 0; v < V; v++)
+      W[ty[v]][tx] = w[ty[v] * output_nPlanes + tx];
+
+    for (Int s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
+#pragma unroll
+      for (int v = 0; v < V; v++) {
+        R1[v] = rules[2 * (s + ty[v])];
+        R0[v] = rules[2 * (s + ty[v]) + 1];
+      }
+      __syncthreads();
+
+// Read input, reset O[]
+#pragma unroll
+      for (int v = 0; v < V; v++) {
+        I[ty[v]][tx] = inFeatures[R0[v] * input_stride + tx];
+        O[v] = 0;
+      }
+      __syncthreads();
+
+#pragma unroll
+      for (int k = 0; k < K; k++)
+#pragma unroll
+        for (int v = 0; v < V; v++)
+          O[v] += I[ty[v]][k] * W[k][tx];
+
+#pragma unroll
+      for (int v = 0; v < V; v++)
+        O[v] += outFeatures[R1[v] * output_stride + tx];
+#pragma unroll
+      for (int v = 0; v < V; v++)
+        outFeatures[R1[v] * output_stride + tx] = O[v];
+      __syncthreads();
+    }
+    w += K * output_nPlanes;
+    inFeatures += K;
+  }
+}
+template <typename T, Int K, Int V>
+__global__ void
+dDeconvolution_KMxKN_forwardB(T *inFeatures, T *outFeatures, T *w, Int *rules,
+                              Int nHot, Int input_nPlanes, Int input_stride,
+                              Int output_nPlanes, Int output_stride) {
+  // Input x Weight -> Output
+  // blockDim=(K,K/V,1), gridDim=(nBlocks,N,1) Volkov-blocks
+  // K is a multiple of V,
+
+  // nHot x KM -> nHot x KN - parallel over N,nHot - loop over M
+
+  Int M = input_nPlanes / K;
+  // N = gridDim.y == output_nPlanes/K
+  Int n = blockIdx.y;
+  outFeatures += n * K;
+  w += n * K;
+
+  T O[V];
+  __shared__ T W[K][K];
+  __shared__ T I[K][K];
+  Int R0[V];
+  Int R1[V];
+  const int tx = threadIdx.x;
+  int ty[V];
+#pragma unroll
+  for (int v = 0; v < V; v++)
+    ty[v] = threadIdx.y + v * (K / V);
+
+  for (int m = 0; m < M; m++) {
+// Read w
+#pragma unroll
+    for (int v = 0; v < V; v++)
+      W[ty[v]][tx] = w[ty[v] * output_nPlanes + tx];
+
+    for (Int s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
+#pragma unroll
+      for (int v = 0; v < V; v++) {
+        if (s + ty[v] < nHot) {
+          R1[v] = rules[2 * (s + ty[v])];
+          R0[v] = rules[2 * (s + ty[v]) + 1];
+        }
+      }
+      __syncthreads();
+
+// Read input, reset O[]
+#pragma unroll
+      for (int v = 0; v < V; v++) {
+        if (s + ty[v] < nHot)
+          I[ty[v]][tx] = inFeatures[R0[v] * input_stride + tx];
+        O[v] = 0;
+      }
+      __syncthreads();
+
+#pragma unroll
+      for (int k = 0; k < K; k++)
+#pragma unroll
+        for (int v = 0; v < V; v++)
+          O[v] += I[ty[v]][k] * W[k][tx];
+
+#pragma unroll
+      for (int v = 0; v < V; v++)
+        if (s + ty[v] < nHot)
+          O[v] += outFeatures[R1[v] * output_stride + tx];
+#pragma unroll
+      for (int v = 0; v < V; v++)
+        if (s + ty[v] < nHot)
+          outFeatures[R1[v] * output_stride + tx] = O[v];
+      __syncthreads();
+    }
+    w += K * output_nPlanes;
+    inFeatures += K;
+  }
+}
+
+#define FOO(T, K, V)                                                           \
+  {                                                                            \
+    if (input_nPlanes % K == 0 and output_nPlanes % K == 0) {                  \
+      Int o = (nHot / K) * K;                                                  \
+      if (o >= K)                                                              \
+        dDeconvolution_KMxKN_forwardA<                                         \
+            T, K, V><<<dim3(std::min(o / K, (Int)512), output_nPlanes / K),    \
+                       dim3(K, K / V)>>>(inFeatures, outFeatures, w, rules, o, \
+                                         input_nPlanes, input_stride,          \
+                                         output_nPlanes, output_stride);       \
+      if (nHot > o)                                                            \
+        dDeconvolution_KMxKN_forwardB<                                         \
+            T, K, V><<<dim3(1, output_nPlanes / K), dim3(K, K / V)>>>(         \
+            inFeatures, outFeatures, w, rules + 2 * o, nHot - o,               \
+            input_nPlanes, input_stride, output_nPlanes, output_stride);       \
+      return;                                                                  \
+    }                                                                          \
+  }
+
+template <typename T>
+void dDeconvolution_forward(T *inFeatures, T *outFeatures, T *w, Int *rules,
+                            Int nHot, Int input_nPlanes, Int input_stride,
+                            Int output_nPlanes, Int output_stride) {
+  FOO(T, 64, 16)
+  FOO(T, 32, 8)
+  FOO(T, 16, 4)
+  FOO(T, 8, 2)
+  assert(false);
+}
+template <>
+void dDeconvolution_forward<double>(double *inFeatures, double *outFeatures,
+                                    double *w, Int *rules, Int nHot,
+                                    Int input_nPlanes, Int input_stride,
+                                    Int output_nPlanes, Int output_stride) {
+  FOO(double, 32, 8)
+  FOO(double, 16, 4)
+  FOO(double, 8, 2)
+  assert(false);
+}
+#undef FOO
+
+// dOutput x W^T -> dInput and
+// Input^T x dOutput -> dW
+// blockDim=(K,K/V,1), gridDim=(nBlocks,M,1)
+template <typename T, Int K, Int V>
+__global__ void dDeconvolution_KMxKN_backward_dW_A(
+    T *inFeatures, T *dInFeatures, T *dOutFeatures, T *w, T *dw, Int *rules,
+    Int nHot, Int input_nPlanes, Int input_stride, Int output_nPlanes,
+    Int output_stride) {
+  // M = gridDim.y == input_nPlanes / K
+  Int N = output_nPlanes / K;
+  Int m = blockIdx.y;
+  inFeatures += m * K;
+  dInFeatures += m * K;
+  w += m * K * output_nPlanes;
+  dw += m * K * output_nPlanes;
+
+  T dI[V];
+  T dW[V];
+  __shared__ T I[K][K];
+  __shared__ T dO[K][K];
+  __shared__ T W[K][K];
+  Int R0[V];
+  Int R1[V];
+  const int tx = threadIdx.x;
+  int ty[V];
+#pragma unroll
+  for (int v = 0; v < V; v++)
+    ty[v] = threadIdx.y + v * (K / V);
+
+  for (int n = 0; n < N; n++) {
+// Read w, reset dW
+#pragma unroll
+    for (int v = 0; v < V; v++) {
+      W[ty[v]][tx] = w[ty[v] * output_nPlanes + tx];
+      dW[v] = 0;
+    }
+
+    for (Int s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
+#pragma unroll
+      for (int v = 0; v < V; v++) {
+        R1[v] = rules[2 * (s + ty[v])];
+        R0[v] = rules[2 * (s + ty[v]) + 1];
+        dI[v] = 0;
+      }
+      __syncthreads();
+// Read input and dOutput
+#pragma unroll
+      for (int v = 0; v < V; v++) {
+        I[ty[v]][tx] = inFeatures[R0[v] * input_stride + tx];
+        dO[ty[v]][tx] = dOutFeatures[R1[v] * output_stride + tx];
+      }
+      __syncthreads();
+#pragma unroll
+      for (int k = 0; k < K; k++)
+#pragma unroll
+        for (int v = 0; v < V; v++) {
+          dI[v] += dO[ty[v]][k] * W[tx][k];
+          dW[v] += I[k][ty[v]] * dO[k][tx];
+        }
+#pragma unroll
+      for (int v = 0; v < V; v++)
+        dI[v] += dInFeatures[R0[v] * input_stride + tx];
+#pragma unroll
+      for (int v = 0; v < V; v++)
+        dInFeatures[R0[v] * input_stride + tx] = dI[v];
+      __syncthreads();
+    }
+#pragma unroll
+    for (int v = 0; v < V; v++)
+      atomicAdd(&dw[ty[v] * output_nPlanes + tx], dW[v]);
+    w += K;
+    dw += K;
+    dOutFeatures += K;
+  }
+}
+
+// dOutput x W^T -> dInput and
+// Input^T x dOutput -> dW
+// blockDim=(K,K/V,1), gridDim=(nBlocks,M,1)
+template <typename T, Int K, Int V>
+__global__ void dDeconvolution_KMxKN_backward_dW_B(
+    T *inFeatures, T *dInFeatures, T *dOutFeatures, T *w, T *dw, Int *rules,
+    Int nHot, Int input_nPlanes, Int input_stride, Int output_nPlanes,
+    Int output_stride) {
+  // M = gridDim.y == input_nPlanes / K
+  Int N = output_nPlanes / K;
+  Int m = blockIdx.y;
+  inFeatures += m * K;
+  dInFeatures += m * K;
+  w += m * K * output_nPlanes;
+  dw += m * K * output_nPlanes;
+
+  T dI[V];
+  T dW[V];
+  __shared__ T I[K][K];
+  __shared__ T dO[K][K];
+  __shared__ T W[K][K];
+  Int R0[V];
+  Int R1[V];
+  const int tx = threadIdx.x;
+  int ty[V];
+#pragma unroll
+  for (int v = 0; v < V; v++)
+    ty[v] = threadIdx.y + v * (K / V);

-  if (bias.numel()) {
-    auto b = bias.data<T>();
-    for (Int i = 0; i < op; i += 32) {
-      Int blockDim = min((Int)32, op - i);
-      Int gridDim = min((Int)4096, nActive);
-      Convolution_fp_bias<<<gridDim, blockDim>>>(oF + i, b + i, op, op,
-                                                 nActive);
+  for (int n = 0; n < N; n++) {
+// Read w, reset dW
+#pragma unroll
+    for (int v = 0; v < V; v++) {
+      W[ty[v]][tx] = w[ty[v] * output_nPlanes + tx];
+      dW[v] = 0;
    }
+
+    for (Int s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
+#pragma unroll
+      for (int v = 0; v < V; v++) {
+        if (s + ty[v] < nHot) {
+          R1[v] = rules[2 * (s + ty[v])];
+          R0[v] = rules[2 * (s + ty[v]) + 1];
+        }
+        dI[v] = 0;
+      }
+      __syncthreads();
+// Read input and dOutput
+#pragma unroll
+      for (int v = 0; v < V; v++)
+        if (s + ty[v] < nHot) {
+          I[ty[v]][tx] = inFeatures[R0[v] * input_stride + tx];
+          dO[ty[v]][tx] = dOutFeatures[R1[v] * output_stride + tx];
+        } else {
+          I[ty[v]][tx] = 0;
+          dO[ty[v]][tx] = 0;
+        }
+      __syncthreads();
+#pragma unroll
+      for (int k = 0; k < K; k++)
+#pragma unroll
+        for (int v = 0; v < V; v++) {
+          dI[v] += dO[ty[v]][k] * W[tx][k];
+          dW[v] += I[k][ty[v]] * dO[k][tx];
+        }
+#pragma unroll
+      for (int v = 0; v < V; v++)
+        if (s + ty[v] < nHot)
+          dI[v] += dInFeatures[R0[v] * input_stride + tx];
+#pragma unroll
+      for (int v = 0; v < V; v++)
+        if (s + ty[v] < nHot)
+          dInFeatures[R0[v] * input_stride + tx] = dI[v];
+      __syncthreads();
+    }
+#pragma unroll
+    for (int v = 0; v < V; v++)
+      atomicAdd(&dw[ty[v] * output_nPlanes + tx], dW[v]);
+    w += K;
+    dw += K;
+    dOutFeatures += K;
+  }
+}
+
+#define FOO(T, K, V)                                                           \
+  {                                                                            \
+    if (input_nPlanes % K == 0 and output_nPlanes % K == 0) {                  \
+      Int o = (nHot / K) * K;                                                  \
+      if (o >= K)                                                              \
+        dDeconvolution_KMxKN_backward_dW_A<                                    \
+            T, K, V><<<dim3(std::min(o / K, (Int)512), input_nPlanes / K),     \
+                       dim3(K, K / V)>>>(                                      \
+            inFeatures, dInFeatures, dOutFeatures, w, dw, rules, o,            \
+            input_nPlanes, input_stride, output_nPlanes, output_stride);       \
+      if (nHot > o)                                                            \
+        dDeconvolution_KMxKN_backward_dW_B<                                    \
+            T, K, V><<<dim3(1, input_nPlanes / K), dim3(K, K / V)>>>(          \
+            inFeatures, dInFeatures, dOutFeatures, w, dw, rules + 2 * o,       \
+            nHot - o, input_nPlanes, input_stride, output_nPlanes,             \
+            output_stride);                                                    \
+      return;                                                                  \
+    }                                                                          \
+  }
+
+template <typename T>
+void dDeconvolution_backward_dW(T *inFeatures, T *dInFeatures, T *dOutFeatures,
+                                T *w, T *dw, Int *rules, Int nHot,
+                                Int input_nPlanes, Int input_stride,
+                                Int output_nPlanes, Int output_stride) {
+  FOO(T, 32, 8)
+  FOO(T, 16, 4)
+  FOO(T, 8, 2)
+  assert(false);
+}
+#undef FOO
+
+template <typename T, Int K, Int V>
+__global__ void
+dDeconvolution_KMxKN_forward2(T *inFeatures, T *outFeatures, T *w, Int *rules,
+                              Int nHot, Int input_nPlanes, Int input_stride,
+                              Int output_nPlanes, Int output_stride) {
+  // Input x Weight -> Output
+  // blockDim=(K,K/V,1), gridDim=(nBlocks,N,1) Volkov-blocks
+  // K is a multiple of V,
+
+  // nHot x input_nplanes<=KM -> nHot x output_nPlanes<=KN
+  // - parallel over N,nHot - loop over M
+
+  Int M = (input_nPlanes + K - 1) / K;
+  // N = gridDim.y ~ output_nPlanes/K
+  Int n = blockIdx.y;
+  outFeatures += n * K;
+  w += n * K;
+  Int KO = min(K, output_nPlanes - K * n);
+
+  T O[V];
+  __shared__ T W[K][K];
+  __shared__ T I[K][K];
+  __shared__ Int R[K * 2];
+  const int tx = threadIdx.x;
+  int ty[V];
+#pragma unroll
+  for (int v = 0; v < V; v++)
+    ty[v] = threadIdx.y + v * (K / V);
+
+  for (int m = 0; m < M; m++) {
+    Int KI = min(K, input_nPlanes - K * m);
+
+// Read w
+#pragma unroll
+    for (int v = 0; v < V; v++)
+      if (ty[v] < KI and tx < KO)
+        W[ty[v]][tx] = w[ty[v] * output_nPlanes + tx];
+
+    for (Int s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
+// Read rules for K input/output pairs
+#pragma unroll
+      for (int v = 0; v < V; v++) {
+        if (ty[v] < 2) {
+          int q = ty[v] * K + tx;
+          if (s + q / 2 < nHot)
+            R[q] = rules[2 * s + q];
+        }
+      }
+      __syncthreads();
+
+// Read input, reset O[]
+#pragma unroll
+      for (int v = 0; v < V; v++) {
+        if (tx < KI and s + ty[v] < nHot)
+          I[ty[v]][tx] = inFeatures[R[2 * ty[v] + 1] * input_stride + tx];
+        O[v] = 0;
      }
-  Int c = ip * op;
+      __syncthreads();
+
+#pragma unroll
+      for (int k = 0; k < KI; k++)
+#pragma unroll
+        for (int v = 0; v < V; v++)
+          O[v] += I[ty[v]][k] * W[k][tx];
+      __syncthreads();
+
+#pragma unroll
+      for (int v = 0; v < V; v++)
+        if (tx < KO and s + ty[v] < nHot)
+          outFeatures[R[2 * ty[v]] * output_stride + tx] += O[v];
+      __syncthreads();
+    }
+    w += K * output_nPlanes;
+    inFeatures += K;
+  }
+}
+
+// dOutput x W^T -> dInput and
+// Input^T x dOutput -> dW
+// blockDim=(K,K/V,1), gridDim=(nBlocks,M,1)
+template <typename T, Int K, Int V>
+__global__ void
+dDeconvolution_KMxKN_backward_dW2(T *inFeatures, T *dInFeatures,
+                                  T *dOutFeatures, T *w, T *dw, Int *rules,
+                                  Int nHot, Int input_nPlanes, Int input_stride,
+                                  Int output_nPlanes, Int output_stride) {
+  // M = gridDim.y == input_nPlanes / K
+  Int N = (output_nPlanes + K - 1) / K;
+  Int m = blockIdx.y;
+  inFeatures += m * K;
+  dInFeatures += m * K;
+  w += m * K * output_nPlanes;
+  dw += m * K * output_nPlanes;
+  Int KI = min(K, input_nPlanes - K * m);
+
+  T dI[V];
+  T dW[V];
+  __shared__ T I[K][K];
+  __shared__ T dO[K][K];
+  __shared__ T W[K][K];
+  __shared__ Int R[K * 2];
+  const int tx = threadIdx.x;
+  int ty[V];
+#pragma unroll
+  for (int v = 0; v < V; v++)
+    ty[v] = threadIdx.y + v * (K / V);
+
+  for (int n = 0; n < N; n++) {
+    Int KO = min(K, output_nPlanes - K * n);
+
+// Read w, reset dW
+#pragma unroll
+    for (int v = 0; v < V; v++) {
+      if (ty[v] < KI and tx < KO)
+        W[ty[v]][tx] = w[ty[v] * output_nPlanes + tx];
+      dW[v] = 0;
+    }
+
+    for (Int s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
+// Read rules for K input/output pairs, reset dI[]
+#pragma unroll
+      for (int v = 0; v < V; v++) {
+        if (ty[v] < 2) {
+          int q = ty[v] * K + tx;
+          if (s + q / 2 < nHot)
+            R[q] = rules[2 * s + q];
+        }
+        dI[v] = 0;
+      }
+      __syncthreads();
+// Read input and dOutput
+#pragma unroll
+      for (int v = 0; v < V; v++) {
+        if (tx < KI and s + ty[v] < nHot)
+          I[ty[v]][tx] = inFeatures[R[2 * ty[v] + 1] * input_stride + tx];
+        else
+          I[ty[v]][tx] = 0;
+        if (tx < KO and s + ty[v] < nHot)
+          dO[ty[v]][tx] = dOutFeatures[R[2 * ty[v]] * output_stride + tx];
+        else
+          dO[ty[v]][tx] = 0;
+      }
+      __syncthreads();
+#pragma unroll
+      for (int k = 0; k < KO; k++)
+#pragma unroll
+        for (int v = 0; v < V; v++)
+          dI[v] += dO[ty[v]][k] * W[tx][k];
+#pragma unroll
+      for (int k = 0; k < K; k++)
+#pragma unroll
+        for (int v = 0; v < V; v++)
+          dW[v] += I[k][ty[v]] * dO[k][tx];
+      __syncthreads();
+#pragma unroll
+      for (int v = 0; v < V; v++)
+        if (tx < KI and s + ty[v] < nHot)
+          dInFeatures[R[2 * ty[v] + 1] * input_stride + tx] += dI[v];
+      __syncthreads();
+    }
+#pragma unroll
+    for (int v = 0; v < V; v++)
+      if (ty[v] < KI and tx < KO)
+        atomicAdd(&dw[ty[v] * output_nPlanes + tx], dW[v]);
+    w += K;
+    dw += K;
+    dOutFeatures += K;
+  }
+}
+
+template <typename T>
+double dDeconvolution_forward2(T *inFeatures, T *outFeatures, T *w,
+                               RuleBook _rules, Int input_nPlanes,
+                               Int input_stride, Int output_nPlanes,
+                               Int output_stride) {
+  Int c = input_nPlanes * output_nPlanes;
+  double flops = 0;
+  if (input_nPlanes % 8 != 0 or output_nPlanes % 8 != 0) {
+    const int K = 16;
+    const int V = 4;
    RULEBOOKITERATOR(
-      dDeconvolution_forward2<T>(iF, oF, w, rbB, nHotB, ip, ip, op, op);
+        (dDeconvolution_KMxKN_forward2<
+            T, K,
+            V><<<dim3(128, (output_nPlanes + K - 1) / K), dim3(K, K / V)>>>(
+            inFeatures, outFeatures, w, rbB, nHotB, input_nPlanes, input_stride,
+            output_nPlanes, output_stride));
+        , w += c; flops += nHotB * c;)
+  } else {
+    RULEBOOKITERATOR(dDeconvolution_forward(inFeatures, outFeatures, w, rbB,
+                                            nHotB, input_nPlanes, input_stride,
+                                            output_nPlanes, output_stride);
                     , w += c; flops += nHotB * c;)
+  }
  return flops;
 }

-template <typename T, Int Dimension>
-void cuda_Deconvolution_backward(
-    /*long*/ at::Tensor inputSize, /*long*/ at::Tensor outputSize,
-    /*long*/ at::Tensor filterSize,
-    /*long*/ at::Tensor filterStride, Metadata<Dimension> &m,
-    /*cuda float*/ at::Tensor input_features,
-    /*cuda float*/ at::Tensor d_input_features,
-    /*cuda float*/ at::Tensor d_output_features,
-    /*cuda float*/ at::Tensor weight, /*cuda float*/ at::Tensor d_weight,
-    /*cuda float*/ at::Tensor d_bias) {
-
-  auto _rules =
-      m.getRuleBook(outputSize, inputSize, filterSize, filterStride, true);
-  Int nActive = m.getNActive(outputSize);
-  d_input_features.resize_as_(input_features);
-  d_input_features.zero_();
-
-  auto iF = input_features.data<T>();
-  auto diF = d_input_features.data<T>();
-  auto doF = d_output_features.data<T>();
-  Int ip = input_features.size(1);
-  Int op = d_output_features.size(1);
-  auto w = weight.data<T>();
-  auto dw = d_weight.data<T>();
-  Int c = ip * op;
-  RULEBOOKITERATOR(dDeconvolution_backward_dW2<T>(iF, diF, doF, w, dw, rbB,
-                                                  nHotB, ip, ip, op, op);
-
+template <typename T>
+void dDeconvolution_backward_dW2(T *inFeatures, T *dInFeatures, T *dOutFeatures,
+                                 T *w, T *dw, RuleBook _rules,
+                                 Int input_nPlanes, Int input_stride,
+                                 Int output_nPlanes, Int output_stride) {
+  Int c = input_nPlanes * output_nPlanes;
+  if (input_nPlanes % 8 != 0 or output_nPlanes % 8 != 0) {
+    const int K = 16;
+    const int V = 4;
+    RULEBOOKITERATOR(
+        (dDeconvolution_KMxKN_backward_dW2<
+            T, K,
+            V><<<dim3(128, (input_nPlanes + K - 1) / K), dim3(K, K / V)>>>(
+            inFeatures, dInFeatures, dOutFeatures, w, dw, rbB, nHotB,
+            input_nPlanes, input_stride, output_nPlanes, output_stride));
+        , w += c; dw += c;)
+  } else {
+    RULEBOOKITERATOR(dDeconvolution_backward_dW(inFeatures, dInFeatures,
+                                                dOutFeatures, w, dw, rbB, nHotB,
+                                                input_nPlanes, input_stride,
+                                                output_nPlanes, output_stride);
                     , w += c; dw += c;)
-
-  if (d_bias.numel()) {
-    auto db = d_bias.data<T>();
-    Convolution_bp_bias(doF, db, op, op, nActive);
  }
 }
--- a/sparseconvnet/SCN/CUDA/Deconvolution.h
+++ b/sparseconvnet/SCN/CUDA/Deconvolution.h
-// Copyright 2016-present, Facebook, Inc.
-// All rights reserved.
-//
-// This source code is licensed under the license found in the
-// LICENSE file in the root directory of this source tree.
-
-#ifndef CUDA_DECONVOLUTION_H
-#define CUDA_DECONVOLUTION_H
-
-
-#include "Convolution.h"
-template <typename T, Int K, Int V>
-__global__ void
-dDeconvolution_KMxKN_forwardA(T *inFeatures, T *outFeatures, T *w, Int *rules,
-                              Int nHot, Int input_nPlanes, Int input_stride,
-                              Int output_nPlanes, Int output_stride) {
-  // nHot must be a multiple of K!!
-
-  // Input x Weight -> Output
-  // blockDim=(K,K/V,1), gridDim=(nBlocks,N,1) Volkov-blocks
-  // K is a multiple of V,
-
-  // nHot x KM -> nHot x KN - parallel over N,nHot - loop over M
-
-  Int M = input_nPlanes / K;
-  // N = gridDim.y == output_nPlanes/K
-  Int n = blockIdx.y;
-  outFeatures += n * K;
-  w += n * K;
-
-  T O[V];
-  __shared__ T W[K][K];
-  __shared__ T I[K][K];
-  Int R0[V];
-  Int R1[V];
-  const int tx = threadIdx.x;
-  int ty[V];
-#pragma unroll
-  for (int v = 0; v < V; v++)
-    ty[v] = threadIdx.y + v * (K / V);
-
-  for (int m = 0; m < M; m++) {
-// Read w
-#pragma unroll
-    for (int v = 0; v < V; v++)
-      W[ty[v]][tx] = w[ty[v] * output_nPlanes + tx];
-
-    for (Int s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
-#pragma unroll
-      for (int v = 0; v < V; v++) {
-        R1[v] = rules[2 * (s + ty[v])];
-        R0[v] = rules[2 * (s + ty[v]) + 1];
-      }
-      __syncthreads();
-
-// Read input, reset O[]
-#pragma unroll
-      for (int v = 0; v < V; v++) {
-        I[ty[v]][tx] = inFeatures[R0[v] * input_stride + tx];
-        O[v] = 0;
-      }
-      __syncthreads();
-
-#pragma unroll
-      for (int k = 0; k < K; k++)
-#pragma unroll
-        for (int v = 0; v < V; v++)
-          O[v] += I[ty[v]][k] * W[k][tx];
-
-#pragma unroll
-      for (int v = 0; v < V; v++)
-        O[v] += outFeatures[R1[v] * output_stride + tx];
-#pragma unroll
-      for (int v = 0; v < V; v++)
-        outFeatures[R1[v] * output_stride + tx] = O[v];
-      __syncthreads();
-    }
-    w += K * output_nPlanes;
-    inFeatures += K;
-  }
-}
-template <typename T, Int K, Int V>
-__global__ void
-dDeconvolution_KMxKN_forwardB(T *inFeatures, T *outFeatures, T *w, Int *rules,
-                              Int nHot, Int input_nPlanes, Int input_stride,
-                              Int output_nPlanes, Int output_stride) {
-  // Input x Weight -> Output
-  // blockDim=(K,K/V,1), gridDim=(nBlocks,N,1) Volkov-blocks
-  // K is a multiple of V,
-
-  // nHot x KM -> nHot x KN - parallel over N,nHot - loop over M
-
-  Int M = input_nPlanes / K;
-  // N = gridDim.y == output_nPlanes/K
-  Int n = blockIdx.y;
-  outFeatures += n * K;
-  w += n * K;
-
-  T O[V];
-  __shared__ T W[K][K];
-  __shared__ T I[K][K];
-  Int R0[V];
-  Int R1[V];
-  const int tx = threadIdx.x;
-  int ty[V];
-#pragma unroll
-  for (int v = 0; v < V; v++)
-    ty[v] = threadIdx.y + v * (K / V);
-
-  for (int m = 0; m < M; m++) {
-// Read w
-#pragma unroll
-    for (int v = 0; v < V; v++)
-      W[ty[v]][tx] = w[ty[v] * output_nPlanes + tx];
-
-    for (Int s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
-#pragma unroll
-      for (int v = 0; v < V; v++) {
-        if (s + ty[v] < nHot) {
-          R1[v] = rules[2 * (s + ty[v])];
-          R0[v] = rules[2 * (s + ty[v]) + 1];
-        }
-      }
-      __syncthreads();
-
-// Read input, reset O[]
-#pragma unroll
-      for (int v = 0; v < V; v++) {
-        if (s + ty[v] < nHot)
-          I[ty[v]][tx] = inFeatures[R0[v] * input_stride + tx];
-        O[v] = 0;
-      }
-      __syncthreads();
-
-#pragma unroll
-      for (int k = 0; k < K; k++)
-#pragma unroll
-        for (int v = 0; v < V; v++)
-          O[v] += I[ty[v]][k] * W[k][tx];
-
-#pragma unroll
-      for (int v = 0; v < V; v++)
-        if (s + ty[v] < nHot)
-          O[v] += outFeatures[R1[v] * output_stride + tx];
-#pragma unroll
-      for (int v = 0; v < V; v++)
-        if (s + ty[v] < nHot)
-          outFeatures[R1[v] * output_stride + tx] = O[v];
-      __syncthreads();
-    }
-    w += K * output_nPlanes;
-    inFeatures += K;
-  }
-}
-
-#define FOO(T, K, V)                                                           \
-  {                                                                            \
-    if (input_nPlanes % K == 0 and output_nPlanes % K == 0) {                  \
-      Int o = (nHot / K) * K;                                                 \
-      if (o >= K)                                                              \
-        dDeconvolution_KMxKN_forwardA<                                         \
-            T, K, V><<<dim3(std::min(o / K, (Int)512), output_nPlanes / K),   \
-                       dim3(K, K / V)>>>(                           \
-            inFeatures, outFeatures, w, rules, o, input_nPlanes, input_stride, \
-            output_nPlanes, output_stride);                                    \
-      if (nHot > o)                                                            \
-        dDeconvolution_KMxKN_forwardB<                                         \
-            T, K,                                                              \
-            V><<<dim3(1, output_nPlanes / K), dim3(K, K / V)>>>(    \
-            inFeatures, outFeatures, w, rules + 2 * o, nHot - o,               \
-            input_nPlanes, input_stride, output_nPlanes, output_stride);       \
-      return;                                                                  \
-    }                                                                          \
-  }
-
-template <typename T>
-void dDeconvolution_forward(T *inFeatures, T *outFeatures, T *w, Int *rules,
-                            Int nHot, Int input_nPlanes, Int input_stride,
-                            Int output_nPlanes, Int output_stride) {
-  FOO(T, 64, 16)
-  FOO(T, 32, 8)
-  FOO(T, 16, 4)
-  FOO(T, 8, 2)
-  assert(false);
-}
-template <>
-void dDeconvolution_forward<double>(double *inFeatures, double *outFeatures,
-                                    double *w, Int *rules, Int nHot,
-                                    Int input_nPlanes, Int input_stride,
-                                    Int output_nPlanes, Int output_stride) {
-  FOO(double, 32, 8)
-  FOO(double, 16, 4)
-  FOO(double, 8, 2)
-  assert(false);
-}
-#undef FOO
-
-// dOutput x W^T -> dInput and
-// Input^T x dOutput -> dW
-// blockDim=(K,K/V,1), gridDim=(nBlocks,M,1)
-template <typename T, Int K, Int V>
-__global__ void dDeconvolution_KMxKN_backward_dW_A(
-    T *inFeatures, T *dInFeatures, T *dOutFeatures, T *w, T *dw, Int *rules,
-    Int nHot, Int input_nPlanes, Int input_stride, Int output_nPlanes,
-    Int output_stride) {
-  // M = gridDim.y == input_nPlanes / K
-  Int N = output_nPlanes / K;
-  Int m = blockIdx.y;
-  inFeatures += m * K;
-  dInFeatures += m * K;
-  w += m * K * output_nPlanes;
-  dw += m * K * output_nPlanes;
-
-  T dI[V];
-  T dW[V];
-  __shared__ T I[K][K];
-  __shared__ T dO[K][K];
-  __shared__ T W[K][K];
-  Int R0[V];
-  Int R1[V];
-  const int tx = threadIdx.x;
-  int ty[V];
-#pragma unroll
-  for (int v = 0; v < V; v++)
-    ty[v] = threadIdx.y + v * (K / V);
-
-  for (int n = 0; n < N; n++) {
-// Read w, reset dW
-#pragma unroll
-    for (int v = 0; v < V; v++) {
-      W[ty[v]][tx] = w[ty[v] * output_nPlanes + tx];
-      dW[v] = 0;
-    }
-
-    for (Int s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
-#pragma unroll
-      for (int v = 0; v < V; v++) {
-        R1[v] = rules[2 * (s + ty[v])];
-        R0[v] = rules[2 * (s + ty[v]) + 1];
-        dI[v] = 0;
-      }
-      __syncthreads();
-// Read input and dOutput
-#pragma unroll
-      for (int v = 0; v < V; v++) {
-        I[ty[v]][tx] = inFeatures[R0[v] * input_stride + tx];
-        dO[ty[v]][tx] = dOutFeatures[R1[v] * output_stride + tx];
-      }
-      __syncthreads();
-#pragma unroll
-      for (int k = 0; k < K; k++)
-#pragma unroll
-        for (int v = 0; v < V; v++) {
-          dI[v] += dO[ty[v]][k] * W[tx][k];
-          dW[v] += I[k][ty[v]] * dO[k][tx];
-        }
-#pragma unroll
-      for (int v = 0; v < V; v++)
-        dI[v] += dInFeatures[R0[v] * input_stride + tx];
-#pragma unroll
-      for (int v = 0; v < V; v++)
-        dInFeatures[R0[v] * input_stride + tx] = dI[v];
-      __syncthreads();
-    }
-#pragma unroll
-    for (int v = 0; v < V; v++)
-      atomicAdd(&dw[ty[v] * output_nPlanes + tx], dW[v]);
-    w += K;
-    dw += K;
-    dOutFeatures += K;
-  }
-}
-
-// dOutput x W^T -> dInput and
-// Input^T x dOutput -> dW
-// blockDim=(K,K/V,1), gridDim=(nBlocks,M,1)
-template <typename T, Int K, Int V>
-__global__ void dDeconvolution_KMxKN_backward_dW_B(
-    T *inFeatures, T *dInFeatures, T *dOutFeatures, T *w, T *dw, Int *rules,
-    Int nHot, Int input_nPlanes, Int input_stride, Int output_nPlanes,
-    Int output_stride) {
-  // M = gridDim.y == input_nPlanes / K
-  Int N = output_nPlanes / K;
-  Int m = blockIdx.y;
-  inFeatures += m * K;
-  dInFeatures += m * K;
-  w += m * K * output_nPlanes;
-  dw += m * K * output_nPlanes;
-
-  T dI[V];
-  T dW[V];
-  __shared__ T I[K][K];
-  __shared__ T dO[K][K];
-  __shared__ T W[K][K];
-  Int R0[V];
-  Int R1[V];
-  const int tx = threadIdx.x;
-  int ty[V];
-#pragma unroll
-  for (int v = 0; v < V; v++)
-    ty[v] = threadIdx.y + v * (K / V);
-
-  for (int n = 0; n < N; n++) {
-// Read w, reset dW
-#pragma unroll
-    for (int v = 0; v < V; v++) {
-      W[ty[v]][tx] = w[ty[v] * output_nPlanes + tx];
-      dW[v] = 0;
-    }
-
-    for (Int s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
-#pragma unroll
-      for (int v = 0; v < V; v++) {
-        if (s + ty[v] < nHot) {
-          R1[v] = rules[2 * (s + ty[v])];
-          R0[v] = rules[2 * (s + ty[v]) + 1];
-        }
-        dI[v] = 0;
-      }
-      __syncthreads();
-// Read input and dOutput
-#pragma unroll
-      for (int v = 0; v < V; v++)
-        if (s + ty[v] < nHot) {
-          I[ty[v]][tx] = inFeatures[R0[v] * input_stride + tx];
-          dO[ty[v]][tx] = dOutFeatures[R1[v] * output_stride + tx];
-        } else {
-          I[ty[v]][tx] = 0;
-          dO[ty[v]][tx] = 0;
-        }
-      __syncthreads();
-#pragma unroll
-      for (int k = 0; k < K; k++)
-#pragma unroll
-        for (int v = 0; v < V; v++) {
-          dI[v] += dO[ty[v]][k] * W[tx][k];
-          dW[v] += I[k][ty[v]] * dO[k][tx];
-        }
-#pragma unroll
-      for (int v = 0; v < V; v++)
-        if (s + ty[v] < nHot)
-          dI[v] += dInFeatures[R0[v] * input_stride + tx];
-#pragma unroll
-      for (int v = 0; v < V; v++)
-        if (s + ty[v] < nHot)
-          dInFeatures[R0[v] * input_stride + tx] = dI[v];
-      __syncthreads();
-    }
-#pragma unroll
-    for (int v = 0; v < V; v++)
-      atomicAdd(&dw[ty[v] * output_nPlanes + tx], dW[v]);
-    w += K;
-    dw += K;
-    dOutFeatures += K;
-  }
-}
-
-#define FOO(T, K, V)                                                           \
-  {                                                                            \
-    if (input_nPlanes % K == 0 and output_nPlanes % K == 0) {                  \
-      Int o = (nHot / K) * K;                                                 \
-      if (o >= K)                                                              \
-        dDeconvolution_KMxKN_backward_dW_A<                                    \
-            T, K, V><<<dim3(std::min(o / K, (Int)512), input_nPlanes / K),    \
-                       dim3(K, K / V)>>>(                           \
-            inFeatures, dInFeatures, dOutFeatures, w, dw, rules, o,            \
-            input_nPlanes, input_stride, output_nPlanes, output_stride);       \
-      if (nHot > o)                                                            \
-        dDeconvolution_KMxKN_backward_dW_B<                                    \
-            T, K,                                                              \
-            V><<<dim3(1, input_nPlanes / K), dim3(K, K / V)>>>(     \
-            inFeatures, dInFeatures, dOutFeatures, w, dw, rules + 2 * o,       \
-            nHot - o, input_nPlanes, input_stride, output_nPlanes,             \
-            output_stride);                                                    \
-      return;                                                                  \
-    }                                                                          \
-  }
-
-template <typename T>
-void dDeconvolution_backward_dW(T *inFeatures, T *dInFeatures, T *dOutFeatures,
-                                T *w, T *dw, Int *rules, Int nHot,
-                                Int input_nPlanes, Int input_stride,
-                                Int output_nPlanes, Int output_stride) {
-  FOO(T, 32, 8)
-  FOO(T, 16, 4)
-  FOO(T, 8, 2)
-  assert(false);
-}
-#undef FOO
-
-template <typename T, Int K, Int V>
-__global__ void
-dDeconvolution_KMxKN_forward2(T *inFeatures, T *outFeatures, T *w, Int *rules,
-                              Int nHot, Int input_nPlanes, Int input_stride,
-                              Int output_nPlanes, Int output_stride) {
-  // Input x Weight -> Output
-  // blockDim=(K,K/V,1), gridDim=(nBlocks,N,1) Volkov-blocks
-  // K is a multiple of V,
-
-  // nHot x input_nplanes<=KM -> nHot x output_nPlanes<=KN
-  // - parallel over N,nHot - loop over M
-
-  Int M = (input_nPlanes + K - 1) / K;
-  // N = gridDim.y ~ output_nPlanes/K
-  Int n = blockIdx.y;
-  outFeatures += n * K;
-  w += n * K;
-  Int KO = min(K, output_nPlanes - K * n);
-
-  T O[V];
-  __shared__ T W[K][K];
-  __shared__ T I[K][K];
-  __shared__ Int R[K * 2];
-  const int tx = threadIdx.x;
-  int ty[V];
-#pragma unroll
-  for (int v = 0; v < V; v++)
-    ty[v] = threadIdx.y + v * (K / V);
-
-  for (int m = 0; m < M; m++) {
-    Int KI = min(K, input_nPlanes - K * m);
-
-// Read w
-#pragma unroll
-    for (int v = 0; v < V; v++)
-      if (ty[v] < KI and tx < KO)
-        W[ty[v]][tx] = w[ty[v] * output_nPlanes + tx];
-
-    for (Int s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
-// Read rules for K input/output pairs
-#pragma unroll
-      for (int v = 0; v < V; v++) {
-        if (ty[v] < 2) {
-          int q = ty[v] * K + tx;
-          if (s + q / 2 < nHot)
-            R[q] = rules[2 * s + q];
-        }
-      }
-      __syncthreads();
-
-// Read input, reset O[]
-#pragma unroll
-      for (int v = 0; v < V; v++) {
-        if (tx < KI and s + ty[v] < nHot)
-          I[ty[v]][tx] = inFeatures[R[2 * ty[v] + 1] * input_stride + tx];
-        O[v] = 0;
-      }
-      __syncthreads();
-
-#pragma unroll
-      for (int k = 0; k < KI; k++)
-#pragma unroll
-        for (int v = 0; v < V; v++)
-          O[v] += I[ty[v]][k] * W[k][tx];
-      __syncthreads();
-
-#pragma unroll
-      for (int v = 0; v < V; v++)
-        if (tx < KO and s + ty[v] < nHot)
-          outFeatures[R[2 * ty[v]] * output_stride + tx] += O[v];
-      __syncthreads();
-    }
-    w += K * output_nPlanes;
-    inFeatures += K;
-  }
-}
-
-template <typename T>
-void dDeconvolution_forward2(T *inFeatures, T *outFeatures, T *w, Int *rules,
-                             Int nHot, Int input_nPlanes, Int input_stride,
-                             Int output_nPlanes, Int output_stride) {
-  if (input_nPlanes % 8 != 0 or output_nPlanes % 8 != 0) {
-    const int K = 16;
-    const int V = 4;
-    dDeconvolution_KMxKN_forward2<T, K, V><<<
-        dim3(128, (output_nPlanes + K - 1) / K), dim3(K, K / V)>>>(
-        inFeatures, outFeatures, w, rules, nHot, input_nPlanes, input_stride,
-        output_nPlanes, output_stride);
-    return;
-  } else {
-    dDeconvolution_forward(inFeatures, outFeatures, w, rules, nHot,
-                           input_nPlanes, input_stride, output_nPlanes,
-                           output_stride);
-  }
-}
-
-// dOutput x W^T -> dInput and
-// Input^T x dOutput -> dW
-// blockDim=(K,K/V,1), gridDim=(nBlocks,M,1)
-template <typename T, Int K, Int V>
-__global__ void dDeconvolution_KMxKN_backward_dW2(
-    T *inFeatures, T *dInFeatures, T *dOutFeatures, T *w, T *dw, Int *rules,
-    Int nHot, Int input_nPlanes, Int input_stride, Int output_nPlanes,
-    Int output_stride) {
-  // M = gridDim.y == input_nPlanes / K
-  Int N = (output_nPlanes + K - 1) / K;
-  Int m = blockIdx.y;
-  inFeatures += m * K;
-  dInFeatures += m * K;
-  w += m * K * output_nPlanes;
-  dw += m * K * output_nPlanes;
-  Int KI = min(K, input_nPlanes - K * m);
-
-  T dI[V];
-  T dW[V];
-  __shared__ T I[K][K];
-  __shared__ T dO[K][K];
-  __shared__ T W[K][K];
-  __shared__ Int R[K * 2];
-  const int tx = threadIdx.x;
-  int ty[V];
-#pragma unroll
-  for (int v = 0; v < V; v++)
-    ty[v] = threadIdx.y + v * (K / V);
-
-  for (int n = 0; n < N; n++) {
-    Int KO = min(K, output_nPlanes - K * n);
-
-// Read w, reset dW
-#pragma unroll
-    for (int v = 0; v < V; v++) {
-      if (ty[v] < KI and tx < KO)
-        W[ty[v]][tx] = w[ty[v] * output_nPlanes + tx];
-      dW[v] = 0;
-    }
-
-    for (Int s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
-// Read rules for K input/output pairs, reset dI[]
-#pragma unroll
-      for (int v = 0; v < V; v++) {
-        if (ty[v] < 2) {
-          int q = ty[v] * K + tx;
-          if (s + q / 2 < nHot)
-            R[q] = rules[2 * s + q];
-        }
-        dI[v] = 0;
-      }
-      __syncthreads();
-// Read input and dOutput
-#pragma unroll
-      for (int v = 0; v < V; v++) {
-        if (tx < KI and s + ty[v] < nHot)
-          I[ty[v]][tx] = inFeatures[R[2 * ty[v] + 1] * input_stride + tx];
-        else
-          I[ty[v]][tx] = 0;
-        if (tx < KO and s + ty[v] < nHot)
-          dO[ty[v]][tx] = dOutFeatures[R[2 * ty[v]] * output_stride + tx];
-        else
-          dO[ty[v]][tx] = 0;
-      }
-      __syncthreads();
-#pragma unroll
-      for (int k = 0; k < KO; k++)
-#pragma unroll
-        for (int v = 0; v < V; v++)
-          dI[v] += dO[ty[v]][k] * W[tx][k];
-#pragma unroll
-      for (int k = 0; k < K; k++)
-#pragma unroll
-        for (int v = 0; v < V; v++)
-          dW[v] += I[k][ty[v]] * dO[k][tx];
-      __syncthreads();
-#pragma unroll
-      for (int v = 0; v < V; v++)
-        if (tx < KI and s + ty[v] < nHot)
-          dInFeatures[R[2 * ty[v] + 1] * input_stride + tx] += dI[v];
-      __syncthreads();
-    }
-#pragma unroll
-    for (int v = 0; v < V; v++)
-      if (ty[v] < KI and tx < KO)
-        atomicAdd(&dw[ty[v] * output_nPlanes + tx], dW[v]);
-    w += K;
-    dw += K;
-    dOutFeatures += K;
-  }
-}
-
-template <typename T>
-void dDeconvolution_backward_dW2(T *inFeatures, T *dInFeatures, T *dOutFeatures,
-                                 T *w, T *dw, Int *rules, Int nHot,
-                                 Int input_nPlanes, Int input_stride,
-                                 Int output_nPlanes, Int output_stride) {
-  if (input_nPlanes % 8 != 0 or output_nPlanes % 8 != 0) {
-    const int K = 16;
-    const int V = 4;
-    dDeconvolution_KMxKN_backward_dW2<T, K, V><<<
-        dim3(128, (input_nPlanes + K - 1) / K), dim3(K, K / V)>>>(
-        inFeatures, dInFeatures, dOutFeatures, w, dw, rules, nHot,
-        input_nPlanes, input_stride, output_nPlanes, output_stride);
-    return;
-  } else {
-    dDeconvolution_backward_dW(inFeatures, dInFeatures, dOutFeatures, w, dw,
-                               rules, nHot, input_nPlanes, input_stride,
-                               output_nPlanes, output_stride);
-  }
-}
-
-#endif /* CUDA_DECONVOLUTION_H */
--- a/sparseconvnet/SCN/CUDA/IOLayers.cpp
+++ b/sparseconvnet/SCN/CUDA/IOLayers.cpp
+// Copyright 2016-present, Facebook, Inc.
+// All rights reserved.
+//
+// This source code is licensed under the license found in the
+// LICENSE file in the root directory of this source tree.
+
+template <typename T>
+void InputLayer_fp(T *input_features, T *output_features, Int nRows,
+                   Int maxActive, Int nPlanes, Int *rules_cpu, Int *rules_gpu,
+                   bool average);
+
+template <typename T>
+void InputLayer_bp(T *d_input_features, T *d_output_features, Int nRows,
+                   Int maxActive, Int nPlanes, Int *rules_cpu, Int *rules_gpu,
+                   bool average);
+
+template <typename T, Int Dimension>
+void cuda_InputLayer_updateOutput(Metadata<Dimension> &m,
+                                  /*long*/ at::Tensor spatialSize,
+                                  /*long*/ at::Tensor input_coords,
+                                  /*cuda float*/ at::Tensor input_features,
+                                  /*cuda float*/ at::Tensor output_features,
+                                  long batchSize, long mode) {
+
+  m.inputLayer(spatialSize, input_coords, batchSize, mode);
+  Int nPlanes = input_features.size(1);
+  auto &rules = m.inputLayerRuleBook;
+  Int maxActive = rules[0][1];
+  Int nRows = rules[0][3];
+  if (mode == 0) {
+    output_features.resize_as_(input_features);
+    output_features.copy_(input_features);
+  } else {
+    output_features.resize_({*m.inputNActive, nPlanes});
+    output_features.zero_();
+    auto rulesBuffer = at::CUDA(at_kINT).tensor({(int)rules[1].size()});
+    auto iF = input_features.data<T>();
+    auto oF = output_features.data<T>();
+    Int *rb = rulesBuffer.data<Int>();
+    InputLayer_fp<T>(iF, oF, nRows, maxActive, nPlanes, &rules[1][0], rb,
+                     mode == 4);
+  }
+}
+template <typename T, Int Dimension>
+void cuda_InputLayer_updateGradInput(
+    Metadata<Dimension> &m,
+    /*cuda float*/ at::Tensor d_input_features,
+    /*cuda float*/ at::Tensor d_output_features) {
+
+  auto &rules = m.inputLayerRuleBook;
+  Int nPlanes = d_output_features.size(1);
+  auto mode = rules[0][0];
+  Int maxActive = rules[0][1];
+  Int nRows = rules[0][3];
+  if (mode == 0) {
+    d_input_features.resize_as_(d_output_features);
+    d_input_features.copy_(d_output_features);
+  } else {
+    d_input_features.resize_({rules[0][2], nPlanes});
+    d_input_features.zero_();
+    auto rulesBuffer = at::CUDA(at_kINT).tensor({(int)rules[1].size()});
+    auto diF = d_input_features.data<T>();
+    auto doF = d_output_features.data<T>();
+    Int *rb = rulesBuffer.data<Int>();
+    InputLayer_bp(diF, doF, nRows, maxActive, nPlanes, &rules[1][0], rb,
+                  mode == 4);
+  }
+}
+
+template <typename T, Int Dimension>
+void cuda_OutputLayer_updateOutput(Metadata<Dimension> &m,
+                                   /*cuda float*/ at::Tensor input_features,
+                                   /*cuda float*/ at::Tensor output_features) {
+
+  auto &rules = m.inputLayerRuleBook;
+  Int nPlanes = input_features.size(1);
+  auto mode = rules[0][0];
+  auto maxActive = rules[0][1];
+  auto nRows = rules[0][3];
+  if (mode == 0) {
+    output_features.resize_as_(input_features);
+    output_features.copy_(input_features);
+  } else {
+    output_features.resize_({rules[0][2], nPlanes});
+    output_features.zero_();
+    auto rulesBuffer = at::CUDA(at_kINT).tensor({(int)rules[1].size()});
+    auto iF = input_features.data<T>();
+    auto oF = output_features.data<T>();
+    Int *rb = rulesBuffer.data<Int>();
+    InputLayer_bp(oF, iF, nRows, maxActive, nPlanes, &rules[1][0], rb, false);
+  }
+}
+template <typename T, Int Dimension>
+void cuda_OutputLayer_updateGradInput(
+    Metadata<Dimension> &m,
+    /*cuda float*/ at::Tensor d_input_features,
+    /*cuda float*/ at::Tensor d_output_features) {
+
+  auto &rules = m.inputLayerRuleBook;
+  Int nPlanes = d_output_features.size(1);
+  auto mode = rules[0][0];
+  auto maxActive = rules[0][1];
+  auto nRows = rules[0][3];
+  if (mode == 0) {
+    d_input_features.resize_as_(d_output_features);
+    d_input_features.copy_(d_output_features);
+  } else {
+    d_input_features.resize_({nRows, nPlanes});
+    d_input_features.zero_();
+    auto rulesBuffer = at::CUDA(at_kINT).tensor({(int)rules[1].size()});
+    auto diF = d_input_features.data<T>();
+    auto doF = d_output_features.data<T>();
+    Int *rb = rulesBuffer.data<Int>();
+    InputLayer_fp<T>(doF, diF, nRows, maxActive, nPlanes, &rules[1][0], rb,
+                     false);
+  }
+}
+
+template <typename T, Int Dimension>
+void cuda_BLInputLayer_updateOutput(Metadata<Dimension> &m,
+                                    /*long*/ at::Tensor spatialSize,
+                                    /*long*/ at::Tensor input_coords,
+                                    /*cuda float*/ at::Tensor input_features,
+                                    /*cuda float*/ at::Tensor output_features,
+                                    long mode) {
+
+  m.blLayer(spatialSize, input_coords, mode);
+  Int nPlanes = input_features.size(2);
+  output_features.resize_({*m.inputNActive, nPlanes});
+  output_features.zero_();
+  auto &rules = m.blLayerRuleBook;
+  Int maxActive = rules[0][1];
+  Int nRows = rules[0][4];
+
+  if (mode == 0) {
+    output_features.resize_as_(input_features);
+    output_features.copy_(input_features);
+    output_features.resize_({*m.inputNActive, nPlanes});
+  } else {
+    auto rulesBuffer = at::CUDA(at_kINT).tensor({(int)rules[1].size()});
+    auto iF = input_features.data<T>();
+    auto oF = output_features.data<T>();
+    Int *rb = rulesBuffer.data<Int>();
+    InputLayer_fp<T>(iF, oF, nRows, maxActive, nPlanes, &rules[1][0], rb,
+                     mode == 4);
+  }
+}
+template <typename T, Int Dimension>
+void cuda_BLInputLayer_updateGradInput(
+    Metadata<Dimension> &m,
+    /*cuda float*/ at::Tensor d_input_features,
+    /*cuda float*/ at::Tensor d_output_features) {
+
+  auto &rules = m.blLayerRuleBook;
+  Int nPlanes = d_output_features.size(1);
+  Int mode = rules[0][0];
+  Int maxActive = rules[0][1];
+  Int nRows = rules[0][4];
+
+  if (mode == 0) {
+    d_input_features.resize_as_(d_output_features);
+    d_input_features.copy_(d_output_features);
+    d_input_features.resize_({rules[0][2], rules[0][3], nPlanes});
+  } else {
+    d_input_features.resize_({rules[0][2], rules[0][3], nPlanes});
+    d_input_features.zero_();
+    auto rulesBuffer = at::CUDA(at_kINT).tensor({(int)rules[1].size()});
+    auto diF = d_input_features.data<T>();
+    auto doF = d_output_features.data<T>();
+    Int *rb = rulesBuffer.data<Int>();
+    InputLayer_bp(diF, doF, nRows, maxActive, nPlanes, &rules[1][0], rb,
+                  mode == 4);
+  }
+}
+
+template <typename T, Int Dimension>
+void cuda_BLOutputLayer_updateOutput(
+    Metadata<Dimension> &m,
+    /*cuda float*/ at::Tensor input_features,
+    /*cuda float*/ at::Tensor output_features) {
+
+  auto &rules = m.blLayerRuleBook;
+  Int nPlanes = input_features.size(1);
+  auto mode = rules[0][0];
+  Int maxActive = rules[0][1];
+  Int nRows = rules[0][4];
+  if (mode == 0) {
+    output_features.resize_as_(input_features);
+    output_features.copy_(input_features);
+    output_features.resize_({rules[0][2], rules[0][3], nPlanes});
+  } else {
+    output_features.resize_({rules[0][2], rules[0][3], nPlanes});
+    output_features.zero_();
+    auto rulesBuffer = at::CUDA(at_kINT).tensor({(int)rules[1].size()});
+    auto iF = input_features.data<T>();
+    auto oF = output_features.data<T>();
+    Int *rb = rulesBuffer.data<Int>();
+    InputLayer_bp(oF, iF, nRows, maxActive, nPlanes, &rules[1][0], rb, false);
+  }
+}
+template <typename T, Int Dimension>
+void cuda_BLOutputLayer_updateGradInput(
+    Metadata<Dimension> &m,
+    /*cuda float*/ at::Tensor d_input_features,
+    /*cuda float*/ at::Tensor d_output_features) {
+
+  auto &rules = m.blLayerRuleBook;
+  Int nPlanes = d_output_features.size(2);
+  Int mode = rules[0][0];
+  Int maxActive = rules[0][1];
+  Int nRows = rules[0][4];
+  if (mode == 0) {
+    d_input_features.resize_as_(d_output_features);
+    d_input_features.copy_(d_output_features);
+    d_input_features.resize_({nRows, nPlanes});
+  } else {
+    d_input_features.resize_({nRows, nPlanes});
+    d_input_features.zero_();
+    auto rulesBuffer = at::CUDA(at_kINT).tensor({(int)rules[1].size()});
+    auto diF = d_input_features.data<T>();
+    auto doF = d_output_features.data<T>();
+    Int *rb = rulesBuffer.data<Int>();
+    InputLayer_fp<T>(doF, diF, nRows, maxActive, nPlanes, &rules[1][0], rb,
+                     false);
+  }
+}
--- a/sparseconvnet/SCN/CUDA/IOLayers.cu
+++ b/sparseconvnet/SCN/CUDA/IOLayers.cu
@@ -4,241 +4,67 @@
 // This source code is licensed under the license found in the
 // LICENSE file in the root directory of this source tree.

-#include "IOLayers.h"
+// Rulebook Format
+// rules[0][0] == mode
+// rules[0][1] == maxActive per spatial location (==1 for modes 0,1,2)
+// rules[0][2] == nInputRows
+// rules[0][3] == nOutputRows
+// rules[1]   nOutputRows x (1+maxActive)

-template <typename T, Int Dimension>
-void cuda_InputLayer_updateOutput(Metadata<Dimension> &m,
-                                  /*long*/ at::Tensor spatialSize,
-                                  /*long*/ at::Tensor input_coords,
-                                  /*cuda float*/ at::Tensor input_features,
-                                  /*cuda float*/ at::Tensor output_features,
-                                  long batchSize, long mode) {
-
-  m.inputLayer(spatialSize, input_coords, batchSize, mode);
-  Int nPlanes = input_features.size(1);
-  auto &rules = m.inputLayerRuleBook;
-  Int maxActive = rules[0][1];
-  Int nRows = rules[0][3];
-  if (mode == 0) {
-    output_features.resize_as_(input_features);
-    output_features.copy_(input_features);
-  } else {
-    output_features.resize_({*m.inputNActive, nPlanes});
-    output_features.zero_();
-    auto rulesBuffer = at::CUDA(at_kINT).tensor({(int)rules[1].size()});
-    auto iF = input_features.data<T>();
-    auto oF = output_features.data<T>();
-    Int *rb = rulesBuffer.data<Int>();
-    cudaMemcpy(rb, &rules[1][0], sizeof(Int) * rules[1].size(),
-               cudaMemcpyHostToDevice);
-    InputLayer_fp<
-        T><<<std::min(nRows, (Int)32768), std::min(nPlanes, (Int)32)>>>(
-        iF, oF, nRows, maxActive, nPlanes, rb, mode == 4);
-  }
-}
-template <typename T, Int Dimension>
-void cuda_InputLayer_updateGradInput(
-    Metadata<Dimension> &m,
-    /*cuda float*/ at::Tensor d_input_features,
-    /*cuda float*/ at::Tensor d_output_features) {
-
-  auto &rules = m.inputLayerRuleBook;
-  Int nPlanes = d_output_features.size(1);
-  auto mode = rules[0][0];
-  Int maxActive = rules[0][1];
-  Int nRows = rules[0][3];
-  if (mode == 0) {
-    d_input_features.resize_as_(d_output_features);
-    d_input_features.copy_(d_output_features);
-  } else {
-    d_input_features.resize_({rules[0][2], nPlanes});
-    d_input_features.zero_();
-    auto rulesBuffer = at::CUDA(at_kINT).tensor({(int)rules[1].size()});
-    auto diF = d_input_features.data<T>();
-    auto doF = d_output_features.data<T>();
-    Int *rb = rulesBuffer.data<Int>();
-    cudaMemcpy(rb, &rules[1][0], sizeof(Int) * rules[1].size(),
-               cudaMemcpyHostToDevice);
-    InputLayer_bp<
-        T><<<std::min(nRows, (Int)32768), std::min(nPlanes, (Int)32)>>>(
-        diF, doF, nRows, maxActive, nPlanes, rb, mode == 4);
+template <typename T>
+__global__ void InputLayer_fp_(T *input_features, T *output_features, Int nRows,
+                               Int maxActive, Int nPlanes, Int *rules,
+                               bool average) {
+  for (int row = blockIdx.x; row < nRows; row += gridDim.x) {
+    T *out = output_features + row * nPlanes;
+    Int *r = rules + row * (1 + maxActive);
+    Int nActive = r[0];
+    T multiplier = (average and nActive > 0) ? 1.0f / nActive : 1.0f;
+    for (int i = 1; i <= nActive; i++) {
+      T *inp = input_features + r[i] * nPlanes;
+      for (Int plane = threadIdx.x; plane < nPlanes; plane += blockDim.x)
+        out[plane] += multiplier * inp[plane];
    }
-}
-
-template <typename T, Int Dimension>
-void cuda_OutputLayer_updateOutput(Metadata<Dimension> &m,
-                                   /*cuda float*/ at::Tensor input_features,
-                                   /*cuda float*/ at::Tensor output_features) {
-
-  auto &rules = m.inputLayerRuleBook;
-  Int nPlanes = input_features.size(1);
-  auto mode = rules[0][0];
-  auto maxActive = rules[0][1];
-  auto nRows = rules[0][3];
-  if (mode == 0) {
-    output_features.resize_as_(input_features);
-    output_features.copy_(input_features);
-  } else {
-    output_features.resize_({rules[0][2], nPlanes});
-    output_features.zero_();
-    auto rulesBuffer = at::CUDA(at_kINT).tensor({(int)rules[1].size()});
-    auto iF = input_features.data<T>();
-    auto oF = output_features.data<T>();
-    Int *rb = rulesBuffer.data<Int>();
-    cudaMemcpy(rb, &rules[1][0], sizeof(Int) * rules[1].size(),
-               cudaMemcpyHostToDevice);
-    InputLayer_bp<
-        T><<<std::min(nRows, (Int)32768), std::min(nPlanes, (Int)32)>>>(
-        oF, iF, nRows, maxActive, nPlanes, rb, false);
-  }
-}
-template <typename T, Int Dimension>
-void cuda_OutputLayer_updateGradInput(
-    Metadata<Dimension> &m,
-    /*cuda float*/ at::Tensor d_input_features,
-    /*cuda float*/ at::Tensor d_output_features) {
-
-  auto &rules = m.inputLayerRuleBook;
-  Int nPlanes = d_output_features.size(1);
-  auto mode = rules[0][0];
-  auto maxActive = rules[0][1];
-  auto nRows = rules[0][3];
-  if (mode == 0) {
-    d_input_features.resize_as_(d_output_features);
-    d_input_features.copy_(d_output_features);
-  } else {
-    d_input_features.resize_({nRows, nPlanes});
-    d_input_features.zero_();
-    auto rulesBuffer = at::CUDA(at_kINT).tensor({(int)rules[1].size()});
-    auto diF = d_input_features.data<T>();
-    auto doF = d_output_features.data<T>();
-    Int *rb = rulesBuffer.data<Int>();
-    cudaMemcpy(rb, &rules[1][0], sizeof(Int) * rules[1].size(),
-               cudaMemcpyHostToDevice);
-    InputLayer_fp<
-        T><<<std::min(nRows, (Int)32768), std::min(nPlanes, (Int)32)>>>(
-        doF, diF, nRows, maxActive, nPlanes, rb, false);
  }
 }

-template <typename T, Int Dimension>
-void cuda_BLInputLayer_updateOutput(Metadata<Dimension> &m,
-                                    /*long*/ at::Tensor spatialSize,
-                                    /*long*/ at::Tensor input_coords,
-                                    /*cuda float*/ at::Tensor input_features,
-                                    /*cuda float*/ at::Tensor output_features,
-                                    long mode) {
-
-  m.blLayer(spatialSize, input_coords, mode);
-  Int nPlanes = input_features.size(2);
-  output_features.resize_({*m.inputNActive, nPlanes});
-  output_features.zero_();
-  auto &rules = m.blLayerRuleBook;
-  Int maxActive = rules[0][1];
-  Int nRows = rules[0][4];
-
-  if (mode == 0) {
-    output_features.resize_as_(input_features);
-    output_features.copy_(input_features);
-    output_features.resize_({*m.inputNActive, nPlanes});
-  } else {
-    auto rulesBuffer = at::CUDA(at_kINT).tensor({(int)rules[1].size()});
-    auto iF = input_features.data<T>();
-    auto oF = output_features.data<T>();
-    Int *rb = rulesBuffer.data<Int>();
-    cudaMemcpy(rb, &rules[1][0], sizeof(Int) * rules[1].size(),
+template <typename T>
+void InputLayer_fp(T *input_features, T *output_features, Int nRows,
+                   Int maxActive, Int nPlanes, Int *rules_cpu, Int *rules_gpu,
+                   bool average) {
+  cudaMemcpy(rules_gpu, rules_cpu, sizeof(Int) * nRows * (1 + maxActive),
             cudaMemcpyHostToDevice);
-    InputLayer_fp<
+  InputLayer_fp_<
      T><<<std::min(nRows, (Int)32768), std::min(nPlanes, (Int)32)>>>(
-        iF, oF, nRows, maxActive, nPlanes, rb, mode == 4);
-  }
+      input_features, output_features, nRows, maxActive, nPlanes, rules_gpu,
+      average);
 }
-template <typename T, Int Dimension>
-void cuda_BLInputLayer_updateGradInput(
-    Metadata<Dimension> &m,
-    /*cuda float*/ at::Tensor d_input_features,
-    /*cuda float*/ at::Tensor d_output_features) {

-  auto &rules = m.blLayerRuleBook;
-  Int nPlanes = d_output_features.size(1);
-  Int mode = rules[0][0];
-  Int maxActive = rules[0][1];
-  Int nRows = rules[0][4];
-
-  if (mode == 0) {
-    d_input_features.resize_as_(d_output_features);
-    d_input_features.copy_(d_output_features);
-    d_input_features.resize_({rules[0][2], rules[0][3], nPlanes});
-  } else {
-    d_input_features.resize_({rules[0][2], rules[0][3], nPlanes});
-    d_input_features.zero_();
-    auto rulesBuffer = at::CUDA(at_kINT).tensor({(int)rules[1].size()});
-    auto diF = d_input_features.data<T>();
-    auto doF = d_output_features.data<T>();
-    Int *rb = rulesBuffer.data<Int>();
-    cudaMemcpy(rb, &rules[1][0], sizeof(Int) * rules[1].size(),
-               cudaMemcpyHostToDevice);
-    InputLayer_bp<
-        T><<<std::min(nRows, (Int)32768), std::min(nPlanes, (Int)32)>>>(
-        diF, doF, nRows, maxActive, nPlanes, rb, mode == 4);
+template <typename T>
+__global__ void InputLayer_bp_(T *d_input_features, T *d_output_features,
+                               Int nRows, Int maxActive, Int nPlanes,
+                               Int *rules, bool average) {
+  for (int row = blockIdx.x; row < nRows; row += gridDim.x) {
+    T *out = d_output_features + row * nPlanes;
+    Int *r = rules + row * (1 + maxActive);
+    Int nActive = r[0];
+    T multiplier = (average and nActive > 0) ? 1.0f / nActive : 1.0f;
+    for (int i = 1; i <= nActive; i++) {
+      T *inp = d_input_features + r[i] * nPlanes;
+      for (Int plane = threadIdx.x; plane < nPlanes; plane += blockDim.x)
+        atomicAdd(&inp[plane], multiplier * out[plane]);
    }
-}
-
-template <typename T, Int Dimension>
-void cuda_BLOutputLayer_updateOutput(
-    Metadata<Dimension> &m,
-    /*cuda float*/ at::Tensor input_features,
-    /*cuda float*/ at::Tensor output_features) {
-
-  auto &rules = m.blLayerRuleBook;
-  Int nPlanes = input_features.size(1);
-  auto mode = rules[0][0];
-  Int maxActive = rules[0][1];
-  Int nRows = rules[0][4];
-  if (mode == 0) {
-    output_features.resize_as_(input_features);
-    output_features.copy_(input_features);
-    output_features.resize_({rules[0][2], rules[0][3], nPlanes});
-  } else {
-    output_features.resize_({rules[0][2], rules[0][3], nPlanes});
-    output_features.zero_();
-    auto rulesBuffer = at::CUDA(at_kINT).tensor({(int)rules[1].size()});
-    auto iF = input_features.data<T>();
-    auto oF = output_features.data<T>();
-    Int *rb = rulesBuffer.data<Int>();
-    cudaMemcpy(rb, &rules[1][0], sizeof(Int) * rules[1].size(),
-               cudaMemcpyHostToDevice);
-    InputLayer_bp<
-        T><<<std::min(nRows, (Int)32768), std::min(nPlanes, (Int)32)>>>(
-        oF, iF, nRows, maxActive, nPlanes, rb, false);
  }
 }
-template <typename T, Int Dimension>
-void cuda_BLOutputLayer_updateGradInput(
-    Metadata<Dimension> &m,
-    /*cuda float*/ at::Tensor d_input_features,
-    /*cuda float*/ at::Tensor d_output_features) {

-  auto &rules = m.blLayerRuleBook;
-  Int nPlanes = d_output_features.size(2);
-  Int mode = rules[0][0];
-  Int maxActive = rules[0][1];
-  Int nRows = rules[0][4];
-  if (mode == 0) {
-    d_input_features.resize_as_(d_output_features);
-    d_input_features.copy_(d_output_features);
-    d_input_features.resize_({nRows, nPlanes});
-  } else {
-    d_input_features.resize_({nRows, nPlanes});
-    d_input_features.zero_();
-    auto rulesBuffer = at::CUDA(at_kINT).tensor({(int)rules[1].size()});
-    auto diF = d_input_features.data<T>();
-    auto doF = d_output_features.data<T>();
-    Int *rb = rulesBuffer.data<Int>();
-    cudaMemcpy(rb, &rules[1][0], sizeof(Int) * rules[1].size(),
+template <typename T>
+void InputLayer_bp(T *d_input_features, T *d_output_features, Int nRows,
+                   Int maxActive, Int nPlanes, Int *rules_cpu, Int *rules_gpu,
+                   bool average) {
+  cudaMemcpy(rules_gpu, rules_cpu, sizeof(Int) * nRows * (1 + maxActive),
             cudaMemcpyHostToDevice);
-    InputLayer_fp<
+  InputLayer_bp_<
      T><<<std::min(nRows, (Int)32768), std::min(nPlanes, (Int)32)>>>(
-        doF, diF, nRows, maxActive, nPlanes, rb, false);
-  }
+      d_input_features, d_output_features, nRows, maxActive, nPlanes, rules_gpu,
+      average);
 }
--- a/sparseconvnet/SCN/CUDA/IOLayers.h
+++ b/sparseconvnet/SCN/CUDA/IOLayers.h
-// Copyright 2016-present, Facebook, Inc.
-// All rights reserved.
-//
-// This source code is licensed under the license found in the
-// LICENSE file in the root directory of this source tree.
-
-#ifndef CUDA_IOLAYERS_H
-#define CUDA_IOLAYERS_H
-
-template <typename T>
-__global__ void InputLayer_fp(T *input_features, T *output_features,
-                              Int nRows, Int maxActive, Int nPlanes,
-                              Int *rules, bool average) {
-  for (int row = blockIdx.x; row < nRows; row += gridDim.x) {
-    T *out = output_features + row * nPlanes;
-    Int *r = rules + row * (1 + maxActive);
-    Int nActive = r[0];
-    T multiplier = (average and nActive > 0) ? 1.0f / nActive : 1.0f;
-    for (int i = 1; i <= nActive; i++) {
-      T *inp = input_features + r[i] * nPlanes;
-      for (Int plane = threadIdx.x; plane < nPlanes; plane += blockDim.x)
-        out[plane] += multiplier * inp[plane];
-    }
-  }
-}
-
-template <typename T>
-__global__ void InputLayer_bp(T *d_input_features, T *d_output_features,
-                              Int nRows, Int maxActive, Int nPlanes,
-                              Int *rules, bool average) {
-  for (int row = blockIdx.x; row < nRows; row += gridDim.x) {
-    T *out = d_output_features + row * nPlanes;
-    Int *r = rules + row * (1 + maxActive);
-    Int nActive = r[0];
-    T multiplier = (average and nActive > 0) ? 1.0f / nActive : 1.0f;
-    for (int i = 1; i <= nActive; i++) {
-      T *inp = d_input_features + r[i] * nPlanes;
-      for (Int plane = threadIdx.x; plane < nPlanes; plane += blockDim.x)
-        atomicAdd(&inp[plane], multiplier * out[plane]);
-    }
-  }
-}
-#endif /* CUDA_IOLAYERS_H */
--- a/sparseconvnet/SCN/CUDA/LeakyReLU.cpp
+++ b/sparseconvnet/SCN/CUDA/LeakyReLU.cpp
+// Copyright 2016-present, Facebook, Inc.
+// All rights reserved.
+//
+// This source code is licensed under the license found in the
+// LICENSE file in the root directory of this source tree.
+
+template <typename T>
+void LeakyReLU_fp(T *input_features, T *output_features, Int n, T alpha);
+template <typename T>
+void LeakyReLU_bp(T *input_features, T *d_input_features, T *output_features,
+                  Int n, T alpha);
+
+template <typename T>
+void cuda_LeakyReLU_updateOutput(/*cuda float*/ at::Tensor input_features,
+                                 /*cuda float*/ at::Tensor output_features,
+                                 T alpha) {
+  output_features.resize_as_(input_features);
+  auto n = input_features.numel();
+  LeakyReLU_fp<T>(input_features.data<T>(), output_features.data<T>(), n,
+                  alpha);
+}
+
+template <typename T>
+void cuda_LeakyReLU_updateGradInput(
+    /*cuda float*/ at::Tensor input_features,
+    /*cuda float*/ at::Tensor d_input_features,
+    /*cuda float*/ at::Tensor d_output_features, T alpha) {
+  d_input_features.resize_as_(d_output_features);
+  auto n = d_input_features.numel();
+  LeakyReLU_bp<T>(input_features.data<T>(), d_input_features.data<T>(),
+                  d_output_features.data<T>(), n, alpha);
+}
--- a/sparseconvnet/SCN/CUDA/LeakyReLU.cu
+++ b/sparseconvnet/SCN/CUDA/LeakyReLU.cu
@@ -4,26 +4,28 @@
 // This source code is licensed under the license found in the
 // LICENSE file in the root directory of this source tree.

-#include "LeakyReLU.h"
-
 template <typename T>
-void cuda_LeakyReLU_updateOutput(/*cuda float*/ at::Tensor input_features,
-                                 /*cuda float*/ at::Tensor output_features,
-                                 float alpha) {
-  output_features.resize_as_(input_features);
-  auto n = input_features.numel();
-  LeakyReLU_fp<T><<<16, 1024>>>(input_features.data<T>(),
-                                output_features.data<T>(), n, alpha);
+__global__ void LeakyReLU_fp_(T *input_features, T *output_features, Int n,
+                              T alpha) {
+  for (Int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; i += 16 * 1024)
+    output_features[i] = (input_features[i] > 0) ? input_features[i]
+                                                 : (input_features[i] * alpha);
+}
+template <typename T>
+void LeakyReLU_fp(T *input_features, T *output_features, Int n, T alpha) {
+  LeakyReLU_fp_<T><<<16, 1024>>>(input_features, output_features, n, alpha);
+}
+template <typename T>
+__global__ void LeakyReLU_bp_(T *input_features, T *d_input_features,
+                              T *d_output_features, Int n, T alpha) {
+  for (Int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; i += 16 * 1024)
+    d_input_features[i] = (input_features[i] > 0)
+                              ? d_output_features[i]
+                              : (d_output_features[i] * alpha);
 }
-
 template <typename T>
-void cuda_LeakyReLU_updateGradInput(
-    /*cuda float*/ at::Tensor input_features,
-    /*cuda float*/ at::Tensor d_input_features,
-    /*cuda float*/ at::Tensor d_output_features, float alpha) {
-  d_input_features.resize_as_(d_output_features);
-  auto n = d_input_features.numel();
-  LeakyReLU_bp<T><<<16, 1024>>>(input_features.data<T>(),
-                                d_input_features.data<T>(),
-                                d_output_features.data<T>(), n, alpha);
+void LeakyReLU_bp(T *input_features, T *d_input_features, T *output_features,
+                  Int n, T alpha) {
+  LeakyReLU_bp_<T><<<16, 1024>>>(input_features, d_input_features,
+                                 output_features, n, alpha);
 }
--- a/sparseconvnet/SCN/CUDA/LeakyReLU.h
+++ b/sparseconvnet/SCN/CUDA/LeakyReLU.h
-// Copyright 2016-present, Facebook, Inc.
-// All rights reserved.
-//
-// This source code is licensed under the license found in the
-// LICENSE file in the root directory of this source tree.
-
-#ifndef LEAKYRELU_H
-#define LEAKYRELU_H
-
-template <typename T>
-__global__ void LeakyReLU_fp(T *input_features, T *output_features, Int n,
-                             T alpha) {
-  for (Int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; i += 16 * 1024)
-    output_features[i] = (input_features[i] > 0) ? input_features[i]
-                                                 : (input_features[i] * alpha);
-}
-template <typename T>
-__global__ void LeakyReLU_bp(T *input_features, T *d_input_features,
-                             T *d_output_features, Int n, T alpha) {
-  for (Int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; i += 16 * 1024)
-    d_input_features[i] = (input_features[i] > 0)
-                              ? d_output_features[i]
-                              : (d_output_features[i] * alpha);
-}
-#endif
--- a/sparseconvnet/SCN/CUDA/MaxPooling.cpp
+++ b/sparseconvnet/SCN/CUDA/MaxPooling.cpp
+// Copyright 2016-present, Facebook, Inc.
+// All rights reserved.
+//
+// This source code is licensed under the license found in the
+// LICENSE file in the root directory of this source tree.
+
+template <typename T>
+void cuda_MaxPooling_ForwardPass(T *input_features, T *output_features,
+                                 Int nPlanes, Int input_stride,
+                                 Int output_stride, RuleBook _rules);
+template <typename T>
+void cuda_MaxPooling_BackwardPass(T *input_features, T *d_input_features,
+                                  T *output_features, T *d_output_features,
+                                  Int nPlanes, Int input_stride,
+                                  Int output_stride, RuleBook _rules);
+
+template <typename T, Int Dimension>
+void cuda_MaxPooling_updateOutput(
+    /*long*/ at::Tensor inputSize, /*long*/ at::Tensor outputSize,
+    /*long*/ at::Tensor poolSize,
+    /*long*/ at::Tensor poolStride, Metadata<Dimension> &m,
+    /*cuda float*/ at::Tensor input_features,
+    /*cuda float*/ at::Tensor output_features, long nFeaturesToDrop) {
+
+  Int nPlanes = input_features.size(1) - nFeaturesToDrop;
+  auto _rules =
+      m.getRuleBook(inputSize, outputSize, poolSize, poolStride, true);
+  Int nActive = m.getNActive(outputSize);
+  output_features.resize_({nActive, nPlanes});
+  output_features.zero_();
+
+  auto iF = input_features.data<T>() + nFeaturesToDrop;
+  auto oF = output_features.data<T>();
+  cuda_MaxPooling_ForwardPass<T>(iF, oF, nPlanes, input_features.size(1),
+                                 output_features.size(1), _rules);
+}
+template <typename T, Int Dimension>
+void cuda_MaxPooling_updateGradInput(
+    /*long*/ at::Tensor inputSize, /*long*/ at::Tensor outputSize,
+    /*long*/ at::Tensor poolSize,
+    /*long*/ at::Tensor poolStride, Metadata<Dimension> &m,
+    /*cuda float*/ at::Tensor input_features,
+    /*cuda float*/ at::Tensor d_input_features,
+    /*cuda float*/ at::Tensor output_features,
+    /*cuda float*/ at::Tensor d_output_features, long nFeaturesToDrop) {
+
+  Int nPlanes = input_features.size(1) - nFeaturesToDrop;
+  auto _rules =
+      m.getRuleBook(inputSize, outputSize, poolSize, poolStride, true);
+  d_input_features.resize_as_(input_features);
+  d_input_features.zero_();
+
+  auto iF = input_features.data<T>();
+  auto oF = output_features.data<T>();
+  auto diF = d_input_features.data<T>();
+  auto doF = d_output_features.data<T>();
+  cuda_MaxPooling_BackwardPass<T>(iF, diF, oF, doF, nPlanes,
+                                  input_features.size(1),
+                                  d_output_features.size(1), _rules);
+}
+template <typename T, Int Dimension>
+void cuda_RandomizedStrideMaxPooling_updateOutput(
+    /*long*/ at::Tensor inputSize, /*long*/ at::Tensor outputSize,
+    /*long*/ at::Tensor poolSize,
+    /*long*/ at::Tensor poolStride, Metadata<Dimension> &m,
+    /*cuda float*/ at::Tensor input_features,
+    /*cuda float*/ at::Tensor output_features, long nFeaturesToDrop) {
+
+  Int nPlanes = input_features.size(1) - nFeaturesToDrop;
+  auto _rules = m.getRandomizedStrideRuleBook(inputSize, outputSize, poolSize,
+                                              poolStride, true);
+  Int nActive = m.getNActive(outputSize);
+  output_features.resize_({nActive, nPlanes});
+  output_features.zero_();
+
+  auto iF = input_features.data<T>() + nFeaturesToDrop;
+  auto oF = output_features.data<T>();
+  cuda_MaxPooling_ForwardPass<T>(iF, oF, nPlanes, input_features.size(1),
+                                 output_features.size(1), _rules);
+}
+template <typename T, Int Dimension>
+void cuda_RandomizedStrideMaxPooling_updateGradInput(
+    /*long*/ at::Tensor inputSize, /*long*/ at::Tensor outputSize,
+    /*long*/ at::Tensor poolSize,
+    /*long*/ at::Tensor poolStride, Metadata<Dimension> &m,
+    /*cuda float*/ at::Tensor input_features,
+    /*cuda float*/ at::Tensor d_input_features,
+    /*cuda float*/ at::Tensor output_features,
+    /*cuda float*/ at::Tensor d_output_features, long nFeaturesToDrop) {
+
+  Int nPlanes = input_features.size(1) - nFeaturesToDrop;
+  auto _rules = m.getRandomizedStrideRuleBook(inputSize, outputSize, poolSize,
+                                              poolStride, true);
+  d_input_features.resize_as_(input_features);
+  d_input_features.zero_();
+
+  auto iF = input_features.data<T>();
+  auto oF = output_features.data<T>();
+  auto diF = d_input_features.data<T>();
+  auto doF = d_output_features.data<T>();
+  cuda_MaxPooling_BackwardPass<T>(iF, diF, oF, doF, nPlanes,
+                                  input_features.size(1),
+                                  d_output_features.size(1), _rules);
+}
--- a/sparseconvnet/SCN/CUDA/MaxPooling.cu
+++ b/sparseconvnet/SCN/CUDA/MaxPooling.cu
@@ -4,100 +4,74 @@
 // This source code is licensed under the license found in the
 // LICENSE file in the root directory of this source tree.

-#include "MaxPooling.h"
 #include "RuleBookIterator.h"

-template <typename T, Int Dimension>
-void cuda_MaxPooling_updateOutput(
-    /*long*/ at::Tensor inputSize, /*long*/ at::Tensor outputSize,
-    /*long*/ at::Tensor poolSize,
-    /*long*/ at::Tensor poolStride, Metadata<Dimension> &m,
-    /*cuda float*/ at::Tensor input_features,
-    /*cuda float*/ at::Tensor output_features, long nFeaturesToDrop) {
-
-  Int nPlanes = input_features.size(1) - nFeaturesToDrop;
-  auto _rules =
-      m.getRuleBook(inputSize, outputSize, poolSize, poolStride, true);
-  Int nActive = m.getNActive(outputSize);
-  output_features.resize_({nActive, nPlanes});
-  output_features.zero_();
-
-  auto iF = input_features.data<T>() + nFeaturesToDrop;
-  auto oF = output_features.data<T>();
-  RULEBOOKITERATOR(
-      cuda_MaxPooling_ForwardPass<T>(iF, oF, nPlanes, input_features.size(1),
-                                     output_features.size(1), rbB, nHotB);
-      , )
+// NTX must be >=2 so r is filled properly
+template <typename T, Int NTX, Int NTY>
+__global__ void MaxPooling_fp(T *input_features, T *output_features,
+                              Int nPlanes, Int input_stride, Int output_stride,
+                              Int *rules, Int nHot) {
+  __shared__ Int r[NTY * 2];
+  for (Int n = blockIdx.x * NTY; n < nHot; n += gridDim.x * NTY) {
+    {
+      Int i = threadIdx.x + NTX * threadIdx.y;
+      if (i < NTY * 2 and i < 2 * (nHot - n))
+        r[i] = rules[2 * n + i];
+    }
+    __syncthreads();
+    if (n + threadIdx.y < nHot) {
+      Int i = r[2 * threadIdx.y] * input_stride;
+      Int o = r[2 * threadIdx.y + 1] * output_stride;
+      for (Int plane = threadIdx.x; plane < nPlanes; plane += NTX) {
+        T inp = input_features[i + plane];
+        if (output_features[o + plane] < inp)
+          output_features[o + plane] = inp;
+      }
+    }
+    __syncthreads();
+  }
 }
-template <typename T, Int Dimension>
-void cuda_MaxPooling_updateGradInput(
-    /*long*/ at::Tensor inputSize, /*long*/ at::Tensor outputSize,
-    /*long*/ at::Tensor poolSize,
-    /*long*/ at::Tensor poolStride, Metadata<Dimension> &m,
-    /*cuda float*/ at::Tensor input_features,
-    /*cuda float*/ at::Tensor d_input_features,
-    /*cuda float*/ at::Tensor output_features,
-    /*cuda float*/ at::Tensor d_output_features, long nFeaturesToDrop) {
-
-  Int nPlanes = input_features.size(1) - nFeaturesToDrop;
-  auto _rules =
-      m.getRuleBook(inputSize, outputSize, poolSize, poolStride, true);
-  d_input_features.resize_as_(input_features);
-  d_input_features.zero_();

-  auto iF = input_features.data<T>();
-  auto oF = output_features.data<T>();
-  auto diF = d_input_features.data<T>();
-  auto doF = d_output_features.data<T>();
-  RULEBOOKITERATOR(cuda_MaxPooling_BackwardPass<T>(
-                       iF, diF, oF, doF, nPlanes, input_features.size(1),
-                       d_output_features.size(1), rbB, nHotB);
+template <typename T>
+void cuda_MaxPooling_ForwardPass(T *input_features, T *output_features,
+                                 Int nPlanes, Int input_stride,
+                                 Int output_stride, RuleBook _rules) {
+  RULEBOOKITERATOR((MaxPooling_fp<T, 32, 32><<<32, dim3(32, 32)>>>(
+      input_features, output_features, nPlanes, input_stride, output_stride,
+      rbB, nHotB));
                   , )
 }
-template <typename T, Int Dimension>
-void cuda_RandomizedStrideMaxPooling_updateOutput(
-    /*long*/ at::Tensor inputSize, /*long*/ at::Tensor outputSize,
-    /*long*/ at::Tensor poolSize,
-    /*long*/ at::Tensor poolStride, Metadata<Dimension> &m,
-    /*cuda float*/ at::Tensor input_features,
-    /*cuda float*/ at::Tensor output_features, long nFeaturesToDrop) {
-
-  Int nPlanes = input_features.size(1) - nFeaturesToDrop;
-  auto _rules = m.getRandomizedStrideRuleBook(inputSize, outputSize, poolSize,
-                                              poolStride, true);
-  Int nActive = m.getNActive(outputSize);
-  output_features.resize_({nActive, nPlanes});
-  output_features.zero_();
-
-  auto iF = input_features.data<T>() + nFeaturesToDrop;
-  auto oF = output_features.data<T>();
-  RULEBOOKITERATOR(
-      cuda_MaxPooling_ForwardPass<T>(iF, oF, nPlanes, input_features.size(1),
-                                     output_features.size(1), rbB, nHotB);
-      , )
+template <typename T, Int NTX, Int NTY>
+__global__ void MaxPooling_bp(T *input_features, T *d_input_features,
+                              T *output_features, T *d_output_features,
+                              Int nPlanes, Int input_stride, Int output_stride,
+                              Int *rules, Int nHot) {
+  __shared__ Int r[NTY * 2];
+  for (Int n = blockIdx.x * NTY; n < nHot; n += gridDim.x * NTY) {
+    {
+      Int i = threadIdx.x + NTX * threadIdx.y;
+      if (i < NTY * 2 and i < 2 * (nHot - n))
+        r[i] = rules[2 * n + i];
+    }
+    __syncthreads();
+    if (n + threadIdx.y < nHot) {
+      Int i = r[2 * threadIdx.y] * input_stride;
+      Int o = r[2 * threadIdx.y + 1] * output_stride;
+      for (Int plane = threadIdx.x; plane < nPlanes; plane += NTX)
+        if (output_features[o + plane] == input_features[i + plane])
+          d_input_features[i + plane] += d_output_features[o + plane];
+    }
+    __syncthreads();
+  }
 }
-template <typename T, Int Dimension>
-void cuda_RandomizedStrideMaxPooling_updateGradInput(
-    /*long*/ at::Tensor inputSize, /*long*/ at::Tensor outputSize,
-    /*long*/ at::Tensor poolSize,
-    /*long*/ at::Tensor poolStride, Metadata<Dimension> &m,
-    /*cuda float*/ at::Tensor input_features,
-    /*cuda float*/ at::Tensor d_input_features,
-    /*cuda float*/ at::Tensor output_features,
-    /*cuda float*/ at::Tensor d_output_features, long nFeaturesToDrop) {
-
-  Int nPlanes = input_features.size(1) - nFeaturesToDrop;
-  auto _rules = m.getRandomizedStrideRuleBook(inputSize, outputSize, poolSize,
-                                              poolStride, true);
-  d_input_features.resize_as_(input_features);
-  d_input_features.zero_();

-  auto iF = input_features.data<T>();
-  auto oF = output_features.data<T>();
-  auto diF = d_input_features.data<T>();
-  auto doF = d_output_features.data<T>();
-  RULEBOOKITERATOR(cuda_MaxPooling_BackwardPass<T>(
-                       iF, diF, oF, doF, nPlanes, input_features.size(1),
-                       d_output_features.size(1), rbB, nHotB);
+template <typename T>
+void cuda_MaxPooling_BackwardPass(T *input_features, T *d_input_features,
+                                  T *output_features, T *d_output_features,
+                                  Int nPlanes, Int input_stride,
+                                  Int output_stride, RuleBook _rules) {
+  RULEBOOKITERATOR((MaxPooling_bp<T, 32, 32><<<32, dim3(32, 32)>>>(
+      input_features, d_input_features, output_features, d_output_features,
+      nPlanes, input_stride, output_stride, rbB, nHotB));
                   , )
 }
--- a/sparseconvnet/SCN/CUDA/MaxPooling.h
+++ b/sparseconvnet/SCN/CUDA/MaxPooling.h
-// Copyright 2016-present, Facebook, Inc.
-// All rights reserved.
-//
-// This source code is licensed under the license found in the
-// LICENSE file in the root directory of this source tree.
-
-#ifndef CUDA_MAXPOOLING_H
-#define CUDA_MAXPOOLING_H
-
-// NTX must be >=2 so r is filled properly
-template <typename T, Int NTX, Int NTY>
-__global__ void MaxPooling_fp(T *input_features, T *output_features,
-                              Int nPlanes, Int input_stride, Int output_stride,
-                              Int *rules, Int nHot) {
-  __shared__ Int r[NTY * 2];
-  for (Int n = blockIdx.x * NTY; n < nHot; n += gridDim.x * NTY) {
-    {
-      Int i = threadIdx.x + NTX * threadIdx.y;
-      if (i < NTY * 2 and i < 2 * (nHot - n))
-        r[i] = rules[2 * n + i];
-    }
-    __syncthreads();
-    if (n + threadIdx.y < nHot) {
-      Int i = r[2 * threadIdx.y] * input_stride;
-      Int o = r[2 * threadIdx.y + 1] * output_stride;
-      for (Int plane = threadIdx.x; plane < nPlanes; plane += NTX) {
-        T inp = input_features[i + plane];
-        if (output_features[o + plane] < inp)
-          output_features[o + plane] = inp;
-      }
-    }
-    __syncthreads();
-  }
-}
-
-template <typename T>
-void cuda_MaxPooling_ForwardPass(T *input_features, T *output_features,
-                                 Int nPlanes, Int input_stride,
-                                 Int output_stride, Int *rules, Int nHot) {
-  MaxPooling_fp<T, 32, 32><<<32, dim3(32, 32)>>>(
-      input_features, output_features, nPlanes, input_stride, output_stride,
-      rules, nHot);
-}
-template <typename T, Int NTX, Int NTY>
-__global__ void MaxPooling_bp(T *input_features, T *d_input_features,
-                              T *output_features, T *d_output_features,
-                              Int nPlanes, Int input_stride, Int output_stride,
-                              Int *rules, Int nHot) {
-  __shared__ Int r[NTY * 2];
-  for (Int n = blockIdx.x * NTY; n < nHot; n += gridDim.x * NTY) {
-    {
-      Int i = threadIdx.x + NTX * threadIdx.y;
-      if (i < NTY * 2 and i < 2 * (nHot - n))
-        r[i] = rules[2 * n + i];
-    }
-    __syncthreads();
-    if (n + threadIdx.y < nHot) {
-      Int i = r[2 * threadIdx.y] * input_stride;
-      Int o = r[2 * threadIdx.y + 1] * output_stride;
-      for (Int plane = threadIdx.x; plane < nPlanes; plane += NTX)
-        if (output_features[o + plane] == input_features[i + plane])
-          d_input_features[i + plane] += d_output_features[o + plane];
-    }
-    __syncthreads();
-  }
-}
-
-template <typename T>
-void cuda_MaxPooling_BackwardPass(T *input_features, T *d_input_features,
-                                  T *output_features, T *d_output_features,
-                                  Int nPlanes, Int input_stride,
-                                  Int output_stride, Int *rules, Int nHot) {
-  MaxPooling_bp<T, 32, 32><<<32, dim3(32, 32)>>>(
-      input_features, d_input_features, output_features, d_output_features,
-      nPlanes, input_stride, output_stride, rules, nHot);
-}
-#endif /* CUDA_MAXPOOLING_H */
--- a/sparseconvnet/SCN/CUDA/NetworkInNetwork.cu
+++ b/sparseconvnet/SCN/CUDA/NetworkInNetwork.cu
@@ -4,8 +4,6 @@
 // This source code is licensed under the license found in the
 // LICENSE file in the root directory of this source tree.

-#include "Convolution.h"
-
 #include <algorithm>

 template <typename T>

--- a/sparseconvnet/SCN/CUDA/SparseToDense.cpp
+++ b/sparseconvnet/SCN/CUDA/SparseToDense.cpp
+// Copyright 2016-present, Facebook, Inc.
+// All rights reserved.
+//
+// This source code is licensed under the license found in the
+// LICENSE file in the root directory of this source tree.
+
+template <typename T>
+void cuda_SparseToDense_ForwardPass(T *input_features, T *output_features,
+                                    Int nPlanes, Int spatialVolume,
+                                    RuleBook _rules);
+template <typename T>
+void cuda_SparseToDense_BackwardPass(T *d_input_features, T *d_output_features,
+                                     Int nPlanes, Int spatialVolume,
+                                     RuleBook _rules);
+
+template <typename T, Int Dimension>
+void cuda_SparseToDense_updateOutput(
+    /*long*/ at::Tensor inputSize, Metadata<Dimension> &m,
+    /*cuda float*/ at::Tensor input_features,
+    /*cuda float*/ at::Tensor output_features, long nPlanes) {
+
+  {
+    std::array<long, Dimension + 2> sz;
+    sz[0] = m.grids.begin()->second.size(); // batch size
+    sz[1] = nPlanes;
+    long *in_sz = inputSize.data<long>();
+    for (Int i = 0; i < Dimension; ++i)
+      sz[i + 2] = in_sz[i];
+    output_features.resize_(sz);
+    output_features.zero_();
+  }
+  if (input_features.ndimension() == 2) {
+    auto _rules = m.getSparseToDenseRuleBook(inputSize, true);
+    Int _nPlanes = input_features.size(1);
+    auto iF = input_features.data<T>();
+    auto oF = output_features.data<T>();
+    long spatialVolume = inputSize.prod().data<long>()[0];
+    cuda_SparseToDense_ForwardPass<T>(iF, oF, _nPlanes, spatialVolume, _rules);
+  }
+}
+template <typename T, Int Dimension>
+void cuda_SparseToDense_updateGradInput(
+    /*long*/ at::Tensor inputSize, Metadata<Dimension> &m,
+    /*cuda float*/ at::Tensor input_features,
+    /*cuda float*/ at::Tensor d_input_features,
+    /*cuda float*/ at::Tensor d_output_features) {
+
+  d_input_features.resize_as_(input_features);
+  d_input_features.zero_();
+
+  if (input_features.ndimension() == 2) {
+    auto _rules = m.getSparseToDenseRuleBook(inputSize, true);
+    long spatialVolume = inputSize.prod().data<long>()[0];
+    Int _nPlanes = d_input_features.size(1);
+    auto diF = d_input_features.data<T>();
+    auto doF = d_output_features.data<T>();
+    cuda_SparseToDense_BackwardPass<T>(diF, doF, _nPlanes, spatialVolume,
+                                       _rules);
+  }
+}
--- a/sparseconvnet/SCN/CUDA/SparseToDense.cu
+++ b/sparseconvnet/SCN/CUDA/SparseToDense.cu
@@ -4,53 +4,66 @@
 // This source code is licensed under the license found in the
 // LICENSE file in the root directory of this source tree.

-#include "SparseToDense.h"
-
-template <typename T, Int Dimension>
-void cuda_SparseToDense_updateOutput(
-    /*long*/ at::Tensor inputSize, Metadata<Dimension> &m,
-    /*cuda float*/ at::Tensor input_features,
-    /*cuda float*/ at::Tensor output_features, long nPlanes) {
-
+// NTX must be >=2 so r is filled properly
+template <typename T, Int NTX, Int NTY>
+__global__ void SparseToDense_fp(T *input_features, T *output_features,
+                                 Int nPlanes, Int spatialVolume, Int *rules,
+                                 Int nHot) {
+  __shared__ Int r[NTY * 2];
+  for (Int n = blockIdx.x * NTY; n < nHot; n += gridDim.x * NTY) {
    {
-    std::array<long, Dimension + 2> sz;
-    sz[0] = m.grids.begin()->second.size(); // batch size
-    sz[1] = nPlanes;
-    long *in_sz = inputSize.data<long>();
-    for (Int i = 0; i < Dimension; ++i)
-      sz[i + 2] = in_sz[i];
-    output_features.resize_(sz);
-    output_features.zero_();
+      Int i = threadIdx.x + NTX * threadIdx.y;
+      if (i < NTY * 2 and i < 2 * (nHot - n))
+        r[i] = rules[2 * n + i];
+    }
+    __syncthreads();
+    if (n + threadIdx.y < nHot) {
+      T *i = input_features + r[2 * threadIdx.y] * nPlanes;
+      T *o = output_features + r[2 * threadIdx.y + 1];
+      for (Int plane = threadIdx.x; plane < nPlanes; plane += NTX)
+        o[plane * spatialVolume] = i[plane];
    }
-  if (input_features.ndimension() == 2) {
-    auto _rules = m.getSparseToDenseRuleBook(inputSize, true);
-    Int _nPlanes = input_features.size(1);
-    auto iF = input_features.data<T>();
-    auto oF = output_features.data<T>();
-    long spatialVolume = inputSize.prod().data<long>()[0];
-    RULEBOOKITERATOR(SparseToDense_ForwardPass<T>( iF, oF, _nPlanes,
-                                                  spatialVolume, rbB, nHotB);
-                     , oF += _nPlanes * spatialVolume;)
+    __syncthreads();
  }
 }
-template <typename T, Int Dimension>
-void cuda_SparseToDense_updateGradInput(
-    /*long*/ at::Tensor inputSize, Metadata<Dimension> &m,
-    /*cuda float*/ at::Tensor input_features,
-    /*cuda float*/ at::Tensor d_input_features,
-    /*cuda float*/ at::Tensor d_output_features) {

-  d_input_features.resize_as_(input_features);
-  d_input_features.zero_();
+template <typename T>
+void cuda_SparseToDense_ForwardPass(T *input_features, T *output_features,
+                                    Int nPlanes, Int spatialVolume,
+                                    RuleBook _rules) {
+  RULEBOOKITERATOR((SparseToDense_fp<T, 32, 32><<<32, dim3(32, 32)>>>(
+      input_features, output_features, nPlanes, spatialVolume, rbB, nHotB));
+                   , output_features += nPlanes * spatialVolume;)
+}

-  if (input_features.ndimension() == 2) {
-    auto _rules = m.getSparseToDenseRuleBook(inputSize, true);
-    long spatialVolume = inputSize.prod().data<long>()[0];
-    Int _nPlanes = d_input_features.size(1);
-    auto diF = d_input_features.data<T>();
-    auto doF = d_output_features.data<T>();
-    RULEBOOKITERATOR(SparseToDense_BackwardPass<T>( diF, doF, _nPlanes,
-                                                   spatialVolume, rbB, nHotB);
-                     , doF += _nPlanes * spatialVolume;)
+// NTX must be >=2 so r is filled properly
+template <typename T, Int NTX, Int NTY>
+__global__ void SparseToDense_bp(T *d_input_features, T *d_output_features,
+                                 Int nPlanes, Int spatialVolume, Int *rules,
+                                 Int nHot) {
+  __shared__ Int r[NTY * 2];
+  for (Int n = blockIdx.x * NTY; n < nHot; n += gridDim.x * NTY) {
+    {
+      Int i = threadIdx.x + NTX * threadIdx.y;
+      if (i < NTY * 2 and i < 2 * (nHot - n))
+        r[i] = rules[2 * n + i];
    }
+    __syncthreads();
+    if (n + threadIdx.y < nHot) {
+      T *d_i = d_input_features + r[2 * threadIdx.y] * nPlanes;
+      T *d_o = d_output_features + r[2 * threadIdx.y + 1];
+      for (Int plane = threadIdx.x; plane < nPlanes; plane += NTX)
+        d_i[plane] = d_o[plane * spatialVolume];
+    }
+    __syncthreads();
+  }
+}
+
+template <typename T>
+void cuda_SparseToDense_BackwardPass(T *d_input_features, T *d_output_features,
+                                     Int nPlanes, Int spatialVolume,
+                                     RuleBook _rules) {
+  RULEBOOKITERATOR((SparseToDense_bp<T, 32, 32><<<32, dim3(32, 32)>>>(
+      d_input_features, d_output_features, nPlanes, spatialVolume, rbB, nHotB));
+                   , d_output_features += nPlanes * spatialVolume;)
 }
--- a/sparseconvnet/SCN/CUDA/SparseToDense.h
+++ b/sparseconvnet/SCN/CUDA/SparseToDense.h
-// Copyright 2016-present, Facebook, Inc.
-// All rights reserved.
-//
-// This source code is licensed under the license found in the
-// LICENSE file in the root directory of this source tree.
-
-#ifndef CUDA_SPARSETODENSE_H
-#define CUDA_SPARSETODENSE_H
-
-
-// NTX must be >=2 so r is filled properly
-template <typename T, Int NTX, Int NTY>
-__global__ void SparseToDense_fp(T *input_features, T *output_features,
-                                 Int nPlanes, Int spatialVolume, Int *rules,
-                                 Int nHot) {
-  __shared__ Int r[NTY * 2];
-  for (Int n = blockIdx.x * NTY; n < nHot; n += gridDim.x * NTY) {
-    {
-      Int i = threadIdx.x + NTX * threadIdx.y;
-      if (i < NTY * 2 and i < 2 * (nHot - n))
-        r[i] = rules[2 * n + i];
-    }
-    __syncthreads();
-    if (n + threadIdx.y < nHot) {
-      T *i = input_features + r[2 * threadIdx.y] * nPlanes;
-      T *o = output_features + r[2 * threadIdx.y + 1];
-      for (Int plane = threadIdx.x; plane < nPlanes; plane += NTX)
-        o[plane * spatialVolume] = i[plane];
-    }
-    __syncthreads();
-  }
-}
-
-template <typename T>
-void SparseToDense_ForwardPass(T *input_features, T *output_features,
-                               Int nPlanes, Int spatialVolume, Int *rules,
-                               Int nHot) {
-  SparseToDense_fp<T, 32, 32><<<32, dim3(32, 32)>>>(
-      input_features, output_features, nPlanes, spatialVolume, rules, nHot);
-}
-// NTX must be >=2 so r is filled properly
-template <typename T, Int NTX, Int NTY>
-__global__ void SparseToDense_bp(T *d_input_features, T *d_output_features,
-                                 Int nPlanes, Int spatialVolume, Int *rules,
-                                 Int nHot) {
-  __shared__ Int r[NTY * 2];
-  for (Int n = blockIdx.x * NTY; n < nHot; n += gridDim.x * NTY) {
-    {
-      Int i = threadIdx.x + NTX * threadIdx.y;
-      if (i < NTY * 2 and i < 2 * (nHot - n))
-        r[i] = rules[2 * n + i];
-    }
-    __syncthreads();
-    if (n + threadIdx.y < nHot) {
-      T *d_i = d_input_features + r[2 * threadIdx.y] * nPlanes;
-      T *d_o = d_output_features + r[2 * threadIdx.y + 1];
-      for (Int plane = threadIdx.x; plane < nPlanes; plane += NTX)
-        d_i[plane] = d_o[plane * spatialVolume];
-    }
-    __syncthreads();
-  }
-}
-
-template <typename T>
-void SparseToDense_BackwardPass(T *d_input_features, T *d_output_features,
-                                Int nPlanes, Int spatialVolume, Int *rules,
-                                Int nHot) {
-  SparseToDense_bp<T, 32, 32><<<32, dim3(32, 32)>>>(
-      d_input_features, d_output_features, nPlanes, spatialVolume, rules, nHot);
-}
-#endif /* CUDA_SPARSETODENSE_H */