Goodbye THNN. Hello ATen!

2c4ed608 · Benjamin Thomas Graham · 6d4475db · 2c4ed608 · 2c4ed608 · 2c4ed608
Commit 2c4ed608 authored Jun 20, 2018 by Benjamin Thomas Graham
20 changed files
--- a/sparseconvnet/SCN/CUDA/BatchNormalization.cu
+++ b/sparseconvnet/SCN/CUDA/BatchNormalization.cu
+// Copyright 2016-present, Facebook, Inc.
+// All rights reserved.
+//
+// This source code is licensed under the license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include "BatchNormalization.h"
+
+#define BN_F_MACRO(N)                                                          \
+  if (nPlanes % N == 0) {                                                      \
+    BatchNormalization_ForwardPass<T, N, 64>(                                  \
+        input_features.data<T>(), output_features.data<T>(), nPlanes,          \
+        input_stride, output_stride, nActive, saveMean.data<T>(),              \
+        saveInvStd.data<T>(), runningMean.data<T>(), runningVar.data<T>(),     \
+        OptionalTensorData<T>(weight), OptionalTensorData<T>(bias), eps, momentum,   \
+        train, leakiness);                                                     \
+  }
+
+template <typename T>
+void cuda_BatchNormalization_updateOutput(
+    /*cuda float*/ at::Tensor input_features,
+    /*cuda float*/ at::Tensor output_features,
+    /*cuda float*/ at::Tensor saveMean,
+    /*cuda float*/ at::Tensor saveInvStd, /*cuda float*/ at::Tensor runningMean,
+    /*cuda float*/ at::Tensor runningVar,
+    /*cuda float*/ at::Tensor weight, /*cuda float*/ at::Tensor bias, T eps,
+    T momentum, bool train, T leakiness) {
+
+  output_features.resize_as_(input_features);
+  if (input_features.ndimension() == 2) {
+    auto nActive = input_features.size(0);
+    auto nPlanes = input_features.size(1);
+    auto input_stride = input_features.stride(0);
+    auto output_stride = output_features.stride(0);
+    BN_F_MACRO(16)
+    else BN_F_MACRO(12) else BN_F_MACRO(8) else BN_F_MACRO(4) else BN_F_MACRO(1)
+  }
+}
+
+template <typename T>
+void cuda_BatchNormalizationInTensor_updateOutput(
+    /*cuda float*/ at::Tensor input_features,
+    /*cuda float*/ at::Tensor output_features,
+    /*cuda float*/ at::Tensor saveMean,
+    /*cuda float*/ at::Tensor saveInvStd, /*cuda float*/ at::Tensor runningMean,
+    /*cuda float*/ at::Tensor runningVar,
+    /*cuda float*/ at::Tensor weight, /*cuda float*/ at::Tensor bias, T eps,
+    T momentum, bool train, T leakiness) {
+  if (input_features.ndimension() == 2) {
+    auto nActive = input_features.size(0);
+    auto nPlanes = input_features.size(1);
+    auto input_stride = input_features.stride(0);
+    auto output_stride = output_features.stride(0);
+    BN_F_MACRO(16)
+    else BN_F_MACRO(12) else BN_F_MACRO(8) else BN_F_MACRO(4) else BN_F_MACRO(1)
+  }
+}
+
+#undef BN_F_MACRO
+
+#define BN_B_MACRO(N)                                                          \
+  if (nPlanes % N == 0) {                                                      \
+    BatchNormalization_BackwardPass<T, N, 64>(                                 \
+        input_features.data<T>(), d_input_features.data<T>(),                  \
+        output_features.data<T>(), d_output_features.data<T>(), nPlanes,       \
+        input_stride, output_stride, nActive, saveMean.data<T>(),              \
+        saveInvStd.data<T>(), runningMean.data<T>(), runningVar.data<T>(),     \
+        OptionalTensorData<T>(weight), OptionalTensorData<T>(bias),                  \
+        OptionalTensorData<T>(d_weight), OptionalTensorData<T>(d_bias), leakiness);  \
+  }
+
+template <typename T>
+void cuda_BatchNormalization_backward(
+    /*cuda float*/ at::Tensor input_features,
+    /*cuda float*/ at::Tensor d_input_features,
+    /*cuda float*/ at::Tensor output_features,
+    /*cuda float*/ at::Tensor d_output_features,
+    /*cuda float*/ at::Tensor saveMean, /*cuda float*/ at::Tensor saveInvStd,
+    /*cuda float*/ at::Tensor runningMean,
+    /*cuda float*/ at::Tensor runningVar, /*cuda float*/ at::Tensor weight,
+    /*cuda float*/ at::Tensor bias,
+    /*cuda float*/ at::Tensor d_weight, /*cuda float*/ at::Tensor d_bias,
+    T leakiness) {
+
+  d_input_features.resize_as_(d_output_features);
+  if (input_features.ndimension() == 2) {
+    auto nActive = input_features.size(0);
+    auto nPlanes = input_features.size(1);
+    auto input_stride = input_features.stride(0);
+    auto output_stride = output_features.stride(0);
+    BN_B_MACRO(16)
+    else BN_B_MACRO(12) else BN_B_MACRO(8) else BN_B_MACRO(4) else BN_B_MACRO(1)
+  }
+}
--- a/sparseconvnet/SCN/generic/GPU/BatchNormalization.h
+++ b/sparseconvnet/SCN/generic/GPU/BatchNormalization.h
@@ -4,9 +4,9 @@
 // This source code is licensed under the license found in the
 // LICENSE file in the root directory of this source tree.

-#ifndef GPU_BATCHNORMALIZATION_H
-#define GPU_BATCHNORMALIZATION_H
-#include "../SparseConvNet.h"
+#ifndef CUDA_BATCHNORMALIZATION_H
+#define CUDA_BATCHNORMALIZATION_H
+
 #include <cassert>

 // input_stride and output_stride are normally the same as nPlanes; allow larger
@@ -14,22 +14,22 @@
 // NTX ~ 16 - nPlanes must be a multiple of this
 // NTY ~ 64 - at least 4

-template <typename T, uInt NTX, uInt NTY>
+template <typename T, Int NTX, Int NTY>
 __global__ void
-BatchNormalization_f_train(T *input_features, T *output_features, uInt nPlanes,
-                           uInt input_stride, uInt output_stride, uInt nActive,
+BatchNormalization_f_train(T *input_features, T *output_features, Int nPlanes,
+                           Int input_stride, Int output_stride, Int nActive,
                           T *saveMean, T *saveInvStd, T *runningMean,
                           T *runningVar, T *weight, T *bias, T eps, T momentum,
                           T leakiness) {
  __shared__ T t[NTY][NTX];
  __shared__ T t2[NTY][NTX];

-  for (uInt plane = threadIdx.x + blockIdx.x * NTX; plane < nPlanes;
+  for (Int plane = threadIdx.x + blockIdx.x * NTX; plane < nPlanes;
       plane += gridDim.x * NTX) {
    t[threadIdx.y][threadIdx.x] = 0;
    t2[threadIdx.y][threadIdx.x] = 0;

-    for (uInt row = threadIdx.y, c = plane + threadIdx.y * input_stride;
+    for (Int row = threadIdx.y, c = plane + threadIdx.y * input_stride;
         row < nActive; row += NTY, c += input_stride * NTY) {
      T i = input_features[c];
      t[threadIdx.y][threadIdx.x] += i;
@@ -38,7 +38,7 @@ BatchNormalization_f_train(T *input_features, T *output_features, uInt nPlanes,
    __syncthreads();
    T _saveMean = 0;
    T _saveInvStd = 0;
-    for (uInt row = 0; row < NTY; row++) {
+    for (Int row = 0; row < NTY; row++) {
      _saveMean += t[row][threadIdx.x];
      _saveInvStd += t2[row][threadIdx.x];
    }
@@ -65,7 +65,7 @@ BatchNormalization_f_train(T *input_features, T *output_features, uInt nPlanes,

    T W = t[0][threadIdx.x];
    T B = t[1][threadIdx.x];
-    for (uInt row = threadIdx.y, ci = plane + threadIdx.y * input_stride,
+    for (Int row = threadIdx.y, ci = plane + threadIdx.y * input_stride,
              co = plane + threadIdx.y * output_stride;
         row < nActive;
         row += NTY, ci += input_stride * NTY, co += output_stride * NTY) {
@@ -75,16 +75,16 @@ BatchNormalization_f_train(T *input_features, T *output_features, uInt nPlanes,
    __syncthreads();
  }
 }
-template <typename T, uInt NTX, uInt NTY>
+template <typename T, Int NTX, Int NTY>
 __global__ void
-BatchNormalization_f_test(T *input_features, T *output_features, uInt nPlanes,
-                          uInt input_stride, uInt output_stride, uInt nActive,
+BatchNormalization_f_test(T *input_features, T *output_features, Int nPlanes,
+                          Int input_stride, Int output_stride, Int nActive,
                          T *saveMean, T *saveInvStd, T *runningMean,
                          T *runningVar, T *weight, T *bias, T eps, T momentum,
                          T leakiness) {
  __shared__ T W[NTX];
  __shared__ T B[NTX];
-  for (uInt plane = threadIdx.x + blockIdx.x * NTX; plane < nPlanes;
+  for (Int plane = threadIdx.x + blockIdx.x * NTX; plane < nPlanes;
       plane += gridDim.x * NTX) {
    if (threadIdx.y == 0) {
      W[threadIdx.x] =
@@ -95,7 +95,7 @@ BatchNormalization_f_test(T *input_features, T *output_features, uInt nPlanes,
    __syncthreads();

    float w = W[threadIdx.x], b = B[threadIdx.x];
-    for (uInt row = threadIdx.y, ci = plane + threadIdx.y * input_stride,
+    for (Int row = threadIdx.y, ci = plane + threadIdx.y * input_stride,
              co = plane + threadIdx.y * output_stride;
         row < nActive;
         row += NTY, ci += input_stride * NTY, co += output_stride * NTY) {
@@ -106,40 +106,38 @@ BatchNormalization_f_test(T *input_features, T *output_features, uInt nPlanes,
  }
 }

-template <typename T, uInt NTX, uInt NTY>
+template <typename T, Int NTX, Int NTY>
 void BatchNormalization_ForwardPass(T *input_features, T *output_features,
-                                    uInt nPlanes, uInt input_stride,
-                                    uInt output_stride, uInt nActive,
+                                    Int nPlanes, Int input_stride,
+                                    Int output_stride, Int nActive,
                                    T *saveMean, T *saveInvStd, T *runningMean,
                                    T *runningVar, T *weight, T *bias, T eps,
                                    T momentum, bool train, T leakiness) {
  if (train) {
    BatchNormalization_f_train<
-        T, NTX, NTY><<<std::min((uInt)16, nPlanes / NTX), dim3(NTX, NTY), 0,
-                       THCState_getCurrentStream(state)>>>(
+        T, NTX, NTY><<<std::min((Int)16, nPlanes / NTX), dim3(NTX, NTY)>>>(
        input_features, output_features, nPlanes, input_stride, output_stride,
        nActive, saveMean, saveInvStd, runningMean, runningVar, weight, bias,
        eps, momentum, leakiness);
  } else {
    BatchNormalization_f_test<
-        T, NTX, NTY><<<std::min((uInt)16, nPlanes / NTX), dim3(NTX, NTY), 0,
-                       THCState_getCurrentStream(state)>>>(
+        T, NTX, NTY><<<std::min((Int)16, nPlanes / NTX), dim3(NTX, NTY)>>>(
        input_features, output_features, nPlanes, input_stride, output_stride,
        nActive, saveMean, saveInvStd, runningMean, runningVar, weight, bias,
        eps, momentum, leakiness);
  }
 }

-template <typename T, uInt NTX, uInt NTY>
+template <typename T, Int NTX, Int NTY>
 __global__ void
 BatchNormalization_b(T *input_features, T *d_input_features, T *output_features,
-                     T *d_output_features, uInt nPlanes, uInt input_stride,
-                     uInt output_stride, uInt nActive, T *saveMean,
+                     T *d_output_features, Int nPlanes, Int input_stride,
+                     Int output_stride, Int nActive, T *saveMean,
                     T *saveInvStd, T *runningMean, T *runningVar, T *weight,
                     T *bias, T *d_weight, T *d_bias, T leakiness) {
  __shared__ T t[NTY][NTX];
  __shared__ T t2[NTY][NTX];
-  for (uInt plane = threadIdx.x + blockIdx.x * NTX; plane < nPlanes;
+  for (Int plane = threadIdx.x + blockIdx.x * NTX; plane < nPlanes;
       plane += gridDim.x * NTX) {
    if (threadIdx.y == 0) {
      t[0][threadIdx.x] = saveMean[plane];
@@ -153,7 +151,7 @@ BatchNormalization_b(T *input_features, T *d_input_features, T *output_features,
    __syncthreads();
    t[threadIdx.y][threadIdx.x] = 0;
    t2[threadIdx.y][threadIdx.x] = 0;
-    for (uInt row = threadIdx.y, ci = plane + threadIdx.y * input_stride,
+    for (Int row = threadIdx.y, ci = plane + threadIdx.y * input_stride,
              co = plane + threadIdx.y * output_stride;
         row < nActive;
         row += NTY, ci += input_stride * NTY, co += output_stride * NTY) {
@@ -180,7 +178,7 @@ BatchNormalization_b(T *input_features, T *d_input_features, T *output_features,

    T k = dotp * _saveInvStd * _saveInvStd / nActive;

-    for (uInt row = threadIdx.y, ci = plane + threadIdx.y * input_stride,
+    for (Int row = threadIdx.y, ci = plane + threadIdx.y * input_stride,
              co = plane + threadIdx.y * output_stride;
         row < nActive;
         row += NTY, ci += input_stride * NTY, co += output_stride * NTY) {
@@ -192,17 +190,16 @@ BatchNormalization_b(T *input_features, T *d_input_features, T *output_features,
  }
 }

-template <typename T, uInt NTX, uInt NTY>
+template <typename T, Int NTX, Int NTY>
 void BatchNormalization_BackwardPass(T *input_features, T *d_input_features,
                                     T *output_features, T *d_output_features,
-                                     uInt nPlanes, uInt input_stride,
-                                     uInt output_stride, uInt nActive,
+                                     Int nPlanes, Int input_stride,
+                                     Int output_stride, Int nActive,
                                     T *saveMean, T *saveInvStd, T *runningMean,
                                     T *runningVar, T *weight, T *bias,
                                     T *d_weight, T *d_bias, T leakiness) {
-  BatchNormalization_b<T, NTX,
-                       NTY><<<std::min((uInt)16, nPlanes / NTX), dim3(NTX, NTY),
-                              0, THCState_getCurrentStream(state)>>>(
+  BatchNormalization_b<
+      T, NTX, NTY><<<std::min((Int)16, nPlanes / NTX), dim3(NTX, NTY)>>>(
      input_features, d_input_features, output_features, d_output_features,
      nPlanes, input_stride, output_stride, nActive, saveMean, saveInvStd,
      runningMean, runningVar, weight, bias, d_weight, d_bias, leakiness);
@@ -210,4 +207,4 @@ void BatchNormalization_BackwardPass(T *input_features, T *d_input_features,

 #undef NTX
 #undef NTY
-#endif /* GPU_BATCHNORMALIZATION_H */
+#endif /* CUDA_BATCHNORMALIZATION_H */
--- a/sparseconvnet/SCN/generic/GPU/BatchwiseMultiplicativeDropout.cu
+++ b/sparseconvnet/SCN/generic/GPU/BatchwiseMultiplicativeDropout.cu
@@ -4,32 +4,28 @@
 // This source code is licensed under the license found in the
 // LICENSE file in the root directory of this source tree.

-#ifndef TH_GENERIC_FILE
-#define TH_GENERIC_FILE "generic/GPU/BatchwiseMultiplicativeDropout.cu"
-#else
 #include "BatchwiseMultiplicativeDropout.h"

 #define SPARSECONVNET_FOO(NTX, NTY)                                            \
  {                                                                            \
    if (nPlanes % NTX == 0) {                                                  \
-      BatchwiseMultiplicativeDropout_fp<real, NTX, NTY> << <                   \
-          dim3(std::min(16L, nPlanes / NTX), 16), dim3(NTX, NTY), 0,           \
-          THCState_getCurrentStream(state)>>>                                  \
-          (THCTensor_(data)(state, input_features),                            \
-           THCTensor_(data)(state, output_features),                           \
-           THCTensor_(data)(state, noise), nActive, nPlanes, nPlanes, nPlanes, \
-           alpha);                                                             \
+      BatchwiseMultiplicativeDropout_fp<                                       \
+          T, NTX,                                                              \
+          NTY><<<dim3(std::min(16L, nPlanes / NTX), 16), dim3(NTX, NTY)>>>(    \
+          input_features.data<T>(), output_features.data<T>(),                 \
+          noise.data<T>(), nActive, nPlanes, nPlanes, nPlanes, alpha);         \
      return;                                                                  \
    }                                                                          \
  }

-extern "C" void scn_R_(BatchwiseMultiplicativeDropout_updateOutput)(
-    THCTensor *input_features, THCTensor *output_features, THCTensor *noise,
+template <typename T>
+void cuda_BatchwiseMultiplicativeDropout_updateOutput(
+    /*cuda float*/ at::Tensor input_features,
+    /*cuda float*/ at::Tensor output_features, /*cuda float*/ at::Tensor noise,
    float alpha) {
-  if (input_features != output_features)
-    THCTensor_(resizeAs)(state, output_features, input_features);
-  auto nActive = input_features->size[0];
-  auto nPlanes = input_features->size[1];
+  output_features.resize_as_(input_features);
+  auto nActive = input_features.size(0);
+  auto nPlanes = input_features.size(1);
  SPARSECONVNET_FOO(32, 32)
  SPARSECONVNET_FOO(24, 32)
  SPARSECONVNET_FOO(16, 64)
@@ -43,24 +39,24 @@ extern "C" void scn_R_(BatchwiseMultiplicativeDropout_updateOutput)(
 #define SPARSECONVNET_FOO(NTX, NTY)                                            \
  {                                                                            \
    if (nPlanes % NTX == 0) {                                                  \
-      BatchwiseMultiplicativeDropout_bp<real, NTX, NTY> << <                   \
-          dim3(std::min(16L, nPlanes / NTX), 16), dim3(NTX, NTY), 0,           \
-          THCState_getCurrentStream(state)>>>                                  \
-          (THCTensor_(data)(state, input_features),                            \
-           THCTensor_(data)(state, d_input_features),                          \
-           THCTensor_(data)(state, d_output_features),                         \
-           THCTensor_(data)(state, noise), nActive, nPlanes, nPlanes, nPlanes, \
-           alpha);                                                             \
+      BatchwiseMultiplicativeDropout_bp<                                       \
+          T, NTX,                                                              \
+          NTY><<<dim3(std::min(16L, nPlanes / NTX), 16), dim3(NTX, NTY)>>>(    \
+          input_features.data<T>(), d_input_features.data<T>(),                \
+          d_output_features.data<T>(), noise.data<T>(), nActive, nPlanes,      \
+          nPlanes, nPlanes, alpha);                                            \
      return;                                                                  \
    }                                                                          \
  }
-extern "C" void scn_R_(BatchwiseMultiplicativeDropout_updateGradInput)(
-    THCTensor *input_features, THCTensor *d_input_features,
-    THCTensor *d_output_features, THCTensor *noise, float alpha) {
-  if (d_input_features != d_output_features)
-    THCTensor_(resizeAs)(state, d_input_features, d_output_features);
-  auto nActive = input_features->size[0];
-  auto nPlanes = input_features->size[1];
+template <typename T>
+void cuda_BatchwiseMultiplicativeDropout_updateGradInput(
+    /*cuda float*/ at::Tensor input_features,
+    /*cuda float*/ at::Tensor d_input_features,
+    /*cuda float*/ at::Tensor d_output_features,
+    /*cuda float*/ at::Tensor noise, float alpha) {
+  d_input_features.resize_as_(d_output_features);
+  auto nActive = input_features.size(0);
+  auto nPlanes = input_features.size(1);

  SPARSECONVNET_FOO(32, 32)
  SPARSECONVNET_FOO(24, 32)
@@ -71,5 +67,3 @@ extern "C" void scn_R_(BatchwiseMultiplicativeDropout_updateGradInput)(
  SPARSECONVNET_FOO(1, 64)
 }
 #undef SPARSECONVNET_FOO
-
-#endif
--- a/sparseconvnet/SCN/generic/GPU/BatchwiseMultiplicativeDropout.h
+++ b/sparseconvnet/SCN/generic/GPU/BatchwiseMultiplicativeDropout.h
@@ -4,50 +4,50 @@
 // This source code is licensed under the license found in the
 // LICENSE file in the root directory of this source tree.

-#ifndef GPU_BATCHWISEMULTIPLICATIVEDROPOUT_H
-#define GPU_BATCHWISEMULTIPLICATIVEDROPOUT_H
-template <typename T, uInt NTX, uInt NTY>
+#ifndef CUDA_BATCHWISEMULTIPLICATIVEDROPOUT_H
+#define CUDA_BATCHWISEMULTIPLICATIVEDROPOUT_H
+template <typename T, Int NTX, Int NTY>
 __global__ void BatchwiseMultiplicativeDropout_fp(T *input_features,
                                                  T *output_features, T *noise,
-                                                  uInt nActive, uInt nPlanes,
-                                                  uInt input_stride,
-                                                  uInt output_stride, T alpha) {
+                                                  Int nActive, Int nPlanes,
+                                                  Int input_stride,
+                                                  Int output_stride, T alpha) {
  __shared__ T nz[NTX];
-  for (uInt plane = threadIdx.x + blockIdx.x * NTX; plane < nPlanes;
+  for (Int plane = threadIdx.x + blockIdx.x * NTX; plane < nPlanes;
       plane += gridDim.x * NTX) {
    if (threadIdx.y == 0)
      nz[threadIdx.x] = noise[plane];
    __syncthreads();
-    for (uInt row = threadIdx.y + blockIdx.y * NTY; row < nActive;
+    for (Int row = threadIdx.y + blockIdx.y * NTY; row < nActive;
         row += gridDim.y * NTY) {
-      uInt i = row * input_stride + plane;
-      uInt o = row * output_stride + plane;
+      Int i = row * input_stride + plane;
+      Int o = row * output_stride + plane;
      output_features[o] = input_features[i] * nz[threadIdx.x] *
                           ((input_features[i] > 0) ? 1 : alpha);
    }
    __syncthreads();
  }
 }
-template <typename T, uInt NTX, uInt NTY>
+template <typename T, Int NTX, Int NTY>
 __global__ void
 BatchwiseMultiplicativeDropout_bp(T *input_features, T *d_input_features,
-                                  T *d_output_features, T *noise, uInt nActive,
-                                  uInt nPlanes, uInt input_stride,
-                                  uInt output_stride, T alpha) {
+                                  T *d_output_features, T *noise, Int nActive,
+                                  Int nPlanes, Int input_stride,
+                                  Int output_stride, T alpha) {
  __shared__ T nz[NTX];
-  for (uInt plane = threadIdx.x + blockIdx.x * NTX; plane < nPlanes;
+  for (Int plane = threadIdx.x + blockIdx.x * NTX; plane < nPlanes;
       plane += gridDim.x * NTX) {
    if (threadIdx.y == 0)
      nz[threadIdx.x] = noise[plane];
    __syncthreads();
-    for (uInt row = threadIdx.y + blockIdx.y * NTY; row < nActive;
+    for (Int row = threadIdx.y + blockIdx.y * NTY; row < nActive;
         row += gridDim.y * NTY) {
-      uInt i = row * input_stride + plane;
-      uInt o = row * output_stride + plane;
+      Int i = row * input_stride + plane;
+      Int o = row * output_stride + plane;
      d_input_features[i] = d_output_features[o] * nz[threadIdx.x] *
                            ((input_features[i] > 0) ? 1 : alpha);
    }
    __syncthreads();
  }
 }
-#endif /* GPU_BATCHWISEMULTIPLICATIVEDROPOUT_H */
+#endif /* CUDA_BATCHWISEMULTIPLICATIVEDROPOUT_H */
--- a/sparseconvnet/SCN/CUDA/Convolution.cu
+++ b/sparseconvnet/SCN/CUDA/Convolution.cu
+// Copyright 2016-present, Facebook, Inc.
+// All rights reserved.
+//
+// This source code is licensed under the license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include "Convolution.h"
+#include "RuleBookIterator.h"
+
+template <typename T, Int Dimension>
+double cuda_Convolution_updateOutput(
+    /*long*/ at::Tensor inputSize, /*long*/ at::Tensor outputSize,
+    /*long*/ at::Tensor filterSize,
+    /*long*/ at::Tensor filterStride, Metadata<Dimension> &m,
+    /*cuda float*/ at::Tensor input_features,
+    /*cuda float*/ at::Tensor output_features, /*cuda float*/ at::Tensor weight,
+    /*cuda float*/ at::Tensor bias) {
+
+  auto _rules =
+      m.getRuleBook(inputSize, outputSize, filterSize, filterStride, true);
+  Int nActive = m.getNActive(outputSize);
+  output_features.resize_({nActive, weight.size(2)});
+  if (not bias.numel())
+    output_features.zero_();
+
+  double flops = 0;
+  if (nActive) {
+    auto iF = input_features.data<T>();
+    auto oF = output_features.data<T>();
+    Int ip = input_features.size(1);
+    Int op = output_features.size(1);
+    auto w = weight.data<T>();
+
+    if (bias.numel()) {
+      auto b = bias.data<T>();
+      for (Int i = 0; i < op; i += 32) {
+        Int blockDim = min((Int)32, op - i);
+        Int gridDim = min((Int)4096, nActive);
+        Convolution_fp_bias<<<gridDim, blockDim>>>(oF + i, b + i, op, op,
+                                                   nActive);
+      }
+    }
+    Int c = ip * op;
+    RULEBOOKITERATOR(
+        dConvolution_forward2<T>(iF, oF, w, rbB, nHotB, ip, ip, op, op);
+        , w += c; flops += nHotB * c;)
+  }
+  return flops;
+}
+
+template <typename T, Int Dimension>
+void cuda_Convolution_backward(
+    /*long*/ at::Tensor inputSize, /*long*/ at::Tensor outputSize,
+    /*long*/ at::Tensor filterSize,
+    /*long*/ at::Tensor filterStride, Metadata<Dimension> &m,
+    /*cuda float*/ at::Tensor input_features,
+    /*cuda float*/ at::Tensor d_input_features,
+    /*cuda float*/ at::Tensor d_output_features,
+    /*cuda float*/ at::Tensor weight, /*cuda float*/ at::Tensor d_weight,
+    /*cuda float*/ at::Tensor d_bias) {
+
+  auto _rules =
+      m.getRuleBook(inputSize, outputSize, filterSize, filterStride, true);
+  Int nActive = m.getNActive(outputSize);
+  d_input_features.resize_as_(input_features);
+  d_input_features.zero_();
+
+  if (nActive) {
+    auto iF = input_features.data<T>();
+    auto diF = d_input_features.data<T>();
+    auto doF = d_output_features.data<T>();
+    Int ip = input_features.size(1);
+    Int op = d_output_features.size(1);
+    auto w = weight.data<T>();
+    auto dw = d_weight.data<T>();
+    Int c = ip * op;
+    RULEBOOKITERATOR(dConvolution_backward_dW2<T>(iF, diF, doF, w, dw, rbB,
+                                                  nHotB, ip, ip, op, op);
+                     , w += c; dw += c;)
+
+    if (d_bias.numel()) {
+      auto db = d_bias.data<T>();
+      Convolution_bp_bias(doF, db, op, op, nActive);
+    }
+  }
+}
+
+template <typename T, Int Dimension>
+double cuda_SubmanifoldConvolution_updateOutput(
+    /*long*/ at::Tensor inputSize, /*long*/ at::Tensor filterSize,
+    Metadata<Dimension> &m,
+    /*cuda float*/ at::Tensor input_features,
+    /*cuda float*/ at::Tensor output_features, /*cuda float*/ at::Tensor weight,
+    /*cuda float*/ at::Tensor bias) {
+
+  auto _rules = m.getSubmanifoldRuleBook(inputSize, filterSize, true);
+  Int nActive = m.getNActive(inputSize);
+  output_features.resize_({nActive, weight.size(2)});
+  if (bias.numel() and nActive)
+    output_features.copy_(bias);
+  else
+    output_features.zero_();
+
+  double flops = 0;
+  if (nActive) {
+    auto iF = input_features.data<T>();
+    auto oF = output_features.data<T>();
+    Int ip = input_features.size(1);
+    Int op = output_features.size(1);
+    auto w = weight.data<T>();
+
+    // if (bias.numel()) {
+    //   auto b = bias.data<T>();
+    //   for (Int i = 0; i < op; i += 32) {
+    //     Int blockDim = min((Int)32, op - i);
+    //     Int gridDim = min((Int)4096, nActive);
+    //     Convolution_fp_bias<<<gridDim, blockDim>>>(oF + i, b + i, op, op,
+    //                                                nActive);
+    //   }
+    // }
+    Int c = ip * op;
+    RULEBOOKITERATOR(
+        dConvolution_forward2<T>(iF, oF, w, rbB, nHotB, ip, ip, op, op);
+        , w += c; flops += nHotB * c;)
+  }
+  return flops;
+}
+
+template <typename T, Int Dimension>
+void cuda_SubmanifoldConvolution_backward(
+    /*long*/ at::Tensor inputSize, /*long*/ at::Tensor filterSize,
+    Metadata<Dimension> &m,
+    /*cuda float*/ at::Tensor input_features,
+    /*cuda float*/ at::Tensor d_input_features,
+    /*cuda float*/ at::Tensor d_output_features,
+    /*cuda float*/ at::Tensor weight, /*cuda float*/ at::Tensor d_weight,
+    /*cuda float*/ at::Tensor d_bias) {
+
+  auto _rules = m.getSubmanifoldRuleBook(inputSize, filterSize, true);
+  Int nActive = m.getNActive(inputSize);
+  d_input_features.resize_as_(input_features);
+  d_input_features.zero_();
+
+  if (nActive) {
+    auto iF = input_features.data<T>();
+    auto diF = d_input_features.data<T>();
+    auto doF = d_output_features.data<T>();
+    Int ip = input_features.size(1);
+    Int op = d_output_features.size(1);
+    auto w = weight.data<T>();
+    auto dw = d_weight.data<T>();
+    Int c = ip * op;
+    RULEBOOKITERATOR(dConvolution_backward_dW2<T>(iF, diF, doF, w, dw, rbB,
+                                                  nHotB, ip, ip, op, op);
+                     , w += c; dw += c;)
+
+    if (d_bias.numel()) {
+      auto db = d_bias.data<T>();
+      Convolution_bp_bias(doF, db, op, op, nActive);
+    }
+  }
+}
+
+template <typename T, Int Dimension>
+double cuda_FullConvolution_updateOutput(
+    /*long*/ at::Tensor inputSize, /*long*/ at::Tensor outputSize,
+    /*long*/ at::Tensor filterSize,
+    /*long*/ at::Tensor filterStride, Metadata<Dimension> &mIn,
+    Metadata<Dimension> &mOut,
+    /*cuda float*/ at::Tensor input_features,
+    /*cuda float*/ at::Tensor output_features, /*cuda float*/ at::Tensor weight,
+    /*cuda float*/ at::Tensor bias) {
+
+  auto _rules = mIn.getFullConvolutionRuleBook(inputSize, outputSize,
+                                               filterSize, filterStride, mOut);
+  Int nActive = mOut.getNActive(outputSize);
+  output_features.resize_({nActive, weight.size(2)});
+  if (not bias.numel())
+    output_features.zero_();
+  double flops = 0;
+
+  if (nActive) {
+    auto iF = input_features.data<T>();
+    auto oF = output_features.data<T>();
+    Int ip = input_features.size(1);
+    Int op = output_features.size(1);
+    auto w = weight.data<T>();
+
+    if (bias.numel()) {
+      auto b = bias.data<T>();
+      for (Int i = 0; i < op; i += 32) {
+        Int blockDim = min((Int)32, op - i);
+        Int gridDim = min((Int)4096, nActive);
+        Convolution_fp_bias<<<gridDim, blockDim>>>(oF + i, b + i, op, op,
+                                                   nActive);
+      }
+    }
+    Int c = ip * op;
+    RULEBOOKITERATOR(
+        dConvolution_forward2<T>(iF, oF, w, rbB, nHotB, ip, ip, op, op);
+        , w += c; flops += nHotB * c;)
+  }
+  return flops;
+}
+
+template <typename T, Int Dimension>
+void cuda_FullConvolution_backward(
+    /*long*/ at::Tensor inputSize, /*long*/ at::Tensor outputSize,
+    /*long*/ at::Tensor filterSize,
+    /*long*/ at::Tensor filterStride, Metadata<Dimension> &mIn,
+    Metadata<Dimension> &mOut,
+    /*cuda float*/ at::Tensor input_features,
+    /*cuda float*/ at::Tensor d_input_features,
+    /*cuda float*/ at::Tensor d_output_features,
+    /*cuda float*/ at::Tensor weight, /*cuda float*/ at::Tensor d_weight,
+    /*cuda float*/ at::Tensor d_bias) {
+
+  auto _rules = mIn.getFullConvolutionRuleBook(inputSize, outputSize,
+                                               filterSize, filterStride, mOut);
+  Int nActive = mOut.getNActive(outputSize);
+  d_input_features.resize_as_(input_features);
+  d_input_features.zero_();
+  if (nActive) {
+    auto iF = input_features.data<T>();
+    auto diF = d_input_features.data<T>();
+    auto doF = d_output_features.data<T>();
+    Int ip = input_features.size(1);
+    Int op = d_output_features.size(1);
+    auto w = weight.data<T>();
+    auto dw = d_weight.data<T>();
+    Int c = ip * op;
+    RULEBOOKITERATOR(dConvolution_backward_dW2<T>(iF, diF, doF, w, dw, rbB,
+                                                  nHotB, ip, ip, op, op);
+                     , w += c; dw += c;)
+
+    if (d_bias.numel()) {
+      auto db = d_bias.data<T>();
+      Convolution_bp_bias(doF, db, op, op, nActive);
+    }
+  }
+}
+template <typename T, Int Dimension>
+double cuda_RandomizedStrideConvolution_updateOutput(
+    /*long*/ at::Tensor inputSize, /*long*/ at::Tensor outputSize,
+    /*long*/ at::Tensor filterSize,
+    /*long*/ at::Tensor filterStride, Metadata<Dimension> &m,
+    /*cuda float*/ at::Tensor input_features,
+    /*cuda float*/ at::Tensor output_features,
+    /*cuda float*/ at::Tensor weight, /*cuda float*/ at::Tensor bias) {
+
+  auto _rules = m.getRandomizedStrideRuleBook(inputSize, outputSize, filterSize,
+                                              filterStride, true);
+  Int nActive = m.getNActive(outputSize);
+  output_features.resize_({nActive, weight.size(2)});
+  if (not bias.numel())
+    output_features.zero_();
+
+  double flops = 0;
+  if (nActive) {
+    auto iF = input_features.data<T>();
+    auto oF = output_features.data<T>();
+    Int ip = input_features.size(1);
+    Int op = output_features.size(1);
+    auto w = weight.data<T>();
+
+    if (bias.numel()) {
+      auto b = bias.data<T>();
+      for (Int i = 0; i < op; i += 32) {
+        Int blockDim = min((Int)32, op - i);
+        Int gridDim = min((Int)4096, nActive);
+        Convolution_fp_bias<<<gridDim, blockDim>>>(oF + i, b + i, op, op,
+                                                   nActive);
+      }
+    }
+    Int c = ip * op;
+    RULEBOOKITERATOR(
+        dConvolution_forward2<T>(iF, oF, w, rbB, nHotB, ip, ip, op, op);
+        , w += c; flops += nHotB * c;)
+  }
+  return flops;
+}
+
+template <typename T, Int Dimension>
+void cuda_RandomizedStrideConvolution_backward(
+    /*long*/ at::Tensor inputSize, /*long*/ at::Tensor outputSize,
+    /*long*/ at::Tensor filterSize,
+    /*long*/ at::Tensor filterStride, Metadata<Dimension> &m,
+    /*cuda float*/ at::Tensor input_features,
+    /*cuda float*/ at::Tensor d_input_features,
+    /*cuda float*/ at::Tensor d_output_features,
+    /*cuda float*/ at::Tensor weight, /*cuda float*/ at::Tensor d_weight,
+    /*cuda float*/ at::Tensor d_bias) {
+
+  auto _rules = m.getRandomizedStrideRuleBook(inputSize, outputSize, filterSize,
+                                              filterStride, true);
+  Int nActive = m.getNActive(outputSize);
+  d_input_features.resize_as_(input_features);
+  d_input_features.zero_();
+
+  if (nActive) {
+    auto iF = input_features.data<T>();
+    auto diF = d_input_features.data<T>();
+    auto doF = d_output_features.data<T>();
+    Int ip = input_features.size(1);
+    Int op = d_output_features.size(1);
+    auto w = weight.data<T>();
+    auto dw = d_weight.data<T>();
+    Int c = ip * op;
+    RULEBOOKITERATOR(dConvolution_backward_dW2<T>(iF, diF, doF, w, dw, rbB,
+                                                  nHotB, ip, ip, op, op);
+                     , w += c; dw += c;)
+
+    if (d_bias.numel()) {
+      auto db = d_bias.data<T>();
+      Convolution_bp_bias(doF, db, op, op, nActive);
+    }
+  }
+}
--- a/sparseconvnet/SCN/generic/GPU/Convolution.h
+++ b/sparseconvnet/SCN/generic/GPU/Convolution.h
@@ -4,47 +4,47 @@
 // This source code is licensed under the license found in the
 // LICENSE file in the root directory of this source tree.

-#ifndef GPU_CONVOLUTION_H
-#define GPU_CONVOLUTION_H
-#include "../SparseConvNet.h"
+#ifndef CUDA_CONVOLUTION_H
+#define CUDA_CONVOLUTION_H
+

 template <typename T>
-__global__ void Convolution_fp_bias(T *output_features, T *bias, uInt nPlanes,
-                                    uInt output_stride, uInt nActive) {
+__global__ void Convolution_fp_bias(T *output_features, T *bias, Int nPlanes,
+                                    Int output_stride, Int nActive) {
  __shared__ T b[32];
  b[threadIdx.x] = bias[threadIdx.x];
-  for (uInt row = blockIdx.x; row < nActive; row += 1 << 12) {
+  for (Int row = blockIdx.x; row < nActive; row += 1 << 12) {
    output_features[row * output_stride + threadIdx.x] = b[threadIdx.x];
  }
 }

 template <typename T>
-__global__ void dColumnSum(T *matrix, T *target, uInt nRows, uInt nColumns,
-                           uInt nCOLUMNS) {
-  uInt i = blockIdx.x * 32 + threadIdx.x;
+__global__ void dColumnSum(T *matrix, T *target, Int nRows, Int nColumns,
+                           Int nCOLUMNS) {
+  Int i = blockIdx.x * 32 + threadIdx.x;
  T t = 0;
-  for (uInt j = blockIdx.y; j < nRows; j += 32)
+  for (Int j = blockIdx.y; j < nRows; j += 32)
    t += matrix[j * nCOLUMNS + i];
  atomicAdd(&target[i], t);
 }
 template <typename T>
-void Convolution_bp_bias(T *matrix, T *target, uInt nRows, uInt nColumns,
-                         uInt nCOLUMNS, cudaStream_t stream) {
+void Convolution_bp_bias(T *matrix, T *target, Int nRows, Int nColumns,
+                         Int nCOLUMNS) {
  if (nColumns / 32 > 0)
-    dColumnSum << <dim3(nColumns / 32, 32), 32, 0, stream>>>
-        (matrix, target, nRows, nColumns, nCOLUMNS);
+    dColumnSum<<<dim3(nColumns / 32, 32), 32>>>(
+        matrix, target, nRows, nColumns, nCOLUMNS);
  if (nColumns % 32 > 0) {
-    uInt o = nColumns / 32 * 32;
-    dColumnSum << <dim3(1, 32), nColumns - o, 0, stream>>>
-        (matrix + o, target + o, nRows, nColumns, nCOLUMNS);
+    Int o = nColumns / 32 * 32;
+    dColumnSum<<<dim3(1, 32), nColumns - o>>>(
+        matrix + o, target + o, nRows, nColumns, nCOLUMNS);
  }
 }

-template <typename T, uInt K, uInt V>
+template <typename T, Int K, Int V>
 __global__ void
-dConvolution_KMxKN_forwardA(T *inFeatures, T *outFeatures, T *w, uInt *rules,
-                            uInt nHot, uInt input_nPlanes, uInt input_stride,
-                            uInt output_nPlanes, uInt output_stride) {
+dConvolution_KMxKN_forwardA(T *inFeatures, T *outFeatures, T *w, Int *rules,
+                            Int nHot, Int input_nPlanes, Int input_stride,
+                            Int output_nPlanes, Int output_stride) {
  // nHot must be a multiple of K!!

  // Input x Weight -> Output
@@ -53,17 +53,17 @@ dConvolution_KMxKN_forwardA(T *inFeatures, T *outFeatures, T *w, uInt *rules,

  // nHot x KM -> nHot x KN - parallel over N,nHot - loop over M

-  uInt M = input_nPlanes / K;
+  Int M = input_nPlanes / K;
  // N = gridDim.y == output_nPlanes/K
-  uInt n = blockIdx.y;
+  Int n = blockIdx.y;
  outFeatures += n * K;
  w += n * K;

  T O[V];
  __shared__ T W[K][K];
  __shared__ T I[K][K];
-  uInt R0[V];
-  uInt R1[V];
+  Int R0[V];
+  Int R1[V];
  const int tx = threadIdx.x;
  int ty[V];
 #pragma unroll
@@ -76,7 +76,7 @@ dConvolution_KMxKN_forwardA(T *inFeatures, T *outFeatures, T *w, uInt *rules,
    for (int v = 0; v < V; v++)
      W[ty[v]][tx] = w[ty[v] * output_nPlanes + tx];

-    for (uInt s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
+    for (Int s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
 #pragma unroll
      for (int v = 0; v < V; v++) {
        R0[v] = rules[2 * (s + ty[v])];
@@ -110,28 +110,28 @@ dConvolution_KMxKN_forwardA(T *inFeatures, T *outFeatures, T *w, uInt *rules,
    inFeatures += K;
  }
 }
-template <typename T, uInt K, uInt V>
+template <typename T, Int K, Int V>
 __global__ void
-dConvolution_KMxKN_forwardB(T *inFeatures, T *outFeatures, T *w, uInt *rules,
-                            uInt nHot, uInt input_nPlanes, uInt input_stride,
-                            uInt output_nPlanes, uInt output_stride) {
+dConvolution_KMxKN_forwardB(T *inFeatures, T *outFeatures, T *w, Int *rules,
+                            Int nHot, Int input_nPlanes, Int input_stride,
+                            Int output_nPlanes, Int output_stride) {
  // Input x Weight -> Output
  // blockDim=(K,K/V,1), gridDim=(nBlocks,N,1) Volkov-blocks
  // K is a multiple of V,

  // nHot x KM -> nHot x KN - parallel over N,nHot - loop over M

-  uInt M = input_nPlanes / K;
+  Int M = input_nPlanes / K;
  // N = gridDim.y == output_nPlanes/K
-  uInt n = blockIdx.y;
+  Int n = blockIdx.y;
  outFeatures += n * K;
  w += n * K;

  T O[V];
  __shared__ T W[K][K];
  __shared__ T I[K][K];
-  uInt R0[V];
-  uInt R1[V];
+  Int R0[V];
+  Int R1[V];
  const int tx = threadIdx.x;
  int ty[V];
 #pragma unroll
@@ -144,7 +144,7 @@ dConvolution_KMxKN_forwardB(T *inFeatures, T *outFeatures, T *w, uInt *rules,
    for (int v = 0; v < V; v++)
      W[ty[v]][tx] = w[ty[v] * output_nPlanes + tx];

-    for (uInt s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
+    for (Int s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
 #pragma unroll
      for (int v = 0; v < V; v++) {
        if (s + ty[v] < nHot) {
@@ -187,27 +187,27 @@ dConvolution_KMxKN_forwardB(T *inFeatures, T *outFeatures, T *w, uInt *rules,
 #define FOO(T, K, V)                                                           \
  {                                                                            \
    if (input_nPlanes % K == 0 and output_nPlanes % K == 0) {                  \
-      uInt o = (nHot / K) * K;                                                 \
+      Int o = (nHot / K) * K;                                                 \
      if (o >= K)                                                              \
-        dConvolution_KMxKN_forwardA<T, K, V> << <                              \
-            dim3(std::min(o / K, (uInt)512), output_nPlanes / K),              \
-            dim3(K, K / V), 0, stream>>>                                       \
-            (inFeatures, outFeatures, w, rules, o, input_nPlanes,              \
-             input_stride, output_nPlanes, output_stride);                     \
+        dConvolution_KMxKN_forwardA<                                           \
+            T, K, V><<<dim3(std::min(o / K, (Int)512), output_nPlanes / K),   \
+                       dim3(K, K / V)>>>(                           \
+            inFeatures, outFeatures, w, rules, o, input_nPlanes, input_stride, \
+            output_nPlanes, output_stride);                                    \
      if (nHot > o)                                                            \
-        dConvolution_KMxKN_forwardB<T, K, V> << <dim3(1, output_nPlanes / K),  \
-                                                 dim3(K, K / V), 0, stream>>>  \
-            (inFeatures, outFeatures, w, rules + 2 * o, nHot - o,              \
-             input_nPlanes, input_stride, output_nPlanes, output_stride);      \
+        dConvolution_KMxKN_forwardB<                                           \
+            T, K,                                                              \
+            V><<<dim3(1, output_nPlanes / K), dim3(K, K / V)>>>(    \
+            inFeatures, outFeatures, w, rules + 2 * o, nHot - o,               \
+            input_nPlanes, input_stride, output_nPlanes, output_stride);       \
      return;                                                                  \
    }                                                                          \
  }

 template <typename T>
-void dConvolution_forward(T *inFeatures, T *outFeatures, T *w, uInt *rules,
-                          uInt nHot, uInt input_nPlanes, uInt input_stride,
-                          uInt output_nPlanes, uInt output_stride,
-                          cudaStream_t stream) {
+void dConvolution_forward(T *inFeatures, T *outFeatures, T *w, Int *rules,
+                          Int nHot, Int input_nPlanes, Int input_stride,
+                          Int output_nPlanes, Int output_stride) {
  FOO(T, 64, 16)
  FOO(T, 32, 8)
  FOO(T, 16, 4)
@@ -216,10 +216,9 @@ void dConvolution_forward(T *inFeatures, T *outFeatures, T *w, uInt *rules,
 }
 template <>
 void dConvolution_forward<double>(double *inFeatures, double *outFeatures,
-                                  double *w, uInt *rules, uInt nHot,
-                                  uInt input_nPlanes, uInt input_stride,
-                                  uInt output_nPlanes, uInt output_stride,
-                                  cudaStream_t stream) {
+                                  double *w, Int *rules, Int nHot,
+                                  Int input_nPlanes, Int input_stride,
+                                  Int output_nPlanes, Int output_stride) {
  FOO(double, 32, 8)
  FOO(double, 16, 4)
  FOO(double, 8, 2)
@@ -230,15 +229,15 @@ void dConvolution_forward<double>(double *inFeatures, double *outFeatures,
 // dOutput x W^T -> dInput and
 // Input^T x dOutput -> dW
 // blockDim=(K,K/V,1), gridDim=(nBlocks,M,1)
-template <typename T, uInt K, uInt V>
+template <typename T, Int K, Int V>
 __global__ void
 dConvolution_KMxKN_backward_dW_A(T *inFeatures, T *dInFeatures, T *dOutFeatures,
-                                 T *w, T *dw, uInt *rules, uInt nHot,
-                                 uInt input_nPlanes, uInt input_stride,
-                                 uInt output_nPlanes, uInt output_stride) {
+                                 T *w, T *dw, Int *rules, Int nHot,
+                                 Int input_nPlanes, Int input_stride,
+                                 Int output_nPlanes, Int output_stride) {
  // M = gridDim.y == input_nPlanes / K
-  uInt N = output_nPlanes / K;
-  uInt m = blockIdx.y;
+  Int N = output_nPlanes / K;
+  Int m = blockIdx.y;
  inFeatures += m * K;
  dInFeatures += m * K;
  w += m * K * output_nPlanes;
@@ -249,8 +248,8 @@ dConvolution_KMxKN_backward_dW_A(T *inFeatures, T *dInFeatures, T *dOutFeatures,
  __shared__ T I[K][K];
  __shared__ T dO[K][K];
  __shared__ T W[K][K];
-  uInt R0[V];
-  uInt R1[V];
+  Int R0[V];
+  Int R1[V];
  const int tx = threadIdx.x;
  int ty[V];
 #pragma unroll
@@ -265,7 +264,7 @@ dConvolution_KMxKN_backward_dW_A(T *inFeatures, T *dInFeatures, T *dOutFeatures,
      dW[v] = 0;
    }

-    for (uInt s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
+    for (Int s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
 #pragma unroll
      for (int v = 0; v < V; v++) {
        R0[v] = rules[2 * (s + ty[v])];
@@ -307,15 +306,15 @@ dConvolution_KMxKN_backward_dW_A(T *inFeatures, T *dInFeatures, T *dOutFeatures,
 // dOutput x W^T -> dInput and
 // Input^T x dOutput -> dW
 // blockDim=(K,K/V,1), gridDim=(nBlocks,M,1)
-template <typename T, uInt K, uInt V>
+template <typename T, Int K, Int V>
 __global__ void
 dConvolution_KMxKN_backward_dW_B(T *inFeatures, T *dInFeatures, T *dOutFeatures,
-                                 T *w, T *dw, uInt *rules, uInt nHot,
-                                 uInt input_nPlanes, uInt input_stride,
-                                 uInt output_nPlanes, uInt output_stride) {
+                                 T *w, T *dw, Int *rules, Int nHot,
+                                 Int input_nPlanes, Int input_stride,
+                                 Int output_nPlanes, Int output_stride) {
  // M = gridDim.y == input_nPlanes / K
-  uInt N = output_nPlanes / K;
-  uInt m = blockIdx.y;
+  Int N = output_nPlanes / K;
+  Int m = blockIdx.y;
  inFeatures += m * K;
  dInFeatures += m * K;
  w += m * K * output_nPlanes;
@@ -326,8 +325,8 @@ dConvolution_KMxKN_backward_dW_B(T *inFeatures, T *dInFeatures, T *dOutFeatures,
  __shared__ T I[K][K];
  __shared__ T dO[K][K];
  __shared__ T W[K][K];
-  uInt R0[V];
-  uInt R1[V];
+  Int R0[V];
+  Int R1[V];
  const int tx = threadIdx.x;
  int ty[V];
 #pragma unroll
@@ -342,7 +341,7 @@ dConvolution_KMxKN_backward_dW_B(T *inFeatures, T *dInFeatures, T *dOutFeatures,
      dW[v] = 0;
    }

-    for (uInt s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
+    for (Int s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
 #pragma unroll
      for (int v = 0; v < V; v++) {
        if (s + ty[v] < nHot) {
@@ -392,29 +391,29 @@ dConvolution_KMxKN_backward_dW_B(T *inFeatures, T *dInFeatures, T *dOutFeatures,
 #define FOO(T, K, V)                                                           \
  {                                                                            \
    if (input_nPlanes % K == 0 and output_nPlanes % K == 0) {                  \
-      uInt o = (nHot / K) * K;                                                 \
+      Int o = (nHot / K) * K;                                                 \
      if (o >= K)                                                              \
-        dConvolution_KMxKN_backward_dW_A<T, K, V> << <                         \
-            dim3(std::min(o / K, (uInt)512), input_nPlanes / K),               \
-            dim3(K, K / V), 0, stream>>>                                       \
-            (inFeatures, dInFeatures, dOutFeatures, w, dw, rules, o,           \
-             input_nPlanes, input_stride, output_nPlanes, output_stride);      \
+        dConvolution_KMxKN_backward_dW_A<                                      \
+            T, K, V><<<dim3(std::min(o / K, (Int)512), input_nPlanes / K),    \
+                       dim3(K, K / V)>>>(                           \
+            inFeatures, dInFeatures, dOutFeatures, w, dw, rules, o,            \
+            input_nPlanes, input_stride, output_nPlanes, output_stride);       \
      if (nHot > o)                                                            \
-        dConvolution_KMxKN_backward_dW_B<T, K, V> << <                         \
-            dim3(1, input_nPlanes / K), dim3(K, K / V), 0, stream>>>           \
-            (inFeatures, dInFeatures, dOutFeatures, w, dw, rules + 2 * o,      \
-             nHot - o, input_nPlanes, input_stride, output_nPlanes,            \
-             output_stride);                                                   \
+        dConvolution_KMxKN_backward_dW_B<                                      \
+            T, K,                                                              \
+            V><<<dim3(1, input_nPlanes / K), dim3(K, K / V)>>>(     \
+            inFeatures, dInFeatures, dOutFeatures, w, dw, rules + 2 * o,       \
+            nHot - o, input_nPlanes, input_stride, output_nPlanes,             \
+            output_stride);                                                    \
      return;                                                                  \
    }                                                                          \
  }

 template <typename T>
 void dConvolution_backward_dW(T *inFeatures, T *dInFeatures, T *dOutFeatures,
-                              T *w, T *dw, uInt *rules, uInt nHot,
-                              uInt input_nPlanes, uInt input_stride,
-                              uInt output_nPlanes, uInt output_stride,
-                              cudaStream_t stream) {
+                              T *w, T *dw, Int *rules, Int nHot,
+                              Int input_nPlanes, Int input_stride,
+                              Int output_nPlanes, Int output_stride) {
  FOO(T, 32, 8)
  FOO(T, 16, 4)
  FOO(T, 8, 2)
@@ -422,11 +421,11 @@ void dConvolution_backward_dW(T *inFeatures, T *dInFeatures, T *dOutFeatures,
 }
 #undef FOO

-template <typename T, uInt K, uInt V>
+template <typename T, Int K, Int V>
 __global__ void
-dConvolution_KMxKN_forward2(T *inFeatures, T *outFeatures, T *w, uInt *rules,
-                            uInt nHot, uInt input_nPlanes, uInt input_stride,
-                            uInt output_nPlanes, uInt output_stride) {
+dConvolution_KMxKN_forward2(T *inFeatures, T *outFeatures, T *w, Int *rules,
+                            Int nHot, Int input_nPlanes, Int input_stride,
+                            Int output_nPlanes, Int output_stride) {
  // Input x Weight -> Output
  // blockDim=(K,K/V,1), gridDim=(nBlocks,N,1) Volkov-blocks
  // K is a multiple of V,
@@ -434,17 +433,17 @@ dConvolution_KMxKN_forward2(T *inFeatures, T *outFeatures, T *w, uInt *rules,
  // nHot x input_nplanes<=KM -> nHot x output_nPlanes<=KN
  // - parallel over N,nHot - loop over M

-  uInt M = (input_nPlanes + K - 1) / K;
+  Int M = (input_nPlanes + K - 1) / K;
  // N = gridDim.y ~ output_nPlanes/K
-  uInt n = blockIdx.y;
+  Int n = blockIdx.y;
  outFeatures += n * K;
  w += n * K;
-  uInt KO = min(K, output_nPlanes - K * n);
+  Int KO = min(K, output_nPlanes - K * n);

  T O[V];
  __shared__ T W[K][K];
  __shared__ T I[K][K];
-  __shared__ uInt R[K * 2];
+  __shared__ Int R[K * 2];
  const int tx = threadIdx.x;
  int ty[V];
 #pragma unroll
@@ -452,7 +451,7 @@ dConvolution_KMxKN_forward2(T *inFeatures, T *outFeatures, T *w, uInt *rules,
    ty[v] = threadIdx.y + v * (K / V);

  for (int m = 0; m < M; m++) {
-    uInt KI = min(K, input_nPlanes - K * m);
+    Int KI = min(K, input_nPlanes - K * m);

 // Read w
 #pragma unroll
@@ -460,7 +459,7 @@ dConvolution_KMxKN_forward2(T *inFeatures, T *outFeatures, T *w, uInt *rules,
      if (ty[v] < KI and tx < KO)
        W[ty[v]][tx] = w[ty[v] * output_nPlanes + tx];

-    for (uInt s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
+    for (Int s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
 // Read rules for K input/output pairs
 #pragma unroll
      for (int v = 0; v < V; v++) {
@@ -500,48 +499,47 @@ dConvolution_KMxKN_forward2(T *inFeatures, T *outFeatures, T *w, uInt *rules,
 }

 template <typename T>
-void dConvolution_forward2(T *inFeatures, T *outFeatures, T *w, uInt *rules,
-                           uInt nHot, uInt input_nPlanes, uInt input_stride,
-                           uInt output_nPlanes, uInt output_stride,
-                           cudaStream_t stream) {
+void dConvolution_forward2(T *inFeatures, T *outFeatures, T *w, Int *rules,
+                           Int nHot, Int input_nPlanes, Int input_stride,
+                           Int output_nPlanes, Int output_stride) {
  if (input_nPlanes % 8 != 0 or output_nPlanes % 8 != 0) {
    const int K = 16;
    const int V = 4;
-    dConvolution_KMxKN_forward2<T, K, V> << <
-        dim3(128, (output_nPlanes + K - 1) / K), dim3(K, K / V), 0, stream>>>
-        (inFeatures, outFeatures, w, rules, nHot, input_nPlanes, input_stride,
-         output_nPlanes, output_stride);
+    dConvolution_KMxKN_forward2<
+        T, K, V><<<dim3(128, (output_nPlanes + K - 1) / K), dim3(K, K / V)>>>(
+        inFeatures, outFeatures, w, rules, nHot, input_nPlanes, input_stride,
+        output_nPlanes, output_stride);
    return;
  } else {
    dConvolution_forward(inFeatures, outFeatures, w, rules, nHot, input_nPlanes,
-                         input_stride, output_nPlanes, output_stride, stream);
+                         input_stride, output_nPlanes, output_stride);
  }
 }

 // dOutput x W^T -> dInput and
 // Input^T x dOutput -> dW
 // blockDim=(K,K/V,1), gridDim=(nBlocks,M,1)
-template <typename T, uInt K, uInt V>
+template <typename T, Int K, Int V>
 __global__ void
 dConvolution_KMxKN_backward_dW2(T *inFeatures, T *dInFeatures, T *dOutFeatures,
-                                T *w, T *dw, uInt *rules, uInt nHot,
-                                uInt input_nPlanes, uInt input_stride,
-                                uInt output_nPlanes, uInt output_stride) {
+                                T *w, T *dw, Int *rules, Int nHot,
+                                Int input_nPlanes, Int input_stride,
+                                Int output_nPlanes, Int output_stride) {
  // M = gridDim.y == input_nPlanes / K
-  uInt N = (output_nPlanes + K - 1) / K;
-  uInt m = blockIdx.y;
+  Int N = (output_nPlanes + K - 1) / K;
+  Int m = blockIdx.y;
  inFeatures += m * K;
  dInFeatures += m * K;
  w += m * K * output_nPlanes;
  dw += m * K * output_nPlanes;
-  uInt KI = min(K, input_nPlanes - K * m);
+  Int KI = min(K, input_nPlanes - K * m);

  T dI[V];
  T dW[V];
  __shared__ T I[K][K];
  __shared__ T dO[K][K];
  __shared__ T W[K][K];
-  __shared__ uInt R[K * 2];
+  __shared__ Int R[K * 2];
  const int tx = threadIdx.x;
  int ty[V];
 #pragma unroll
@@ -549,7 +547,7 @@ dConvolution_KMxKN_backward_dW2(T *inFeatures, T *dInFeatures, T *dOutFeatures,
    ty[v] = threadIdx.y + v * (K / V);

  for (int n = 0; n < N; n++) {
-    uInt KO = min(K, output_nPlanes - K * n);
+    Int KO = min(K, output_nPlanes - K * n);

 // Read w, reset dW
 #pragma unroll
@@ -559,7 +557,7 @@ dConvolution_KMxKN_backward_dW2(T *inFeatures, T *dInFeatures, T *dOutFeatures,
      dW[v] = 0;
    }

-    for (uInt s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
+    for (Int s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
 // Read rules for K input/output pairs, reset dI[]
 #pragma unroll
      for (int v = 0; v < V; v++) {
@@ -613,23 +611,22 @@ dConvolution_KMxKN_backward_dW2(T *inFeatures, T *dInFeatures, T *dOutFeatures,

 template <typename T>
 void dConvolution_backward_dW2(T *inFeatures, T *dInFeatures, T *dOutFeatures,
-                               T *w, T *dw, uInt *rules, uInt nHot,
-                               uInt input_nPlanes, uInt input_stride,
-                               uInt output_nPlanes, uInt output_stride,
-                               cudaStream_t stream) {
+                               T *w, T *dw, Int *rules, Int nHot,
+                               Int input_nPlanes, Int input_stride,
+                               Int output_nPlanes, Int output_stride) {
  if (input_nPlanes % 8 != 0 or output_nPlanes % 8 != 0) {
    const int K = 16;
    const int V = 4;
-    dConvolution_KMxKN_backward_dW2<T, K, V> << <
-        dim3(128, (input_nPlanes + K - 1) / K), dim3(K, K / V), 0, stream>>>
-        (inFeatures, dInFeatures, dOutFeatures, w, dw, rules, nHot,
-         input_nPlanes, input_stride, output_nPlanes, output_stride);
+    dConvolution_KMxKN_backward_dW2<T, K, V><<<
+        dim3(128, (input_nPlanes + K - 1) / K), dim3(K, K / V)>>>(
+        inFeatures, dInFeatures, dOutFeatures, w, dw, rules, nHot,
+        input_nPlanes, input_stride, output_nPlanes, output_stride);
    return;
  } else {
    dConvolution_backward_dW(inFeatures, dInFeatures, dOutFeatures, w, dw,
                             rules, nHot, input_nPlanes, input_stride,
-                             output_nPlanes, output_stride, stream);
+                             output_nPlanes, output_stride);
  }
 }

-#endif /* GPU_CONVOLUTION_H */
+#endif /* CUDA_CONVOLUTION_H */
--- a/sparseconvnet/SCN/CUDA/Deconvolution.cu
+++ b/sparseconvnet/SCN/CUDA/Deconvolution.cu
+// Copyright 2016-present, Facebook, Inc.
+// All rights reserved.
+//
+// This source code is licensed under the license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include "Convolution.h"
+#include "Deconvolution.h"
+
+template <typename T, Int Dimension>
+double cuda_Deconvolution_updateOutput(
+    /*long*/ at::Tensor inputSize, /*long*/ at::Tensor outputSize,
+    /*long*/ at::Tensor filterSize,
+    /*long*/ at::Tensor filterStride, Metadata<Dimension> &m,
+    /*cuda float*/ at::Tensor input_features,
+    /*cuda float*/ at::Tensor output_features, /*cuda float*/ at::Tensor weight,
+    /*cuda float*/ at::Tensor bias) {
+
+  auto _rules =
+      m.getRuleBook(outputSize, inputSize, filterSize, filterStride, true);
+  Int nActive = m.getNActive(outputSize);
+  output_features.resize_({nActive, weight.size(2)});
+  if (not bias.numel())
+    output_features.zero_();
+
+  auto iF = input_features.data<T>();
+  auto oF = output_features.data<T>();
+  Int ip = input_features.size(1);
+  Int op = output_features.size(1);
+  auto w = weight.data<T>();
+  double flops = 0;
+
+  if (bias.numel()) {
+    auto b = bias.data<T>();
+    for (Int i = 0; i < op; i += 32) {
+      Int blockDim = min((Int)32, op - i);
+      Int gridDim = min((Int)4096, nActive);
+      Convolution_fp_bias<<<gridDim, blockDim>>>(oF + i, b + i, op, op,
+                                                 nActive);
+    }
+  }
+  Int c = ip * op;
+  RULEBOOKITERATOR(
+      dDeconvolution_forward2<T>(iF, oF, w, rbB, nHotB, ip, ip, op, op);
+      , w += c; flops += nHotB * c;)
+  return flops;
+}
+
+template <typename T, Int Dimension>
+void cuda_Deconvolution_backward(
+    /*long*/ at::Tensor inputSize, /*long*/ at::Tensor outputSize,
+    /*long*/ at::Tensor filterSize,
+    /*long*/ at::Tensor filterStride, Metadata<Dimension> &m,
+    /*cuda float*/ at::Tensor input_features,
+    /*cuda float*/ at::Tensor d_input_features,
+    /*cuda float*/ at::Tensor d_output_features,
+    /*cuda float*/ at::Tensor weight, /*cuda float*/ at::Tensor d_weight,
+    /*cuda float*/ at::Tensor d_bias) {
+
+  auto _rules =
+      m.getRuleBook(outputSize, inputSize, filterSize, filterStride, true);
+  Int nActive = m.getNActive(outputSize);
+  d_input_features.resize_as_(input_features);
+  d_input_features.zero_();
+
+  auto iF = input_features.data<T>();
+  auto diF = d_input_features.data<T>();
+  auto doF = d_output_features.data<T>();
+  Int ip = input_features.size(1);
+  Int op = d_output_features.size(1);
+  auto w = weight.data<T>();
+  auto dw = d_weight.data<T>();
+  Int c = ip * op;
+  RULEBOOKITERATOR(dDeconvolution_backward_dW2<T>(iF, diF, doF, w, dw, rbB,
+                                                  nHotB, ip, ip, op, op);
+
+                   , w += c; dw += c;)
+
+  if (d_bias.numel()) {
+    auto db = d_bias.data<T>();
+    Convolution_bp_bias(doF, db, op, op, nActive);
+  }
+}
--- a/sparseconvnet/SCN/generic/GPU/Deconvolution.h
+++ b/sparseconvnet/SCN/generic/GPU/Deconvolution.h
@@ -4,16 +4,16 @@
 // This source code is licensed under the license found in the
 // LICENSE file in the root directory of this source tree.

-#ifndef GPU_DECONVOLUTION_H
-#define GPU_DECONVOLUTION_H
-#include "../SparseConvNet.h"
+#ifndef CUDA_DECONVOLUTION_H
+#define CUDA_DECONVOLUTION_H
+

 #include "Convolution.h"
-template <typename T, uInt K, uInt V>
+template <typename T, Int K, Int V>
 __global__ void
-dDeconvolution_KMxKN_forwardA(T *inFeatures, T *outFeatures, T *w, uInt *rules,
-                              uInt nHot, uInt input_nPlanes, uInt input_stride,
-                              uInt output_nPlanes, uInt output_stride) {
+dDeconvolution_KMxKN_forwardA(T *inFeatures, T *outFeatures, T *w, Int *rules,
+                              Int nHot, Int input_nPlanes, Int input_stride,
+                              Int output_nPlanes, Int output_stride) {
  // nHot must be a multiple of K!!

  // Input x Weight -> Output
@@ -22,17 +22,17 @@ dDeconvolution_KMxKN_forwardA(T *inFeatures, T *outFeatures, T *w, uInt *rules,

  // nHot x KM -> nHot x KN - parallel over N,nHot - loop over M

-  uInt M = input_nPlanes / K;
+  Int M = input_nPlanes / K;
  // N = gridDim.y == output_nPlanes/K
-  uInt n = blockIdx.y;
+  Int n = blockIdx.y;
  outFeatures += n * K;
  w += n * K;

  T O[V];
  __shared__ T W[K][K];
  __shared__ T I[K][K];
-  uInt R0[V];
-  uInt R1[V];
+  Int R0[V];
+  Int R1[V];
  const int tx = threadIdx.x;
  int ty[V];
 #pragma unroll
@@ -45,7 +45,7 @@ dDeconvolution_KMxKN_forwardA(T *inFeatures, T *outFeatures, T *w, uInt *rules,
    for (int v = 0; v < V; v++)
      W[ty[v]][tx] = w[ty[v] * output_nPlanes + tx];

-    for (uInt s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
+    for (Int s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
 #pragma unroll
      for (int v = 0; v < V; v++) {
        R1[v] = rules[2 * (s + ty[v])];
@@ -79,28 +79,28 @@ dDeconvolution_KMxKN_forwardA(T *inFeatures, T *outFeatures, T *w, uInt *rules,
    inFeatures += K;
  }
 }
-template <typename T, uInt K, uInt V>
+template <typename T, Int K, Int V>
 __global__ void
-dDeconvolution_KMxKN_forwardB(T *inFeatures, T *outFeatures, T *w, uInt *rules,
-                              uInt nHot, uInt input_nPlanes, uInt input_stride,
-                              uInt output_nPlanes, uInt output_stride) {
+dDeconvolution_KMxKN_forwardB(T *inFeatures, T *outFeatures, T *w, Int *rules,
+                              Int nHot, Int input_nPlanes, Int input_stride,
+                              Int output_nPlanes, Int output_stride) {
  // Input x Weight -> Output
  // blockDim=(K,K/V,1), gridDim=(nBlocks,N,1) Volkov-blocks
  // K is a multiple of V,

  // nHot x KM -> nHot x KN - parallel over N,nHot - loop over M

-  uInt M = input_nPlanes / K;
+  Int M = input_nPlanes / K;
  // N = gridDim.y == output_nPlanes/K
-  uInt n = blockIdx.y;
+  Int n = blockIdx.y;
  outFeatures += n * K;
  w += n * K;

  T O[V];
  __shared__ T W[K][K];
  __shared__ T I[K][K];
-  uInt R0[V];
-  uInt R1[V];
+  Int R0[V];
+  Int R1[V];
  const int tx = threadIdx.x;
  int ty[V];
 #pragma unroll
@@ -113,7 +113,7 @@ dDeconvolution_KMxKN_forwardB(T *inFeatures, T *outFeatures, T *w, uInt *rules,
    for (int v = 0; v < V; v++)
      W[ty[v]][tx] = w[ty[v] * output_nPlanes + tx];

-    for (uInt s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
+    for (Int s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
 #pragma unroll
      for (int v = 0; v < V; v++) {
        if (s + ty[v] < nHot) {
@@ -156,27 +156,27 @@ dDeconvolution_KMxKN_forwardB(T *inFeatures, T *outFeatures, T *w, uInt *rules,
 #define FOO(T, K, V)                                                           \
  {                                                                            \
    if (input_nPlanes % K == 0 and output_nPlanes % K == 0) {                  \
-      uInt o = (nHot / K) * K;                                                 \
+      Int o = (nHot / K) * K;                                                 \
      if (o >= K)                                                              \
-        dDeconvolution_KMxKN_forwardA<T, K, V> << <                            \
-            dim3(std::min(o / K, (uInt)512), output_nPlanes / K),              \
-            dim3(K, K / V), 0, stream>>>                                       \
-            (inFeatures, outFeatures, w, rules, o, input_nPlanes,              \
-             input_stride, output_nPlanes, output_stride);                     \
+        dDeconvolution_KMxKN_forwardA<                                         \
+            T, K, V><<<dim3(std::min(o / K, (Int)512), output_nPlanes / K),   \
+                       dim3(K, K / V)>>>(                           \
+            inFeatures, outFeatures, w, rules, o, input_nPlanes, input_stride, \
+            output_nPlanes, output_stride);                                    \
      if (nHot > o)                                                            \
-        dDeconvolution_KMxKN_forwardB<T, K, V> << <                            \
-            dim3(1, output_nPlanes / K), dim3(K, K / V), 0, stream>>>          \
-            (inFeatures, outFeatures, w, rules + 2 * o, nHot - o,              \
-             input_nPlanes, input_stride, output_nPlanes, output_stride);      \
+        dDeconvolution_KMxKN_forwardB<                                         \
+            T, K,                                                              \
+            V><<<dim3(1, output_nPlanes / K), dim3(K, K / V)>>>(    \
+            inFeatures, outFeatures, w, rules + 2 * o, nHot - o,               \
+            input_nPlanes, input_stride, output_nPlanes, output_stride);       \
      return;                                                                  \
    }                                                                          \
  }

 template <typename T>
-void dDeconvolution_forward(T *inFeatures, T *outFeatures, T *w, uInt *rules,
-                            uInt nHot, uInt input_nPlanes, uInt input_stride,
-                            uInt output_nPlanes, uInt output_stride,
-                            cudaStream_t stream) {
+void dDeconvolution_forward(T *inFeatures, T *outFeatures, T *w, Int *rules,
+                            Int nHot, Int input_nPlanes, Int input_stride,
+                            Int output_nPlanes, Int output_stride) {
  FOO(T, 64, 16)
  FOO(T, 32, 8)
  FOO(T, 16, 4)
@@ -185,10 +185,9 @@ void dDeconvolution_forward(T *inFeatures, T *outFeatures, T *w, uInt *rules,
 }
 template <>
 void dDeconvolution_forward<double>(double *inFeatures, double *outFeatures,
-                                    double *w, uInt *rules, uInt nHot,
-                                    uInt input_nPlanes, uInt input_stride,
-                                    uInt output_nPlanes, uInt output_stride,
-                                    cudaStream_t stream) {
+                                    double *w, Int *rules, Int nHot,
+                                    Int input_nPlanes, Int input_stride,
+                                    Int output_nPlanes, Int output_stride) {
  FOO(double, 32, 8)
  FOO(double, 16, 4)
  FOO(double, 8, 2)
@@ -199,14 +198,14 @@ void dDeconvolution_forward<double>(double *inFeatures, double *outFeatures,
 // dOutput x W^T -> dInput and
 // Input^T x dOutput -> dW
 // blockDim=(K,K/V,1), gridDim=(nBlocks,M,1)
-template <typename T, uInt K, uInt V>
+template <typename T, Int K, Int V>
 __global__ void dDeconvolution_KMxKN_backward_dW_A(
-    T *inFeatures, T *dInFeatures, T *dOutFeatures, T *w, T *dw, uInt *rules,
-    uInt nHot, uInt input_nPlanes, uInt input_stride, uInt output_nPlanes,
-    uInt output_stride) {
+    T *inFeatures, T *dInFeatures, T *dOutFeatures, T *w, T *dw, Int *rules,
+    Int nHot, Int input_nPlanes, Int input_stride, Int output_nPlanes,
+    Int output_stride) {
  // M = gridDim.y == input_nPlanes / K
-  uInt N = output_nPlanes / K;
-  uInt m = blockIdx.y;
+  Int N = output_nPlanes / K;
+  Int m = blockIdx.y;
  inFeatures += m * K;
  dInFeatures += m * K;
  w += m * K * output_nPlanes;
@@ -217,8 +216,8 @@ __global__ void dDeconvolution_KMxKN_backward_dW_A(
  __shared__ T I[K][K];
  __shared__ T dO[K][K];
  __shared__ T W[K][K];
-  uInt R0[V];
-  uInt R1[V];
+  Int R0[V];
+  Int R1[V];
  const int tx = threadIdx.x;
  int ty[V];
 #pragma unroll
@@ -233,7 +232,7 @@ __global__ void dDeconvolution_KMxKN_backward_dW_A(
      dW[v] = 0;
    }

-    for (uInt s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
+    for (Int s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
 #pragma unroll
      for (int v = 0; v < V; v++) {
        R1[v] = rules[2 * (s + ty[v])];
@@ -275,14 +274,14 @@ __global__ void dDeconvolution_KMxKN_backward_dW_A(
 // dOutput x W^T -> dInput and
 // Input^T x dOutput -> dW
 // blockDim=(K,K/V,1), gridDim=(nBlocks,M,1)
-template <typename T, uInt K, uInt V>
+template <typename T, Int K, Int V>
 __global__ void dDeconvolution_KMxKN_backward_dW_B(
-    T *inFeatures, T *dInFeatures, T *dOutFeatures, T *w, T *dw, uInt *rules,
-    uInt nHot, uInt input_nPlanes, uInt input_stride, uInt output_nPlanes,
-    uInt output_stride) {
+    T *inFeatures, T *dInFeatures, T *dOutFeatures, T *w, T *dw, Int *rules,
+    Int nHot, Int input_nPlanes, Int input_stride, Int output_nPlanes,
+    Int output_stride) {
  // M = gridDim.y == input_nPlanes / K
-  uInt N = output_nPlanes / K;
-  uInt m = blockIdx.y;
+  Int N = output_nPlanes / K;
+  Int m = blockIdx.y;
  inFeatures += m * K;
  dInFeatures += m * K;
  w += m * K * output_nPlanes;
@@ -293,8 +292,8 @@ __global__ void dDeconvolution_KMxKN_backward_dW_B(
  __shared__ T I[K][K];
  __shared__ T dO[K][K];
  __shared__ T W[K][K];
-  uInt R0[V];
-  uInt R1[V];
+  Int R0[V];
+  Int R1[V];
  const int tx = threadIdx.x;
  int ty[V];
 #pragma unroll
@@ -309,7 +308,7 @@ __global__ void dDeconvolution_KMxKN_backward_dW_B(
      dW[v] = 0;
    }

-    for (uInt s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
+    for (Int s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
 #pragma unroll
      for (int v = 0; v < V; v++) {
        if (s + ty[v] < nHot) {
@@ -359,29 +358,29 @@ __global__ void dDeconvolution_KMxKN_backward_dW_B(
 #define FOO(T, K, V)                                                           \
  {                                                                            \
    if (input_nPlanes % K == 0 and output_nPlanes % K == 0) {                  \
-      uInt o = (nHot / K) * K;                                                 \
+      Int o = (nHot / K) * K;                                                 \
      if (o >= K)                                                              \
-        dDeconvolution_KMxKN_backward_dW_A<T, K, V> << <                       \
-            dim3(std::min(o / K, (uInt)512), input_nPlanes / K),               \
-            dim3(K, K / V), 0, stream>>>                                       \
-            (inFeatures, dInFeatures, dOutFeatures, w, dw, rules, o,           \
-             input_nPlanes, input_stride, output_nPlanes, output_stride);      \
+        dDeconvolution_KMxKN_backward_dW_A<                                    \
+            T, K, V><<<dim3(std::min(o / K, (Int)512), input_nPlanes / K),    \
+                       dim3(K, K / V)>>>(                           \
+            inFeatures, dInFeatures, dOutFeatures, w, dw, rules, o,            \
+            input_nPlanes, input_stride, output_nPlanes, output_stride);       \
      if (nHot > o)                                                            \
-        dDeconvolution_KMxKN_backward_dW_B<T, K, V> << <                       \
-            dim3(1, input_nPlanes / K), dim3(K, K / V), 0, stream>>>           \
-            (inFeatures, dInFeatures, dOutFeatures, w, dw, rules + 2 * o,      \
-             nHot - o, input_nPlanes, input_stride, output_nPlanes,            \
-             output_stride);                                                   \
+        dDeconvolution_KMxKN_backward_dW_B<                                    \
+            T, K,                                                              \
+            V><<<dim3(1, input_nPlanes / K), dim3(K, K / V)>>>(     \
+            inFeatures, dInFeatures, dOutFeatures, w, dw, rules + 2 * o,       \
+            nHot - o, input_nPlanes, input_stride, output_nPlanes,             \
+            output_stride);                                                    \
      return;                                                                  \
    }                                                                          \
  }

 template <typename T>
 void dDeconvolution_backward_dW(T *inFeatures, T *dInFeatures, T *dOutFeatures,
-                                T *w, T *dw, uInt *rules, uInt nHot,
-                                uInt input_nPlanes, uInt input_stride,
-                                uInt output_nPlanes, uInt output_stride,
-                                cudaStream_t stream) {
+                                T *w, T *dw, Int *rules, Int nHot,
+                                Int input_nPlanes, Int input_stride,
+                                Int output_nPlanes, Int output_stride) {
  FOO(T, 32, 8)
  FOO(T, 16, 4)
  FOO(T, 8, 2)
@@ -389,11 +388,11 @@ void dDeconvolution_backward_dW(T *inFeatures, T *dInFeatures, T *dOutFeatures,
 }
 #undef FOO

-template <typename T, uInt K, uInt V>
+template <typename T, Int K, Int V>
 __global__ void
-dDeconvolution_KMxKN_forward2(T *inFeatures, T *outFeatures, T *w, uInt *rules,
-                              uInt nHot, uInt input_nPlanes, uInt input_stride,
-                              uInt output_nPlanes, uInt output_stride) {
+dDeconvolution_KMxKN_forward2(T *inFeatures, T *outFeatures, T *w, Int *rules,
+                              Int nHot, Int input_nPlanes, Int input_stride,
+                              Int output_nPlanes, Int output_stride) {
  // Input x Weight -> Output
  // blockDim=(K,K/V,1), gridDim=(nBlocks,N,1) Volkov-blocks
  // K is a multiple of V,
@@ -401,17 +400,17 @@ dDeconvolution_KMxKN_forward2(T *inFeatures, T *outFeatures, T *w, uInt *rules,
  // nHot x input_nplanes<=KM -> nHot x output_nPlanes<=KN
  // - parallel over N,nHot - loop over M

-  uInt M = (input_nPlanes + K - 1) / K;
+  Int M = (input_nPlanes + K - 1) / K;
  // N = gridDim.y ~ output_nPlanes/K
-  uInt n = blockIdx.y;
+  Int n = blockIdx.y;
  outFeatures += n * K;
  w += n * K;
-  uInt KO = min(K, output_nPlanes - K * n);
+  Int KO = min(K, output_nPlanes - K * n);

  T O[V];
  __shared__ T W[K][K];
  __shared__ T I[K][K];
-  __shared__ uInt R[K * 2];
+  __shared__ Int R[K * 2];
  const int tx = threadIdx.x;
  int ty[V];
 #pragma unroll
@@ -419,7 +418,7 @@ dDeconvolution_KMxKN_forward2(T *inFeatures, T *outFeatures, T *w, uInt *rules,
    ty[v] = threadIdx.y + v * (K / V);

  for (int m = 0; m < M; m++) {
-    uInt KI = min(K, input_nPlanes - K * m);
+    Int KI = min(K, input_nPlanes - K * m);

 // Read w
 #pragma unroll
@@ -427,7 +426,7 @@ dDeconvolution_KMxKN_forward2(T *inFeatures, T *outFeatures, T *w, uInt *rules,
      if (ty[v] < KI and tx < KO)
        W[ty[v]][tx] = w[ty[v] * output_nPlanes + tx];

-    for (uInt s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
+    for (Int s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
 // Read rules for K input/output pairs
 #pragma unroll
      for (int v = 0; v < V; v++) {
@@ -467,48 +466,47 @@ dDeconvolution_KMxKN_forward2(T *inFeatures, T *outFeatures, T *w, uInt *rules,
 }

 template <typename T>
-void dDeconvolution_forward2(T *inFeatures, T *outFeatures, T *w, uInt *rules,
-                             uInt nHot, uInt input_nPlanes, uInt input_stride,
-                             uInt output_nPlanes, uInt output_stride,
-                             cudaStream_t stream) {
+void dDeconvolution_forward2(T *inFeatures, T *outFeatures, T *w, Int *rules,
+                             Int nHot, Int input_nPlanes, Int input_stride,
+                             Int output_nPlanes, Int output_stride) {
  if (input_nPlanes % 8 != 0 or output_nPlanes % 8 != 0) {
    const int K = 16;
    const int V = 4;
-    dDeconvolution_KMxKN_forward2<T, K, V> << <
-        dim3(128, (output_nPlanes + K - 1) / K), dim3(K, K / V), 0, stream>>>
-        (inFeatures, outFeatures, w, rules, nHot, input_nPlanes, input_stride,
-         output_nPlanes, output_stride);
+    dDeconvolution_KMxKN_forward2<T, K, V><<<
+        dim3(128, (output_nPlanes + K - 1) / K), dim3(K, K / V)>>>(
+        inFeatures, outFeatures, w, rules, nHot, input_nPlanes, input_stride,
+        output_nPlanes, output_stride);
    return;
  } else {
    dDeconvolution_forward(inFeatures, outFeatures, w, rules, nHot,
                           input_nPlanes, input_stride, output_nPlanes,
-                           output_stride, stream);
+                           output_stride);
  }
 }

 // dOutput x W^T -> dInput and
 // Input^T x dOutput -> dW
 // blockDim=(K,K/V,1), gridDim=(nBlocks,M,1)
-template <typename T, uInt K, uInt V>
+template <typename T, Int K, Int V>
 __global__ void dDeconvolution_KMxKN_backward_dW2(
-    T *inFeatures, T *dInFeatures, T *dOutFeatures, T *w, T *dw, uInt *rules,
-    uInt nHot, uInt input_nPlanes, uInt input_stride, uInt output_nPlanes,
-    uInt output_stride) {
+    T *inFeatures, T *dInFeatures, T *dOutFeatures, T *w, T *dw, Int *rules,
+    Int nHot, Int input_nPlanes, Int input_stride, Int output_nPlanes,
+    Int output_stride) {
  // M = gridDim.y == input_nPlanes / K
-  uInt N = (output_nPlanes + K - 1) / K;
-  uInt m = blockIdx.y;
+  Int N = (output_nPlanes + K - 1) / K;
+  Int m = blockIdx.y;
  inFeatures += m * K;
  dInFeatures += m * K;
  w += m * K * output_nPlanes;
  dw += m * K * output_nPlanes;
-  uInt KI = min(K, input_nPlanes - K * m);
+  Int KI = min(K, input_nPlanes - K * m);

  T dI[V];
  T dW[V];
  __shared__ T I[K][K];
  __shared__ T dO[K][K];
  __shared__ T W[K][K];
-  __shared__ uInt R[K * 2];
+  __shared__ Int R[K * 2];
  const int tx = threadIdx.x;
  int ty[V];
 #pragma unroll
@@ -516,7 +514,7 @@ __global__ void dDeconvolution_KMxKN_backward_dW2(
    ty[v] = threadIdx.y + v * (K / V);

  for (int n = 0; n < N; n++) {
-    uInt KO = min(K, output_nPlanes - K * n);
+    Int KO = min(K, output_nPlanes - K * n);

 // Read w, reset dW
 #pragma unroll
@@ -526,7 +524,7 @@ __global__ void dDeconvolution_KMxKN_backward_dW2(
      dW[v] = 0;
    }

-    for (uInt s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
+    for (Int s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
 // Read rules for K input/output pairs, reset dI[]
 #pragma unroll
      for (int v = 0; v < V; v++) {
@@ -580,23 +578,22 @@ __global__ void dDeconvolution_KMxKN_backward_dW2(

 template <typename T>
 void dDeconvolution_backward_dW2(T *inFeatures, T *dInFeatures, T *dOutFeatures,
-                                 T *w, T *dw, uInt *rules, uInt nHot,
-                                 uInt input_nPlanes, uInt input_stride,
-                                 uInt output_nPlanes, uInt output_stride,
-                                 cudaStream_t stream) {
+                                 T *w, T *dw, Int *rules, Int nHot,
+                                 Int input_nPlanes, Int input_stride,
+                                 Int output_nPlanes, Int output_stride) {
  if (input_nPlanes % 8 != 0 or output_nPlanes % 8 != 0) {
    const int K = 16;
    const int V = 4;
-    dDeconvolution_KMxKN_backward_dW2<T, K, V> << <
-        dim3(128, (input_nPlanes + K - 1) / K), dim3(K, K / V), 0, stream>>>
-        (inFeatures, dInFeatures, dOutFeatures, w, dw, rules, nHot,
-         input_nPlanes, input_stride, output_nPlanes, output_stride);
+    dDeconvolution_KMxKN_backward_dW2<T, K, V><<<
+        dim3(128, (input_nPlanes + K - 1) / K), dim3(K, K / V)>>>(
+        inFeatures, dInFeatures, dOutFeatures, w, dw, rules, nHot,
+        input_nPlanes, input_stride, output_nPlanes, output_stride);
    return;
  } else {
    dDeconvolution_backward_dW(inFeatures, dInFeatures, dOutFeatures, w, dw,
                               rules, nHot, input_nPlanes, input_stride,
-                               output_nPlanes, output_stride, stream);
+                               output_nPlanes, output_stride);
  }
 }

-#endif /* GPU_DECONVOLUTION_H */
+#endif /* CUDA_DECONVOLUTION_H */
--- a/sparseconvnet/SCN/CUDA/IOLayers.cu
+++ b/sparseconvnet/SCN/CUDA/IOLayers.cu
+// Copyright 2016-present, Facebook, Inc.
+// All rights reserved.
+//
+// This source code is licensed under the license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include "IOLayers.h"
+
+template <typename T, Int Dimension>
+void cuda_InputLayer_updateOutput(Metadata<Dimension> &m,
+                                  /*long*/ at::Tensor spatialSize,
+                                  /*long*/ at::Tensor input_coords,
+                                  /*cuda float*/ at::Tensor input_features,
+                                  /*cuda float*/ at::Tensor output_features,
+                                  long batchSize, long mode) {
+
+  m.inputLayer(spatialSize, input_coords, batchSize, mode);
+  Int nPlanes = input_features.size(1);
+  auto &rules = m.inputLayerRuleBook;
+  Int maxActive = rules[0][1];
+  Int nRows = rules[0][3];
+  if (mode == 0) {
+    output_features.resize_as_(input_features);
+    output_features.copy_(input_features);
+  } else {
+    output_features.resize_({*m.inputNActive, nPlanes});
+    output_features.zero_();
+    auto rulesBuffer = at::CUDA(at_kINT).tensor({(int)rules[1].size()});
+    auto iF = input_features.data<T>();
+    auto oF = output_features.data<T>();
+    Int *rb = rulesBuffer.data<Int>();
+    cudaMemcpy(rb, &rules[1][0], sizeof(Int) * rules[1].size(),
+               cudaMemcpyHostToDevice);
+    InputLayer_fp<
+        T><<<std::min(nRows, (Int)32768), std::min(nPlanes, (Int)32)>>>(
+        iF, oF, nRows, maxActive, nPlanes, rb, mode == 4);
+  }
+}
+template <typename T, Int Dimension>
+void cuda_InputLayer_updateGradInput(
+    Metadata<Dimension> &m,
+    /*cuda float*/ at::Tensor d_input_features,
+    /*cuda float*/ at::Tensor d_output_features) {
+
+  auto &rules = m.inputLayerRuleBook;
+  Int nPlanes = d_output_features.size(1);
+  auto mode = rules[0][0];
+  Int maxActive = rules[0][1];
+  Int nRows = rules[0][3];
+  if (mode == 0) {
+    d_input_features.resize_as_(d_output_features);
+    d_input_features.copy_(d_output_features);
+  } else {
+    d_input_features.resize_({rules[0][2], nPlanes});
+    d_input_features.zero_();
+    auto rulesBuffer = at::CUDA(at_kINT).tensor({(int)rules[1].size()});
+    auto diF = d_input_features.data<T>();
+    auto doF = d_output_features.data<T>();
+    Int *rb = rulesBuffer.data<Int>();
+    cudaMemcpy(rb, &rules[1][0], sizeof(Int) * rules[1].size(),
+               cudaMemcpyHostToDevice);
+    InputLayer_bp<
+        T><<<std::min(nRows, (Int)32768), std::min(nPlanes, (Int)32)>>>(
+        diF, doF, nRows, maxActive, nPlanes, rb, mode == 4);
+  }
+}
+
+template <typename T, Int Dimension>
+void cuda_OutputLayer_updateOutput(Metadata<Dimension> &m,
+                                   /*cuda float*/ at::Tensor input_features,
+                                   /*cuda float*/ at::Tensor output_features) {
+
+  auto &rules = m.inputLayerRuleBook;
+  Int nPlanes = input_features.size(1);
+  auto mode = rules[0][0];
+  auto maxActive = rules[0][1];
+  auto nRows = rules[0][3];
+  if (mode == 0) {
+    output_features.resize_as_(input_features);
+    output_features.copy_(input_features);
+  } else {
+    output_features.resize_({rules[0][2], nPlanes});
+    output_features.zero_();
+    auto rulesBuffer = at::CUDA(at_kINT).tensor({(int)rules[1].size()});
+    auto iF = input_features.data<T>();
+    auto oF = output_features.data<T>();
+    Int *rb = rulesBuffer.data<Int>();
+    cudaMemcpy(rb, &rules[1][0], sizeof(Int) * rules[1].size(),
+               cudaMemcpyHostToDevice);
+    InputLayer_bp<
+        T><<<std::min(nRows, (Int)32768), std::min(nPlanes, (Int)32)>>>(
+        oF, iF, nRows, maxActive, nPlanes, rb, false);
+  }
+}
+template <typename T, Int Dimension>
+void cuda_OutputLayer_updateGradInput(
+    Metadata<Dimension> &m,
+    /*cuda float*/ at::Tensor d_input_features,
+    /*cuda float*/ at::Tensor d_output_features) {
+
+  auto &rules = m.inputLayerRuleBook;
+  Int nPlanes = d_output_features.size(1);
+  auto mode = rules[0][0];
+  auto maxActive = rules[0][1];
+  auto nRows = rules[0][3];
+  if (mode == 0) {
+    d_input_features.resize_as_(d_output_features);
+    d_input_features.copy_(d_output_features);
+  } else {
+    d_input_features.resize_({nRows, nPlanes});
+    d_input_features.zero_();
+    auto rulesBuffer = at::CUDA(at_kINT).tensor({(int)rules[1].size()});
+    auto diF = d_input_features.data<T>();
+    auto doF = d_output_features.data<T>();
+    Int *rb = rulesBuffer.data<Int>();
+    cudaMemcpy(rb, &rules[1][0], sizeof(Int) * rules[1].size(),
+               cudaMemcpyHostToDevice);
+    InputLayer_fp<
+        T><<<std::min(nRows, (Int)32768), std::min(nPlanes, (Int)32)>>>(
+        doF, diF, nRows, maxActive, nPlanes, rb, false);
+  }
+}
+
+template <typename T, Int Dimension>
+void cuda_BLInputLayer_updateOutput(Metadata<Dimension> &m,
+                                    /*long*/ at::Tensor spatialSize,
+                                    /*long*/ at::Tensor input_coords,
+                                    /*cuda float*/ at::Tensor input_features,
+                                    /*cuda float*/ at::Tensor output_features,
+                                    long mode) {
+
+  m.blLayer(spatialSize, input_coords, mode);
+  Int nPlanes = input_features.size(2);
+  output_features.resize_({*m.inputNActive, nPlanes});
+  output_features.zero_();
+  auto &rules = m.blLayerRuleBook;
+  Int maxActive = rules[0][1];
+  Int nRows = rules[0][4];
+
+  if (mode == 0) {
+    output_features.resize_as_(input_features);
+    output_features.copy_(input_features);
+    output_features.resize_({*m.inputNActive, nPlanes});
+  } else {
+    auto rulesBuffer = at::CUDA(at_kINT).tensor({(int)rules[1].size()});
+    auto iF = input_features.data<T>();
+    auto oF = output_features.data<T>();
+    Int *rb = rulesBuffer.data<Int>();
+    cudaMemcpy(rb, &rules[1][0], sizeof(Int) * rules[1].size(),
+               cudaMemcpyHostToDevice);
+    InputLayer_fp<
+        T><<<std::min(nRows, (Int)32768), std::min(nPlanes, (Int)32)>>>(
+        iF, oF, nRows, maxActive, nPlanes, rb, mode == 4);
+  }
+}
+template <typename T, Int Dimension>
+void cuda_BLInputLayer_updateGradInput(
+    Metadata<Dimension> &m,
+    /*cuda float*/ at::Tensor d_input_features,
+    /*cuda float*/ at::Tensor d_output_features) {
+
+  auto &rules = m.blLayerRuleBook;
+  Int nPlanes = d_output_features.size(1);
+  Int mode = rules[0][0];
+  Int maxActive = rules[0][1];
+  Int nRows = rules[0][4];
+
+  if (mode == 0) {
+    d_input_features.resize_as_(d_output_features);
+    d_input_features.copy_(d_output_features);
+    d_input_features.resize_({rules[0][2], rules[0][3], nPlanes});
+  } else {
+    d_input_features.resize_({rules[0][2], rules[0][3], nPlanes});
+    d_input_features.zero_();
+    auto rulesBuffer = at::CUDA(at_kINT).tensor({(int)rules[1].size()});
+    auto diF = d_input_features.data<T>();
+    auto doF = d_output_features.data<T>();
+    Int *rb = rulesBuffer.data<Int>();
+    cudaMemcpy(rb, &rules[1][0], sizeof(Int) * rules[1].size(),
+               cudaMemcpyHostToDevice);
+    InputLayer_bp<
+        T><<<std::min(nRows, (Int)32768), std::min(nPlanes, (Int)32)>>>(
+        diF, doF, nRows, maxActive, nPlanes, rb, mode == 4);
+  }
+}
+
+template <typename T, Int Dimension>
+void cuda_BLOutputLayer_updateOutput(
+    Metadata<Dimension> &m,
+    /*cuda float*/ at::Tensor input_features,
+    /*cuda float*/ at::Tensor output_features) {
+
+  auto &rules = m.blLayerRuleBook;
+  Int nPlanes = input_features.size(1);
+  auto mode = rules[0][0];
+  Int maxActive = rules[0][1];
+  Int nRows = rules[0][4];
+  if (mode == 0) {
+    output_features.resize_as_(input_features);
+    output_features.copy_(input_features);
+    output_features.resize_({rules[0][2], rules[0][3], nPlanes});
+  } else {
+    output_features.resize_({rules[0][2], rules[0][3], nPlanes});
+    output_features.zero_();
+    auto rulesBuffer = at::CUDA(at_kINT).tensor({(int)rules[1].size()});
+    auto iF = input_features.data<T>();
+    auto oF = output_features.data<T>();
+    Int *rb = rulesBuffer.data<Int>();
+    cudaMemcpy(rb, &rules[1][0], sizeof(Int) * rules[1].size(),
+               cudaMemcpyHostToDevice);
+    InputLayer_bp<
+        T><<<std::min(nRows, (Int)32768), std::min(nPlanes, (Int)32)>>>(
+        oF, iF, nRows, maxActive, nPlanes, rb, false);
+  }
+}
+template <typename T, Int Dimension>
+void cuda_BLOutputLayer_updateGradInput(
+    Metadata<Dimension> &m,
+    /*cuda float*/ at::Tensor d_input_features,
+    /*cuda float*/ at::Tensor d_output_features) {
+
+  auto &rules = m.blLayerRuleBook;
+  Int nPlanes = d_output_features.size(2);
+  Int mode = rules[0][0];
+  Int maxActive = rules[0][1];
+  Int nRows = rules[0][4];
+  if (mode == 0) {
+    d_input_features.resize_as_(d_output_features);
+    d_input_features.copy_(d_output_features);
+    d_input_features.resize_({nRows, nPlanes});
+  } else {
+    d_input_features.resize_({nRows, nPlanes});
+    d_input_features.zero_();
+    auto rulesBuffer = at::CUDA(at_kINT).tensor({(int)rules[1].size()});
+    auto diF = d_input_features.data<T>();
+    auto doF = d_output_features.data<T>();
+    Int *rb = rulesBuffer.data<Int>();
+    cudaMemcpy(rb, &rules[1][0], sizeof(Int) * rules[1].size(),
+               cudaMemcpyHostToDevice);
+    InputLayer_fp<
+        T><<<std::min(nRows, (Int)32768), std::min(nPlanes, (Int)32)>>>(
+        doF, diF, nRows, maxActive, nPlanes, rb, false);
+  }
+}
--- a/sparseconvnet/SCN/generic/GPU/IOLayers.h
+++ b/sparseconvnet/SCN/generic/GPU/IOLayers.h
@@ -4,21 +4,21 @@
 // This source code is licensed under the license found in the
 // LICENSE file in the root directory of this source tree.

-#ifndef GPU_IOLAYERS_H
-#define GPU_IOLAYERS_H
+#ifndef CUDA_IOLAYERS_H
+#define CUDA_IOLAYERS_H

 template <typename T>
 __global__ void InputLayer_fp(T *input_features, T *output_features,
-                              uInt nRows, uInt maxActive, uInt nPlanes,
-                              uInt *rules, bool average) {
+                              Int nRows, Int maxActive, Int nPlanes,
+                              Int *rules, bool average) {
  for (int row = blockIdx.x; row < nRows; row += gridDim.x) {
    T *out = output_features + row * nPlanes;
-    uInt *r = rules + row * (1 + maxActive);
-    uInt nActive = r[0];
+    Int *r = rules + row * (1 + maxActive);
+    Int nActive = r[0];
    T multiplier = (average and nActive > 0) ? 1.0f / nActive : 1.0f;
    for (int i = 1; i <= nActive; i++) {
      T *inp = input_features + r[i] * nPlanes;
-      for (uInt plane = threadIdx.x; plane < nPlanes; plane += blockDim.x)
+      for (Int plane = threadIdx.x; plane < nPlanes; plane += blockDim.x)
        out[plane] += multiplier * inp[plane];
    }
  }
@@ -26,18 +26,18 @@ __global__ void InputLayer_fp(T *input_features, T *output_features,

 template <typename T>
 __global__ void InputLayer_bp(T *d_input_features, T *d_output_features,
-                              uInt nRows, uInt maxActive, uInt nPlanes,
-                              uInt *rules, bool average) {
+                              Int nRows, Int maxActive, Int nPlanes,
+                              Int *rules, bool average) {
  for (int row = blockIdx.x; row < nRows; row += gridDim.x) {
    T *out = d_output_features + row * nPlanes;
-    uInt *r = rules + row * (1 + maxActive);
-    uInt nActive = r[0];
+    Int *r = rules + row * (1 + maxActive);
+    Int nActive = r[0];
    T multiplier = (average and nActive > 0) ? 1.0f / nActive : 1.0f;
    for (int i = 1; i <= nActive; i++) {
      T *inp = d_input_features + r[i] * nPlanes;
-      for (uInt plane = threadIdx.x; plane < nPlanes; plane += blockDim.x)
+      for (Int plane = threadIdx.x; plane < nPlanes; plane += blockDim.x)
        atomicAdd(&inp[plane], multiplier * out[plane]);
    }
  }
 }
-#endif /* GPU_IOLAYERS_H */
+#endif /* CUDA_IOLAYERS_H */
--- a/sparseconvnet/SCN/CUDA/LeakyReLU.cu
+++ b/sparseconvnet/SCN/CUDA/LeakyReLU.cu
+// Copyright 2016-present, Facebook, Inc.
+// All rights reserved.
+//
+// This source code is licensed under the license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include "LeakyReLU.h"
+
+template <typename T>
+void cuda_LeakyReLU_updateOutput(/*cuda float*/ at::Tensor input_features,
+                                 /*cuda float*/ at::Tensor output_features,
+                                 float alpha) {
+  output_features.resize_as_(input_features);
+  auto n = input_features.numel();
+  LeakyReLU_fp<T><<<16, 1024>>>(input_features.data<T>(),
+                                output_features.data<T>(), n, alpha);
+}
+
+template <typename T>
+void cuda_LeakyReLU_updateGradInput(
+    /*cuda float*/ at::Tensor input_features,
+    /*cuda float*/ at::Tensor d_input_features,
+    /*cuda float*/ at::Tensor d_output_features, float alpha) {
+  d_input_features.resize_as_(d_output_features);
+  auto n = d_input_features.numel();
+  LeakyReLU_bp<T><<<16, 1024>>>(input_features.data<T>(),
+                                d_input_features.data<T>(),
+                                d_output_features.data<T>(), n, alpha);
+}
--- a/sparseconvnet/SCN/generic/GPU/LeakyReLU.h
+++ b/sparseconvnet/SCN/generic/GPU/LeakyReLU.h
@@ -8,16 +8,16 @@
 #define LEAKYRELU_H

 template <typename T>
-__global__ void LeakyReLU_fp(T *input_features, T *output_features, uInt n,
+__global__ void LeakyReLU_fp(T *input_features, T *output_features, Int n,
                             T alpha) {
-  for (uInt i = blockIdx.x * blockDim.x + threadIdx.x; i < n; i += 16 * 1024)
+  for (Int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; i += 16 * 1024)
    output_features[i] = (input_features[i] > 0) ? input_features[i]
                                                 : (input_features[i] * alpha);
 }
 template <typename T>
 __global__ void LeakyReLU_bp(T *input_features, T *d_input_features,
-                             T *d_output_features, uInt n, T alpha) {
-  for (uInt i = blockIdx.x * blockDim.x + threadIdx.x; i < n; i += 16 * 1024)
+                             T *d_output_features, Int n, T alpha) {
+  for (Int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; i += 16 * 1024)
    d_input_features[i] = (input_features[i] > 0)
                              ? d_output_features[i]
                              : (d_output_features[i] * alpha);

--- a/sparseconvnet/SCN/CUDA/MaxPooling.cu
+++ b/sparseconvnet/SCN/CUDA/MaxPooling.cu
+// Copyright 2016-present, Facebook, Inc.
+// All rights reserved.
+//
+// This source code is licensed under the license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include "MaxPooling.h"
+#include "RuleBookIterator.h"
+
+template <typename T, Int Dimension>
+void cuda_MaxPooling_updateOutput(
+    /*long*/ at::Tensor inputSize, /*long*/ at::Tensor outputSize,
+    /*long*/ at::Tensor poolSize,
+    /*long*/ at::Tensor poolStride, Metadata<Dimension> &m,
+    /*cuda float*/ at::Tensor input_features,
+    /*cuda float*/ at::Tensor output_features, long nFeaturesToDrop) {
+
+  Int nPlanes = input_features.size(1) - nFeaturesToDrop;
+  auto _rules =
+      m.getRuleBook(inputSize, outputSize, poolSize, poolStride, true);
+  Int nActive = m.getNActive(outputSize);
+  output_features.resize_({nActive, nPlanes});
+  output_features.zero_();
+
+  auto iF = input_features.data<T>() + nFeaturesToDrop;
+  auto oF = output_features.data<T>();
+  RULEBOOKITERATOR(
+      cuda_MaxPooling_ForwardPass<T>(iF, oF, nPlanes, input_features.size(1),
+                                     output_features.size(1), rbB, nHotB);
+      , )
+}
+template <typename T, Int Dimension>
+void cuda_MaxPooling_updateGradInput(
+    /*long*/ at::Tensor inputSize, /*long*/ at::Tensor outputSize,
+    /*long*/ at::Tensor poolSize,
+    /*long*/ at::Tensor poolStride, Metadata<Dimension> &m,
+    /*cuda float*/ at::Tensor input_features,
+    /*cuda float*/ at::Tensor d_input_features,
+    /*cuda float*/ at::Tensor output_features,
+    /*cuda float*/ at::Tensor d_output_features, long nFeaturesToDrop) {
+
+  Int nPlanes = input_features.size(1) - nFeaturesToDrop;
+  auto _rules =
+      m.getRuleBook(inputSize, outputSize, poolSize, poolStride, true);
+  d_input_features.resize_as_(input_features);
+  d_input_features.zero_();
+
+  auto iF = input_features.data<T>();
+  auto oF = output_features.data<T>();
+  auto diF = d_input_features.data<T>();
+  auto doF = d_output_features.data<T>();
+  RULEBOOKITERATOR(cuda_MaxPooling_BackwardPass<T>(
+                       iF, diF, oF, doF, nPlanes, input_features.size(1),
+                       d_output_features.size(1), rbB, nHotB);
+                   , )
+}
+template <typename T, Int Dimension>
+void cuda_RandomizedStrideMaxPooling_updateOutput(
+    /*long*/ at::Tensor inputSize, /*long*/ at::Tensor outputSize,
+    /*long*/ at::Tensor poolSize,
+    /*long*/ at::Tensor poolStride, Metadata<Dimension> &m,
+    /*cuda float*/ at::Tensor input_features,
+    /*cuda float*/ at::Tensor output_features, long nFeaturesToDrop) {
+
+  Int nPlanes = input_features.size(1) - nFeaturesToDrop;
+  auto _rules = m.getRandomizedStrideRuleBook(inputSize, outputSize, poolSize,
+                                              poolStride, true);
+  Int nActive = m.getNActive(outputSize);
+  output_features.resize_({nActive, nPlanes});
+  output_features.zero_();
+
+  auto iF = input_features.data<T>() + nFeaturesToDrop;
+  auto oF = output_features.data<T>();
+  RULEBOOKITERATOR(
+      cuda_MaxPooling_ForwardPass<T>(iF, oF, nPlanes, input_features.size(1),
+                                     output_features.size(1), rbB, nHotB);
+      , )
+}
+template <typename T, Int Dimension>
+void cuda_RandomizedStrideMaxPooling_updateGradInput(
+    /*long*/ at::Tensor inputSize, /*long*/ at::Tensor outputSize,
+    /*long*/ at::Tensor poolSize,
+    /*long*/ at::Tensor poolStride, Metadata<Dimension> &m,
+    /*cuda float*/ at::Tensor input_features,
+    /*cuda float*/ at::Tensor d_input_features,
+    /*cuda float*/ at::Tensor output_features,
+    /*cuda float*/ at::Tensor d_output_features, long nFeaturesToDrop) {
+
+  Int nPlanes = input_features.size(1) - nFeaturesToDrop;
+  auto _rules = m.getRandomizedStrideRuleBook(inputSize, outputSize, poolSize,
+                                              poolStride, true);
+  d_input_features.resize_as_(input_features);
+  d_input_features.zero_();
+
+  auto iF = input_features.data<T>();
+  auto oF = output_features.data<T>();
+  auto diF = d_input_features.data<T>();
+  auto doF = d_output_features.data<T>();
+  RULEBOOKITERATOR(cuda_MaxPooling_BackwardPass<T>(
+                       iF, diF, oF, doF, nPlanes, input_features.size(1),
+                       d_output_features.size(1), rbB, nHotB);
+                   , )
+}
--- a/sparseconvnet/SCN/generic/GPU/MaxPooling.h
+++ b/sparseconvnet/SCN/generic/GPU/MaxPooling.h
@@ -4,26 +4,26 @@
 // This source code is licensed under the license found in the
 // LICENSE file in the root directory of this source tree.

-#ifndef GPU_MAXPOOLING_H
-#define GPU_MAXPOOLING_H
+#ifndef CUDA_MAXPOOLING_H
+#define CUDA_MAXPOOLING_H

 // NTX must be >=2 so r is filled properly
-template <typename T, uInt NTX, uInt NTY>
+template <typename T, Int NTX, Int NTY>
 __global__ void MaxPooling_fp(T *input_features, T *output_features,
-                              uInt nPlanes, uInt input_stride,
-                              uInt output_stride, uInt *rules, uInt nHot) {
-  __shared__ uInt r[NTY * 2];
-  for (uInt n = blockIdx.x * NTY; n < nHot; n += gridDim.x * NTY) {
+                              Int nPlanes, Int input_stride, Int output_stride,
+                              Int *rules, Int nHot) {
+  __shared__ Int r[NTY * 2];
+  for (Int n = blockIdx.x * NTY; n < nHot; n += gridDim.x * NTY) {
    {
-      uInt i = threadIdx.x + NTX * threadIdx.y;
-      if (i < NTY * 2 and i < 2 * (n - nHot))
+      Int i = threadIdx.x + NTX * threadIdx.y;
+      if (i < NTY * 2 and i < 2 * (nHot - n))
        r[i] = rules[2 * n + i];
    }
    __syncthreads();
    if (n + threadIdx.y < nHot) {
-      uInt i = r[2 * threadIdx.y] * input_stride;
-      uInt o = r[2 * threadIdx.y + 1] * output_stride;
-      for (uInt plane = threadIdx.x; plane < nPlanes; plane += NTX) {
+      Int i = r[2 * threadIdx.y] * input_stride;
+      Int o = r[2 * threadIdx.y + 1] * output_stride;
+      for (Int plane = threadIdx.x; plane < nPlanes; plane += NTX) {
        T inp = input_features[i + plane];
        if (output_features[o + plane] < inp)
          output_features[o + plane] = inp;
@@ -34,30 +34,30 @@ __global__ void MaxPooling_fp(T *input_features, T *output_features,
 }

 template <typename T>
-void MaxPooling_ForwardPass(cudaStream_t stream, T *input_features,
-                            T *output_features, uInt nPlanes, uInt input_stride,
-                            uInt output_stride, uInt *rules, uInt nHot) {
-  MaxPooling_fp<T, 32, 32> << <32, dim3(32, 32), 0, stream>>>
-      (input_features, output_features, nPlanes, input_stride, output_stride,
-       rules, nHot);
+void cuda_MaxPooling_ForwardPass(T *input_features, T *output_features,
+                                 Int nPlanes, Int input_stride,
+                                 Int output_stride, Int *rules, Int nHot) {
+  MaxPooling_fp<T, 32, 32><<<32, dim3(32, 32)>>>(
+      input_features, output_features, nPlanes, input_stride, output_stride,
+      rules, nHot);
 }
-template <typename T, uInt NTX, uInt NTY>
+template <typename T, Int NTX, Int NTY>
 __global__ void MaxPooling_bp(T *input_features, T *d_input_features,
                              T *output_features, T *d_output_features,
-                              uInt nPlanes, uInt input_stride,
-                              uInt output_stride, uInt *rules, uInt nHot) {
-  __shared__ uInt r[NTY * 2];
-  for (uInt n = blockIdx.x * NTY; n < nHot; n += gridDim.x * NTY) {
+                              Int nPlanes, Int input_stride, Int output_stride,
+                              Int *rules, Int nHot) {
+  __shared__ Int r[NTY * 2];
+  for (Int n = blockIdx.x * NTY; n < nHot; n += gridDim.x * NTY) {
    {
-      uInt i = threadIdx.x + NTX * threadIdx.y;
-      if (i < NTY * 2 and i < 2 * (n - nHot))
+      Int i = threadIdx.x + NTX * threadIdx.y;
+      if (i < NTY * 2 and i < 2 * (nHot - n))
        r[i] = rules[2 * n + i];
    }
    __syncthreads();
    if (n + threadIdx.y < nHot) {
-      uInt i = r[2 * threadIdx.y] * input_stride;
-      uInt o = r[2 * threadIdx.y + 1] * output_stride;
-      for (uInt plane = threadIdx.x; plane < nPlanes; plane += NTX)
+      Int i = r[2 * threadIdx.y] * input_stride;
+      Int o = r[2 * threadIdx.y + 1] * output_stride;
+      for (Int plane = threadIdx.x; plane < nPlanes; plane += NTX)
        if (output_features[o + plane] == input_features[i + plane])
          d_input_features[i + plane] += d_output_features[o + plane];
    }
@@ -66,13 +66,12 @@ __global__ void MaxPooling_bp(T *input_features, T *d_input_features,
 }

 template <typename T>
-void MaxPooling_BackwardPass(cudaStream_t stream, T *input_features,
-                             T *d_input_features, T *output_features,
-                             T *d_output_features, uInt nPlanes,
-                             uInt input_stride, uInt output_stride, uInt *rules,
-                             uInt nHot) {
-  MaxPooling_bp<T, 32, 32> << <32, dim3(32, 32), 0, stream>>>
-      (input_features, d_input_features, output_features, d_output_features,
-       nPlanes, input_stride, output_stride, rules, nHot);
+void cuda_MaxPooling_BackwardPass(T *input_features, T *d_input_features,
+                                  T *output_features, T *d_output_features,
+                                  Int nPlanes, Int input_stride,
+                                  Int output_stride, Int *rules, Int nHot) {
+  MaxPooling_bp<T, 32, 32><<<32, dim3(32, 32)>>>(
+      input_features, d_input_features, output_features, d_output_features,
+      nPlanes, input_stride, output_stride, rules, nHot);
 }
-#endif /* GPU_MAXPOOLING_H */
+#endif /* CUDA_MAXPOOLING_H */
--- a/sparseconvnet/SCN/CUDA/NetworkInNetwork.cu
+++ b/sparseconvnet/SCN/CUDA/NetworkInNetwork.cu
+// Copyright 2016-present, Facebook, Inc.
+// All rights reserved.
+//
+// This source code is licensed under the license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include "Convolution.h"
+
+#include <algorithm>
+
+template <typename T>
+double cuda_NetworkInNetwork_updateOutput(
+    /*cuda float*/ at::Tensor input_features,
+    /*cuda float*/ at::Tensor output_features,
+    /*cuda float*/ at::Tensor weight, /*cuda float*/ at::Tensor bias) {
+  auto nActive = input_features.size(0);
+  auto input_nPlanes = weight.size(0);
+  auto output_nPlanes = weight.size(1);
+  output_features.resize_({nActive, input_nPlanes});
+  if (bias.numel())
+    output_features.copy_(bias);
+  else
+    output_features.zero_();
+  output_features.addmm(input_features, weight);
+  return nActive * input_nPlanes * output_nPlanes;
+}
+
+template <typename T>
+void cuda_NetworkInNetwork_updateGradInput(
+    /*cuda float*/ at::Tensor d_input_features,
+    /*cuda float*/ at::Tensor d_output_features,
+    /*cuda float*/ at::Tensor weight) {
+  d_input_features.resize_({(int)d_output_features.size(0), weight.size(0)});
+  d_input_features.zero_();
+  at::mm_out(d_input_features, d_output_features, weight.t());
+}
+
+template <typename T>
+void cuda_NetworkInNetwork_accGradParameters(
+    /*cuda float*/ at::Tensor input_features,
+    /*cuda float*/ at::Tensor d_output_features,
+    /*cuda float*/ at::Tensor d_weight, /*cuda float*/ at::Tensor d_bias) {
+  auto nActive = input_features.size(0);
+  if (nActive and d_bias.numel())
+    at::sum_out(d_bias, d_output_features, {0}, false);
+  at::mm_out(d_weight, input_features.t(), d_output_features);
+}
--- a/sparseconvnet/SCN/generic/GPU/RuleBookIterator.h
+++ b/sparseconvnet/SCN/generic/GPU/RuleBookIterator.h
@@ -4,32 +4,31 @@
 // This source code is licensed under the license found in the
 // LICENSE file in the root directory of this source tree.

-#ifndef GPU_RULEBOOKITERATOR_H
-#define GPU_RULEBOOKITERATOR_H
+#ifndef CUDA_RULEBOOKITERATOR_H
+#define CUDA_RULEBOOKITERATOR_H

-// Macro to parallelize loading rulebook elements to GPU memory and operating
+// Macro to parallelize loading rulebook elements to CUDA memory and operating
 // on the elements of the rulebook.
 // X is the function to apply.
 // Y is a command to run

 #define RULEBOOKITERATOR(X, Y)                                                 \
-  uInt ms = ruleBookMaxSize(_rules);                                           \
-  auto rulesBuffer = THCITensor_(new)(state);                                 \
-  if (THCITensor_(nElement)(state, rulesBuffer) < ms)                         \
-    THCITensor_(resize1d)(state, rulesBuffer, ms);                            \
-  uInt *rbB = (uInt *)THCITensor_(data)(state, rulesBuffer);                  \
-  for (int k = 0; k < _rules.size(); ++k) {                                    \
-    auto &r = _rules[k];                                                       \
-    uInt nHotB = r.size() / 2;                                                 \
-    if (nHotB) {                                                               \
-      cudaMemcpy(rbB, &r[0], sizeof(uInt) * 2 * nHotB,                         \
-                 cudaMemcpyHostToDevice);                                      \
+  {                                                                            \
+    Int rbMaxSize = 0;                                                        \
+    for (auto &r : _rules)                                                     \
+      rbMaxSize = std::max(rbMaxSize, (Int)r.size());                         \
+    at::Tensor rulesBuffer = at::CUDA(at_kINT).tensor({rbMaxSize});            \
+    Int *rbB = rulesBuffer.data<Int>();                                       \
+    for (int k = 0; k < _rules.size(); ++k) {                                  \
+      auto &r = _rules[k];                                                     \
+      Int nHotB = r.size() / 2;                                               \
+      if (nHotB) {                                                             \
+        cudaMemcpy(rbB, &r[0], sizeof(Int) * 2 * nHotB,                       \
+                   cudaMemcpyHostToDevice);                                    \
+        X                                                                      \
+      }                                                                        \
+      Y                                                                        \
    }                                                                          \
-    if (nHotB) {                                                               \
-      X                                                                        \
-    }                                                                          \
-    Y                                                                          \
-  }                                                                            \
-  THCITensor_(free)(state, rulesBuffer);
+  }

-#endif /* GPU_RULEBOOKITERATOR_H */
+#endif /* CUDA_RULEBOOKITERATOR_H */
--- a/sparseconvnet/SCN/CUDA/SparseToDense.cu
+++ b/sparseconvnet/SCN/CUDA/SparseToDense.cu
+// Copyright 2016-present, Facebook, Inc.
+// All rights reserved.
+//
+// This source code is licensed under the license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include "SparseToDense.h"
+
+template <typename T, Int Dimension>
+void cuda_SparseToDense_updateOutput(
+    /*long*/ at::Tensor inputSize, Metadata<Dimension> &m,
+    /*cuda float*/ at::Tensor input_features,
+    /*cuda float*/ at::Tensor output_features, long nPlanes) {
+
+  {
+    std::array<long, Dimension + 2> sz;
+    sz[0] = m.grids.begin()->second.size(); // batch size
+    sz[1] = nPlanes;
+    long *in_sz = inputSize.data<long>();
+    for (Int i = 0; i < Dimension; ++i)
+      sz[i + 2] = in_sz[i];
+    output_features.resize_(sz);
+    output_features.zero_();
+  }
+  if (input_features.ndimension() == 2) {
+    auto _rules = m.getSparseToDenseRuleBook(inputSize, true);
+    Int _nPlanes = input_features.size(1);
+    auto iF = input_features.data<T>();
+    auto oF = output_features.data<T>();
+    long spatialVolume = inputSize.prod().data<long>()[0];
+    RULEBOOKITERATOR(SparseToDense_ForwardPass<T>( iF, oF, _nPlanes,
+                                                  spatialVolume, rbB, nHotB);
+                     , oF += _nPlanes * spatialVolume;)
+  }
+}
+template <typename T, Int Dimension>
+void cuda_SparseToDense_updateGradInput(
+    /*long*/ at::Tensor inputSize, Metadata<Dimension> &m,
+    /*cuda float*/ at::Tensor input_features,
+    /*cuda float*/ at::Tensor d_input_features,
+    /*cuda float*/ at::Tensor d_output_features) {
+
+  d_input_features.resize_as_(input_features);
+  d_input_features.zero_();
+
+  if (input_features.ndimension() == 2) {
+    auto _rules = m.getSparseToDenseRuleBook(inputSize, true);
+    long spatialVolume = inputSize.prod().data<long>()[0];
+    Int _nPlanes = d_input_features.size(1);
+    auto diF = d_input_features.data<T>();
+    auto doF = d_output_features.data<T>();
+    RULEBOOKITERATOR(SparseToDense_BackwardPass<T>( diF, doF, _nPlanes,
+                                                   spatialVolume, rbB, nHotB);
+                     , doF += _nPlanes * spatialVolume;)
+  }
+}
--- a/sparseconvnet/SCN/generic/GPU/SparseToDense.h
+++ b/sparseconvnet/SCN/generic/GPU/SparseToDense.h
@@ -4,28 +4,27 @@
 // This source code is licensed under the license found in the
 // LICENSE file in the root directory of this source tree.

-#ifndef GPU_SPARSETODENSE_H
-#define GPU_SPARSETODENSE_H
-#include "../SparseConvNet.h"
-//#include <THC/THCAtomics.cuh>
+#ifndef CUDA_SPARSETODENSE_H
+#define CUDA_SPARSETODENSE_H
+

 // NTX must be >=2 so r is filled properly
-template <typename T, uInt NTX, uInt NTY>
+template <typename T, Int NTX, Int NTY>
 __global__ void SparseToDense_fp(T *input_features, T *output_features,
-                                 uInt nPlanes, uInt spatialVolume, uInt *rules,
-                                 uInt nHot) {
-  __shared__ uInt r[NTY * 2];
-  for (uInt n = blockIdx.x * NTY; n < nHot; n += gridDim.x * NTY) {
+                                 Int nPlanes, Int spatialVolume, Int *rules,
+                                 Int nHot) {
+  __shared__ Int r[NTY * 2];
+  for (Int n = blockIdx.x * NTY; n < nHot; n += gridDim.x * NTY) {
    {
-      uInt i = threadIdx.x + NTX * threadIdx.y;
-      if (i < NTY * 2 and i < 2 * (n - nHot))
+      Int i = threadIdx.x + NTX * threadIdx.y;
+      if (i < NTY * 2 and i < 2 * (nHot - n))
        r[i] = rules[2 * n + i];
    }
    __syncthreads();
    if (n + threadIdx.y < nHot) {
      T *i = input_features + r[2 * threadIdx.y] * nPlanes;
      T *o = output_features + r[2 * threadIdx.y + 1];
-      for (uInt plane = threadIdx.x; plane < nPlanes; plane += NTX)
+      for (Int plane = threadIdx.x; plane < nPlanes; plane += NTX)
        o[plane * spatialVolume] = i[plane];
    }
    __syncthreads();
@@ -33,29 +32,29 @@ __global__ void SparseToDense_fp(T *input_features, T *output_features,
 }

 template <typename T>
-void SparseToDense_ForwardPass(cudaStream_t stream, T *input_features,
-                               T *output_features, uInt nPlanes,
-                               uInt spatialVolume, uInt *rules, uInt nHot) {
-  SparseToDense_fp<T, 32, 32> << <32, dim3(32, 32), 0, stream>>>
-      (input_features, output_features, nPlanes, spatialVolume, rules, nHot);
+void SparseToDense_ForwardPass(T *input_features, T *output_features,
+                               Int nPlanes, Int spatialVolume, Int *rules,
+                               Int nHot) {
+  SparseToDense_fp<T, 32, 32><<<32, dim3(32, 32)>>>(
+      input_features, output_features, nPlanes, spatialVolume, rules, nHot);
 }
 // NTX must be >=2 so r is filled properly
-template <typename T, uInt NTX, uInt NTY>
+template <typename T, Int NTX, Int NTY>
 __global__ void SparseToDense_bp(T *d_input_features, T *d_output_features,
-                                 uInt nPlanes, uInt spatialVolume, uInt *rules,
-                                 uInt nHot) {
-  __shared__ uInt r[NTY * 2];
-  for (uInt n = blockIdx.x * NTY; n < nHot; n += gridDim.x * NTY) {
+                                 Int nPlanes, Int spatialVolume, Int *rules,
+                                 Int nHot) {
+  __shared__ Int r[NTY * 2];
+  for (Int n = blockIdx.x * NTY; n < nHot; n += gridDim.x * NTY) {
    {
-      uInt i = threadIdx.x + NTX * threadIdx.y;
-      if (i < NTY * 2 and i < 2 * (n - nHot))
+      Int i = threadIdx.x + NTX * threadIdx.y;
+      if (i < NTY * 2 and i < 2 * (nHot - n))
        r[i] = rules[2 * n + i];
    }
    __syncthreads();
    if (n + threadIdx.y < nHot) {
      T *d_i = d_input_features + r[2 * threadIdx.y] * nPlanes;
      T *d_o = d_output_features + r[2 * threadIdx.y + 1];
-      for (uInt plane = threadIdx.x; plane < nPlanes; plane += NTX)
+      for (Int plane = threadIdx.x; plane < nPlanes; plane += NTX)
        d_i[plane] = d_o[plane * spatialVolume];
    }
    __syncthreads();
@@ -63,11 +62,10 @@ __global__ void SparseToDense_bp(T *d_input_features, T *d_output_features,
 }

 template <typename T>
-void SparseToDense_BackwardPass(cudaStream_t stream, T *d_input_features,
-                                T *d_output_features, uInt nPlanes,
-                                uInt spatialVolume, uInt *rules, uInt nHot) {
-  SparseToDense_bp<T, 32, 32> << <32, dim3(32, 32), 0, stream>>>
-      (d_input_features, d_output_features, nPlanes, spatialVolume, rules,
-       nHot);
+void SparseToDense_BackwardPass(T *d_input_features, T *d_output_features,
+                                Int nPlanes, Int spatialVolume, Int *rules,
+                                Int nHot) {
+  SparseToDense_bp<T, 32, 32><<<32, dim3(32, 32)>>>(
+      d_input_features, d_output_features, nPlanes, spatialVolume, rules, nHot);
 }
-#endif /* GPU_SPARSETODENSE_H */
+#endif /* CUDA_SPARSETODENSE_H */
--- a/sparseconvnet/SCN/CUDA/UnPooling.cu
+++ b/sparseconvnet/SCN/CUDA/UnPooling.cu
+// Copyright 2016-present, Facebook, Inc.
+// All rights reserved.
+//
+// This source code is licensed under the license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include "RuleBookIterator.h"
+#include "UnPooling.h"
+
+template <typename T, Int Dimension>
+void cuda_UnPooling_updateOutput(
+    /*long*/ at::Tensor inputSize, /*long*/ at::Tensor outputSize,
+    /*long*/ at::Tensor poolSize,
+    /*long*/ at::Tensor poolStride, Metadata<Dimension> &m,
+    /*cuda float*/ at::Tensor input_features,
+    /*cuda float*/ at::Tensor output_features, long nFeaturesToDrop) {
+
+  Int nPlanes = input_features.size(1) - nFeaturesToDrop;
+  auto _rules =
+      m.getRuleBook(outputSize, inputSize, poolSize, poolStride, true);
+  Int nActive = m.getNActive(outputSize);
+  output_features.resize_({nActive, input_features.size(1) - nFeaturesToDrop});
+  output_features.zero_();
+
+  auto iF = input_features.data<T>() + nFeaturesToDrop;
+  auto oF = output_features.data<T>();
+  RULEBOOKITERATOR(
+      cuda_UnPooling_ForwardPass<T>(iF, oF, nPlanes, input_features.size(1),
+                                    output_features.size(1), rbB, nHotB);
+      , )
+}
+
+template <typename T, Int Dimension>
+void cuda_UnPooling_updateGradInput(
+    /*long*/ at::Tensor inputSize, /*long*/ at::Tensor outputSize,
+    /*long*/ at::Tensor poolSize,
+    /*long*/ at::Tensor poolStride, Metadata<Dimension> &m,
+    /*cuda float*/ at::Tensor input_features,
+    /*cuda float*/ at::Tensor d_input_features,
+    /*cuda float*/ at::Tensor d_output_features, long nFeaturesToDrop) {
+
+  Int nPlanes = input_features.size(1) - nFeaturesToDrop;
+  auto _rules =
+      m.getRuleBook(outputSize, inputSize, poolSize, poolStride, true);
+  d_input_features.resize_as_(input_features);
+  d_input_features.zero_();
+
+  auto diF = d_input_features.data<T>() + nFeaturesToDrop;
+  auto doF = d_output_features.data<T>();
+  RULEBOOKITERATOR(
+      cuda_UnPooling_BackwardPass<T>(diF, doF, nPlanes, input_features.size(1),
+                                     d_output_features.size(1), rbB, nHotB);
+      , )
+}
--- a/sparseconvnet/SCN/CUDA/UnPooling.h
+++ b/sparseconvnet/SCN/CUDA/UnPooling.h
+// Copyright 2016-present, Facebook, Inc.
+// All rights reserved.
+//
+// This source code is licensed under the license found in the
+// LICENSE file in the root directory of this source tree.
+
+#ifndef CUDA_UNPOOLING_H
+#define CUDA_UNPOOLING_H
+
+// NTX must be >=2 so r is filled properly
+template <typename T, Int NTX, Int NTY>
+__global__ void UnPooling_fp(T *input_features, T *output_features,
+                             Int nPlanes, Int input_stride,
+                             Int output_stride, Int *rules, Int nHot) {
+  __shared__ Int r[NTY * 2];
+  for (Int n = blockIdx.x * NTY; n < nHot; n += gridDim.x * NTY) {
+    {
+      Int i = threadIdx.x + NTX * threadIdx.y;
+      if (i < NTY * 2 and i < 2 * (nHot - n))
+        r[i] = rules[2 * n + i];
+    }
+    __syncthreads();
+    if (n + threadIdx.y < nHot) {
+      Int i = r[2 * threadIdx.y + 1] * input_stride;
+      Int o = r[2 * threadIdx.y] * output_stride;
+      for (Int plane = threadIdx.x; plane < nPlanes; plane += NTX)
+        output_features[o + plane] += input_features[i + plane];
+    }
+    __syncthreads();
+  }
+}
+
+template <typename T>
+void cuda_UnPooling_ForwardPass(T *input_features, T *output_features,
+                                Int nPlanes, Int input_stride,
+                                Int output_stride, Int *rules, Int nHot) {
+  UnPooling_fp<T, 32, 32><<<32, dim3(32, 32)>>>(input_features, output_features,
+                                                nPlanes, input_stride,
+                                                output_stride, rules, nHot);
+}
+template <typename T, Int NTX, Int NTY>
+__global__ void UnPooling_bp(T *d_input_features, T *d_output_features,
+                             Int nPlanes, Int input_stride,
+                             Int output_stride, Int *rules, Int nHot) {
+  __shared__ Int r[NTY * 2];
+  for (Int n = blockIdx.x * NTY; n < nHot; n += gridDim.x * NTY) {
+    {
+      Int i = threadIdx.x + NTX * threadIdx.y;
+      if (i < NTY * 2 and i < 2 * (nHot - n))
+        r[i] = rules[2 * n + i];
+    }
+    __syncthreads();
+    if (n + threadIdx.y < nHot) {
+      Int i = r[2 * threadIdx.y + 1] * input_stride;
+      Int o = r[2 * threadIdx.y] * output_stride;
+      for (Int plane = threadIdx.x; plane < nPlanes; plane += NTX)
+        d_input_features[i + plane] += d_output_features[o + plane];
+    }
+    __syncthreads();
+  }
+}
+
+template <typename T>
+void cuda_UnPooling_BackwardPass(T *d_input_features, T *d_output_features,
+                                 Int nPlanes, Int input_stride,
+                                 Int output_stride, Int *rules, Int nHot) {
+  UnPooling_bp<T, 32, 32><<<32, dim3(32, 32)>>>(
+      d_input_features, d_output_features, nPlanes, input_stride, output_stride,
+      rules, nHot);
+}
+#endif /* CUDA_UNPOOLING_H */