Goodbye THNN. Hello ATen!

2c4ed608 · Benjamin Thomas Graham · 6d4475db · 2c4ed608 · 2c4ed608 · 2c4ed608
Commit 2c4ed608 authored Jun 20, 2018 by Benjamin Thomas Graham
20 changed files
--- a/sparseconvnet/SCN/generic/CPU/BatchNormalization.h
+++ b/sparseconvnet/SCN/generic/CPU/BatchNormalization.h
@@ -6,7 +6,7 @@

 #ifndef CPU_BATCHNORMALIZATION_H
 #define CPU_BATCHNORMALIZATION_H
-#include "../SparseConvNet.h"
+
 #include <vector>

 // in/output_stride is normally the same as nPlanes; allow other values to act
@@ -14,28 +14,28 @@

 template <typename T>
 void BatchNormalization_ForwardPass(T *input_features, T *output_features,
-                                    uInt nPlanes, uInt input_stride,
-                                    uInt output_stride, uInt nActive,
+                                    Int nPlanes, Int input_stride,
+                                    Int output_stride, Int nActive,
                                    T *saveMean, T *saveInvStd, T *runningMean,
                                    T *runningVar, T *weight, T *bias, T eps,
                                    T momentum, bool train, T leakiness) {
  if (train) {
    std::memset(saveMean, 0, nPlanes * sizeof(T));
    std::memset(saveInvStd, 0, nPlanes * sizeof(T));
-    for (uInt row = 0, ci = 0; row < nActive;
+    for (Int row = 0, ci = 0; row < nActive;
         row++, ci += input_stride - nPlanes) {
-      for (uInt plane = 0; plane < nPlanes; plane++, ci++) {
+      for (Int plane = 0; plane < nPlanes; plane++, ci++) {
        saveMean[plane] += input_features[ci];
      }
    }
-    for (uInt plane = 0; plane < nPlanes; plane++) {
+    for (Int plane = 0; plane < nPlanes; plane++) {
      saveMean[plane] /= nActive;
      runningMean[plane] =
          momentum * runningMean[plane] + (1 - momentum) * saveMean[plane];
    }
-    for (uInt row = 0, ci = 0; row < nActive;
+    for (Int row = 0, ci = 0; row < nActive;
         row++, ci += input_stride - nPlanes) {
-      for (uInt plane = 0; plane < nPlanes; plane++, ci++) {
+      for (Int plane = 0; plane < nPlanes; plane++, ci++) {
        saveInvStd[plane] +=
            (input_features[ci] - saveMean[plane]) *
            (input_features[ci] - saveMean[plane]); // accumulate sum-squares
@@ -43,26 +43,26 @@ void BatchNormalization_ForwardPass(T *input_features, T *output_features,
        // rooting
      }
    }
-    for (uInt plane = 0; plane < nPlanes; plane++) {
+    for (Int plane = 0; plane < nPlanes; plane++) {
      runningVar[plane] = momentum * runningVar[plane] +
                          (1 - momentum) * saveInvStd[plane] / (nActive - 1);
      saveInvStd[plane] = powf(saveInvStd[plane] / nActive + eps, -0.5);
    }
  } else {
-    for (uInt plane = 0; plane < nPlanes; plane++) {
+    for (Int plane = 0; plane < nPlanes; plane++) {
      saveMean[plane] = runningMean[plane];
      saveInvStd[plane] = powf(runningVar[plane] + eps, -0.5);
    }
  }
  std::vector<T> w(nPlanes);
  std::vector<T> b(nPlanes);
-  for (uInt plane = 0; plane < nPlanes; plane++) {
+  for (Int plane = 0; plane < nPlanes; plane++) {
    w[plane] = saveInvStd[plane] * (weight ? weight[plane] : 1);
    b[plane] = -saveMean[plane] * w[plane] + (bias ? bias[plane] : 0);
  }
-  for (uInt row = 0, ci = 0, co = 0; row < nActive;
+  for (Int row = 0, ci = 0, co = 0; row < nActive;
       row++, ci += input_stride - nPlanes, co += output_stride - nPlanes) {
-    for (uInt plane = 0; plane < nPlanes; plane++, ci++, co++) {
+    for (Int plane = 0; plane < nPlanes; plane++, ci++, co++) {
      T out = input_features[ci] * w[plane] + b[plane];
      out = (out > 0) ? out : (out * leakiness);
      output_features[co] = out;
@@ -73,17 +73,17 @@ void BatchNormalization_ForwardPass(T *input_features, T *output_features,
 template <typename T>
 void BatchNormalization_BackwardPass(T *input_features, T *d_input_features,
                                     T *output_features, T *d_output_features,
-                                     uInt nPlanes, uInt input_stride,
-                                     uInt output_stride, uInt nActive,
+                                     Int nPlanes, Int input_stride,
+                                     Int output_stride, Int nActive,
                                     T *saveMean, T *saveInvStd, T *runningMean,
                                     T *runningVar, T *weight, T *bias,
                                     T *d_weight, T *d_bias, T leakiness) {
  std::vector<T> gradMean(nPlanes);
  std::vector<T> dotp(nPlanes);
  std::vector<T> k(nPlanes);
-  for (uInt row = 0, ci = 0, co = 0; row < nActive;
+  for (Int row = 0, ci = 0, co = 0; row < nActive;
       row++, ci += input_stride - nPlanes, co += output_stride - nPlanes) {
-    for (uInt plane = 0; plane < nPlanes; plane++, ci++, co++) {
+    for (Int plane = 0; plane < nPlanes; plane++, ci++, co++) {
      T d = d_output_features[co];
      d = (output_features[co] > 0) ? d : (d * leakiness);
      d_output_features[co] = d;
@@ -91,15 +91,15 @@ void BatchNormalization_BackwardPass(T *input_features, T *d_input_features,
      dotp[plane] += (input_features[ci] - saveMean[plane]) * d;
    }
  }
-  for (uInt plane = 0; plane < nPlanes; plane++) {
+  for (Int plane = 0; plane < nPlanes; plane++) {
    if (d_bias)
      d_bias[plane] = gradMean[plane]; // sum of grads, really, until ...
    gradMean[plane] /= nActive;        // ...now
    k[plane] = dotp[plane] * saveInvStd[plane] * saveInvStd[plane] / nActive;
  }
-  for (uInt row = 0, ci = 0, co = 0; row < nActive;
+  for (Int row = 0, ci = 0, co = 0; row < nActive;
       row++, ci += input_stride - nPlanes, co += output_stride - nPlanes) {
-    for (uInt plane = 0; plane < nPlanes; plane++, ci++, co++) {
+    for (Int plane = 0; plane < nPlanes; plane++, ci++, co++) {
      d_input_features[ci] =
          (d_output_features[co] - gradMean[plane] -
           (input_features[ci] - saveMean[plane]) * k[plane]) *
@@ -107,7 +107,7 @@ void BatchNormalization_BackwardPass(T *input_features, T *d_input_features,
    }
  }
  if (d_weight)
-    for (uInt plane = 0; plane < nPlanes; plane++) {
+    for (Int plane = 0; plane < nPlanes; plane++) {
      d_weight[plane] = dotp[plane] * saveInvStd[plane];
    }
 }

--- a/sparseconvnet/SCN/CPU/BatchwiseMultiplicativeDropout.cpp
+++ b/sparseconvnet/SCN/CPU/BatchwiseMultiplicativeDropout.cpp
+// Copyright 2016-present, Facebook, Inc.
+// All rights reserved.
+//
+// This source code is licensed under the license found in the
+// LICENSE file in the root directory of this source tree.
+
+template <typename T>
+void cpu_BatchwiseMultiplicativeDropout_updateOutput(
+    /*float*/ at::Tensor input_features, /*float*/ at::Tensor output_features,
+    /*float*/ at::Tensor noise, float alpha) {
+  output_features.resize_as_(input_features);
+  auto nActive = input_features.size(0);
+  auto nPlanes = input_features.size(1);
+  auto iF = input_features.data<T>();
+  auto oF = output_features.data<T>();
+  auto nz = noise.data<T>();
+  for (Int row = 0; row < nActive; row++)
+    for (Int plane = 0, o = row * nPlanes, i = row * nPlanes; plane < nPlanes;
+         plane++, o++, i++)
+      oF[o] = (iF[i] > 0) ? iF[i] * nz[plane] : iF[i] * nz[plane] * alpha;
+}
+template <typename T>
+void cpu_BatchwiseMultiplicativeDropout_updateGradInput(
+    /*float*/ at::Tensor input_features, /*float*/ at::Tensor d_input_features,
+    /*float*/ at::Tensor d_output_features, /*float*/ at::Tensor noise,
+    float alpha) {
+  d_input_features.resize_as_(d_output_features);
+  auto nActive = input_features.size(0);
+  auto nPlanes = input_features.size(1);
+  auto iF = input_features.data<T>();
+  auto diF = d_input_features.data<T>();
+  auto doF = d_output_features.data<T>();
+  auto nz = noise.data<T>();
+  for (Int row = 0; row < nActive; row++)
+    for (Int plane = 0, o = row * nPlanes, i = row * nPlanes; plane < nPlanes;
+         plane++, o++, i++)
+      diF[i] = (iF[i] > 0) ? doF[o] * nz[plane] : doF[o] * nz[plane] * alpha;
+}
--- a/sparseconvnet/SCN/CPU/Convolution.cpp
+++ b/sparseconvnet/SCN/CPU/Convolution.cpp
+// Copyright 2016-present, Facebook, Inc.
+// All rights reserved.
+//
+// This source code is licensed under the license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <cstring>
+template <typename T>
+void rule_index_select(at::Tensor target, at::Tensor src, Int nRules,
+                       Int *rules) {
+  auto t_ptr = target.data<T>();
+  auto s_ptr = src.data<T>();
+  auto n = target.size(1);
+  for (int i = 0; i < nRules; ++i)
+    std::memcpy(t_ptr + i * n, s_ptr + rules[2 * i] * n, sizeof(T) * n);
+}
+template <typename T>
+void rule_index_add_(at::Tensor target, at::Tensor src, Int nRules,
+                     Int *rules) {
+  auto t_ptr = target.data<T>();
+  auto s_ptr = src.data<T>();
+  auto n = target.size(1);
+  for (int i = 0; i < nRules; ++i) {
+    auto t = t_ptr + rules[2 * i] * n;
+    auto s = s_ptr + i * n;
+    for (int j = 0; j < n; ++j)
+      t[j] += s[j];
+  }
+}
+
+template <typename T, Int Dimension>
+double cpu_Convolution_updateOutput(
+    /*long*/ at::Tensor inputSize, /*long*/ at::Tensor outputSize,
+    /*long*/ at::Tensor filterSize,
+    /*long*/ at::Tensor filterStride, Metadata<Dimension> &m,
+    /*float*/ at::Tensor input_features,
+    /*float*/ at::Tensor output_features, /*float*/ at::Tensor weight,
+    /*float*/ at::Tensor bias) {
+  auto _rules =
+      m.getRuleBook(inputSize, outputSize, filterSize, filterStride, true);
+  Int nActive = m.getNActive(outputSize);
+  output_features.resize_({nActive, weight.size(2)});
+  if (bias.numel() and nActive)
+    output_features.copy_(bias);
+  else
+    output_features.zero_();
+
+  double flops = 0;
+  auto ip = weight.size(1);
+  auto op = weight.size(2);
+  for (Int i = 0; i < (Int)_rules.size(); i++) {
+    auto r = _rules[i];
+    int nRules = r.size() / 2;
+    if (nRules) {
+      flops += nRules * ip * op;
+      // auto rt = torch::CPU(at_kINT).tensorFromBlob(&r[0], {nRules, 2});
+      // auto input_rows = input_features.index_select(0, rt.select(1, 0));
+      // auto w = weight.select(0, i);
+      // auto output_rows = at::mm(input_rows, w);
+      // output_features.index_add_(0, rt.select(1, 1), output_rows);
+      auto input_rows = input_features.type().tensor({nRules, ip});
+      rule_index_select<T>(input_rows, input_features, nRules, &r[0]);
+      auto w = weight.select(0, i);
+      auto output_rows = at::mm(input_rows, w);
+      rule_index_add_<T>(output_features, output_rows, nRules, &r[1]);
+    }
+  }
+  return flops;
+}
+
+template <typename T, Int Dimension>
+void cpu_Convolution_backward(
+    /*long*/ at::Tensor inputSize, /*long*/ at::Tensor outputSize,
+    /*long*/ at::Tensor filterSize,
+    /*long*/ at::Tensor filterStride, Metadata<Dimension> &m,
+    /*float*/ at::Tensor input_features,
+    /*float*/ at::Tensor d_input_features,
+    /*float*/ at::Tensor d_output_features, /*float*/ at::Tensor weight,
+    /*float*/ at::Tensor d_weight, /*float*/ at::Tensor d_bias) {
+
+  auto _rules =
+      m.getRuleBook(inputSize, outputSize, filterSize, filterStride, true);
+  Int nActive = m.getNActive(inputSize);
+  d_input_features.resize_as_(input_features);
+  d_input_features.zero_();
+
+  if (nActive and d_bias.numel())
+    at::sum_out(d_bias, d_output_features, {0}, false);
+  auto ip = weight.size(1);
+  auto op = weight.size(2);
+  for (Int i = 0; i < (Int)_rules.size(); i++) {
+    auto r = _rules[i];
+    int nRules = r.size() / 2;
+    if (nRules) {
+      auto w = weight.select(0, i);
+      auto dw = d_weight.select(0, i);
+      // auto rt = torch::CPU(at_kINT).tensorFromBlob(&r[0], {nRules, 2});
+      // auto input_rows = input_features.index_select(0, rt.select(1, 0));
+      // auto d_output_rows = d_output_features.index_select(0, rt.select(1,
+      // 1));
+      // at::mm_out(dw, input_rows.t(), d_output_rows);
+      // auto d_input_rows = at::mm(d_output_rows, w.t());
+      // d_input_features.index_add_(0, rt.select(1, 0), d_input_rows);
+      auto input_rows = input_features.type().tensor({nRules, ip});
+      rule_index_select<T>(input_rows, input_features, nRules, &r[0]);
+      auto d_output_rows = d_output_features.type().tensor({nRules, op});
+      rule_index_select<T>(d_output_rows, d_output_features, nRules, &r[1]);
+      at::mm_out(dw, input_rows.t(), d_output_rows);
+      auto d_input_rows = at::mm(d_output_rows, w.t());
+      rule_index_add_<T>(d_input_features, d_input_rows, nRules, &r[0]);
+    }
+  }
+}
+
+template <typename T, Int Dimension>
+double cpu_SubmanifoldConvolution_updateOutput(
+    /*long*/ at::Tensor inputSize, /*long*/ at::Tensor filterSize,
+    Metadata<Dimension> &m,
+    /*float*/ at::Tensor input_features, /*float*/ at::Tensor output_features,
+    /*float*/ at::Tensor weight,
+    /*float*/ at::Tensor bias) {
+  auto _rules = m.getSubmanifoldRuleBook(inputSize, filterSize, true);
+  Int nActive = m.getNActive(inputSize);
+  output_features.resize_({nActive, weight.size(2)});
+  if (bias.numel() and nActive)
+    output_features.copy_(bias);
+  else
+    output_features.zero_();
+
+  double flops = 0;
+  auto ip = weight.size(1);
+  auto op = weight.size(2);
+  for (Int i = 0; i < (Int)_rules.size(); i++) {
+    auto r = _rules[i];
+    int nRules = r.size() / 2;
+    if (nRules) {
+      flops += nRules * ip * op;
+      // auto  rt = torch::CPU(at_kINT).tensorFromBlob(&r[0], {nRules, 2});
+      // auto input_rows = input_features.index_select(0, rt.select(1, 0));
+      // auto w = weight.select(0, i);
+      // auto output_rows = at::mm(input_rows, w);
+      // output_features.index_add_(0, rt.select(1, 1), output_rows);
+      auto input_rows = input_features.type().tensor({nRules, ip});
+      rule_index_select<T>(input_rows, input_features, nRules, &r[0]);
+      auto w = weight.select(0, i);
+      auto output_rows = at::mm(input_rows, w);
+      rule_index_add_<T>(output_features, output_rows, nRules, &r[1]);
+    }
+  }
+  return flops;
+}
+
+template <typename T, Int Dimension>
+void cpu_SubmanifoldConvolution_backward(
+    /*long*/ at::Tensor inputSize, /*long*/ at::Tensor filterSize,
+    Metadata<Dimension> &m,
+    /*float*/ at::Tensor input_features,
+    /*float*/ at::Tensor d_input_features,
+    /*float*/ at::Tensor d_output_features, /*float*/ at::Tensor weight,
+    /*float*/ at::Tensor d_weight,
+    /*float*/ at::Tensor d_bias) {
+
+  auto _rules = m.getSubmanifoldRuleBook(inputSize, filterSize, true);
+  Int nActive = m.getNActive(inputSize);
+  d_input_features.resize_as_(input_features);
+  d_input_features.zero_();
+
+  if (nActive and d_bias.numel())
+    at::sum_out(d_bias, d_output_features, {0}, false);
+  auto ip = weight.size(1);
+  auto op = weight.size(2);
+  for (Int i = 0; i < (Int)_rules.size(); i++) {
+    auto r = _rules[i];
+    int nRules = r.size() / 2;
+    if (nRules) {
+      auto w = weight.select(0, i);
+      auto dw = d_weight.select(0, i);
+      // auto rt = torch::CPU(at_kINT).tensorFromBlob(&r[0], {nRules, 2});
+      // auto input_rows = input_features.index_select(0, rt.select(1, 0));
+      // auto d_output_rows = d_output_features.index_select(0, rt.select(1,
+      // 1));
+      // at::mm_out(dw, input_rows.t(), d_output_rows);
+      // auto d_input_rows = at::mm(d_output_rows, w.t());
+      // d_input_features.index_add_(0, rt.select(1, 0), d_input_rows);
+      auto input_rows = input_features.type().tensor({nRules, ip});
+      rule_index_select<T>(input_rows, input_features, nRules, &r[0]);
+      auto d_output_rows = d_output_features.type().tensor({nRules, op});
+      rule_index_select<T>(d_output_rows, d_output_features, nRules, &r[1]);
+      at::mm_out(dw, input_rows.t(), d_output_rows);
+      auto d_input_rows = at::mm(d_output_rows, w.t());
+      rule_index_add_<T>(d_input_features, d_input_rows, nRules, &r[0]);
+    }
+  }
+}
+
+template <typename T, Int Dimension>
+double cpu_FullConvolution_updateOutput(
+    /*long*/ at::Tensor inputSize, /*long*/ at::Tensor outputSize,
+    /*long*/ at::Tensor filterSize,
+    /*long*/ at::Tensor filterStride, Metadata<Dimension> &mIn,
+    Metadata<Dimension> &mOut,
+    /*float*/ at::Tensor input_features, /*float*/ at::Tensor output_features,
+    /*float*/ at::Tensor weight,
+    /*float*/ at::Tensor bias) {
+  auto _rules = mIn.getFullConvolutionRuleBook(inputSize, outputSize,
+                                               filterSize, filterStride, mOut);
+  Int nActive = mOut.getNActive(outputSize);
+  output_features.resize_({nActive, weight.size(2)});
+  if (bias.numel() and nActive)
+    output_features.copy_(bias);
+  else
+    output_features.zero_();
+
+  double flops = 0;
+  auto ip = weight.size(1);
+  auto op = weight.size(2);
+  for (Int i = 0; i < (Int)_rules.size(); i++) {
+    auto r = _rules[i];
+    int nRules = r.size() / 2;
+    if (nRules) {
+      flops += nRules * ip * op;
+      // auto rt = torch::CPU(at_kINT).tensorFromBlob(&r[0], {nRules, 2});
+      // auto input_rows = input_features.index_select(0, rt.select(1, 0));
+      // auto w = weight.select(0, i);
+      // auto output_rows = at::mm(input_rows, w);
+      // output_features.index_add_(0, rt.select(1, 1), output_rows);
+      auto input_rows = input_features.type().tensor({nRules, ip});
+      rule_index_select<T>(input_rows, input_features, nRules, &r[0]);
+      auto w = weight.select(0, i);
+      auto output_rows = at::mm(input_rows, w);
+      rule_index_add_<T>(output_features, output_rows, nRules, &r[1]);
+    }
+  }
+  return flops;
+}
+
+template <typename T, Int Dimension>
+void cpu_FullConvolution_backward(
+    /*long*/ at::Tensor inputSize, /*long*/ at::Tensor outputSize,
+    /*long*/ at::Tensor filterSize,
+    /*long*/ at::Tensor filterStride, Metadata<Dimension> &mIn,
+    Metadata<Dimension> &mOut,
+    /*float*/ at::Tensor input_features,
+    /*float*/ at::Tensor d_input_features,
+    /*float*/ at::Tensor d_output_features, /*float*/ at::Tensor weight,
+    /*float*/ at::Tensor d_weight,
+    /*float*/ at::Tensor d_bias) {
+
+  auto _rules = mIn.getFullConvolutionRuleBook(inputSize, outputSize,
+                                               filterSize, filterStride, mOut);
+  Int nActive = mOut.getNActive(inputSize);
+  d_input_features.resize_as_(input_features);
+  d_input_features.zero_();
+
+  if (nActive and d_bias.numel())
+    at::sum_out(d_bias, d_output_features, {0}, false);
+  auto ip = weight.size(1);
+  auto op = weight.size(2);
+  for (Int i = 0; i < (Int)_rules.size(); i++) {
+    auto r = _rules[i];
+    int nRules = r.size() / 2;
+    if (nRules) {
+      auto w = weight.select(0, i);
+      auto dw = d_weight.select(0, i);
+      // auto rt = torch::CPU(at_kINT).tensorFromBlob(&r[0], {nRules, 2});
+      // auto input_rows = input_features.index_select(0, rt.select(1, 0));
+      // auto d_output_rows = d_output_features.index_select(0, rt.select(1,
+      // 1));
+      // at::mm_out(dw, input_rows.t(), d_output_rows);
+      // auto d_input_rows = at::mm(d_output_rows, w.t());
+      // d_input_features.index_add_(0, rt.select(1, 0), d_input_rows);
+      auto input_rows = input_features.type().tensor({nRules, ip});
+      rule_index_select<T>(input_rows, input_features, nRules, &r[0]);
+      auto d_output_rows = d_output_features.type().tensor({nRules, op});
+      rule_index_select<T>(d_output_rows, d_output_features, nRules, &r[1]);
+      at::mm_out(dw, input_rows.t(), d_output_rows);
+      auto d_input_rows = at::mm(d_output_rows, w.t());
+      rule_index_add_<T>(d_input_features, d_input_rows, nRules, &r[0]);
+    }
+  }
+}
+
+template <typename T, Int Dimension>
+double cpu_RandomizedStrideConvolution_updateOutput(
+    /*long*/ at::Tensor inputSize, /*long*/ at::Tensor outputSize,
+    /*long*/ at::Tensor filterSize,
+    /*long*/ at::Tensor filterStride, Metadata<Dimension> &m,
+    /*float*/ at::Tensor input_features,
+    /*float*/ at::Tensor output_features, /*float*/ at::Tensor weight,
+    /*float*/ at::Tensor bias) {
+  auto _rules = m.getRandomizedStrideRuleBook(inputSize, outputSize, filterSize,
+                                              filterStride, true);
+  Int nActive = m.getNActive(outputSize);
+  output_features.resize_({nActive, weight.size(2)});
+  if (bias.numel() and nActive)
+    output_features.copy_(bias);
+  else
+    output_features.zero_();
+
+  double flops = 0;
+  auto ip = weight.size(1);
+  auto op = weight.size(2);
+  for (Int i = 0; i < (Int)_rules.size(); i++) {
+    auto r = _rules[i];
+    int nRules = r.size() / 2;
+    if (nRules) {
+      flops += nRules * ip * op;
+      // auto rt = torch::CPU(at_kINT).tensorFromBlob(&r[0], {nRules, 2});
+      // auto input_rows = input_features.index_select(0, rt.select(1, 0));
+      // auto w = weight.select(0, i);
+      // auto output_rows = at::mm(input_rows, w);
+      // output_features.index_add_(0, rt.select(1, 1), output_rows);
+      auto input_rows = input_features.type().tensor({nRules, ip});
+      rule_index_select<T>(input_rows, input_features, nRules, &r[0]);
+      auto w = weight.select(0, i);
+      auto output_rows = at::mm(input_rows, w);
+      rule_index_add_<T>(output_features, output_rows, nRules, &r[1]);
+    }
+  }
+  return flops;
+}
+
+template <typename T, Int Dimension>
+void cpu_RandomizedStrideConvolution_backward(
+    /*long*/ at::Tensor inputSize, /*long*/ at::Tensor outputSize,
+    /*long*/ at::Tensor filterSize,
+    /*long*/ at::Tensor filterStride, Metadata<Dimension> &m,
+    /*float*/ at::Tensor input_features,
+    /*float*/ at::Tensor d_input_features,
+    /*float*/ at::Tensor d_output_features, /*float*/ at::Tensor weight,
+    /*float*/ at::Tensor d_weight, /*float*/ at::Tensor d_bias) {
+
+  auto _rules = m.getRandomizedStrideRuleBook(inputSize, outputSize, filterSize,
+                                              filterStride, true);
+  Int nActive = m.getNActive(inputSize);
+  d_input_features.resize_as_(input_features);
+  d_input_features.zero_();
+
+  if (nActive and d_bias.numel())
+    at::sum_out(d_bias, d_output_features, {0}, false);
+  auto ip = weight.size(1);
+  auto op = weight.size(2);
+  for (Int i = 0; i < (Int)_rules.size(); i++) {
+    auto r = _rules[i];
+    int nRules = r.size() / 2;
+    if (nRules) {
+      auto w = weight.select(0, i);
+      auto dw = d_weight.select(0, i);
+      // auto rt = torch::CPU(at_kINT).tensorFromBlob(&r[0], {nRules, 2});
+      // auto input_rows = input_features.index_select(0, rt.select(1, 0));
+      // auto d_output_rows = d_output_features.index_select(0, rt.select(1,
+      // 1));
+      // at::mm_out(dw, input_rows.t(), d_output_rows);
+      // auto d_input_rows = at::mm(d_output_rows, w.t());
+      // d_input_features.index_add_(0, rt.select(1, 0), d_input_rows);
+      auto input_rows = input_features.type().tensor({nRules, ip});
+      rule_index_select<T>(input_rows, input_features, nRules, &r[0]);
+      auto d_output_rows = d_output_features.type().tensor({nRules, op});
+      rule_index_select<T>(d_output_rows, d_output_features, nRules, &r[1]);
+      at::mm_out(dw, input_rows.t(), d_output_rows);
+      auto d_input_rows = at::mm(d_output_rows, w.t());
+      rule_index_add_<T>(d_input_features, d_input_rows, nRules, &r[0]);
+    }
+  }
+}
--- a/sparseconvnet/SCN/CPU/Deconvolution.cpp
+++ b/sparseconvnet/SCN/CPU/Deconvolution.cpp
+// Copyright 2016-present, Facebook, Inc.
+// All rights reserved.
+//
+// This source code is licensed under the license found in the
+// LICENSE file in the root directory of this source tree.
+
+template <typename T, Int Dimension>
+double cpu_Deconvolution_updateOutput(
+    /*long*/ at::Tensor inputSize, /*long*/ at::Tensor outputSize,
+    /*long*/ at::Tensor filterSize,
+    /*long*/ at::Tensor filterStride, Metadata<Dimension> &m,
+    /*float*/ at::Tensor input_features,
+    /*float*/ at::Tensor output_features, /*float*/ at::Tensor weight,
+    /*float*/ at::Tensor bias) {
+  auto _rules =
+      m.getRuleBook(outputSize, inputSize, filterSize, filterStride, true);
+  Int nActive = m.getNActive(outputSize);
+  output_features.resize_({nActive, weight.size(2)});
+  if (bias.numel() and nActive)
+    output_features.copy_(bias);
+  else
+    output_features.zero_();
+
+  double flops = 0;
+  auto ip = weight.size(1);
+  auto op = weight.size(2);
+  for (Int i = 0; i < (Int)_rules.size(); i++) {
+    auto r = _rules[i];
+    int nRules = r.size() / 2;
+    if (nRules) {
+      flops += nRules * ip * op;
+      // auto rt = torch::CPU(at_kINT).tensorFromBlob(&r[0], {nRules, 2});
+      // auto input_rows = input_features.index_select(0, rt.select(1, 1));
+      // auto w = weight.select(0, i);
+      // auto output_rows = at::mm(input_rows, w);
+      // output_features.index_add_(0, rt.select(1, 0), output_rows);
+      auto input_rows = input_features.type().tensor({nRules, ip});
+      rule_index_select<T>(input_rows, input_features, nRules, &r[1]);
+      auto w = weight.select(0, i);
+      auto output_rows = at::mm(input_rows, w);
+      rule_index_add_<T>(output_features, output_rows, nRules, &r[0]);
+    }
+  }
+  return flops;
+}
+
+template <typename T, Int Dimension>
+void cpu_Deconvolution_backward(
+    /*long*/ at::Tensor inputSize, /*long*/ at::Tensor outputSize,
+    /*long*/ at::Tensor filterSize,
+    /*long*/ at::Tensor filterStride, Metadata<Dimension> &m,
+    /*float*/ at::Tensor input_features,
+    /*float*/ at::Tensor d_input_features,
+    /*float*/ at::Tensor d_output_features, /*float*/ at::Tensor weight,
+    /*float*/ at::Tensor d_weight, /*float*/ at::Tensor d_bias) {
+
+  auto _rules =
+      m.getRuleBook(outputSize, inputSize, filterSize, filterStride, true);
+  Int nActive = m.getNActive(inputSize);
+  d_input_features.resize_as_(input_features);
+  d_input_features.zero_();
+
+  if (nActive and d_bias.numel())
+    at::sum_out(d_bias, d_output_features, {0}, false);
+  auto ip = weight.size(1);
+  auto op = weight.size(2);
+  for (Int i = 0; i < (Int)_rules.size(); i++) {
+    auto r = _rules[i];
+    int nRules = r.size() / 2;
+    if (nRules) {
+      auto w = weight.select(0, i);
+      auto dw = d_weight.select(0, i);
+      // auto rt = torch::CPU(at_kINT).tensorFromBlob(&r[0], {nRules, 2});
+      // auto input_rows = input_features.index_select(0, rt.select(1, 1));
+      // auto d_output_rows = d_output_features.index_select(0, rt.select(1,
+      // 0));
+      // at::mm_out(dw, input_rows.t(), d_output_rows);
+      // auto d_input_rows = at::mm(d_output_rows, w.t());
+      // d_input_features.index_add_(0, rt.select(1, 1), d_input_rows);
+      auto input_rows = input_features.type().tensor({nRules, ip});
+      rule_index_select<T>(input_rows, input_features, nRules, &r[1]);
+      auto d_output_rows = d_output_features.type().tensor({nRules, op});
+      rule_index_select<T>(d_output_rows, d_output_features, nRules, &r[0]);
+      at::mm_out(dw, input_rows.t(), d_output_rows);
+      auto d_input_rows = at::mm(d_output_rows, w.t());
+      rule_index_add_<T>(d_input_features, d_input_rows, nRules, &r[1]);
+    }
+  }
+}
--- a/sparseconvnet/SCN/CPU/IOLayers.cpp
+++ b/sparseconvnet/SCN/CPU/IOLayers.cpp
+// Copyright 2016-present, Facebook, Inc.
+// All rights reserved.
+//
+// This source code is licensed under the license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include "IOLayers.h"
+
+template <typename T, Int Dimension>
+void cpu_InputLayer_updateOutput(Metadata<Dimension> &m,
+                                 /*long*/ at::Tensor spatialSize,
+                                 /*long*/ at::Tensor input_coords,
+                                 /*float*/ at::Tensor input_features,
+                                 /*float*/ at::Tensor output_features,
+                                 long batchSize, long mode) {
+
+  m.inputLayer(spatialSize, input_coords, batchSize, mode);
+  auto nPlanes = input_features.size(1);
+  auto &rules = m.inputLayerRuleBook;
+  auto maxActive = rules[0][1];
+  auto nRows = rules[0][3];
+  if (mode == 0) {
+    output_features.resize_as_(input_features);
+    output_features.copy_(input_features);
+  } else {
+    output_features.resize_({*m.inputNActive, nPlanes});
+    output_features.zero_();
+    InputLayer_ForwardPass<T>(input_features.data<T>(),
+                                 output_features.data<T>(), nRows,
+                                 maxActive, nPlanes, &rules[1][0], mode == 4);
+  }
+}
+template <typename T, Int Dimension>
+void cpu_InputLayer_updateGradInput(Metadata<Dimension> &m,
+                                    /*float*/ at::Tensor d_input_features,
+                                    /*float*/ at::Tensor d_output_features) {
+
+  auto &rules = m.inputLayerRuleBook;
+  auto nPlanes = d_output_features.size(1);
+  auto mode = rules[0][0];
+  auto maxActive = rules[0][1];
+  auto nRows = rules[0][3];
+  if (mode == 0) {
+    d_input_features.resize_as_(d_output_features);
+    d_input_features.copy_(d_output_features);
+  } else {
+    d_input_features.resize_({rules[0][2], nPlanes});
+    d_input_features.zero_();
+    InputLayer_BackwardPass<T>(d_input_features.data<T>(),
+                                  d_output_features.data<T>(), nRows,
+                                  maxActive, nPlanes, &rules[1][0], mode == 4);
+  }
+}
+
+template <typename T, Int Dimension>
+void cpu_OutputLayer_updateOutput(Metadata<Dimension> &m,
+                                  /*float*/ at::Tensor input_features,
+                                  /*float*/ at::Tensor output_features) {
+
+  auto &rules = m.inputLayerRuleBook;
+  auto nPlanes = input_features.size(1);
+  auto mode = rules[0][0];
+  auto maxActive = rules[0][1];
+  auto nRows = rules[0][3];
+  if (mode == 0) {
+    output_features.resize_as_(input_features);
+    output_features.copy_(input_features);
+  } else {
+    output_features.resize_({rules[0][2], nPlanes});
+    output_features.zero_();
+    InputLayer_BackwardPass<T>(output_features.data<T>(),
+                                  input_features.data<T>(), nRows,
+                                  maxActive, nPlanes, &rules[1][0], false);
+  }
+}
+template <typename T, Int Dimension>
+void cpu_OutputLayer_updateGradInput(Metadata<Dimension> &m,
+                                     /*float*/ at::Tensor d_input_features,
+                                     /*float*/ at::Tensor d_output_features) {
+
+  auto &rules = m.inputLayerRuleBook;
+  auto nPlanes = d_output_features.size(1);
+  auto mode = rules[0][0];
+  auto maxActive = rules[0][1];
+  auto nRows = rules[0][3];
+  if (mode == 0) {
+    d_input_features.resize_as_(d_output_features);
+    d_input_features.copy_(d_output_features);
+  } else {
+    d_input_features.resize_({nRows, nPlanes});
+    d_input_features.zero_();
+    InputLayer_ForwardPass<T>(d_output_features.data<T>(),
+                                 d_input_features.data<T>(), nRows,
+                                 maxActive, nPlanes, &rules[1][0], false);
+  }
+}
+
+template <typename T, Int Dimension>
+void cpu_BLInputLayer_updateOutput(Metadata<Dimension> &m,
+                                   /*long*/ at::Tensor spatialSize,
+                                   /*long*/ at::Tensor input_coords,
+                                   /*float*/ at::Tensor input_features,
+                                   /*float*/ at::Tensor output_features,
+                                   long mode) {
+
+  m.blLayer(spatialSize, input_coords, mode);
+  auto nPlanes = input_features.size(2);
+  auto &rules = m.blLayerRuleBook;
+  auto maxActive = rules[0][1];
+  auto nRows = rules[0][4];
+  if (mode == 0) {
+    output_features.resize_as_(input_features);
+    output_features.copy_(input_features);
+    output_features.resize_({*m.inputNActive, nPlanes});
+  } else {
+    output_features.resize_({*m.inputNActive, nPlanes});
+    output_features.zero_();
+    InputLayer_ForwardPass<T>(input_features.data<T>(),
+                                 output_features.data<T>(), nRows,
+                                 maxActive, nPlanes, &rules[1][0], mode == 4);
+  }
+}
+template <typename T, Int Dimension>
+void cpu_BLInputLayer_updateGradInput(Metadata<Dimension> &m,
+                                      /*float*/ at::Tensor d_input_features,
+                                      /*float*/ at::Tensor d_output_features) {
+
+  auto &rules = m.blLayerRuleBook;
+  auto nPlanes = d_output_features.size(1);
+  auto mode = rules[0][0];
+  auto maxActive = rules[0][1];
+  auto nRows = rules[0][4];
+
+  if (mode == 0) {
+    d_input_features.resize_as_(d_output_features);
+    d_input_features.copy_(d_output_features);
+    d_input_features.resize_({rules[0][2], rules[0][3], nPlanes});
+  } else {
+    d_input_features.resize_({rules[0][2], rules[0][3], nPlanes});
+    d_input_features.zero_();
+    InputLayer_BackwardPass<T>(d_input_features.data<T>(),
+                                  d_output_features.data<T>(), nRows,
+                                  maxActive, nPlanes, &rules[1][0], mode == 4);
+  }
+}
+
+template <typename T, Int Dimension>
+void cpu_BLOutputLayer_updateOutput(Metadata<Dimension> &m,
+                                    /*float*/ at::Tensor input_features,
+                                    /*float*/ at::Tensor output_features) {
+
+  auto &rules = m.blLayerRuleBook;
+  auto nPlanes = input_features.size(1);
+  auto mode = rules[0][0];
+  auto maxActive = rules[0][1];
+  auto nRows = rules[0][4];
+  if (mode == 0) {
+    output_features.resize_as_(input_features);
+    output_features.copy_(input_features);
+    output_features.resize_({rules[0][2], rules[0][3], nPlanes});
+  } else {
+    output_features.resize_({rules[0][2], rules[0][3], nPlanes});
+    output_features.zero_();
+    InputLayer_BackwardPass<T>(output_features.data<T>(),
+                                  input_features.data<T>(), nRows,
+                                  maxActive, nPlanes, &rules[1][0], false);
+  }
+}
+template <typename T, Int Dimension>
+void cpu_BLOutputLayer_updateGradInput(Metadata<Dimension> &m,
+                                       /*float*/ at::Tensor d_input_features,
+                                       /*float*/ at::Tensor d_output_features) {
+
+  auto &rules = m.blLayerRuleBook;
+  auto nPlanes = d_output_features.size(2);
+  auto mode = rules[0][0];
+  auto maxActive = rules[0][1];
+  auto nRows = rules[0][4];
+  if (mode == 0) {
+    d_input_features.resize_as_(d_output_features);
+    d_input_features.copy_(d_output_features);
+    d_input_features.resize_({nRows, nPlanes});
+  } else {
+    d_input_features.resize_({nRows, nPlanes});
+    d_input_features.zero_();
+    InputLayer_ForwardPass<T>(d_output_features.data<T>(),
+                                 d_input_features.data<T>(), nRows,
+                                 maxActive, nPlanes, &rules[1][0], false);
+  }
+}
--- a/sparseconvnet/SCN/generic/CPU/IOLayers.h
+++ b/sparseconvnet/SCN/generic/CPU/IOLayers.h
@@ -6,21 +6,21 @@

 #ifndef CPU_IOLAYERS_H
 #define CPU_IOLAYERS_H
-#include "../SparseConvNet.h"
+
 #include <cstring>

 // Assume output and d_input_features have been zero-ed

 template <typename T>
-void InputLayer_ForwardPass(T *input_features, T *output_features, uInt nRows,
-                            uInt maxActive, uInt nPlanes, uInt *rules,
+void InputLayer_ForwardPass(T *input_features, T *output_features, Int nRows,
+                            Int maxActive, Int nPlanes, Int *rules,
                            bool average) {
-  for (uInt row = 0; row < nRows; row++) {
+  for (Int row = 0; row < nRows; row++) {
    auto nActive = rules[0];
    T multiplier = (average and nActive > 0) ? 1.0f / nActive : 1.0f;
-    for (uInt i = 1; i <= nActive; ++i) {
+    for (Int i = 1; i <= nActive; ++i) {
      auto in_f = input_features + nPlanes * rules[i];
-      for (uInt plane = 0; plane < nPlanes; plane++) {
+      for (Int plane = 0; plane < nPlanes; plane++) {
        output_features[plane] += multiplier * in_f[plane];
      }
    }
@@ -30,14 +30,14 @@ void InputLayer_ForwardPass(T *input_features, T *output_features, uInt nRows,
 }
 template <typename T>
 void InputLayer_BackwardPass(T *d_input_features, T *d_output_features,
-                             uInt nRows, uInt maxActive, uInt nPlanes,
-                             uInt *rules, bool average) {
-  for (uInt row = 0; row < nRows; row++) {
+                             Int nRows, Int maxActive, Int nPlanes,
+                             Int *rules, bool average) {
+  for (Int row = 0; row < nRows; row++) {
    auto nActive = rules[0];
    T multiplier = (average and nActive > 0) ? 1.0f / nActive : 1.0f;
-    for (uInt i = 1; i <= nActive; ++i) {
+    for (Int i = 1; i <= nActive; ++i) {
      auto d_in_f = d_input_features + nPlanes * rules[i];
-      for (uInt plane = 0; plane < nPlanes; plane++)
+      for (Int plane = 0; plane < nPlanes; plane++)
        d_in_f[plane] += multiplier * d_output_features[plane];
    }
    d_output_features += nPlanes;

--- a/sparseconvnet/SCN/CPU/LeakyReLU.cpp
+++ b/sparseconvnet/SCN/CPU/LeakyReLU.cpp
+// Copyright 2016-present, Facebook, Inc.
+// All rights reserved.
+//
+// This source code is licensed under the license found in the
+// LICENSE file in the root directory of this source tree.
+
+template <typename T>
+void cpu_LeakyReLU_updateOutput(/*float*/ at::Tensor input_features,
+                                /*float*/ at::Tensor output_features,
+                                float alpha) {
+  output_features.resize_as_(input_features);
+  auto iF = input_features.data<T>();
+  auto oF = output_features.data<T>();
+  auto n = input_features.numel();
+
+  for (Int i = 0; i < n; i++)
+    oF[i] = (iF[i] > 0) ? iF[i] : iF[i] * alpha;
+}
+template <typename T>
+void cpu_LeakyReLU_updateGradInput(/*float*/ at::Tensor input_features,
+                                   /*float*/ at::Tensor d_input_features,
+                                   /*float*/ at::Tensor d_output_features,
+                                   float alpha) {
+  d_input_features.resize_as_(d_output_features);
+  auto iF = input_features.data<T>();
+  auto diF = d_input_features.data<T>();
+  auto doF = d_output_features.data<T>();
+  auto n = d_input_features.numel();
+
+  for (Int i = 0; i < n; i++)
+    diF[i] = (iF[i] > 0) ? doF[i] : doF[i] * alpha;
+}
--- a/sparseconvnet/SCN/CPU/MaxPooling.cpp
+++ b/sparseconvnet/SCN/CPU/MaxPooling.cpp
+// Copyright 2016-present, Facebook, Inc.
+// All rights reserved.
+//
+// This source code is licensed under the license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include "MaxPooling.h"
+
+template <typename T, Int Dimension>
+void cpu_MaxPooling_updateOutput(
+    /*long*/ at::Tensor inputSize, /*long*/ at::Tensor outputSize,
+    /*long*/ at::Tensor poolSize,
+    /*long*/ at::Tensor poolStride, Metadata<Dimension> &m,
+    /*float*/ at::Tensor input_features,
+    /*float*/ at::Tensor output_features, long nFeaturesToDrop) {
+
+  Int nPlanes = input_features.size(1) - nFeaturesToDrop;
+  auto _rules =
+      m.getRuleBook(inputSize, outputSize, poolSize, poolStride, true);
+  Int nActive = m.getNActive(outputSize);
+  output_features.resize_({nActive, input_features.size(1) - nFeaturesToDrop});
+  output_features.zero_();
+
+  auto iF = input_features.data<T>() + nFeaturesToDrop;
+  auto oF = output_features.data<T>();
+
+  for (auto &r : _rules) {
+    Int nHot = r.size() / 2;
+    MaxPooling_ForwardPass<T>(iF, oF, nPlanes, input_features.stride(0),
+                              output_features.stride(0), &r[0], nHot);
+  }
+}
+template <typename T, Int Dimension>
+void cpu_MaxPooling_updateGradInput(
+    /*long*/ at::Tensor inputSize, /*long*/ at::Tensor outputSize,
+    /*long*/ at::Tensor poolSize,
+    /*long*/ at::Tensor poolStride, Metadata<Dimension> &m,
+    /*float*/ at::Tensor input_features,
+    /*float*/ at::Tensor d_input_features, /*float*/ at::Tensor output_features,
+    /*float*/ at::Tensor d_output_features, long nFeaturesToDrop) {
+
+  Int nPlanes = input_features.size(1) - nFeaturesToDrop;
+  auto _rules =
+      m.getRuleBook(inputSize, outputSize, poolSize, poolStride, true);
+  d_input_features.resize_as_(input_features);
+  d_input_features.zero_();
+
+  auto iF = input_features.data<T>();
+  auto oF = output_features.data<T>();
+  auto diF = d_input_features.data<T>();
+  auto doF = d_output_features.data<T>();
+
+  for (auto &r : _rules) {
+    Int nHot = r.size() / 2;
+    MaxPooling_BackwardPass<T>(iF, diF, oF, doF, nPlanes,
+                               input_features.stride(0),
+                               output_features.stride(0), &r[0], nHot);
+  }
+}
+template <typename T, Int Dimension>
+void cpu_RandomizedStrideMaxPooling_updateOutput(
+    /*long*/ at::Tensor inputSize, /*long*/ at::Tensor outputSize,
+    /*long*/ at::Tensor poolSize,
+    /*long*/ at::Tensor poolStride, Metadata<Dimension> &m,
+    /*float*/ at::Tensor input_features,
+    /*float*/ at::Tensor output_features, long nFeaturesToDrop) {
+
+  Int nPlanes = input_features.size(1) - nFeaturesToDrop;
+  auto _rules = m.getRandomizedStrideRuleBook(inputSize, outputSize, poolSize,
+                                              poolStride, true);
+  Int nActive = m.getNActive(outputSize);
+  output_features.resize_({nActive, input_features.size(1) - nFeaturesToDrop});
+  output_features.zero_();
+
+  auto iF = input_features.data<T>() + nFeaturesToDrop;
+  auto oF = output_features.data<T>();
+
+  for (auto &r : _rules) {
+    Int nHot = r.size() / 2;
+    MaxPooling_ForwardPass<T>(iF, oF, nPlanes, input_features.stride(0),
+                              output_features.stride(0), &r[0], nHot);
+  }
+}
+template <typename T, Int Dimension>
+void cpu_RandomizedStrideMaxPooling_updateGradInput(
+    /*long*/ at::Tensor inputSize, /*long*/ at::Tensor outputSize,
+    /*long*/ at::Tensor poolSize,
+    /*long*/ at::Tensor poolStride, Metadata<Dimension> &m,
+    /*float*/ at::Tensor input_features,
+    /*float*/ at::Tensor d_input_features, /*float*/ at::Tensor output_features,
+    /*float*/ at::Tensor d_output_features, long nFeaturesToDrop) {
+
+  Int nPlanes = input_features.size(1) - nFeaturesToDrop;
+  auto _rules = m.getRandomizedStrideRuleBook(inputSize, outputSize, poolSize,
+                                              poolStride, true);
+  d_input_features.resize_as_(input_features);
+  d_input_features.zero_();
+
+  auto iF = input_features.data<T>();
+  auto oF = output_features.data<T>();
+  auto diF = d_input_features.data<T>();
+  auto doF = d_output_features.data<T>();
+
+  for (auto &r : _rules) {
+    Int nHot = r.size() / 2;
+    MaxPooling_BackwardPass<T>(iF, diF, oF, doF, nPlanes,
+                               input_features.stride(0),
+                               output_features.stride(0), &r[0], nHot);
+  }
+}
--- a/sparseconvnet/SCN/generic/CPU/MaxPooling.h
+++ b/sparseconvnet/SCN/generic/CPU/MaxPooling.h
@@ -6,16 +6,16 @@

 #ifndef CPU_MAXPOOLING_H
 #define CPU_MAXPOOLING_H
-#include "../SparseConvNet.h"
+

 template <typename T>
 void MaxPooling_ForwardPass(T *input_features, T *output_features,
-                              uInt nPlanes, uInt input_stride,
-                              uInt output_stride, uInt *rules, uInt nHot) {
-  for (uInt outSite = 0; outSite < nHot; outSite++) {
-    uInt i = rules[2 * outSite] * input_stride;
-    uInt o = rules[2 * outSite + 1] * output_stride;
-    for (uInt plane = 0; plane < nPlanes; plane++)
+                              Int nPlanes, Int input_stride,
+                              Int output_stride, Int *rules, Int nHot) {
+  for (Int outSite = 0; outSite < nHot; outSite++) {
+    Int i = rules[2 * outSite] * input_stride;
+    Int o = rules[2 * outSite + 1] * output_stride;
+    for (Int plane = 0; plane < nPlanes; plane++)
      if (output_features[o + plane] < input_features[i + plane])
        output_features[o + plane] = input_features[i + plane];
  }
@@ -23,12 +23,12 @@ void MaxPooling_ForwardPass(T *input_features, T *output_features,
 template <typename T>
 void MaxPooling_BackwardPass(T *input_features, T *d_input_features,
                               T *output_features, T *d_output_features,
-                               uInt nPlanes, uInt input_stride,
-                               uInt output_stride, uInt *rules, uInt nHot) {
-  for (uInt outSite = 0; outSite < nHot; outSite++) {
-    uInt i = rules[2 * outSite] * input_stride;
-    uInt o = rules[2 * outSite + 1] * output_stride;
-    for (uInt plane = 0; plane < nPlanes; plane++)
+                               Int nPlanes, Int input_stride,
+                               Int output_stride, Int *rules, Int nHot) {
+  for (Int outSite = 0; outSite < nHot; outSite++) {
+    Int i = rules[2 * outSite] * input_stride;
+    Int o = rules[2 * outSite + 1] * output_stride;
+    for (Int plane = 0; plane < nPlanes; plane++)
      if (output_features[o + plane] == input_features[i + plane])
        d_input_features[i + plane] += d_output_features[o + plane];
  }

--- a/sparseconvnet/SCN/CPU/NetworkInNetwork.cpp
+++ b/sparseconvnet/SCN/CPU/NetworkInNetwork.cpp
+// Copyright 2016-present, Facebook, Inc.
+// All rights reserved.
+//
+// This source code is licensed under the license found in the
+// LICENSE file in the root directory of this source tree.
+
+template <typename T>
+double cpu_NetworkInNetwork_updateOutput(/*float*/ at::Tensor input_features,
+                                         /*float*/ at::Tensor output_features,
+                                         /*float*/ at::Tensor weight,
+                                         /*float*/ at::Tensor bias) {
+  auto nActive = input_features.size(0);
+  auto input_nPlanes = weight.size(0);
+  auto output_nPlanes = weight.size(1);
+  output_features.resize_({nActive, output_nPlanes});
+  if (bias.numel())
+    output_features.copy_(bias);
+  else
+    output_features.zero_();
+  output_features.addmm(input_features, weight);
+  return nActive * input_nPlanes * output_nPlanes;
+}
+template <typename T>
+void cpu_NetworkInNetwork_updateGradInput(
+    /*float*/ at::Tensor d_input_features,
+    /*float*/ at::Tensor d_output_features,
+    /*float*/ at::Tensor weight) {
+
+  d_input_features.resize_({(int)d_output_features.size(0), weight.size(0)});
+  d_input_features.zero_();
+  at::mm_out(d_input_features, d_output_features, weight.t());
+}
+template <typename T>
+void cpu_NetworkInNetwork_accGradParameters(
+    /*float*/ at::Tensor input_features,
+    /*float*/ at::Tensor d_output_features,
+    /*float*/ at::Tensor d_weight, /*float*/ at::Tensor d_bias) {
+  auto nActive = input_features.size(0);
+  if (nActive and d_bias.numel())
+    at::sum_out(d_bias, d_output_features, {0}, false);
+  at::mm_out(d_weight, input_features.t(), d_output_features);
+}
--- a/sparseconvnet/SCN/CPU/SparseToDense.cpp
+++ b/sparseconvnet/SCN/CPU/SparseToDense.cpp
+// Copyright 2016-present, Facebook, Inc.
+// All rights reserved.
+//
+// This source code is licensed under the license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include "SparseToDense.h"
+
+template <typename T, Int Dimension>
+void cpu_SparseToDense_updateOutput(
+    /*long*/ at::Tensor inputSize, Metadata<Dimension> &m,
+    /*float*/ at::Tensor input_features,
+    /*float*/ at::Tensor output_features, long nPlanes) {
+
+  {
+    std::array<long, Dimension + 2> sz;
+    sz[0] = m.grids.begin()->second.size(); // batch size
+    sz[1] = nPlanes;
+    long *in_sz = inputSize.data<long>();
+    for (Int i = 0; i < Dimension; ++i)
+      sz[i + 2] = in_sz[i];
+    output_features.resize_(sz);
+    output_features.zero_();
+  }
+  if (input_features.ndimension() == 2) {
+    auto _rules = m.getSparseToDenseRuleBook(inputSize, true);
+    Int _nPlanes = input_features.size(1);
+    auto iF = input_features.data<T>();
+    auto oF = output_features.data<T>();
+    long spatialVolume = inputSize.prod().data<long>()[0];
+    for (auto &r : _rules) {
+      Int nHot = r.size() / 2;
+      SparseToDense_ForwardPass<T>(iF, oF, _nPlanes, spatialVolume, &r[0],
+                                   nHot);
+      oF += _nPlanes * spatialVolume;
+    }
+  }
+}
+template <typename T, Int Dimension>
+void cpu_SparseToDense_updateGradInput(
+    /*long*/ at::Tensor inputSize, Metadata<Dimension> &m,
+    /*float*/ at::Tensor input_features,
+    /*float*/ at::Tensor d_input_features,
+    /*float*/ at::Tensor d_output_features) {
+
+  d_input_features.resize_as_(input_features);
+  d_input_features.zero_();
+  if (input_features.ndimension() == 2) {
+    auto _rules = m.getSparseToDenseRuleBook(inputSize, true);
+    long spatialVolume = inputSize.prod().data<long>()[0];
+    Int _nPlanes = d_input_features.size(1);
+    auto diF = d_input_features.data<T>();
+    auto doF = d_output_features.data<T>();
+    for (auto &r : _rules) {
+      Int nHot = r.size() / 2;
+      SparseToDense_BackwardPass<T>(diF, doF, _nPlanes, spatialVolume, &r[0],
+                                    nHot);
+      doF += _nPlanes * spatialVolume;
+    }
+  }
+}
--- a/sparseconvnet/SCN/generic/CPU/SparseToDense.h
+++ b/sparseconvnet/SCN/generic/CPU/SparseToDense.h
@@ -6,29 +6,29 @@

 #ifndef CPU_SPARSETODENSE_H
 #define CPU_SPARSETODENSE_H
-#include "../SparseConvNet.h"
+

 template <typename T>
 void SparseToDense_ForwardPass(T *input_features, T *output_features,
-                               uInt nPlanes, uInt spatialVolume, uInt *rules,
+                               Int nPlanes, Int spatialVolume, Int *rules,
                               int nHot) {
-  for (uInt outSite = 0; outSite < nHot; outSite++) {
+  for (Int outSite = 0; outSite < nHot; outSite++) {
    T *i = input_features + rules[2 * outSite] * nPlanes;
    T *o = output_features + rules[2 * outSite + 1];
-    for (uInt plane = 0; plane < nPlanes; plane++)
+    for (Int plane = 0; plane < nPlanes; plane++)
      o[plane * spatialVolume] = i[plane];
  }
 }

 template <typename T>
 void SparseToDense_BackwardPass(T *d_input_features, T *d_output_features,
-                                uInt nPlanes, uInt spatialVolume, uInt *rules,
+                                Int nPlanes, Int spatialVolume, Int *rules,
                                int nHot) {

-  for (uInt outSite = 0; outSite < nHot; outSite++) {
+  for (Int outSite = 0; outSite < nHot; outSite++) {
    T *d_i = d_input_features + rules[2 * outSite] * nPlanes;
    T *d_o = d_output_features + rules[2 * outSite + 1];
-    for (uInt plane = 0; plane < nPlanes; plane++)
+    for (Int plane = 0; plane < nPlanes; plane++)
      d_i[plane] = d_o[plane * spatialVolume];
  }
 }

--- a/sparseconvnet/SCN/CPU/UnPooling.cpp
+++ b/sparseconvnet/SCN/CPU/UnPooling.cpp
+// Copyright 2016-present, Facebook, Inc.
+// All rights reserved.
+//
+// This source code is licensed under the license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include "UnPooling.h"
+
+template <typename T, Int Dimension>
+void cpu_UnPooling_updateOutput(
+    /*long*/ at::Tensor inputSize, /*long*/ at::Tensor outputSize,
+    /*long*/ at::Tensor poolSize,
+    /*long*/ at::Tensor poolStride, Metadata<Dimension> &m,
+    /*float*/ at::Tensor input_features,
+    /*float*/ at::Tensor output_features, long nFeaturesToDrop) {
+
+  Int nPlanes = input_features.size(1) - nFeaturesToDrop;
+  auto _rules =
+      m.getRuleBook(outputSize, inputSize, poolSize, poolStride, true);
+  Int nActive = m.getNActive(outputSize);
+  output_features.resize_({nActive, input_features.size(1) - nFeaturesToDrop});
+  output_features.zero_();
+
+  auto iF = input_features.data<T>() + nFeaturesToDrop;
+  auto oF = output_features.data<T>();
+
+  for (auto &r : _rules) {
+    Int nHot = r.size() / 2;
+    UnPooling_ForwardPass<T>(iF, oF, nPlanes, input_features.size(1),
+                             output_features.size(1), &r[0], nHot);
+  }
+}
+template <typename T, Int Dimension>
+void cpu_UnPooling_updateGradInput(
+    /*long*/ at::Tensor inputSize, /*long*/ at::Tensor outputSize,
+    /*long*/ at::Tensor poolSize,
+    /*long*/ at::Tensor poolStride, Metadata<Dimension> &m,
+    /*float*/ at::Tensor input_features,
+    /*float*/ at::Tensor d_input_features,
+    /*float*/ at::Tensor d_output_features, long nFeaturesToDrop) {
+
+  Int nPlanes = input_features.size(1) - nFeaturesToDrop;
+  auto _rules =
+      m.getRuleBook(outputSize, inputSize, poolSize, poolStride, true);
+  d_input_features.resize_as_(input_features);
+  d_input_features.zero_();
+
+  auto diF = d_input_features.data<T>() + nFeaturesToDrop;
+  auto doF = d_output_features.data<T>();
+
+  for (auto &r : _rules) {
+    Int nHot = r.size() / 2;
+    UnPooling_BackwardPass<T>(diF, doF, nPlanes, input_features.size(1),
+                              d_output_features.size(1), &r[0], nHot);
+  }
+}
--- a/sparseconvnet/SCN/generic/CPU/UnPooling.h
+++ b/sparseconvnet/SCN/generic/CPU/UnPooling.h
@@ -6,27 +6,27 @@

 #ifndef CPU_UNPOOLING_H
 #define CPU_UNPOOLING_H
-#include "../SparseConvNet.h"
+

 template <typename T>
-void UnPooling_ForwardPass(T *input_features, T *output_features, uInt nPlanes,
-                           uInt input_stride, uInt output_stride, uInt *rules,
-                           uInt nHot, uInt filterVolume) {
-  for (uInt outSite = 0; outSite < nHot; outSite++) {
-    uInt i = rules[2 * outSite + 1] * input_stride;
-    uInt o = rules[2 * outSite] * output_stride;
-    for (uInt plane = 0; plane < nPlanes; plane++)
+void UnPooling_ForwardPass(T *input_features, T *output_features, Int nPlanes,
+                           Int input_stride, Int output_stride, Int *rules,
+                           Int nHot) {
+  for (Int outSite = 0; outSite < nHot; outSite++) {
+    Int i = rules[2 * outSite + 1] * input_stride;
+    Int o = rules[2 * outSite] * output_stride;
+    for (Int plane = 0; plane < nPlanes; plane++)
      output_features[o + plane] += input_features[i + plane];
  }
 }
 template <typename T>
 void UnPooling_BackwardPass(T *d_input_features, T *d_output_features,
-                            uInt nPlanes, uInt input_stride, uInt output_stride,
-                            uInt *rules, uInt nHot, uInt filterVolume) {
-  for (uInt outSite = 0; outSite < nHot; outSite++) {
-    uInt i = rules[2 * outSite + 1] * input_stride;
-    uInt o = rules[2 * outSite] * output_stride;
-    for (uInt plane = 0; plane < nPlanes; plane++)
+                            Int nPlanes, Int input_stride, Int output_stride,
+                            Int *rules, Int nHot) {
+  for (Int outSite = 0; outSite < nHot; outSite++) {
+    Int i = rules[2 * outSite + 1] * input_stride;
+    Int o = rules[2 * outSite] * output_stride;
+    for (Int plane = 0; plane < nPlanes; plane++)
      d_input_features[i + plane] += d_output_features[o + plane];
  }
 }

--- a/sparseconvnet/SCN/CUDA/ActivePooling.cu
+++ b/sparseconvnet/SCN/CUDA/ActivePooling.cu
+// Copyright 2016-present, Facebook, Inc.
+// All rights reserved.
+//
+// This source code is licensed under the license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include "ActivePooling.h"
+
+template <typename T, Int Dimension>
+void cuda_ActivePooling_updateOutput(
+    /*long*/ at::Tensor inputSize, Metadata<Dimension> &m,
+    /*cuda float*/ at::Tensor input_features,
+    /*cuda float*/ at::Tensor output_features, bool average) {
+
+  Int nPlanes = input_features.size(1);
+  auto _rules = m.getActivePoolingRuleBook(inputSize);
+  Int batchSize = _rules[1][0];
+  Int maxActive = _rules[1][1];
+  output_features.resize_({batchSize, nPlanes});
+  output_features.zero_();
+
+  auto rulesBuffer = at::CUDA(at_kINT).tensor({1 << 22});
+  Int *rb = rulesBuffer.data<Int>();
+  Int rowBatchSize = std::min((Int)32768, (1 << 22) / (maxActive + 1));
+  assert(rowBatchSize > 0);
+
+  auto iF = input_features.data<T>();
+  auto oF = output_features.data<T>();
+  for (Int o = 0; o < batchSize; o += rowBatchSize) {
+    Int batchSize_ = std::min(rowBatchSize, (Int(batchSize - o)));
+    cudaMemcpy(rb, &_rules[0][o * (maxActive + 1)],
+               sizeof(Int) * (maxActive + 1) * batchSize_,
+               cudaMemcpyHostToDevice);
+    ActivePooling_ForwardPass<T>(iF, oF + o * nPlanes, batchSize_, maxActive,
+                                 nPlanes, rb, average);
+  }
+}
+template <typename T, Int Dimension>
+void cuda_ActivePooling_updateGradInput(
+    /*long*/ at::Tensor inputSize, Metadata<Dimension> &m,
+    /*cuda float*/ at::Tensor input_features,
+    /*cuda float*/ at::Tensor d_input_features,
+    /*cuda float*/ at::Tensor d_output_features, bool average) {
+
+  Int nPlanes = input_features.size(1);
+  auto _rules = m.getActivePoolingRuleBook(inputSize);
+  Int batchSize = _rules[1][0];
+  Int maxActive = _rules[1][1];
+  d_input_features.resize_as_(input_features);
+  d_input_features.zero_();
+
+  auto rulesBuffer = at::CUDA(at_kINT).tensor({1 << 22});
+  Int *rb = rulesBuffer.data<Int>();
+  Int rowBatchSize = std::min((Int)32768, (1 << 22) / (maxActive + 1));
+  assert(rowBatchSize > 0);
+
+  auto diF = d_input_features.data<T>();
+  auto doF = d_output_features.data<T>();
+  for (Int o = 0; o < batchSize; o += rowBatchSize) {
+    Int batchSize_ = std::min(rowBatchSize, (Int(batchSize - o)));
+    cudaMemcpy(rb, &_rules[0][o * (maxActive + 1)],
+               sizeof(Int) * (maxActive + 1) * batchSize_,
+               cudaMemcpyHostToDevice);
+    ActivePooling_BackwardPass<T>(diF, doF + o * nPlanes, batchSize_, maxActive,
+                                  nPlanes, rb, average);
+  }
+}
--- a/sparseconvnet/SCN/generic/GPU/ActivePooling.h
+++ b/sparseconvnet/SCN/generic/GPU/ActivePooling.h
@@ -4,54 +4,52 @@
 // This source code is licensed under the license found in the
 // LICENSE file in the root directory of this source tree.

-#ifndef GPU_ACTIVEPOOLING_H
-#define GPU_ACTIVEPOOLING_H
+#ifndef CUDA_ACTIVEPOOLING_H
+#define CUDA_ACTIVEPOOLING_H

 template <typename T>
 __global__ void ActivePooling_fp(T *input_features, T *output_features,
-                                 uInt maxActive, uInt nPlanes, uInt *rules,
+                                 Int maxActive, Int nPlanes, Int *rules,
                                 bool average) {
  T *out = &output_features[blockIdx.x * nPlanes];
-  uInt *r = &rules[blockIdx.x * (maxActive + 1)];
-  uInt nActive = *r++;
+  Int *r = &rules[blockIdx.x * (maxActive + 1)];
+  Int nActive = *r++;
  T multiplier = (average and nActive > 0) ? 1.0f / nActive : 1.0f;
  while (nActive-- > 0) {
    T *inp = &input_features[(*r++) * nPlanes];
-    for (uInt plane = threadIdx.x; plane < nPlanes; plane += 32)
+    for (Int plane = threadIdx.x; plane < nPlanes; plane += 32)
      out[plane] += inp[plane] * multiplier;
  }
 }
 template <typename T>
 void ActivePooling_ForwardPass(T *input_features, T *output_features,
-                               uInt batchSize, uInt maxActive, uInt nPlanes,
-                               uInt *rules, bool average) {
-  uInt kernelBlockDim = std::min(nPlanes, (uInt)32);
-  ActivePooling_fp<T> << <batchSize, kernelBlockDim, 0,
-                          THCState_getCurrentStream(state)>>>
-      (input_features, output_features, maxActive, nPlanes, rules, average);
+                               Int batchSize, Int maxActive, Int nPlanes,
+                               Int *rules, bool average) {
+  Int kernelBlockDim = std::min(nPlanes, (Int)32);
+  ActivePooling_fp<T><<<batchSize, kernelBlockDim>>>(
+      input_features, output_features, maxActive, nPlanes, rules, average);
 }
 template <typename T>
 __global__ void ActivePooling_bp(T *d_input_features, T *d_output_features,
-                                 uInt maxActive, uInt nPlanes, uInt *rules,
+                                 Int maxActive, Int nPlanes, Int *rules,
                                 bool average) {
  T *out = &d_output_features[blockIdx.x * nPlanes];
-  uInt *r = &rules[blockIdx.x * (maxActive + 1)];
-  uInt nActive = *r++;
+  Int *r = &rules[blockIdx.x * (maxActive + 1)];
+  Int nActive = *r++;
  T multiplier = (average and nActive > 0) ? 1.0f / nActive : 1.0f;
  while (nActive-- > 0) {
    T *inp = &d_input_features[(*r++) * nPlanes];
-    for (uInt plane = threadIdx.x; plane < nPlanes; plane += 32)
+    for (Int plane = threadIdx.x; plane < nPlanes; plane += 32)
      inp[plane] = out[plane] * multiplier;
  }
 }

 template <typename T>
 void ActivePooling_BackwardPass(T *d_input_features, T *d_output_features,
-                                uInt batchSize, uInt maxActive, uInt nPlanes,
-                                uInt *rules, bool average) {
-  uInt kernelBlockDim = std::min(nPlanes, (uInt)32);
-  ActivePooling_bp<T> << <batchSize, kernelBlockDim, 0,
-                          THCState_getCurrentStream(state)>>>
-      (d_input_features, d_output_features, maxActive, nPlanes, rules, average);
+                                Int batchSize, Int maxActive, Int nPlanes,
+                                Int *rules, bool average) {
+  Int kernelBlockDim = std::min(nPlanes, (Int)32);
+  ActivePooling_bp<T><<<batchSize, kernelBlockDim>>>(
+      d_input_features, d_output_features, maxActive, nPlanes, rules, average);
 }
-#endif /* GPU_ActivePOOLING_H */
+#endif /* CUDA_ActivePOOLING_H */
--- a/sparseconvnet/SCN/CUDA/AffineReluTrivialConvolution.cu
+++ b/sparseconvnet/SCN/CUDA/AffineReluTrivialConvolution.cu
+// Copyright 2016-present, Facebook, Inc.
+// All rights reserved.
+//
+// This source code is licensed under the license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include "AffineReluTrivialConvolution.h"
+
+template <typename T>
+double cuda_AffineReluTrivialConvolution_updateOutput(
+    /*cuda float*/ at::Tensor input_features,
+    /*cuda float*/ at::Tensor output_features,
+    /*cuda float*/ at::Tensor affineWeight,
+    /*cuda float*/ at::Tensor affineBias,
+    /*cuda float*/ at::Tensor convWeight) {
+
+  output_features.resize_({input_features.size(0), convWeight.size(1)});
+  dAffineReluTrivialConvolution_forward<T>(
+      input_features.data<T>(), output_features.data<T>(),
+      affineWeight.data<T>(), affineBias.data<T>(), convWeight.data<T>(),
+      convWeight.size(0), input_features.stride(0), convWeight.size(1),
+      output_features.size(1), input_features.size(0));
+  return input_features.size(0) * input_features.size(1) *
+         output_features.size(1);
+}
+
+template <typename T>
+void cuda_AffineReluTrivialConvolution_backward(
+    /*cuda float*/ at::Tensor input_features,
+    /*cuda float*/ at::Tensor d_input_features,
+    /*cuda float*/ at::Tensor d_output_features,
+    /*cuda float*/ at::Tensor affineWeight,
+    /*cuda float*/ at::Tensor d_affineWeight,
+    /*cuda float*/ at::Tensor affineBias,
+    /*cuda float*/ at::Tensor d_affineBias,
+    /*cuda float*/ at::Tensor convWeight,
+    /*cuda float*/ at::Tensor d_convWeight, bool additiveGrad) {
+
+  d_input_features.resize_as_(input_features);
+  dAffineReluTrivialConvolution_backward_dW<T>(
+      input_features.data<T>(), d_input_features.data<T>(),
+      d_output_features.data<T>(), affineWeight.data<T>(),
+      d_affineWeight.data<T>(), affineBias.data<T>(), d_affineBias.data<T>(),
+      convWeight.data<T>(), d_convWeight.data<T>(), convWeight.size(0),
+      input_features.stride(0), convWeight.size(1), d_output_features.stride(0),
+      input_features.size(0), additiveGrad);
+}
--- a/sparseconvnet/SCN/generic/GPU/AffineReluTrivialConvolution.h
+++ b/sparseconvnet/SCN/generic/GPU/AffineReluTrivialConvolution.h
@@ -4,18 +4,18 @@
 // This source code is licensed under the license found in the
 // LICENSE file in the root directory of this source tree.

-#ifndef GPU_AFFINERELUTRIVIALCONVOLUTION_H
-#define GPU_AFFINERELUTRIVIALCONVOLUTION_H
+#ifndef CUDA_AFFINERELUTRIVIALCONVOLUTION_H
+#define CUDA_AFFINERELUTRIVIALCONVOLUTION_H

 // check if A+B is faster than just B
 // check if loading affineBias into shared memory is faster than loading
 // multiple times (if not try 64,16 backwards case)

-template <typename T, uInt K, uInt V>
+template <typename T, Int K, Int V>
 __global__ void dAffineReluTrivialConvolution_forwardA(
    T *inFeatures, T *outFeatures, T *affineWeight, T *affineBias,
-    T *convWeight, uInt input_nPlanes, uInt input_stride, uInt output_nPlanes,
-    uInt output_stride, uInt nActive) {
+    T *convWeight, Int input_nPlanes, Int input_stride, Int output_nPlanes,
+    Int output_stride, Int nActive) {
  // nActive must be a multiple of K!!

  // Input x Weight -> Output
@@ -24,9 +24,9 @@ __global__ void dAffineReluTrivialConvolution_forwardA(

  // nActive x KM -> nActive x KN - parallel over N,nActive - loop over M

-  uInt M = input_nPlanes / K;
+  Int M = input_nPlanes / K;
  // N = gridDim.y == output_nPlanes/K
-  uInt n = blockIdx.y;
+  Int n = blockIdx.y;
  outFeatures += n * K;
  convWeight += n * K;

@@ -35,7 +35,7 @@ __global__ void dAffineReluTrivialConvolution_forwardA(
  __shared__ T AW[K];
  __shared__ T AB[K];
  __shared__ T CW[K][K];
-  const uInt tx = threadIdx.x;
+  const Int tx = threadIdx.x;
  int ty[V];
 #pragma unroll
  for (int v = 0; v < V; v++)
@@ -52,7 +52,7 @@ __global__ void dAffineReluTrivialConvolution_forwardA(
      CW[ty[v]][tx] = convWeight[ty[v] * output_nPlanes + tx];
    __syncthreads();

-    for (uInt s = blockIdx.x * K; s < nActive; s += K * gridDim.x) {
+    for (Int s = blockIdx.x * K; s < nActive; s += K * gridDim.x) {
 // Read input, do affine + relu, set O[]
 #pragma unroll
      for (int v = 0; v < V; v++) {
@@ -82,20 +82,20 @@ __global__ void dAffineReluTrivialConvolution_forwardA(
    inFeatures += K;
  }
 }
-template <typename T, uInt K, uInt V>
+template <typename T, Int K, Int V>
 __global__ void dAffineReluTrivialConvolution_forwardB(
    T *inFeatures, T *outFeatures, T *affineWeight, T *affineBias,
-    T *convWeight, uInt input_nPlanes, uInt input_stride, uInt output_nPlanes,
-    uInt output_stride, uInt nActive) {
+    T *convWeight, Int input_nPlanes, Int input_stride, Int output_nPlanes,
+    Int output_stride, Int nActive) {
  // Input x Weight -> Output
  // blockDim=(K,K/V,1), gridDim=(nBlocks,N,1) Volkov-blocks
  // K is a multiple of V,

  // nActive x KM -> nActive x KN - parallel over N,nActive - loop over M

-  uInt M = input_nPlanes / K;
+  Int M = input_nPlanes / K;
  // N = gridDim.y == output_nPlanes/K
-  uInt n = blockIdx.y;
+  Int n = blockIdx.y;
  outFeatures += n * K;
  convWeight += n * K;

@@ -104,7 +104,7 @@ __global__ void dAffineReluTrivialConvolution_forwardB(
  __shared__ T AW[K];
  __shared__ T AB[K];
  __shared__ T CW[K][K];
-  const uInt tx = threadIdx.x;
+  const Int tx = threadIdx.x;
  int ty[V];
 #pragma unroll
  for (int v = 0; v < V; v++)
@@ -121,7 +121,7 @@ __global__ void dAffineReluTrivialConvolution_forwardB(
      CW[ty[v]][tx] = convWeight[ty[v] * output_nPlanes + tx];
    __syncthreads();

-    for (uInt s = blockIdx.x * K; s < nActive; s += K * gridDim.x) {
+    for (Int s = blockIdx.x * K; s < nActive; s += K * gridDim.x) {
 // Read input, do affine + relu, set O[]
 #pragma unroll
      for (int v = 0; v < V; v++) {
@@ -158,20 +158,19 @@ __global__ void dAffineReluTrivialConvolution_forwardB(
 #define FOO(T, K, V)                                                           \
  {                                                                            \
    if (input_nPlanes % K == 0 and output_nPlanes % K == 0) {                  \
-      uInt o = (nActive / K) * K;                                              \
+      Int o = (nActive / K) * K;                                              \
      if (o > 0)                                                               \
-        dAffineReluTrivialConvolution_forwardA<T, K, V> << <                   \
-            dim3(std::min(o / K, (uInt)512), output_nPlanes / K),              \
-            dim3(K, K / V), 0, THCState_getCurrentStream(state)>>>             \
-            (inFeatures, outFeatures, affineWeight, affineBias, convWeight,    \
+        dAffineReluTrivialConvolution_forwardA<                                \
+            T, K, V><<<dim3(std::min(o / K, (Int)512), output_nPlanes / K),   \
+                       dim3(K, K / V)>>>(                                      \
+            inFeatures, outFeatures, affineWeight, affineBias, convWeight,     \
            input_nPlanes, input_stride, output_nPlanes, output_stride, o);    \
      if (nActive > o)                                                         \
-        dAffineReluTrivialConvolution_forwardB<T, K, V> << <                   \
-            dim3(1, output_nPlanes / K), dim3(K, K / V), 0,                    \
-            THCState_getCurrentStream(state)>>>                                \
-            (inFeatures + o * input_stride, outFeatures + o * output_stride,   \
-             affineWeight, affineBias, convWeight, input_nPlanes,              \
-             input_stride, output_nPlanes, output_stride, nActive - o);        \
+        dAffineReluTrivialConvolution_forwardB<                                \
+            T, K, V><<<dim3(1, output_nPlanes / K), dim3(K, K / V)>>>(         \
+            inFeatures + o * input_stride, outFeatures + o * output_stride,    \
+            affineWeight, affineBias, convWeight, input_nPlanes, input_stride, \
+            output_nPlanes, output_stride, nActive - o);                       \
      return;                                                                  \
    }                                                                          \
  }
@@ -179,10 +178,10 @@ __global__ void dAffineReluTrivialConvolution_forwardB(
 template <typename T>
 void dAffineReluTrivialConvolution_forward(T *inFeatures, T *outFeatures,
                                           T *affineWeight, T *affineBias,
-                                           T *convWeight, uInt input_nPlanes,
-                                           uInt input_stride,
-                                           uInt output_nPlanes,
-                                           uInt output_stride, uInt nActive) {
+                                           T *convWeight, Int input_nPlanes,
+                                           Int input_stride,
+                                           Int output_nPlanes,
+                                           Int output_stride, Int nActive) {

  FOO(T, 64, 16)
  FOO(T, 32, 8)
@@ -193,8 +192,8 @@ void dAffineReluTrivialConvolution_forward(T *inFeatures, T *outFeatures,
 template <>
 void dAffineReluTrivialConvolution_forward<double>(
    double *inFeatures, double *outFeatures, double *affineWeight,
-    double *affineBias, double *convWeight, uInt input_nPlanes,
-    uInt input_stride, uInt output_nPlanes, uInt output_stride, uInt nActive) {
+    double *affineBias, double *convWeight, Int input_nPlanes,
+    Int input_stride, Int output_nPlanes, Int output_stride, Int nActive) {

  FOO(double, 32, 8)
  FOO(double, 16, 4)
@@ -206,15 +205,15 @@ void dAffineReluTrivialConvolution_forward<double>(
 // dOutput x W^T -> dInput and
 // Input^T x dOutput -> dW
 // blockDim=(K,K/V,1), gridDim=(nBlocks,M,1)
-template <typename T, uInt K, uInt V>
+template <typename T, Int K, Int V>
 __global__ void dAffineReluTrivialConvolution_backward_dW_A(
    T *inFeatures, T *dInFeatures, T *dOutFeatures, T *affineWeight,
    T *dAffineWeight, T *affineBias, T *dAffineBias, T *convWeight,
-    T *dConvWeight, uInt input_nPlanes, uInt input_stride, uInt output_nPlanes,
-    uInt output_stride, uInt nActive, bool additiveGrad) {
+    T *dConvWeight, Int input_nPlanes, Int input_stride, Int output_nPlanes,
+    Int output_stride, Int nActive, bool additiveGrad) {
  // M = gridDim.y == input_nPlanes / K
-  uInt N = output_nPlanes / K;
-  uInt m = blockIdx.y;
+  Int N = output_nPlanes / K;
+  Int m = blockIdx.y;
  inFeatures += m * K;
  dInFeatures += m * K;
  convWeight += m * K * output_nPlanes;
@@ -234,7 +233,7 @@ __global__ void dAffineReluTrivialConvolution_backward_dW_A(
  __shared__ T AW[K];
  __shared__ T AB[K];
  __shared__ T CW[K][K];
-  const uInt tx = threadIdx.x;
+  const Int tx = threadIdx.x;
  int ty[V];
 #pragma unroll
  for (int v = 0; v < V; v++)
@@ -253,7 +252,7 @@ __global__ void dAffineReluTrivialConvolution_backward_dW_A(
    }
    __syncthreads();

-    for (uInt s = blockIdx.x * K; s < nActive; s += K * gridDim.x) {
+    for (Int s = blockIdx.x * K; s < nActive; s += K * gridDim.x) {
 #pragma unroll
      for (int v = 0; v < V; v++)
        dI[v] = 0;
@@ -303,15 +302,15 @@ __global__ void dAffineReluTrivialConvolution_backward_dW_A(
 // dOutput x W^T -> dInput and
 // Input^T x dOutput -> dW
 // blockDim=(K,K/V,1), gridDim=(nBlocks,M,1)
-template <typename T, uInt K, uInt V>
+template <typename T, Int K, Int V>
 __global__ void dAffineReluTrivialConvolution_backward_dW_B(
    T *inFeatures, T *dInFeatures, T *dOutFeatures, T *affineWeight,
    T *dAffineWeight, T *affineBias, T *dAffineBias, T *convWeight,
-    T *dConvWeight, uInt input_nPlanes, uInt input_stride, uInt output_nPlanes,
-    uInt output_stride, uInt nActive, bool additiveGrad) {
+    T *dConvWeight, Int input_nPlanes, Int input_stride, Int output_nPlanes,
+    Int output_stride, Int nActive, bool additiveGrad) {
  // M = gridDim.y == input_nPlanes / K
-  uInt N = output_nPlanes / K;
-  uInt m = blockIdx.y;
+  Int N = output_nPlanes / K;
+  Int m = blockIdx.y;
  inFeatures += m * K;
  dInFeatures += m * K;
  convWeight += m * K * output_nPlanes;
@@ -331,7 +330,7 @@ __global__ void dAffineReluTrivialConvolution_backward_dW_B(
  __shared__ T AW[K];
  __shared__ T AB[K];
  __shared__ T CW[K][K];
-  const uInt tx = threadIdx.x;
+  const Int tx = threadIdx.x;
  int ty[V];
 #pragma unroll
  for (int v = 0; v < V; v++)
@@ -350,7 +349,7 @@ __global__ void dAffineReluTrivialConvolution_backward_dW_B(
    }
    __syncthreads();

-    for (uInt s = blockIdx.x * K; s < nActive; s += K * gridDim.x) {
+    for (Int s = blockIdx.x * K; s < nActive; s += K * gridDim.x) {
 #pragma unroll
      for (int v = 0; v < V; v++)
        dI[v] = 0;
@@ -406,20 +405,19 @@ __global__ void dAffineReluTrivialConvolution_backward_dW_B(
 #define FOO(T, K, V)                                                           \
  {                                                                            \
    if (input_nPlanes % K == 0 and output_nPlanes % K == 0) {                  \
-      uInt o = (nActive / K) * K;                                              \
+      Int o = (nActive / K) * K;                                              \
      if (o > 0)                                                               \
-        dAffineReluTrivialConvolution_backward_dW_A<T, K, V> << <              \
-            dim3(std::min(o / K, (uInt)512), input_nPlanes / K),               \
-            dim3(K, K / V), 0, THCState_getCurrentStream(state)>>>             \
-            (inFeatures, dInFeatures, dOutFeatures, affineWeight,              \
+        dAffineReluTrivialConvolution_backward_dW_A<                           \
+            T, K, V><<<dim3(std::min(o / K, (Int)512), input_nPlanes / K),    \
+                       dim3(K, K / V)>>>(                                      \
+            inFeatures, dInFeatures, dOutFeatures, affineWeight,               \
            dAffineWeight, affineBias, dAffineBias, convWeight, dConvWeight,   \
            input_nPlanes, input_stride, output_nPlanes, output_stride, o,     \
            additiveGrad);                                                     \
      if (nActive > o)                                                         \
-        dAffineReluTrivialConvolution_backward_dW_B<T, K, V> << <              \
-            dim3(1, input_nPlanes / K), dim3(K, K / V), 0,                     \
-            THCState_getCurrentStream(state)>>>                                \
-            (inFeatures + o * input_stride, dInFeatures + o * input_stride,    \
+        dAffineReluTrivialConvolution_backward_dW_B<                           \
+            T, K, V><<<dim3(1, input_nPlanes / K), dim3(K, K / V)>>>(          \
+            inFeatures + o * input_stride, dInFeatures + o * input_stride,     \
            dOutFeatures + o * output_stride, affineWeight, dAffineWeight,     \
            affineBias, dAffineBias, convWeight, dConvWeight, input_nPlanes,   \
            input_stride, output_nPlanes, output_stride, nActive - o,          \
@@ -432,8 +430,8 @@ template <typename T>
 void dAffineReluTrivialConvolution_backward_dW(
    T *inFeatures, T *dInFeatures, T *dOutFeatures, T *affineWeight,
    T *dAffineWeight, T *affineBias, T *dAffineBias, T *convWeight,
-    T *dConvWeight, uInt input_nPlanes, uInt input_stride, uInt output_nPlanes,
-    uInt output_stride, uInt nActive, bool additiveGrad) {
+    T *dConvWeight, Int input_nPlanes, Int input_stride, Int output_nPlanes,
+    Int output_stride, Int nActive, bool additiveGrad) {
  FOO(T, 32, 8)
  FOO(T, 16, 4)
  FOO(T, 8, 2)

--- a/sparseconvnet/SCN/CUDA/AveragePooling.cu
+++ b/sparseconvnet/SCN/CUDA/AveragePooling.cu
+// Copyright 2016-present, Facebook, Inc.
+// All rights reserved.
+//
+// This source code is licensed under the license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include "AveragePooling.h"
+#include "RuleBookIterator.h"
+
+template <typename T, Int Dimension>
+void cuda_AveragePooling_updateOutput(
+    /*long*/ at::Tensor inputSize, /*long*/ at::Tensor outputSize,
+    /*long*/ at::Tensor poolSize,
+    /*long*/ at::Tensor poolStride, Metadata<Dimension> &m,
+    /*cuda float*/ at::Tensor input_features,
+    /*cuda float*/ at::Tensor output_features, long nFeaturesToDrop) {
+
+  Int nPlanes = input_features.size(1) - nFeaturesToDrop;
+  auto _rules =
+      m.getRuleBook(inputSize, outputSize, poolSize, poolStride, true);
+  Int nActive = m.getNActive(outputSize);
+  output_features.resize_({nActive, input_features.size(1) - nFeaturesToDrop});
+  output_features.zero_();
+
+  auto iF = input_features.data<T>() + nFeaturesToDrop;
+  auto oF = output_features.data<T>();
+  RULEBOOKITERATOR(cuda_AveragePooling_ForwardPass<T>(
+                       iF, oF, nPlanes, input_features.size(1),
+                       output_features.size(1), rbB, nHotB, _rules.size());
+                   , )
+}
+
+template <typename T, Int Dimension>
+void cuda_AveragePooling_updateGradInput(
+    /*long*/ at::Tensor inputSize, /*long*/ at::Tensor outputSize,
+    /*long*/ at::Tensor poolSize,
+    /*long*/ at::Tensor poolStride, Metadata<Dimension> &m,
+    /*cuda float*/ at::Tensor input_features,
+    /*cuda float*/ at::Tensor d_input_features,
+    /*cuda float*/ at::Tensor d_output_features, long nFeaturesToDrop) {
+
+  Int nPlanes = input_features.size(1) - nFeaturesToDrop;
+  auto _rules =
+      m.getRuleBook(inputSize, outputSize, poolSize, poolStride, true);
+  d_input_features.resize_as_(input_features);
+  d_input_features.zero_();
+
+  auto diF = d_input_features.data<T>() + nFeaturesToDrop;
+  auto doF = d_output_features.data<T>();
+  RULEBOOKITERATOR(cuda_AveragePooling_BackwardPass<T>(
+                       diF, doF, nPlanes, input_features.size(1),
+                       d_output_features.size(1), rbB, nHotB, _rules.size());
+                   , )
+}
--- a/sparseconvnet/SCN/generic/GPU/AveragePooling.h
+++ b/sparseconvnet/SCN/generic/GPU/AveragePooling.h
@@ -4,27 +4,27 @@
 // This source code is licensed under the license found in the
 // LICENSE file in the root directory of this source tree.

-#ifndef GPU_AVERAGEPOOLING_H
-#define GPU_AVERAGEPOOLING_H
+#ifndef CUDA_AVERAGEPOOLING_H
+#define CUDA_AVERAGEPOOLING_H

 // NTX must be >=2 so r is filled properly
-template <typename T, uInt NTX, uInt NTY>
+template <typename T, Int NTX, Int NTY>
 __global__ void AveragePooling_fp(T *input_features, T *output_features,
-                                  uInt nPlanes, uInt input_stride,
-                                  uInt output_stride, uInt *rules, uInt nHot,
+                                  Int nPlanes, Int input_stride,
+                                  Int output_stride, Int *rules, Int nHot,
                                  T alpha) {
-  __shared__ uInt r[NTY * 2];
-  for (uInt n = blockIdx.x * NTY; n < nHot; n += gridDim.x * NTY) {
+  __shared__ Int r[NTY * 2];
+  for (Int n = blockIdx.x * NTY; n < nHot; n += gridDim.x * NTY) {
    {
-      uInt i = threadIdx.x + NTX * threadIdx.y;
-      if (i < NTY * 2 and i < 2 * (n - nHot))
+      Int i = threadIdx.x + NTX * threadIdx.y;
+      if (i < NTY * 2 and i < 2 * (nHot - n))
        r[i] = rules[2 * n + i];
    }
    __syncthreads();
    if (n + threadIdx.y < nHot) {
-      uInt i = r[2 * threadIdx.y] * input_stride;
-      uInt o = r[2 * threadIdx.y + 1] * output_stride;
-      for (uInt plane = threadIdx.x; plane < nPlanes; plane += NTX)
+      Int i = r[2 * threadIdx.y] * input_stride;
+      Int o = r[2 * threadIdx.y + 1] * output_stride;
+      for (Int plane = threadIdx.x; plane < nPlanes; plane += NTX)
        atomicAdd(&output_features[o + plane],
                  alpha * input_features[i + plane]);
    }
@@ -33,31 +33,31 @@ __global__ void AveragePooling_fp(T *input_features, T *output_features,
 }

 template <typename T>
-void AveragePooling_ForwardPass(cudaStream_t stream, T *input_features,
-                                T *output_features, uInt nPlanes,
-                                uInt input_stride, uInt output_stride,
-                                uInt *rules, uInt nHot, uInt filterVolume) {
-  AveragePooling_fp<T, 32, 32><<<32, dim3(32, 32), 0, stream>>>(
+void cuda_AveragePooling_ForwardPass(T *input_features, T *output_features,
+                                     Int nPlanes, Int input_stride,
+                                     Int output_stride, Int *rules, Int nHot,
+                                     Int filterVolume) {
+  AveragePooling_fp<T, 32, 32><<<32, dim3(32, 32)>>>(
      input_features, output_features, nPlanes, input_stride, output_stride,
      rules, nHot, 1.0 / filterVolume);
 }
-template <typename T, uInt NTX, uInt NTY>
+template <typename T, Int NTX, Int NTY>
 __global__ void AveragePooling_bp(T *d_input_features, T *d_output_features,
-                                  uInt nPlanes, uInt input_stride,
-                                  uInt output_stride, uInt *rules, uInt nHot,
+                                  Int nPlanes, Int input_stride,
+                                  Int output_stride, Int *rules, Int nHot,
                                  T alpha) {
-  __shared__ uInt r[NTY * 2];
-  for (uInt n = blockIdx.x * NTY; n < nHot; n += gridDim.x * NTY) {
+  __shared__ Int r[NTY * 2];
+  for (Int n = blockIdx.x * NTY; n < nHot; n += gridDim.x * NTY) {
    {
-      uInt i = threadIdx.x + NTX * threadIdx.y;
-      if (i < NTY * 2 and i < 2 * (n - nHot))
+      Int i = threadIdx.x + NTX * threadIdx.y;
+      if (i < NTY * 2 and i < 2 * (nHot - n))
        r[i] = rules[2 * n + i];
    }
    __syncthreads();
    if (n + threadIdx.y < nHot) {
-      uInt i = r[2 * threadIdx.y] * input_stride;
-      uInt o = r[2 * threadIdx.y + 1] * output_stride;
-      for (uInt plane = threadIdx.x; plane < nPlanes; plane += NTX)
+      Int i = r[2 * threadIdx.y] * input_stride;
+      Int o = r[2 * threadIdx.y + 1] * output_stride;
+      for (Int plane = threadIdx.x; plane < nPlanes; plane += NTX)
        d_input_features[i + plane] += alpha * d_output_features[o + plane];
    }
    __syncthreads();
@@ -65,12 +65,12 @@ __global__ void AveragePooling_bp(T *d_input_features, T *d_output_features,
 }

 template <typename T>
-void AveragePooling_BackwardPass(cudaStream_t stream, T *d_input_features,
-                                 T *d_output_features, uInt nPlanes,
-                                 uInt input_stride, uInt output_stride,
-                                 uInt *rules, uInt nHot, uInt filterVolume) {
-  AveragePooling_bp<T, 32, 32><<<32, dim3(32, 32), 0, stream>>>(
+void cuda_AveragePooling_BackwardPass(T *d_input_features, T *d_output_features,
+                                      Int nPlanes, Int input_stride,
+                                      Int output_stride, Int *rules,
+                                      Int nHot, Int filterVolume) {
+  AveragePooling_bp<T, 32, 32><<<32, dim3(32, 32)>>>(
      d_input_features, d_output_features, nPlanes, input_stride, output_stride,
      rules, nHot, 1.0 / filterVolume);
 }
-#endif /* GPU_AVERAGEPOOLING_H */
+#endif /* CUDA_AVERAGEPOOLING_H */