Goodbye THNN. Hello ATen!

2c4ed608 · Benjamin Thomas Graham · 6d4475db · 2c4ed608 · 2c4ed608 · 2c4ed608
Commit 2c4ed608 authored Jun 20, 2018 by Benjamin Thomas Graham
20 changed files
--- a/sparseconvnet/SCN/generic/32bits.h
+++ b/sparseconvnet/SCN/generic/32bits.h
@@ -5,66 +5,58 @@
 // LICENSE file in the root directory of this source tree.

 #include <array>
-#include <tuple>

 // Using 32 bit integers for coordinates and memory calculations.
-// They could be replaced with 64 bit integers.
-// Advantages of 64 bit:
-// - support for nFeatures * nActiveSites > 2^32 per hidden layer per batch
-// Disadvantages:
-// - larger, and therefore slower, data copies from CPU -> GPU
-// - more device memory needed to store sparseconvnet 'rulebooks'
-// - not really needed until GPUs have >> 32GB RAM

 using Int = int32_t;
-using uInt = uint32_t; // Max value = uInt_MAX used to denote 'non-existent'
-const uInt uInt_MAX = 4294967295; // 2^32-1
-const uInt Int_MAX = 2147483647;  // 2^31-1

 // Point<dimension> is a point in the d-dimensional integer lattice
 // (i.e. square-grid/cubic-grid, ...)
-template <uInt dimension> using Point = std::array<Int, dimension>;
+template <Int dimension> using Point = std::array<Int, dimension>;

-template <uInt dimension> Point<dimension> LongTensorToPoint(THLongTensor *t) {
+template <Int dimension>
+Point<dimension> LongTensorToPoint(/*long*/ at::Tensor &t) {
  Point<dimension> p;
-  long *td = THLongTensor_data(t);
-  for (int i = 0; i < dimension; i++)
+  long *td = t.data<long>();
+  for (Int i = 0; i < dimension; i++)
    p[i] = td[i];
  return p;
 }
-template <uInt dimension>
-Point<2 * dimension> TwoLongTensorsToPoint(THLongTensor *t0, THLongTensor *t1) {
+template <Int dimension>
+Point<2 * dimension> TwoLongTensorsToPoint(/*long*/ at::Tensor &t0,
+                                           /*long*/ at::Tensor &t1) {
  Point<2 * dimension> p;
  long *td;
-  td = THLongTensor_data(t0);
-  for (int i = 0; i < dimension; i++)
+  td = t0.data<long>();
+  for (Int i = 0; i < dimension; i++)
    p[i] = td[i];
-  td = THLongTensor_data(t1);
-  for (int i = 0; i < dimension; i++)
+  td = t1.data<long>();
+  for (Int i = 0; i < dimension; i++)
    p[i + dimension] = td[i];
  return p;
 }
-template <uInt dimension>
-Point<3 * dimension> ThreeLongTensorsToPoint(THLongTensor *t0, THLongTensor *t1,
-                                             THLongTensor *t2) {
+template <Int dimension>
+Point<3 * dimension> ThreeLongTensorsToPoint(/*long*/ at::Tensor &t0,
+                                             /*long*/ at::Tensor &t1,
+                                             /*long*/ at::Tensor &t2) {
  Point<3 * dimension> p;
  long *td;
-  td = THLongTensor_data(t0);
-  for (int i = 0; i < dimension; i++)
+  td = t0.data<long>();
+  for (Int i = 0; i < dimension; i++)
    p[i] = td[i];
-  td = THLongTensor_data(t1);
-  for (int i = 0; i < dimension; i++)
+  td = t1.data<long>();
+  for (Int i = 0; i < dimension; i++)
    p[i + dimension] = td[i];
-  td = THLongTensor_data(t2);
-  for (int i = 0; i < dimension; i++)
+  td = t2.data<long>();
+  for (Int i = 0; i < dimension; i++)
    p[i + 2 * dimension] = td[i];
  return p;
 }

 // FNV Hash function for Point<dimension>
-template <uInt dimension> struct IntArrayHash {
+template <Int dimension> struct IntArrayHash {
  std::size_t operator()(Point<dimension> const &p) const {
-    uInt hash = 16777619;
+    Int hash = 16777619;
    for (auto x : p) {
      hash *= 2166136261;
      hash ^= x;
@@ -73,5 +65,4 @@ template <uInt dimension> struct IntArrayHash {
  }
 };

-#define THCITensor THCudaIntTensor
-#define THCITensor_(NAME) TH_CONCAT_3(THCITensor, _, NAME)
+#define at_kINT at::kInt
--- a/sparseconvnet/SCN/generic/64bits.h
+++ b/sparseconvnet/SCN/generic/64bits.h
@@ -5,66 +5,58 @@
 // LICENSE file in the root directory of this source tree.

 #include <array>
-#include <tuple>

-// Using 32 bit integers for coordinates and memory calculations.
-// They could be replaced with 64 bit integers.
-// Advantages of 64 bit:
-// - support for nFeatures * nActiveSites > 2^32 per hidden layer per batch
-// Disadvantages:
-// - larger, and therefore slower, data copies from CPU -> GPU
-// - more device memory needed to store sparseconvnet 'rulebooks'
-// - not really needed until GPUs have >> 32GB RAM
+// Using 64 bit integers for coordinates and memory calculations.

 using Int = int64_t;
-using uInt = uint64_t; // Max value = uInt_MAX used to denote 'non-existent'
-const uInt uInt_MAX = 18446744073709551615; // 2^64-1
-const uInt Int_MAX = 9223372036854775807;   // 2^63-1

 // Point<dimension> is a point in the d-dimensional integer lattice
 // (i.e. square-grid/cubic-grid, ...)
-template <uInt dimension> using Point = std::array<Int, dimension>;
+template <Int dimension> using Point = std::array<Int, dimension>;

-template <uInt dimension> Point<dimension> LongTensorToPoint(THLongTensor *t) {
+template <Int dimension>
+Point<dimension> LongTensorToPoint(/*long*/ at::Tensor &t) {
  Point<dimension> p;
-  long *td = THLongTensor_data(t);
-  for (int i = 0; i < dimension; i++)
+  long *td = t.data<long>();
+  for (Int i = 0; i < dimension; i++)
    p[i] = td[i];
  return p;
 }
-template <uInt dimension>
-Point<2 * dimension> TwoLongTensorsToPoint(THLongTensor *t0, THLongTensor *t1) {
+template <Int dimension>
+Point<2 * dimension> TwoLongTensorsToPoint(/*long*/ at::Tensor &t0,
+                                           /*long*/ at::Tensor &t1) {
  Point<2 * dimension> p;
  long *td;
-  td = THLongTensor_data(t0);
-  for (int i = 0; i < dimension; i++)
+  td = t0.data<long>();
+  for (Int i = 0; i < dimension; i++)
    p[i] = td[i];
-  td = THLongTensor_data(t1);
-  for (int i = 0; i < dimension; i++)
+  td = t1.data<long>();
+  for (Int i = 0; i < dimension; i++)
    p[i + dimension] = td[i];
  return p;
 }
-template <uInt dimension>
-Point<3 * dimension> ThreeLongTensorsToPoint(THLongTensor *t0, THLongTensor *t1,
-                                             THLongTensor *t2) {
+template <Int dimension>
+Point<3 * dimension> ThreeLongTensorsToPoint(/*long*/ at::Tensor &t0,
+                                             /*long*/ at::Tensor &t1,
+                                             /*long*/ at::Tensor &t2) {
  Point<3 * dimension> p;
  long *td;
-  td = THLongTensor_data(t0);
-  for (int i = 0; i < dimension; i++)
+  td = t0.data<long>();
+  for (Int i = 0; i < dimension; i++)
    p[i] = td[i];
-  td = THLongTensor_data(t1);
-  for (int i = 0; i < dimension; i++)
+  td = t1.data<long>();
+  for (Int i = 0; i < dimension; i++)
    p[i + dimension] = td[i];
-  td = THLongTensor_data(t2);
-  for (int i = 0; i < dimension; i++)
+  td = t2.data<long>();
+  for (Int i = 0; i < dimension; i++)
    p[i + 2 * dimension] = td[i];
  return p;
 }

 // FNV Hash function for Point<dimension>
-template <uInt dimension> struct IntArrayHash {
+template <Int dimension> struct IntArrayHash {
  std::size_t operator()(Point<dimension> const &p) const {
-    uInt hash = 14695981039346656037;
+    Int hash = -3750763034362895579; // 14695981039346656037;
    for (auto x : p) {
      hash *= 1099511628211;
      hash ^= x;
@@ -73,5 +65,4 @@ template <uInt dimension> struct IntArrayHash {
  }
 };

-#define THCITensor THCudaLongTensor
-#define THCITensor_(NAME) TH_CONCAT_3(THCITensor, _, NAME)
+#define at_kINT at::kLong
--- a/sparseconvnet/SCN/generic/Geometry/ActivePoolingRules.h
+++ b/sparseconvnet/SCN/generic/Geometry/ActivePoolingRules.h
@@ -6,7 +6,6 @@

 #ifndef ACTIVEPOOLING_H
 #define ACTIVEPOOLING_H
-#include "../SparseConvNet.h"

 // Return the maximum number of active sites in the batch
 // rules has size 1.
@@ -14,14 +13,14 @@
 // First column is number of active sites for that sample (<= maxActive)
 // Remaining maxActive columns give the active sites, zero padded.

-template <uInt dimension>
+template <Int dimension>
 void activePoolingRules(SparseGrids<dimension> &SGs, RuleBook &rules) {
  rules.clear();
  rules.resize(2);
  auto &r = rules[0];
-  uInt maxActive = 0;
+  Int maxActive = 0;
  for (auto &sg : SGs)
-    maxActive = std::max(maxActive, (uInt)sg.mp.size());
+    maxActive = std::max(maxActive, (Int)sg.mp.size());
  for (auto &sg : SGs) {
    r.push_back(sg.mp.size());
    for (auto &iter : sg.mp)

--- a/sparseconvnet/SCN/generic/Geometry/ConvolutionRules.h
+++ b/sparseconvnet/SCN/generic/Geometry/ConvolutionRules.h
@@ -8,7 +8,7 @@
 #define CONVOLUTIONRULES_H
 #include "RectangularRegions.h"

-template <uInt dimension>
+template <Int dimension>
 void Convolution_InputSgToRulesAndOutputSg(SparseGrid<dimension> &inputGrid,
                                           SparseGrid<dimension> &outputGrid,
                                           RuleBook &rules, long *size,
@@ -17,10 +17,11 @@ void Convolution_InputSgToRulesAndOutputSg(SparseGrid<dimension> &inputGrid,
  rules.resize(volume<dimension>(size));

  for (auto const &inIter : inputGrid.mp) {
-    for (auto j : OutputRegionCalculator<dimension>(inIter.first, size, stride,
-                                                    outputSpatialSize)) {
+    auto outRegion = OutputRegionCalculator<dimension>(
+        inIter.first, size, stride, outputSpatialSize);
+    for (auto j : outRegion) {
      auto inRegion = InputRegionCalculator<dimension>(j, size, stride);
-      uInt rulesOffset = inRegion.offset(inIter.first);
+      Int rulesOffset = inRegion.offset(inIter.first);
      auto outIter = outputGrid.mp.find(j);
      if (outIter == outputGrid.mp.end()) {
        outIter =
@@ -32,19 +33,19 @@ void Convolution_InputSgToRulesAndOutputSg(SparseGrid<dimension> &inputGrid,
  }
 }

-template <uInt dimension>
-uInt Convolution_InputSgsToRulesAndOutputSgs(SparseGrids<dimension> &input_SGs,
-                                             SparseGrids<dimension> &output_SGs,
-                                             RuleBook &rules, long *filterSize,
-                                             long *filterStride,
-                                             long *input_spatialSize,
-                                             long *output_spatialSize) {
+template <Int dimension>
+Int Convolution_InputSgsToRulesAndOutputSgs(SparseGrids<dimension> &input_SGs,
+                                            SparseGrids<dimension> &output_SGs,
+                                            RuleBook &rules, long *filterSize,
+                                            long *filterStride,
+                                            long *input_spatialSize,
+                                            long *output_spatialSize) {
  rules.clear();
  output_SGs.clear();
-  uInt batchSize = input_SGs.size();
+  Int batchSize = input_SGs.size();
  output_SGs.resize(batchSize);
-  uInt output_nActive = 0;
-  for (uInt i = 0; i < batchSize; i++) {
+  Int output_nActive = 0;
+  for (Int i = 0; i < batchSize; i++) {
    auto &iSG = input_SGs[i];
    auto &oSG = output_SGs[i];
    oSG.ctr = output_nActive;
@@ -57,43 +58,43 @@ uInt Convolution_InputSgsToRulesAndOutputSgs(SparseGrids<dimension> &input_SGs,
  return output_nActive;
 }

-template <uInt dimension>
-uInt Convolution_InputSgsToRulesAndOutputSgs_OMP(
+template <Int dimension>
+Int Convolution_InputSgsToRulesAndOutputSgs_OMP(
    SparseGrids<dimension> &input_SGs, SparseGrids<dimension> &output_SGs,
    RuleBook &rules, long *filterSize, long *filterStride,
    long *input_spatialSize, long *output_spatialSize) {
  rules.clear();
  rules.resize(volume<dimension>(filterSize));
  output_SGs.clear();
-  uInt batchSize = input_SGs.size();
+  Int batchSize = input_SGs.size();
  output_SGs.resize(batchSize);
  std::vector<RuleBook> rbs(batchSize);
  {
-    uInt i;
+    Int i;
 #pragma omp parallel for private(i)
    for (i = 0; i < batchSize; i++)
      Convolution_InputSgToRulesAndOutputSg<dimension>(
          input_SGs[i], output_SGs[i], rbs[i], filterSize, filterStride,
          input_spatialSize, output_spatialSize);
  }
-  uInt output_nActive = 0;
-  for (uInt i = 0; i < batchSize; i++) {
+  Int output_nActive = 0;
+  for (Int i = 0; i < batchSize; i++) {
    // Parallel assignment:
    // output_nActive     <-  output_nActive+output_SGs[i].ctr
    // output_SGs[i].ctr  <-  output_nActive
-    uInt tmp = output_nActive;
+    Int tmp = output_nActive;
    output_nActive += output_SGs[i].ctr;
    output_SGs[i].ctr = tmp;
  }
  {
-    uInt i;
+    Int i;
 #pragma omp parallel for private(i)
-    for (i = 0; i < rules.size(); i++) {
+    for (i = 0; i < (Int)rules.size(); i++) {
      auto &R = rules[i];
-      for (uInt j = 0; j < batchSize; j++) {
+      for (Int j = 0; j < batchSize; j++) {
        auto &r = rbs[j][i];
        auto offset = output_SGs[j].ctr;
-        for (uInt k = 0; k < r.size();) {
+        for (Int k = 0; k < (Int)r.size();) {
          R.push_back(r[k++]);
          R.push_back(r[k++] + offset);
        }
@@ -105,19 +106,19 @@ uInt Convolution_InputSgsToRulesAndOutputSgs_OMP(

 // for each active site, list of (inputFeatureNumber,batchIdx, spatialOffset)
 // triples
-template <uInt dimension>
+template <Int dimension>
 void SparseToDense_InputSgsToRulesAndOutputSgs(
    SparseGrids<dimension> &input_SGs, RuleBook &rules, long *spatialSize) {
-  uInt batchSize = input_SGs.size();
+  Int batchSize = input_SGs.size();
  rules.clear();
  rules.resize(batchSize);
  Point<dimension> lb, ub;
-  for (int i = 0; i < dimension; ++i) {
+  for (Int i = 0; i < dimension; ++i) {
    lb[i] = 0;
    ub[i] = spatialSize[i] - 1;
  }
  auto region = RectangularRegion<dimension>(lb, ub);
-  for (uInt batchIdx = 0; batchIdx < batchSize; batchIdx++) {
+  for (Int batchIdx = 0; batchIdx < batchSize; batchIdx++) {
    auto &iSG = input_SGs[batchIdx];
    for (auto const &inIter : iSG.mp) {
      rules[batchIdx].push_back(inIter.second + iSG.ctr);
@@ -126,19 +127,19 @@ void SparseToDense_InputSgsToRulesAndOutputSgs(
  }
 }

-template <uInt dimension>
+template <Int dimension>
 void SparseToDense_InputSgsToRulesAndOutputSgs_OMP(
    SparseGrids<dimension> &input_SGs, RuleBook &rules, long *spatialSize) {
-  uInt batchSize = input_SGs.size();
+  Int batchSize = input_SGs.size();
  rules.clear();
  rules.resize(batchSize);
  Point<dimension> lb, ub;
-  for (int i = 0; i < dimension; ++i) {
+  for (Int i = 0; i < dimension; ++i) {
    lb[i] = 0;
    ub[i] = spatialSize[i] - 1;
  }
  auto region = RectangularRegion<dimension>(lb, ub);
-  uInt batchIdx;
+  Int batchIdx;
 #pragma omp parallel for private(batchIdx)
  for (batchIdx = 0; batchIdx < batchSize; batchIdx++) {
    auto &iSG = input_SGs[batchIdx];

--- a/sparseconvnet/SCN/generic/Geometry/FullConvolutionRules.h
+++ b/sparseconvnet/SCN/generic/Geometry/FullConvolutionRules.h
@@ -8,7 +8,7 @@
 #define FULLDECONVOLUTIONRULES_H
 #include "RectangularRegions.h"

-template <uInt dimension>
+template <Int dimension>
 void FullConvolution_InputSgToRulesAndOutputSg(
    SparseGrid<dimension> &inputGrid, SparseGrid<dimension> &outputGrid,
    RuleBook &rules, long *size, long *stride, long *inputSpatialSize,
@@ -20,9 +20,7 @@ void FullConvolution_InputSgToRulesAndOutputSg(
    auto outRegion =
        InputRegionCalculator<dimension>(inIter.first, size, stride);
    for (auto j : outRegion) {
-      auto inRegion =
-          OutputRegionCalculator<dimension>(j, size, stride, outputSpatialSize);
-      uInt rulesOffset = outRegion.offset(j);
+      Int rulesOffset = outRegion.offset(j);
      auto outIter = outputGrid.mp.find(j);
      if (outIter == outputGrid.mp.end()) {
        outIter =
@@ -34,17 +32,17 @@ void FullConvolution_InputSgToRulesAndOutputSg(
  }
 }

-template <uInt dimension>
-uInt FullConvolution_InputSgsToRulesAndOutputSgs(
+template <Int dimension>
+Int FullConvolution_InputSgsToRulesAndOutputSgs(
    SparseGrids<dimension> &input_SGs, SparseGrids<dimension> &output_SGs,
    RuleBook &rules, long *filterSize, long *filterStride,
    long *input_spatialSize, long *output_spatialSize) {
  rules.clear();
  output_SGs.clear();
-  uInt batchSize = input_SGs.size();
+  Int batchSize = input_SGs.size();
  output_SGs.resize(batchSize);
-  uInt output_nActive = 0;
-  for (uInt i = 0; i < batchSize; i++) {
+  Int output_nActive = 0;
+  for (Int i = 0; i < batchSize; i++) {
    auto &iSG = input_SGs[i];
    auto &oSG = output_SGs[i];
    oSG.ctr = output_nActive;
@@ -57,43 +55,43 @@ uInt FullConvolution_InputSgsToRulesAndOutputSgs(
  return output_nActive;
 }

-template <uInt dimension>
-uInt FullConvolution_InputSgsToRulesAndOutputSgs_OMP(
+template <Int dimension>
+Int FullConvolution_InputSgsToRulesAndOutputSgs_OMP(
    SparseGrids<dimension> &input_SGs, SparseGrids<dimension> &output_SGs,
    RuleBook &rules, long *filterSize, long *filterStride,
    long *input_spatialSize, long *output_spatialSize) {
  rules.clear();
  rules.resize(volume<dimension>(filterSize));
  output_SGs.clear();
-  uInt batchSize = input_SGs.size();
+  Int batchSize = input_SGs.size();
  output_SGs.resize(batchSize);
  std::vector<RuleBook> rbs(batchSize);
  {
-    uInt i;
+    Int i;
 #pragma omp parallel for private(i)
    for (i = 0; i < batchSize; i++)
      FullConvolution_InputSgToRulesAndOutputSg<dimension>(
          input_SGs[i], output_SGs[i], rbs[i], filterSize, filterStride,
          input_spatialSize, output_spatialSize);
  }
-  uInt output_nActive = 0;
-  for (uInt i = 0; i < batchSize; i++) {
+  Int output_nActive = 0;
+  for (Int i = 0; i < batchSize; i++) {
    // Parallel assignment:
    // output_nActive     <-  output_nActive+output_SGs[i].ctr
    // output_SGs[i].ctr  <-  output_nActive
-    uInt tmp = output_nActive;
+    Int tmp = output_nActive;
    output_nActive += output_SGs[i].ctr;
    output_SGs[i].ctr = tmp;
  }
  {
-    uInt i;
+    Int i;
 #pragma omp parallel for private(i)
-    for (i = 0; i < rules.size(); i++) {
+    for (i = 0; i < (Int)rules.size(); i++) {
      auto &R = rules[i];
-      for (uInt j = 0; j < batchSize; j++) {
+      for (Int j = 0; j < batchSize; j++) {
        auto &r = rbs[j][i];
        auto offset = output_SGs[j].ctr;
-        for (uInt k = 0; k < r.size();) {
+        for (Int k = 0; k < (Int)r.size();) {
          R.push_back(r[k++]);
          R.push_back(r[k++] + offset);
        }

--- a/sparseconvnet/SCN/generic/Geometry/IOLayersRules.h
+++ b/sparseconvnet/SCN/generic/Geometry/IOLayersRules.h
@@ -6,7 +6,7 @@

 #ifndef INPUTLAYER_H
 #define INPUTLAYER_H
-#include "../SparseConvNet.h"
+

 // Rulebook Format
 // rules[0][0] == mode
@@ -16,10 +16,10 @@
 // rules[1]   nOutputRows x (1+maxActive)

 // mode 0==guaranteed unique 1==overwrite, 2=keep, 3=sum, 4=mean
-template <uInt dimension>
+template <Int dimension>
 void inputLayerRules(SparseGrids<dimension> &SGs, RuleBook &rules, long *coords,
-                     uInt nInputRows, uInt nInputColumns, uInt batchSize,
-                     uInt mode, uInt &nActive) {
+                     Int nInputRows, Int nInputColumns, Int batchSize, Int mode,
+                     Int &nActive) {
  assert(nActive == 0);
  assert(rules.size() == 0);
  assert(SGs.size() == 0);
@@ -37,20 +37,20 @@ void inputLayerRules(SparseGrids<dimension> &SGs, RuleBook &rules, long *coords,
    if (nInputColumns == dimension) {
      SGs.resize(1);
      auto &sg = SGs[0];
-      for (int i = 0; i < nInputRows; ++i) {
-        for (int j = 0; j < dimension; j++)
+      for (Int i = 0; i < nInputRows; ++i) {
+        for (Int j = 0; j < dimension; j++)
          p[j] = coords[j];
        coords += dimension;
        sg.mp[p] = i;
      }
    } else { // nInputColumns == dimension + 1
-      uInt idx;
-      for (int i = 0; i < nInputRows; ++i) {
-        for (int j = 0; j < dimension; j++)
+      Int idx;
+      for (Int i = 0; i < nInputRows; ++i) {
+        for (Int j = 0; j < dimension; j++)
          p[j] = coords[j];
        idx = coords[dimension];
        coords += dimension + 1;
-        if (idx + 1 >= SGs.size())
+        if (idx + 1 >= (Int)SGs.size())
          SGs.resize(idx + 1);
        SGs[idx].mp[p] = i;
      }
@@ -59,12 +59,12 @@ void inputLayerRules(SparseGrids<dimension> &SGs, RuleBook &rules, long *coords,
  }

  // Compile list of how input rows correspond to output rows
-  std::vector<std::vector<uInt>> outputRows;
+  std::vector<std::vector<Int>> outputRows;
  if (nInputColumns == dimension) {
    SGs.resize(1);
    auto &sg = SGs[0];
-    for (int i = 0; i < nInputRows; ++i) {
-      for (int j = 0; j < dimension; j++)
+    for (Int i = 0; i < nInputRows; ++i) {
+      for (Int j = 0; j < dimension; j++)
        p[j] = coords[j];
      coords += dimension;
      auto iter = sg.mp.find(p);
@@ -75,13 +75,13 @@ void inputLayerRules(SparseGrids<dimension> &SGs, RuleBook &rules, long *coords,
      outputRows[sg.mp[p]].push_back(i);
    }
  } else { // nInputColumns == dimension + 1
-    uInt idx;
-    for (int i = 0; i < nInputRows; ++i) {
-      for (int j = 0; j < dimension; j++)
+    Int idx;
+    for (Int i = 0; i < nInputRows; ++i) {
+      for (Int j = 0; j < dimension; j++)
        p[j] = coords[j];
      idx = coords[dimension];
      coords += dimension + 1;
-      if (idx + 1 >= SGs.size())
+      if (idx + 1 >= (Int)SGs.size())
        SGs.resize(idx + 1);
      auto &sg = SGs[idx];
      auto iter = sg.mp.find(p);
@@ -99,21 +99,21 @@ void inputLayerRules(SparseGrids<dimension> &SGs, RuleBook &rules, long *coords,
  rules[0].push_back(outputRows.size());
  auto &rule = rules[1];
  if (mode == 1) {
-    for (uInt i = 0; i < nActive; ++i) {
+    for (Int i = 0; i < nActive; ++i) {
      rule.push_back(1);
      rule.push_back(outputRows[i].front());
    }
  }
  if (mode == 2) {
-    for (uInt i = 0; i < nActive; ++i) {
+    for (Int i = 0; i < nActive; ++i) {
      rule.push_back(1);
      rule.push_back(outputRows[i].back());
    }
  }
  if (mode == 3 or mode == 4) {
-    uInt maxActive = 0;
+    Int maxActive = 0;
    for (auto &row : outputRows)
-      maxActive = std::max(maxActive, (uInt)row.size());
+      maxActive = std::max(maxActive, (Int)row.size());
    rules[0][1] = maxActive;
    for (auto &row : outputRows) {
      rule.push_back(row.size());
@@ -125,8 +125,6 @@ void inputLayerRules(SparseGrids<dimension> &SGs, RuleBook &rules, long *coords,
  }
 }

-
-
 // Rulebook Format
 // rules[0][0] == mode
 // rules[0][1] == maxActive per spatial location (==1 for modes 0,1,2)
@@ -138,14 +136,14 @@ void inputLayerRules(SparseGrids<dimension> &SGs, RuleBook &rules, long *coords,
 // bl is a batchSize x length x dimension long array of coordinates
 // mode 0==guaranteed unique and all present; 1==overwrite, 2=keep, 3=sum,
 // 4=mean
-template <uInt dimension>
+template <Int dimension>
 void blRules(SparseGrids<dimension> &SGs, RuleBook &rules, long *coords,
-             uInt batchSize, uInt length, uInt mode, uInt &nActive) {
+             Int batchSize, Int length, Int mode, Int &nActive) {
  assert(nActive == 0);
  assert(rules.size() == 0);
  assert(SGs.size() == 0);
  SGs.resize(batchSize);
-  uInt I;
+  Int I;

  if (mode == 0) {
    nActive = batchSize * length;
@@ -161,8 +159,8 @@ void blRules(SparseGrids<dimension> &SGs, RuleBook &rules, long *coords,
      sg.ctr = I * length;
      auto c = coords + I * length * dimension;
      Point<dimension> p;
-      for (int l = 0; l < length; ++l) {
-        for (int j = 0; j < dimension; ++j)
+      for (Int l = 0; l < length; ++l) {
+        for (Int j = 0; j < dimension; ++j)
          p[j] = c[j];
        c += dimension;
        sg.mp[p] = l;
@@ -172,18 +170,18 @@ void blRules(SparseGrids<dimension> &SGs, RuleBook &rules, long *coords,
  }

  // Compile list of how input rows correspond to output rows
-  std::vector<std::vector<std::vector<uInt>>> outputRows(batchSize);
-  std::vector<uInt> nActives(batchSize);
+  std::vector<std::vector<std::vector<Int>>> outputRows(batchSize);
+  std::vector<Int> nActives(batchSize);
 #pragma omp parallel for private(I)
  for (I = 0; I < batchSize; I++) {
    auto &sg = SGs[I];
    auto &ors = outputRows[I];
    auto &nAct = nActives[I];
    auto c = coords + I * length * dimension;
-    uInt i = I * length;
+    Int i = I * length;
    Point<dimension> p;
-    for (int l = 0; l < length; ++l, ++i) {
-      for (int j = 0; j < dimension; ++j)
+    for (Int l = 0; l < length; ++l, ++i) {
+      for (Int j = 0; j < dimension; ++j)
        p[j] = *c++;
      if (p[0] >= 0) {
        auto iter = sg.mp.find(p);
@@ -200,11 +198,11 @@ void blRules(SparseGrids<dimension> &SGs, RuleBook &rules, long *coords,
    SGs[I].ctr = nActive;
    nActive += nActives[I];
  }
-  uInt maxActive = 1;
+  Int maxActive = 1;
  if (mode >= 3)
    for (auto &ors : outputRows)
      for (auto &row : ors)
-        maxActive = std::max(maxActive, (uInt)row.size());
+        maxActive = std::max(maxActive, (Int)row.size());

  rules.resize(2);
  rules[0].push_back(mode);
@@ -247,7 +245,7 @@ void blRules(SparseGrids<dimension> &SGs, RuleBook &rules, long *coords,
      auto rr = &rule[SGs[I].ctr * (maxActive + 1)];
      for (auto &row : ors) {
        rr[0] = row.size();
-        for (int i = 0; i < row.size(); ++i)
+        for (Int i = 0; i < (Int)row.size(); ++i)
          rr[i + 1] = row[i];
        rr += 1 + maxActive;
      }

--- a/sparseconvnet/SCN/Metadata/Metadata.cpp
+++ b/sparseconvnet/SCN/Metadata/Metadata.cpp
+// Copyright 2016-present, Facebook, Inc.
+// All rights reserved.
+//
+// This source code is licensed under the license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include "Metadata.h"
+
+#include "ActivePoolingRules.h"
+#include "ConvolutionRules.h"
+#include "FullConvolutionRules.h"
+#include "IOLayersRules.h"
+#include "RandomizedStrideRules.h"
+#include "SubmanifoldConvolutionRules.h"
+
+template <Int dimension> SparseGrid<dimension>::SparseGrid() : ctr(0) {
+  // Sparsehash needs a key to be set aside and never used - we use
+  // (-1,...,-1)
+  Point<dimension> empty_key;
+  for (Int i = 0; i < dimension; ++i)
+    empty_key[i] = -1;
+  mp.set_empty_key(empty_key);
+}
+
+template <typename T> T *OptionalTensorData(at::Tensor tensor) {
+  return tensor.numel() ? tensor.data<T>() : nullptr;
+}
+
+template <Int dimension>
+void addPointToSparseGridMapAndFeatures(SparseGridMap<dimension> &mp,
+                                        Point<dimension> p, Int &nActive,
+                                        long nPlanes,
+                                        /*float*/ at::Tensor features,
+                                        float *vec, bool overwrite) {
+  auto iter = mp.find(p);
+  if (iter == mp.end()) {
+    iter = mp.insert(std::make_pair(p, nActive++)).first;
+    features.resize_({(int)nActive, nPlanes});
+    std::memcpy(features.data<float>() + (nActive - 1) * nPlanes, vec,
+                sizeof(float) * nPlanes);
+  } else if (overwrite) {
+    std::memcpy(features.data<float>() + iter->second * nPlanes, vec,
+                sizeof(float) * nPlanes);
+  }
+}
+
+template <Int dimension>
+Metadata<dimension>::Metadata()
+    : re(std::chrono::system_clock::now().time_since_epoch().count()) {}
+
+template <Int dimension> void Metadata<dimension>::clear() {
+  nActive.clear();
+  grids.clear();
+  activePoolingRuleBooks.clear();
+  inputLayerRuleBook.clear();
+  validRuleBooks.clear();
+  ruleBooks.clear();
+  fullConvolutionRuleBooks.clear();
+  sparseToDenseRuleBooks.clear();
+  inputSGs = nullptr;
+  inputSG = nullptr;
+  inputNActive = nullptr;
+  inputLayerRuleBook.clear();
+  blLayerRuleBook.clear();
+}
+template <Int dimension>
+Int Metadata<dimension>::getNActive(/*long*/ at::Tensor spatialSize) {
+  return nActive[LongTensorToPoint<dimension>(spatialSize)];
+};
+template <Int dimension>
+SparseGrids<dimension> &
+Metadata<dimension>::getSparseGrid(/*long*/ at::Tensor spatialSize) {
+  return grids[LongTensorToPoint<dimension>(spatialSize)];
+};
+template <Int dimension>
+void Metadata<dimension>::setInputSpatialSize(/*long*/ at::Tensor spatialSize) {
+  inputSpatialSize = LongTensorToPoint<dimension>(spatialSize);
+  inputSGs = &grids[inputSpatialSize];
+  inputNActive = &nActive[inputSpatialSize];
+}
+template <Int dimension> void Metadata<dimension>::batchAddSample() {
+  assert(inputSGs && "Call setInputSpatialSize first, please!");
+  inputSGs->resize(inputSGs->size() + 1);
+  inputSG = &inputSGs->back();
+}
+template <Int dimension>
+void Metadata<dimension>::setInputSpatialLocation(/*float*/ at::Tensor features,
+                                                  /*long*/ at::Tensor location,
+                                                  /*float*/ at::Tensor vec,
+                                                  bool overwrite) {
+  auto p = LongTensorToPoint<dimension>(location);
+  SparseGridMap<dimension> &mp = inputSG->mp;
+  Int &nActive = *inputNActive;
+  auto nPlanes = vec.size(0);
+  addPointToSparseGridMapAndFeatures<dimension>(
+      mp, p, nActive, nPlanes, features, vec.data<float>(), overwrite);
+}
+template <Int dimension>
+void Metadata<dimension>::setInputSpatialLocations(
+    /*float*/ at::Tensor features,
+    /*long*/ at::Tensor locations,
+    /*float*/ at::Tensor vecs, bool overwrite) {
+  /* assert(locations.ndimension() == 2 and "locations must be 2
+   * dimensional!"); */
+  /* assert(vecs.ndimension() == 2 and "vecs must be 2 dimensional!"); */
+  /* assert(locations.size(0) == vecs.size(0) and */
+  /*        "Location.size(0) and vecs.size(0) must be equal!"); */
+  /* assert((locations.size(1) == dimension or */
+  /*         locations.size(1) == 1 + dimension) and */
+  /*        "locations.size(0) must be either dimension or dimension+1"); */
+  Point<dimension> p;
+  Int &nActive = *inputNActive;
+  auto nPlanes = vecs.size(1);
+  long *l = locations.data<long>();
+  float *v = vecs.data<float>();
+
+  if (locations.size(1) == dimension) {
+    // add points to current sample
+    assert(inputSG);
+    SparseGridMap<dimension> &mp = inputSG->mp;
+    for (Int idx = 0; idx < locations.size(0); ++idx) {
+      for (Int d = 0; d < dimension; ++d)
+        p[d] = *l++;
+      addPointToSparseGridMapAndFeatures<dimension>(mp, p, nActive, nPlanes,
+                                                    features, v, overwrite);
+      v += nPlanes;
+    }
+  }
+  if (locations.size(1) == dimension + 1) {
+    // add new samples to batch as necessary
+    auto &SGs = *inputSGs;
+    for (Int idx = 0; idx < locations.size(0); ++idx) {
+      for (Int d = 0; d < dimension; ++d)
+        p[d] = *l++;
+      Int batch = *l++;
+      if (batch >= (Int)SGs.size()) {
+        SGs.resize(batch + 1);
+      }
+      SparseGridMap<dimension> &mp = SGs[batch].mp;
+      addPointToSparseGridMapAndFeatures<dimension>(mp, p, nActive, nPlanes,
+                                                    features, v, overwrite);
+      v += nPlanes;
+    }
+  }
+}
+
+template <Int dimension>
+void Metadata<dimension>::getSpatialLocations(/*long*/ at::Tensor spatialSize,
+                                              /*long*/ at::Tensor locations) {
+  Int nActive = getNActive(spatialSize);
+  auto &SGs = getSparseGrid(spatialSize);
+  Int batchSize = SGs.size();
+
+  locations.resize_({(int)nActive, dimension + 1});
+  locations.zero_();
+
+  auto lD = locations.data<long>();
+
+  for (Int i = 0; i < batchSize; i++) {
+    auto mp = SGs[i].mp;
+    auto offset = SGs[i].ctr;
+    for (auto it = mp.begin(); it != mp.end(); ++it) {
+      for (Int d = 0; d < dimension; ++d) {
+        lD[(it->second + offset) * (dimension + 1) + d] = it->first[d];
+      }
+      lD[(it->second + offset) * (dimension + 1) + dimension] = i;
+    }
+  }
+}
+template <Int dimension>
+void Metadata<dimension>::createMetadataForDenseToSparse(
+    /*long*/ at::Tensor spatialSize,
+    /*long*/ at::Tensor nz_, long batchSize) {
+  clear();
+  setInputSpatialSize(spatialSize);
+  inputSGs->resize(batchSize);
+  auto &nActive = *inputNActive;
+  nActive = nz_.size(0);
+
+  long *nz = nz_.data<long>();
+
+  std::vector<Int> br(batchSize + 1);
+  if (batchSize == 1) {
+    br[1] = nActive;
+  } else {
+    long b = 0;
+    for (Int i = 0; i < nActive; i++) {
+      long B = nz[i * (dimension + 1)];
+      for (; b < B;)
+        br[++b] = i;
+    }
+    for (; b < batchSize;)
+      br[++b] = nActive;
+  }
+  Int b;
+#pragma omp parallel for private(b)
+  for (b = 0; b < batchSize; b++) {
+    auto &sg = inputSGs->at(b);
+    for (Int i = br[b]; i < br[b + 1]; i++) {
+      Point<dimension> x;
+      for (Int j = 0; j < dimension; j++) {
+        x[j] = nz[i * (dimension + 1) + j + 1]; // 0-indexed
+      }
+      sg.mp[x] = i;
+    }
+  }
+}
+
+template <Int dimension>
+void Metadata<dimension>::sparsifyMetadata(Metadata<dimension> &mOut,
+                                           /*long*/ at::Tensor spatialSize,
+                                           /*byte*/ at::Tensor filter,
+                                           /*long*/ at::Tensor cuSum) {
+  // Create a new SparseGrids with fewer entries.
+  mOut.clear();
+  auto p = LongTensorToPoint<dimension>(spatialSize);
+  auto &sgsIn = grids[p];
+  auto &sgsOut = mOut.grids[p];
+  sgsOut.resize(sgsIn.size());
+  if (filter.ndimension() == 1) {
+    auto f = filter.data<unsigned char>();
+    auto cs = cuSum.data<long>();
+    auto nActive = cs[cuSum.numel() - 1];
+    mOut.nActive[p] = nActive;
+    Int sample;
+#pragma omp parallel for private(sample)
+    for (sample = 0; sample < (Int)sgsIn.size(); ++sample) {
+      auto &sgIn = sgsIn[sample];
+      auto &sgOut = sgsOut[sample];
+      for (auto const &iter : sgIn.mp) {
+        auto n = iter.second + sgIn.ctr;
+        if (f[n])
+          sgOut.mp[iter.first] = cs[n] - 1;
+      }
+    }
+  } else {
+    mOut.nActive[p] = 0;
+  }
+}
+
+// tensor is size[0] x .. x size[dimension-1] x size[dimension]
+// size[0] x .. x size[dimension-1] == spatial volume
+// size[dimension] == #feature planes
+template <Int dimension>
+void Metadata<dimension>::addSampleFromThresholdedTensor(
+    /*float*/ at::Tensor features_,
+    /*float*/ at::Tensor tensor_,
+    /*long*/ at::Tensor offset_,
+    /*long*/ at::Tensor spatialSize_, float threshold) {
+
+  auto &nActive = *inputNActive;
+  auto &SGs = *inputSGs;
+  SGs.resize(SGs.size() + 1);
+  auto &sg = SGs.back();
+
+  auto tensor = tensor_.data<float>();
+  auto offset = offset_.data<long>();
+  auto spatialSize = spatialSize_.data<long>();
+  long size[dimension + 1]; // IntList?
+  for (Int i = 0; i <= dimension; ++i)
+    size[i] = tensor_.size(i); //   std::vector<long> size = tensor_.size();
+  auto nPlanes = size[dimension];
+  long volume = 1;
+  for (Int i = 0; i < dimension; ++i)
+    volume *= size[i];
+  features_.resize_({(int)(nActive + volume), nPlanes});
+  // Increment pointers as we work through the data
+  auto features = features_.data<float>() + nActive * nPlanes;
+
+  // Active locations
+  Point<dimension> point;
+  for (Int i = 0; i < dimension; i++)
+    point[i] = offset[i];
+  for (Int ctr = 0; ctr < volume; ctr++) {
+    bool active = false;
+    for (Int i = 0; i < nPlanes; i++) {
+      if (fabs(tensor[i]) > threshold) {
+        active = true;
+        break;
+      }
+    }
+    for (Int i = 0; i < dimension; i++) {
+      if (point[i] < 0 or point[i] >= spatialSize[i]) {
+        active = false;
+        break;
+      }
+    }
+    if (active) {
+      sg.mp[point] = nActive++;
+      std::memcpy(features, tensor, sizeof(float) * nPlanes);
+      features += nPlanes;
+    }
+    tensor += nPlanes;
+    incrementPointInCube<dimension>(point, size, offset);
+  }
+  features_.resize_({(int)nActive, nPlanes});
+}
+
+// 3x3 submanifold convolutions, 3x3/2x2 pooling or strided convolutions
+template <Int dimension> void Metadata<dimension>::generateRuleBooks3s2() {
+  long sz[dimension], str[dimension], inS[dimension], outS[dimension];
+  Point<dimension> p1;
+  Point<2 * dimension> p2;
+  Point<3 * dimension> p3;
+  for (Int i = 0; i < dimension; ++i) {
+    p1[i] = p2[i] = p3[i] = inS[i] = inputSpatialSize[i];
+    p2[i + dimension] = p3[i + dimension] = sz[i] = 3;
+    p3[i + 2 * dimension] = str[i] = 2;
+  }
+  while (true) {
+    auto &SGs = grids[p1];
+    auto &rb = validRuleBooks[p2];
+    if (rb.empty())
+      SubmanifoldConvolution_SgsToRules(SGs, rb, sz);
+    for (Int i = 0; i < dimension; ++i)
+      if (p1[i] < 3 or p1[i] % 2 != 1)
+        return;
+      else
+        p1[i] = outS[i] = (inS[i] - 1) / 2;
+    auto &SGs2 = grids[p1];
+    auto &rb2 = ruleBooks[p3];
+    if (rb2.empty())
+      nActive[p1] = Convolution_InputSgsToRulesAndOutputSgs(SGs, SGs2, rb2, sz,
+                                                            str, inS, outS);
+    for (Int i = 0; i < dimension; ++i)
+      p2[i] = p3[i] = inS[i] = outS[i];
+  }
+}
+
+// 3x3 submanifold convolutions, 2x2 pooling or strided convolutions
+template <Int dimension> void Metadata<dimension>::generateRuleBooks2s2() {
+  long s2[dimension], s3[dimension], inS[dimension], outS[dimension];
+  Point<dimension> p1;
+  Point<2 * dimension> p2;
+  Point<3 * dimension> p3;
+  for (Int i = 0; i < dimension; ++i) {
+    p1[i] = p2[i] = p3[i] = inS[i] = inputSpatialSize[i];
+    p2[i + dimension] = s3[i] = 3;
+    p3[i + dimension] = p3[i + 2 * dimension] = s2[i] = 2;
+  }
+  while (true) {
+    auto &SGs = grids[p1];
+    auto &rb = validRuleBooks[p2];
+    if (rb.empty())
+      SubmanifoldConvolution_SgsToRules(SGs, rb, s3);
+    for (Int i = 0; i < dimension; ++i)
+      if (p1[i] < 2 or p1[i] % 2 != 0)
+        return;
+      else
+        p1[i] = outS[i] = inS[i] / 2;
+    auto &SGs2 = grids[p1];
+    auto &rb2 = ruleBooks[p3];
+    if (rb2.empty())
+      nActive[p1] = Convolution_InputSgsToRulesAndOutputSgs(SGs, SGs2, rb2, s2,
+                                                            s2, inS, outS);
+    for (Int i = 0; i < dimension; ++i)
+      p2[i] = p3[i] = inS[i] = outS[i];
+  }
+}
+
+template <Int dimension>
+void Metadata<dimension>::inputLayer(/*long*/ at::Tensor spatialSize,
+                                     /*long*/ at::Tensor coords, Int batchSize,
+                                     Int mode) {
+  assert(spatialSize.ndimension() == 1);
+  assert(spatialSize.size(0) == dimension);
+  assert(coords.ndimension() == 2);
+  assert(coords.size(1) >= dimension and coords.size(1) <= dimension + 1);
+  setInputSpatialSize(spatialSize);
+  inputLayerRules<dimension>(*inputSGs, inputLayerRuleBook, coords.data<long>(),
+                             coords.size(0), coords.size(1), batchSize, mode,
+                             *inputNActive);
+}
+template <Int dimension>
+void Metadata<dimension>::blLayer(/*long*/ at::Tensor spatialSize,
+                                  /*long*/ at::Tensor coords, Int mode) {
+  assert(spatialSize.ndimension() == 1);
+  assert(spatialSize.size(0) == dimension);
+  assert(coords.ndimension() == 3);
+  assert(coords.size(2) == dimension);
+  setInputSpatialSize(spatialSize);
+  blRules<dimension>(*inputSGs, blLayerRuleBook, coords.data<long>(),
+                     coords.size(0), coords.size(1), mode, *inputNActive);
+}
+template <Int dimension>
+RuleBook &
+Metadata<dimension>::getSubmanifoldRuleBook(/*long*/ at::Tensor spatialSize,
+                                            /*long*/ at::Tensor size,
+                                            bool openMP) {
+  auto p = TwoLongTensorsToPoint<dimension>(spatialSize, size);
+  auto &rb = validRuleBooks[p];
+  if (rb.empty()) {
+    auto &SGs = grids[LongTensorToPoint<dimension>(spatialSize)];
+#if defined(ENABLE_OPENMP)
+    openMP ? SubmanifoldConvolution_SgsToRules_OMP(SGs, rb, size.data<long>()) :
+#endif
+           SubmanifoldConvolution_SgsToRules(SGs, rb, size.data<long>());
+  }
+  return rb;
+}
+template <Int dimension>
+RuleBook &
+Metadata<dimension>::getActivePoolingRuleBook(/*long*/ at::Tensor spatialSize) {
+  auto spatialSz = LongTensorToPoint<dimension>(spatialSize);
+  auto &SGs = grids[spatialSz];
+  auto &rb = activePoolingRuleBooks[spatialSz];
+  if (rb.empty())
+    activePoolingRules(SGs, rb);
+  return rb;
+}
+template <Int dimension>
+RuleBook &
+Metadata<dimension>::getSparseToDenseRuleBook(/*long*/ at::Tensor spatialSize,
+                                              bool openMP) {
+  auto ss = LongTensorToPoint<dimension>(spatialSize);
+  auto &SGs = grids[ss];
+  auto &rb = sparseToDenseRuleBooks[ss];
+  if (rb.empty())
+#if defined(ENABLE_OPENMP)
+    openMP ? SparseToDense_InputSgsToRulesAndOutputSgs_OMP(
+                 SGs, rb, spatialSize.data<long>())
+           :
+#endif
+           SparseToDense_InputSgsToRulesAndOutputSgs(SGs, rb,
+                                                     spatialSize.data<long>());
+  return rb;
+}
+template <Int dimension>
+RuleBook &
+Metadata<dimension>::getRuleBook(/*long*/ at::Tensor inputSpatialSize,
+                                 /*long*/ at::Tensor outputSpatialSize,
+                                 /*long*/ at::Tensor size,
+                                 /*long*/ at::Tensor stride, bool openMP) {
+  auto p = ThreeLongTensorsToPoint<dimension>(inputSpatialSize, size, stride);
+  auto &rb = ruleBooks[p];
+  if (rb.empty()) {
+    auto iS = LongTensorToPoint<dimension>(inputSpatialSize);
+    auto oS = LongTensorToPoint<dimension>(outputSpatialSize);
+    auto &iSGs = grids[iS];
+    auto &oSGs = grids[oS];
+    nActive[oS] =
+#if defined(ENABLE_OPENMP)
+        openMP
+            ? Convolution_InputSgsToRulesAndOutputSgs_OMP(
+                  iSGs, oSGs, rb, size.data<long>(), stride.data<long>(),
+                  inputSpatialSize.data<long>(), outputSpatialSize.data<long>())
+            :
+#endif
+            Convolution_InputSgsToRulesAndOutputSgs(
+                iSGs, oSGs, rb, size.data<long>(), stride.data<long>(),
+                inputSpatialSize.data<long>(), outputSpatialSize.data<long>());
+  }
+  return rb;
+}
+template <Int dimension>
+RuleBook &Metadata<dimension>::getFullConvolutionRuleBook(
+    /*long*/ at::Tensor inputSpatialSize,
+    /*long*/ at::Tensor outputSpatialSize,
+    /*long*/ at::Tensor size,
+    /*long*/ at::Tensor stride, Metadata<dimension> &newM) {
+  auto p = ThreeLongTensorsToPoint<dimension>(inputSpatialSize, size, stride);
+  auto &rb = fullConvolutionRuleBooks[p];
+  if (rb.empty()) {
+    newM.clear();
+    auto iS = LongTensorToPoint<dimension>(inputSpatialSize);
+    auto oS = LongTensorToPoint<dimension>(outputSpatialSize);
+    newM.grids[iS] = grids[iS]; // copy
+    newM.nActive[iS] = nActive[iS];
+    auto &iSGs = newM.grids[iS];
+    auto &oSGs = newM.grids[oS];
+    newM.nActive[oS] = FullConvolution_InputSgsToRulesAndOutputSgs_OMP(
+        iSGs, oSGs, rb, size.data<long>(), stride.data<long>(),
+        inputSpatialSize.data<long>(), outputSpatialSize.data<long>());
+  }
+  return rb;
+}
+
+template <Int dimension>
+RuleBook &Metadata<dimension>::getRandomizedStrideRuleBook(
+    /*long*/ at::Tensor inputSpatialSize,
+    /*long*/ at::Tensor outputSpatialSize,
+    /*long*/ at::Tensor size,
+    /*long*/ at::Tensor stride, bool openMP) {
+  auto p = ThreeLongTensorsToPoint<dimension>(inputSpatialSize, size, stride);
+  auto &rb = ruleBooks[p];
+  if (rb.empty()) {
+    auto iS = LongTensorToPoint<dimension>(inputSpatialSize);
+    auto oS = LongTensorToPoint<dimension>(outputSpatialSize);
+    auto &iSGs = grids[iS];
+    auto &oSGs = grids[oS];
+    nActive[oS] =
+#if defined(ENABLE_OPENMP)
+        openMP
+            ? RSR_InputSgsToRulesAndOutputSgs_OMP(
+                  iSGs, oSGs, rb, size.data<long>(), stride.data<long>(),
+                  inputSpatialSize.data<long>(), outputSpatialSize.data<long>(),
+                  re)
+            :
+#endif
+            RSR_InputSgsToRulesAndOutputSgs(iSGs, oSGs, rb, size.data<long>(),
+                                            stride.data<long>(),
+                                            inputSpatialSize.data<long>(),
+                                            outputSpatialSize.data<long>(), re);
+  }
+  return rb;
+}
+
+template <Int dimension> Int volume(long *point) {
+  Int v = 1;
+  for (Int i = 0; i < dimension; i++)
+    v *= point[i];
+  return v;
+}
--- a/sparseconvnet/SCN/Metadata/Metadata.h
+++ b/sparseconvnet/SCN/Metadata/Metadata.h
+// Copyright 2016-present, Facebook, Inc.
+// All rights reserved.
+//
+// This source code is licensed under the license found in the
+// LICENSE file in the root directory of this source tree.
+
+#ifndef Metadata_H
+#define Metadata_H
+#include "32bits.h"
+#include <array>
+#include <chrono>
+#include <cstdint>
+#include <google/dense_hash_map>
+#include <iostream>
+#include <random>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+template <Int dimension>
+using SparseGridMap =
+    google::dense_hash_map<Point<dimension>, Int, IntArrayHash<dimension>,
+                           std::equal_to<Point<dimension>>>;
+template <Int dimension> class SparseGrid {
+public:
+  Int ctr;
+  SparseGridMap<dimension> mp;
+  SparseGrid();
+};
+template <Int dimension> using SparseGrids = std::vector<SparseGrid<dimension>>;
+using RuleBook = std::vector<std::vector<Int>>;
+
+template <Int dimension>
+void addPointToSparseGridMapAndFeatures(SparseGridMap<dimension> &mp,
+                                        Point<dimension> p, Int &nActive,
+                                        long nPlanes,
+                                        /*float*/ at::Tensor features,
+                                        float *vec, bool overwrite);
+
+template <Int dimension> class Metadata {
+public:
+  // Count of active sites for each scale
+  std::unordered_map<Point<dimension>, Int, IntArrayHash<dimension>> nActive;
+
+  // Hash tables for each scale locating the active points
+  std::unordered_map<Point<dimension>, SparseGrids<dimension>,
+                     IntArrayHash<dimension>>
+      grids;
+
+  std::unordered_map<Point<dimension>, RuleBook, IntArrayHash<dimension>>
+      activePoolingRuleBooks;
+
+  RuleBook inputLayerRuleBook;
+  RuleBook blLayerRuleBook;
+
+  std::unordered_map<Point<2 * dimension>, RuleBook,
+                     IntArrayHash<2 * dimension>>
+      validRuleBooks;
+
+  std::unordered_map<Point<3 * dimension>, RuleBook,
+                     IntArrayHash<3 * dimension>>
+      ruleBooks;
+
+  std::unordered_map<Point<3 * dimension>, RuleBook,
+                     IntArrayHash<3 * dimension>>
+      fullConvolutionRuleBooks;
+
+  std::unordered_map<Point<dimension>, RuleBook, IntArrayHash<dimension>>
+      sparseToDenseRuleBooks;
+
+  Point<dimension> inputSpatialSize;
+  SparseGrids<dimension> *inputSGs;
+  SparseGrid<dimension> *inputSG;
+  Int *inputNActive;
+  std::default_random_engine re;
+
+  Metadata();
+  void clear();
+  Int getNActive(/*long*/ at::Tensor spatialSize);
+  SparseGrids<dimension> &getSparseGrid(/*long*/ at::Tensor spatialSize);
+  void setInputSpatialSize(/*long*/ at::Tensor spatialSize);
+  void batchAddSample();
+  void setInputSpatialLocation(/*float*/ at::Tensor features,
+                               /*long*/ at::Tensor location,
+                               /*float*/ at::Tensor vec, bool overwrite);
+  void setInputSpatialLocations(/*float*/ at::Tensor features,
+                                /*long*/ at::Tensor locations,
+                                /*float*/ at::Tensor vecs, bool overwrite);
+
+  void getSpatialLocations(/*long*/ at::Tensor spatialSize,
+                           /*long*/ at::Tensor locations);
+  void createMetadataForDenseToSparse(/*long*/ at::Tensor spatialSize,
+                                      /*long*/ at::Tensor nz_, long batchSize);
+
+  void sparsifyMetadata(Metadata<dimension> &mOut,
+                        /*long*/ at::Tensor spatialSize,
+                        /*byte*/ at::Tensor filter,
+                        /*long*/ at::Tensor cuSum);
+
+  // tensor is size[0] x .. x size[dimension-1] x size[dimension]
+  // size[0] x .. x size[dimension-1] == spatial volume
+  // size[dimension] == #feature planes
+  void addSampleFromThresholdedTensor(/*float*/ at::Tensor features_,
+                                      /*float*/ at::Tensor tensor_,
+                                      /*long*/ at::Tensor offset_,
+                                      /*long*/ at::Tensor spatialSize_,
+                                      float threshold);
+
+  // 3x3 submanifold convolutions, 3x3/2x2 pooling or strided convolutions
+  void generateRuleBooks3s2();
+
+  // 3x3 submanifold convolutions, 2x2 pooling or strided convolutions
+  void generateRuleBooks2s2();
+
+  void inputLayer(/*long*/ at::Tensor spatialSize,
+                  /*long*/ at::Tensor coords, Int batchSize, Int mode);
+  void blLayer(/*long*/ at::Tensor spatialSize, /*long*/ at::Tensor coords,
+               Int mode);
+  RuleBook &getSubmanifoldRuleBook(/*long*/ at::Tensor spatialSize,
+                                   /*long*/ at::Tensor size, bool openMP);
+  RuleBook &getActivePoolingRuleBook(/*long*/ at::Tensor spatialSize);
+  RuleBook &getSparseToDenseRuleBook(/*long*/ at::Tensor spatialSize,
+                                     bool openMP);
+  RuleBook &getRuleBook(/*long*/ at::Tensor inputSpatialSize,
+                        /*long*/ at::Tensor outputSpatialSize,
+                        /*long*/ at::Tensor size,
+                        /*long*/ at::Tensor stride, bool openMP);
+  RuleBook &getFullConvolutionRuleBook(/*long*/ at::Tensor inputSpatialSize,
+                                       /*long*/ at::Tensor outputSpatialSize,
+                                       /*long*/ at::Tensor size,
+                                       /*long*/ at::Tensor stride,
+                                       Metadata<dimension> &newM);
+
+  RuleBook &getRandomizedStrideRuleBook(/*long*/ at::Tensor inputSpatialSize,
+                                        /*long*/ at::Tensor outputSpatialSize,
+                                        /*long*/ at::Tensor size,
+                                        /*long*/ at::Tensor stride,
+                                        bool openMP);
+};
+
+template <typename T> T *OptionalTensorData(at::Tensor tensor);
+
+template <Int dimension> Int volume(long *point);
+#endif
--- a/sparseconvnet/SCN/generic/Geometry/RandomizedStrideRules.h
+++ b/sparseconvnet/SCN/generic/Geometry/RandomizedStrideRules.h
@@ -11,27 +11,28 @@

 class RSRTicks {
 public:
-  std::vector<uInt> inputL;
-  std::vector<uInt> inputR;
-  std::vector<uInt> outputL;
-  std::vector<uInt> outputR;
-  RSRTicks(uInt input_spatialSize, uInt output_spatialSize, uInt size, uInt stride, std::default_random_engine re) {
-    std::vector<uInt> steps;
-    //steps.resize(output_spatialSize/3,stride-1);
-    //steps.resize(output_spatialSize/3*2,stride+1);
-    steps.resize(output_spatialSize-1,stride);
+  std::vector<Int> inputL;
+  std::vector<Int> inputR;
+  std::vector<Int> outputL;
+  std::vector<Int> outputR;
+  RSRTicks(Int input_spatialSize, Int output_spatialSize, Int size, Int stride,
+           std::default_random_engine re) {
+    std::vector<Int> steps;
+    // steps.resize(output_spatialSize/3,stride-1);
+    // steps.resize(output_spatialSize/3*2,stride+1);
+    steps.resize(output_spatialSize - 1, stride);
    std::shuffle(steps.begin(), steps.end(), re);
    inputL.push_back(0);
-    inputR.push_back(size-1);
+    inputR.push_back(size - 1);
    for (auto step : steps) {
-      inputL.push_back(inputL.back()+step);
-      inputR.push_back(inputR.back()+step);
+      inputL.push_back(inputL.back() + step);
+      inputR.push_back(inputR.back() + step);
    }
    assert(inputR.back() == input_spatialSize - 1);
    outputL.resize(input_spatialSize, output_spatialSize);
    outputR.resize(input_spatialSize, 0);
-    for (uInt i = 0; i < output_spatialSize; i++) {
-      for (uInt j = inputL[i]; j <= inputR[i]; j++) {
+    for (Int i = 0; i < output_spatialSize; i++) {
+      for (Int j = inputL[i]; j <= inputR[i]; j++) {
        outputL[j] = std::min(outputL[j], i);
        outputR[j] = std::max(outputR[j], i);
      }
@@ -42,74 +43,76 @@ public:
 typedef std::vector<RSRTicks> RSRTicksV;

 RSRTicksV RSRRegions(long *input_spatialSize, long *output_spatialSize,
-                       uInt dimension, long *size, long *stride, std::default_random_engine re) {
+                     Int dimension, long *size, long *stride,
+                     std::default_random_engine re) {
  RSRTicksV t;
-  for (uInt i = 0; i < dimension; i++)
+  for (Int i = 0; i < dimension; i++)
    t.emplace_back(RSRTicks(input_spatialSize[i], output_spatialSize[i],
-                              size[i], stride[i], re));
+                            size[i], stride[i], re));
  return t;
 }

-template <uInt dimension>
+template <Int dimension>
 RectangularRegion<dimension>
 RSRInputRegionCalculator(const Point<dimension> &output, RSRTicksV &t) {
  Point<dimension> lb, ub;
-  for (uInt i = 0; i < dimension; i++) {
+  for (Int i = 0; i < dimension; i++) {
    lb[i] = t[i].inputL[output[i]];
    ub[i] = t[i].inputR[output[i]];
  }
  return RectangularRegion<dimension>(lb, ub);
 }
-template <uInt dimension>
+template <Int dimension>
 RectangularRegion<dimension>
 RSROutputRegionCalculator(const Point<dimension> &input, RSRTicksV &t) {
  Point<dimension> lb, ub;
-  for (uInt i = 0; i < dimension; i++) {
+  for (Int i = 0; i < dimension; i++) {
    lb[i] = t[i].outputL[input[i]];
    ub[i] = t[i].outputR[input[i]];
  }
  return RectangularRegion<dimension>(lb, ub);
 }

-template <uInt dimension>
+template <Int dimension>
 void RSR_InputSgToRulesAndOutputSg(SparseGrid<dimension> &inputGrid,
                                   SparseGrid<dimension> &outputGrid,
-                                   RuleBook &rules, RSRTicksV &t, long *size, long *stride) {
+                                   RuleBook &rules, RSRTicksV &t, long *size,
+                                   long *stride) {
  rules.resize(volume<dimension>(size));

  for (auto const &inIter : inputGrid.mp) {
    for (auto j : RSROutputRegionCalculator<dimension>(inIter.first, t)) {
      auto inRegion = RSRInputRegionCalculator<dimension>(j, t);
-      uInt rulesOffset = inRegion.offset(inIter.first);
+      Int rulesOffset = inRegion.offset(inIter.first);
      auto outIter = outputGrid.mp.find(j);
      if (outIter == outputGrid.mp.end()) {
        outIter =
            outputGrid.mp.insert(std::make_pair(j, outputGrid.ctr++)).first;
      }
-      assert(inIter.second<1e6);
-      assert(outIter->second<1e6);
+      assert(inIter.second < 1e6);
+      assert(outIter->second < 1e6);
      rules[rulesOffset].push_back(inIter.second + inputGrid.ctr);
      rules[rulesOffset].push_back(outIter->second);
    }
  }
 }

-template <uInt dimension>
-uInt RSR_InputSgsToRulesAndOutputSgs(SparseGrids<dimension> &input_SGs,
-                                     SparseGrids<dimension> &output_SGs,
-                                     RuleBook &rules, long *size,long *stride,
-                                     long *input_spatialSize,
-                                     long *output_spatialSize,
-                                     std::default_random_engine re) {
-  auto t = RSRRegions(input_spatialSize, output_spatialSize, dimension,
-                        size, stride, re);
+template <Int dimension>
+Int RSR_InputSgsToRulesAndOutputSgs(SparseGrids<dimension> &input_SGs,
+                                    SparseGrids<dimension> &output_SGs,
+                                    RuleBook &rules, long *size, long *stride,
+                                    long *input_spatialSize,
+                                    long *output_spatialSize,
+                                    std::default_random_engine re) {
+  auto t = RSRRegions(input_spatialSize, output_spatialSize, dimension, size,
+                      stride, re);

  rules.clear();
  output_SGs.clear();
-  uInt batchSize = input_SGs.size();
+  Int batchSize = input_SGs.size();
  output_SGs.resize(batchSize);
-  uInt output_nActive = 0;
-  for (uInt i = 0; i < batchSize; i++) {
+  Int output_nActive = 0;
+  for (Int i = 0; i < batchSize; i++) {
    auto &iSG = input_SGs[i];
    auto &oSG = output_SGs[i];
    oSG.ctr = output_nActive;
@@ -120,47 +123,46 @@ uInt RSR_InputSgsToRulesAndOutputSgs(SparseGrids<dimension> &input_SGs,
  return output_nActive;
 }

-template <uInt dimension>
-uInt RSR_InputSgsToRulesAndOutputSgs_OMP(SparseGrids<dimension> &input_SGs,
-                                         SparseGrids<dimension> &output_SGs,
-                                         RuleBook &rules,
-                                         long *size, long *stride,
-                                         long *input_spatialSize,
-                                         long *output_spatialSize,
-                                         std::default_random_engine re) {
-  auto t = RSRRegions(input_spatialSize, output_spatialSize, dimension,
-                        size, stride, re);
+template <Int dimension>
+Int RSR_InputSgsToRulesAndOutputSgs_OMP(SparseGrids<dimension> &input_SGs,
+                                        SparseGrids<dimension> &output_SGs,
+                                        RuleBook &rules, long *size,
+                                        long *stride, long *input_spatialSize,
+                                        long *output_spatialSize,
+                                        std::default_random_engine re) {
+  auto t = RSRRegions(input_spatialSize, output_spatialSize, dimension, size,
+                      stride, re);
  rules.clear();
  rules.resize(volume<dimension>(size));
  output_SGs.clear();
-  uInt batchSize = input_SGs.size();
+  Int batchSize = input_SGs.size();
  output_SGs.resize(batchSize);
  std::vector<RuleBook> rbs(batchSize);
  {
-    uInt i;
+    Int i;
 #pragma omp parallel for private(i)
    for (i = 0; i < batchSize; i++)
      RSR_InputSgToRulesAndOutputSg<dimension>(input_SGs[i], output_SGs[i],
                                               rbs[i], t, size, stride);
  }
-  uInt output_nActive = 0;
-  for (uInt i = 0; i < batchSize; i++) {
+  Int output_nActive = 0;
+  for (Int i = 0; i < batchSize; i++) {
    // Parallel assignment:
    // output_nActive     <-  output_nActive+output_SGs[i].ctr
    // output_SGs[i].ctr  <-  output_nActive
-    uInt tmp = output_nActive;
+    Int tmp = output_nActive;
    output_nActive += output_SGs[i].ctr;
    output_SGs[i].ctr = tmp;
  }
  {
-    uInt i;
+    Int i;
 #pragma omp parallel for private(i)
-    for (i = 0; i < rules.size(); i++) {
+    for (i = 0; i < (Int)rules.size(); i++) {
      auto &R = rules[i];
-      for (uInt j = 0; j < batchSize; j++) {
+      for (Int j = 0; j < batchSize; j++) {
        auto &r = rbs[j][i];
        auto offset = output_SGs[j].ctr;
-        for (uInt k = 0; k < r.size();) {
+        for (Int k = 0; k < (Int)r.size();) {
          R.push_back(r[k++]);
          R.push_back(r[k++] + offset);
        }

--- a/sparseconvnet/SCN/generic/Geometry/RectangularRegions.h
+++ b/sparseconvnet/SCN/generic/Geometry/RectangularRegions.h
@@ -6,14 +6,14 @@

 #ifndef RECTANGULARREGIONS_H
 #define RECTANGULARREGIONS_H
-#include "../SparseConvNet.h"
+

 // For iterating over the rectangular region with corners lb and ub.
 // The .end() method and operator!= are designed to allow range based for
 // loops of the region, but nothing else.

-template <uInt dimension> class RectangularRegionIterator;
-template <uInt dimension> class RectangularRegion {
+template <Int dimension> class RectangularRegionIterator;
+template <Int dimension> class RectangularRegion {
 public:
  Point<dimension> lb;
  Point<dimension> ub;
@@ -27,9 +27,9 @@ public:
    // Otherwise it would need to represent a point just outside the region
    return RectangularRegionIterator<dimension>(*this, ub);
  }
-  uInt
+  Int
  offset(const Point<dimension> &p) { // Enumerate the points inside the region
-    uInt of = 0, m = 1;
+    Int of = 0, m = 1;
    for (Int i = dimension - 1; i >= 0; i--) {
      of += m * (p[i] - lb[i]);
      m *= ub[i] - lb[i] + 1;
@@ -38,13 +38,13 @@ public:
  }
 };

-template <uInt dimension> class RectangularRegionIterator {
+template <Int dimension> class RectangularRegionIterator {
 private:
  RectangularRegion<dimension> &region;

 public:
-  bool stillLooping;
  Point<dimension> point;
+  bool stillLooping;
  RectangularRegionIterator(RectangularRegion<dimension> &region,
                            Point<dimension> &point)
      : region(region), point(point), stillLooping(true) {
@@ -73,14 +73,14 @@ public:
 };

 // Only to be used for checking the end point of range based for loops.
-template <uInt dimension>
+template <Int dimension>
 inline bool operator!=(const RectangularRegionIterator<dimension> &lhs,
                       const RectangularRegionIterator<dimension> &rhs) {
  return lhs.stillLooping;
 }

 // Similar to above but for [ offset[0] ... offset[0]+size[0]-1 ] x ... x [..]
-template <uInt dimension>
+template <Int dimension>
 void incrementPointInCube(Point<dimension> &point, long *size, long *offset) {
  for (Int i = dimension - 1; i >= 0; i--) {
    point[i]++;
@@ -92,12 +92,12 @@ void incrementPointInCube(Point<dimension> &point, long *size, long *offset) {

 // For a convolutional layer with given filter *size* and *stride*, find the
 // subset of the input field corresponding to a point in the output.
-template <uInt dimension>
+template <Int dimension>
 RectangularRegion<dimension>
 InputRegionCalculator(const Point<dimension> &output, long *size,
                      long *stride) {
  Point<dimension> lb, ub;
-  for (uInt i = 0; i < dimension; i++) {
+  for (Int i = 0; i < dimension; i++) {
    lb[i] = output[i] * stride[i];
    ub[i] = output[i] * stride[i] + size[i] - 1;
  }
@@ -106,12 +106,12 @@ InputRegionCalculator(const Point<dimension> &output, long *size,

 // For a convolutional layer with given filter *size* and *stride*, find the
 // subset of the output field corresponding to a point in the input.
-template <uInt dimension>
+template <Int dimension>
 RectangularRegion<dimension>
 OutputRegionCalculator(const Point<dimension> &input, long *size, long *stride,
                       long *outputSpatialSize) {
  Point<dimension> lb, ub;
-  for (uInt i = 0; i < dimension; i++) {
+  for (Int i = 0; i < dimension; i++) {
    lb[i] = std::max(0L, (input[i] - size[i] + stride[i]) / stride[i]);
    ub[i] = std::min(outputSpatialSize[i] - 1, input[i] / stride[i]);
  }

--- a/sparseconvnet/SCN/generic/Geometry/SubmanifoldConvolutionRules.h
+++ b/sparseconvnet/SCN/generic/Geometry/SubmanifoldConvolutionRules.h
@@ -8,11 +8,11 @@
 #define VALIDCONVOLUTIONRULES_H

 // Full input region for an output point
-template <uInt dimension>
+template <Int dimension>
 RectangularRegion<dimension>
 InputRegionCalculator_Valid(const Point<dimension> &output, long *size) {
  Point<dimension> lb, ub;
-  for (uInt i = 0; i < dimension; i++) {
+  for (Int i = 0; i < dimension; i++) {
    Int pad = size[i] / 2;
    lb[i] = output[i] - pad;
    ub[i] = output[i] + size[i] - 1 - pad;
@@ -23,15 +23,14 @@ InputRegionCalculator_Valid(const Point<dimension> &output, long *size) {
 // Call for each convolutional / max-pooling layer, once for each batch item.
 // rules is used to carry out the "lowering" whilst carrying out the convolution

-template <uInt dimension>
-double SubmanifoldConvolution_SgToRules(SparseGrid<dimension> &grid, RuleBook &rules,
-                                  long *size) {
-  uInt sd = volume<dimension>(size);
+template <Int dimension>
+double SubmanifoldConvolution_SgToRules(SparseGrid<dimension> &grid,
+                                        RuleBook &rules, long *size) {
  double countActiveInputs = 0;
  for (auto const &outputIter : grid.mp) {
    auto inRegion =
        InputRegionCalculator_Valid<dimension>(outputIter.first, size);
-    uInt rulesOffset = 0;
+    Int rulesOffset = 0;
    for (auto inputPoint : inRegion) {
      auto inputIter = grid.mp.find(inputPoint);
      if (inputIter != grid.mp.end()) {
@@ -45,43 +44,43 @@ double SubmanifoldConvolution_SgToRules(SparseGrid<dimension> &grid, RuleBook &r
  return countActiveInputs;
 }

-template <uInt dimension>
-uInt SubmanifoldConvolution_SgsToRules(SparseGrids<dimension> &SGs, RuleBook &rules,
-                                 long *size) {
-  uInt sd = volume<dimension>(size);
-  uInt countActiveInputs = 0;
+template <Int dimension>
+Int SubmanifoldConvolution_SgsToRules(SparseGrids<dimension> &SGs,
+                                      RuleBook &rules, long *size) {
+  Int sd = volume<dimension>(size);
+  Int countActiveInputs = 0;
  rules.clear();
  rules.resize(sd);
-  for (uInt i = 0; i < SGs.size(); i++)
+  for (Int i = 0; i < (Int)SGs.size(); i++)
    countActiveInputs +=
        SubmanifoldConvolution_SgToRules<dimension>(SGs[i], rules, size);
  return countActiveInputs;
 }
-template <uInt dimension>
-uInt SubmanifoldConvolution_SgsToRules_OMP(SparseGrids<dimension> &SGs,
-                                     RuleBook &rules, long *size) {
+template <Int dimension>
+Int SubmanifoldConvolution_SgsToRules_OMP(SparseGrids<dimension> &SGs,
+                                          RuleBook &rules, long *size) {
  std::vector<RuleBook> rbs(SGs.size());
  std::vector<double> countActiveInputs(SGs.size());
  rules.clear();
-  uInt sd = volume<dimension>(size);
+  Int sd = volume<dimension>(size);
  rules.resize(sd);
  {
-    uInt i;
+    Int i;
 #pragma omp parallel for private(i)
-    for (i = 0; i < SGs.size(); i++) {
+    for (i = 0; i < (Int)SGs.size(); i++) {
      rbs[i].resize(sd);
      countActiveInputs[i] =
          SubmanifoldConvolution_SgToRules<dimension>(SGs[i], rbs[i], size);
    }
  }
  {
-    uInt i;
+    Int i;
 #pragma omp parallel for private(i)
    for (i = 0; i < sd; i++)
      for (auto const &rb : rbs)
        rules[i].insert(rules[i].end(), rb[i].begin(), rb[i].end());
  }
-  uInt countActiveInputs_ = 0;
+  Int countActiveInputs_ = 0;
  for (auto &i : countActiveInputs)
    countActiveInputs_ += i;
  return countActiveInputs_;

--- a/sparseconvnet/SCN/__init__.py
+++ b/sparseconvnet/SCN/__init__.py
-
-from torch.utils.ffi import _wrap_function
-from ._SCN import lib as _lib, ffi as _ffi
-
-__all__ = []
-def _import_symbols(locals):
-    for symbol in dir(_lib):
-        fn = getattr(_lib, symbol)
-        if callable(fn):
-            locals[symbol] = _wrap_function(fn, _ffi)
-        else:
-            locals[symbol] = fn
-        __all__.append(symbol)
-
-_import_symbols(locals())
--- a/sparseconvnet/SCN/generic/CPU/ActivePooling.cpp
+++ b/sparseconvnet/SCN/generic/CPU/ActivePooling.cpp
-// Copyright 2016-present, Facebook, Inc.
-// All rights reserved.
-//
-// This source code is licensed under the license found in the
-// LICENSE file in the root directory of this source tree.
-
-#ifndef TH_GENERIC_FILE_
-#define TH_GENERIC_FILE_ "generic/CPU/ActivePooling.cpp"
-#else
-#include "ActivePooling.h"
-
-extern "C" void scn_DR_(ActivePooling_updateOutput)(
-    THLongTensor *inputSize, void **m, THTensor *input_features,
-    THTensor *output_features,  bool average) {
-
-  SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
-  uInt nPlanes = input_features->size[1];
-  auto _rules = _m.getActivePoolingRuleBook(inputSize);
-  uInt batchSize = _rules[1][0];
-  uInt maxActive = _rules[1][1];
-  THTensor_(resize2d)(output_features, batchSize, nPlanes);
-  THTensor_(zero)(output_features);
-
-  ActivePooling_ForwardPass<real>(THTensor_(data)(input_features),
-                                  THTensor_(data)(output_features), batchSize,
-                                  maxActive, nPlanes, _rules, average);
-}
-extern "C" void scn_DR_(ActivePooling_updateGradInput)(
-    THLongTensor *inputSize, void **m, THTensor *input_features,
-    THTensor *d_input_features, THTensor *d_output_features,
-    bool average) {
-
-  SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
-  uInt nPlanes = input_features->size[1];
-  auto _rules = _m.getActivePoolingRuleBook(inputSize);
-  uInt batchSize = _rules[1][0];
-  uInt maxActive = _rules[1][1];
-  THTensor_(resizeAs)(d_input_features, input_features);
-  THTensor_(zero)(d_input_features);
-
-  ActivePooling_BackwardPass<real>(
-      THTensor_(data)(d_input_features), THTensor_(data)(d_output_features),
-      batchSize, maxActive, nPlanes, _rules, average);
-}
-#endif
--- a/sparseconvnet/SCN/generic/CPU/AffineReluTrivialConvolution.cpp
+++ b/sparseconvnet/SCN/generic/CPU/AffineReluTrivialConvolution.cpp
-// Copyright 2016-present, Facebook, Inc.
-// All rights reserved.
-//
-// This source code is licensed under the license found in the
-// LICENSE file in the root directory of this source tree.
-
-#ifndef TH_GENERIC_FILE
-#define TH_GENERIC_FILE "generic/CPU/AffineReluTrivialConvolution.cpp"
-#else
-#include "AffineReluTrivialConvolution.h"
-
-extern "C" void scn_R_(AffineReluTrivialConvolution_updateOutput)(
-    THTensor *input_features, THTensor *output_features, THTensor *affineWeight,
-    THTensor *affineBias, THTensor *convWeight) {
-  THTensor_(resize2d)(output_features, input_features->size[0],
-                      convWeight->size[1]);
-  AffineReluTrivialConvolution_ForwardPass(
-      THTensor_(data)(input_features), convWeight->size[0],
-      input_features->stride[0], THTensor_(data)(output_features),
-      convWeight->size[1], output_features->stride[0],
-      THTensor_(data)(affineWeight), THTensor_(data)(affineBias),
-      THTensor_(data)(convWeight), input_features->size[0]);
-}
-
-extern "C" void scn_R_(AffineReluTrivialConvolution_backward)(
-    THTensor *input_features, THTensor *d_input_features,
-    THTensor *d_output_features, THTensor *affineWeight,
-    THTensor *d_affineWeight, THTensor *affineBias, THTensor *d_affineBias,
-    THTensor *convWeight, THTensor *d_convWeight, bool additiveGrad) {
-
-  THTensor_(resizeAs)(d_input_features, input_features);
-  AffineReluTrivialConvolution_BackwardPass(
-      THTensor_(data)(input_features), THTensor_(data)(d_input_features),
-      convWeight->size[0], input_features->stride[0],
-      THTensor_(data)(d_output_features), convWeight->size[1],
-      d_output_features->stride[0], THTensor_(data)(affineWeight),
-      THTensor_(data)(d_affineWeight), THTensor_(data)(affineBias),
-      THTensor_(data)(d_affineBias), THTensor_(data)(convWeight),
-      THTensor_(data)(d_convWeight), input_features->size[0], additiveGrad);
-}
-
-#endif
--- a/sparseconvnet/SCN/generic/CPU/AveragePooling.cpp
+++ b/sparseconvnet/SCN/generic/CPU/AveragePooling.cpp
-// Copyright 2016-present, Facebook, Inc.
-// All rights reserved.
-//
-// This source code is licensed under the license found in the
-// LICENSE file in the root directory of this source tree.
-
-#ifndef TH_GENERIC_FILE_
-#define TH_GENERIC_FILE_ "generic/CPU/AveragePooling.cpp"
-#else
-#include "AveragePooling.h"
-
-extern "C" void scn_DR_(AveragePooling_updateOutput)(
-    THLongTensor *inputSize, THLongTensor *outputSize, THLongTensor *poolSize,
-    THLongTensor *poolStride, void **m, THTensor *input_features,
-    THTensor *output_features, long nFeaturesToDrop) {
-
-  SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
-  uInt nPlanes = input_features->size[1] - nFeaturesToDrop;
-  auto _rules =
-      _m.getRuleBook(inputSize, outputSize, poolSize, poolStride, true);
-  uInt nActive = _m.getNActive(outputSize);
-  THTensor_(resize2d)(output_features, nActive,
-                      input_features->size[1] - nFeaturesToDrop);
-  THTensor_(zero)(output_features);
-
-  auto iF = THTensor_(data)(input_features) + nFeaturesToDrop;
-  auto oF = THTensor_(data)(output_features);
-
-  for (auto &r : _rules) {
-    uInt nHot = r.size() / 2;
-    AveragePooling_ForwardPass<real>(iF, oF, nPlanes, input_features->stride[0],
-                                     output_features->stride[0], &r[0], nHot,
-                                     _rules.size());
-  }
-}
-extern "C" void scn_DR_(AveragePooling_updateGradInput)(
-    THLongTensor *inputSize, THLongTensor *outputSize, THLongTensor *poolSize,
-    THLongTensor *poolStride, void **m, THTensor *input_features,
-    THTensor *d_input_features, THTensor *d_output_features,
-    long nFeaturesToDrop) {
-
-  SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
-  uInt nPlanes = input_features->size[1] - nFeaturesToDrop;
-  auto _rules =
-      _m.getRuleBook(inputSize, outputSize, poolSize, poolStride, true);
-  uInt nActive = _m.getNActive(outputSize);
-  THTensor_(resizeAs)(d_input_features, input_features);
-  THTensor_(zero)(d_input_features);
-
-  auto diF = THTensor_(data)(d_input_features) + nFeaturesToDrop;
-  auto doF = THTensor_(data)(d_output_features);
-
-  for (auto &r : _rules) {
-    uInt nHot = r.size() / 2;
-    AveragePooling_BackwardPass<real>(
-        diF, doF, nPlanes, input_features->stride[0],
-        d_output_features->stride[0], &r[0], nHot, _rules.size());
-  }
-}
-#endif
--- a/sparseconvnet/SCN/generic/CPU/BatchNormalization.cpp
+++ b/sparseconvnet/SCN/generic/CPU/BatchNormalization.cpp
-// Copyright 2016-present, Facebook, Inc.
-// All rights reserved.
-//
-// This source code is licensed under the license found in the
-// LICENSE file in the root directory of this source tree.
-
-#ifndef TH_GENERIC_FILE
-#define TH_GENERIC_FILE "generic/CPU/BatchNormalization.cpp"
-#else
-#include "BatchNormalization.h"
-
-extern "C" void scn_R_(BatchNormalization_updateOutput)(
-    THTensor *input_features, THTensor *output_features, THTensor *saveMean,
-    THTensor *saveInvStd, THTensor *runningMean, THTensor *runningVar,
-    THTensor *weight, THTensor *bias, real eps, real momentum, bool train,
-    real leakiness) {
-  THTensor_(resizeAs)(output_features, input_features);
-  if (input_features->nDimension == 2) {
-    auto nActive = input_features->size[0];
-    auto nPlanes = input_features->size[1];
-    auto input_stride = input_features->stride[0];
-    auto output_stride = output_features->stride[0];
-    BatchNormalization_ForwardPass<real>(
-        THTensor_(data)(input_features), THTensor_(data)(output_features),
-        nPlanes, input_stride, output_stride, nActive,
-        THTensor_(data)(saveMean), THTensor_(data)(saveInvStd),
-        THTensor_(data)(runningMean), THTensor_(data)(runningVar),
-        THOptionalTensorData(weight), THOptionalTensorData(bias), eps, momentum,
-        train, leakiness);
-  }
-}
-
-extern "C" void scn_R_(BatchNormalizationInTensor_updateOutput)(
-    THTensor *input_features, THTensor *output_features, THTensor *saveMean,
-    THTensor *saveInvStd, THTensor *runningMean, THTensor *runningVar,
-    THTensor *weight, THTensor *bias, real eps, real momentum, bool train,
-    real leakiness) {
-
-  if (input_features->nDimension == 2) {
-    auto nActive = input_features->size[0];
-    auto nPlanes = input_features->size[1];
-    auto input_stride = input_features->stride[0];
-    auto output_stride = output_features->stride[0];
-
-    BatchNormalization_ForwardPass<real>(
-        THTensor_(data)(input_features), THTensor_(data)(output_features),
-        nPlanes, input_stride, output_stride, nActive,
-        THTensor_(data)(saveMean), THTensor_(data)(saveInvStd),
-        THTensor_(data)(runningMean), THTensor_(data)(runningVar),
-        THOptionalTensorData(weight), THOptionalTensorData(bias), eps, momentum,
-        train, leakiness);
-  }
-}
-
-extern "C" void scn_R_(BatchNormalization_backward)(
-    THTensor *input_features, THTensor *d_input_features,
-    THTensor *output_features, THTensor *d_output_features, THTensor *saveMean,
-    THTensor *saveInvStd, THTensor *runningMean, THTensor *runningVar,
-    THTensor *weight, THTensor *bias, THTensor *d_weight, THTensor *d_bias,
-    real leakiness) {
-
-  THTensor_(resizeAs)(d_input_features, input_features);
-  if (input_features->nDimension == 2) {
-    auto nActive = input_features->size[0];
-    auto nPlanes = input_features->size[1];
-    auto input_stride = input_features->stride[0];
-    auto output_stride = output_features->stride[0];
-    BatchNormalization_BackwardPass<real>(
-        THTensor_(data)(input_features), THTensor_(data)(d_input_features),
-        THTensor_(data)(output_features), THTensor_(data)(d_output_features),
-        nPlanes, input_stride, output_stride, nActive,
-        THTensor_(data)(saveMean), THTensor_(data)(saveInvStd),
-        THTensor_(data)(runningMean), THTensor_(data)(runningVar),
-        THOptionalTensorData(weight), THOptionalTensorData(bias),
-        THOptionalTensorData(d_weight), THOptionalTensorData(d_bias),
-        leakiness);
-  }
-}
-#endif
--- a/sparseconvnet/SCN/generic/CPU/BatchwiseMultiplicativeDropout.cpp
+++ b/sparseconvnet/SCN/generic/CPU/BatchwiseMultiplicativeDropout.cpp
-// Copyright 2016-present, Facebook, Inc.
-// All rights reserved.
-//
-// This source code is licensed under the license found in the
-// LICENSE file in the root directory of this source tree.
-
-#ifndef TH_GENERIC_FILE
-#define TH_GENERIC_FILE "generic/CPU/BatchwiseMultiplicativeDropout.cpp"
-#else
-
-extern "C" void scn_R_(BatchwiseMultiplicativeDropout_updateOutput)(
-    THTensor *input_features, THTensor *output_features, THTensor *noise,
-    float alpha) {
-  if (input_features != output_features)
-    THTensor_(resizeAs)(output_features, input_features);
-  auto nActive = input_features->size[0];
-  auto nPlanes = input_features->size[1];
-  auto iF = THTensor_(data)(input_features);
-  auto oF = THTensor_(data)(output_features);
-  auto nz = THTensor_(data)(noise);
-  for (uInt row = 0; row < nActive; row++)
-    for (uInt plane = 0, o = row * nPlanes, i = row * nPlanes; plane < nPlanes;
-         plane++, o++, i++)
-      oF[o] = (iF[i] > 0) ? iF[i] * nz[plane] : iF[i] * nz[plane] * alpha;
-}
-extern "C" void scn_R_(BatchwiseMultiplicativeDropout_updateGradInput)(
-    THTensor *input_features, THTensor *d_input_features,
-    THTensor *d_output_features, THTensor *noise, float alpha) {
-  if (d_input_features != d_output_features)
-    THTensor_(resizeAs)(d_input_features, d_output_features);
-  auto nActive = input_features->size[0];
-  auto nPlanes = input_features->size[1];
-  auto iF = THTensor_(data)(input_features);
-  auto diF = THTensor_(data)(d_input_features);
-  auto doF = THTensor_(data)(d_output_features);
-  auto nz = THTensor_(data)(noise);
-  for (uInt row = 0; row < nActive; row++)
-    for (uInt plane = 0, o = row * nPlanes, i = row * nPlanes; plane < nPlanes;
-         plane++, o++, i++)
-      diF[i] = (iF[i] > 0) ? doF[o] * nz[plane] : doF[o] * nz[plane] * alpha;
-}
-#endif
--- a/sparseconvnet/SCN/generic/CPU/Convolution.cpp
+++ b/sparseconvnet/SCN/generic/CPU/Convolution.cpp
-// Copyright 2016-present, Facebook, Inc.
-// All rights reserved.
-//
-// This source code is licensed under the license found in the
-// LICENSE file in the root directory of this source tree.
-
-#ifndef TH_GENERIC_FILE_
-#define TH_GENERIC_FILE_ "generic/CPU/Convolution.cpp"
-#else
-#include "Convolution.h"
-
-extern "C" double scn_DR_(Convolution_updateOutput)(
-    THLongTensor *inputSize, THLongTensor *outputSize, THLongTensor *filterSize,
-    THLongTensor *filterStride, void **m, THTensor *input_features,
-    THTensor *output_features, THTensor *weight, THTensor *bias,
-    long filterVolume) {
-
-  SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
-  auto _rules =
-      _m.getRuleBook(inputSize, outputSize, filterSize, filterStride, true);
-  uInt nActive = _m.getNActive(outputSize);
-  THTensor_(resize2d)(output_features, nActive, weight->size[1]);
-  if (not bias)
-    THTensor_(zero)(output_features);
-
-  double flops = 0;
-  if (nActive) {
-    auto iF = THTensor_(data)(input_features);
-    auto oF = THTensor_(data)(output_features);
-    auto ip = input_features->size[1];
-    auto op = output_features->size[1];
-    auto w = THTensor_(data)(weight);
-    auto b = THOptionalTensorData(bias);
-    Convolution_ForwardPass(iF, ip, ip, oF, op, op, w, b, _rules, nActive,
-                            THBlas_(gemm));
-    for (auto &r : _rules)
-      flops += r.size() / 2 * ip * op;
-  }
-  return flops;
-}
-
-extern "C" void scn_DR_(Convolution_backward)(
-    THLongTensor *inputSize, THLongTensor *outputSize, THLongTensor *filterSize,
-    THLongTensor *filterStride, void **m, THTensor *input_features,
-    THTensor *d_input_features, THTensor *d_output_features, THTensor *weight,
-    THTensor *d_weight, THTensor *d_bias, long filterVolume) {
-
-  SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
-  auto _rules =
-      _m.getRuleBook(inputSize, outputSize, filterSize, filterStride, true);
-  uInt nActive = _m.getNActive(outputSize);
-  THTensor_(resizeAs)(d_input_features, input_features);
-  THTensor_(zero)(d_input_features);
-
-  if (nActive) {
-    auto iF = THTensor_(data)(input_features);
-    auto diF = THTensor_(data)(d_input_features);
-    auto doF = THTensor_(data)(d_output_features);
-    auto ip = input_features->size[1];
-    auto op = d_output_features->size[1];
-    auto w = THTensor_(data)(weight);
-    auto dw = THTensor_(data)(d_weight);
-    auto db = THOptionalTensorData(d_bias);
-
-    Convolution_BackwardPass(iF, diF, ip, ip, doF, op, op, w, dw, db, _rules,
-                             nActive, THBlas_(gemm));
-  }
-}
-
-extern "C" double scn_DR_(SubmanifoldConvolution_updateOutput)(
-    THLongTensor *inputSize, THLongTensor *filterSize, void **m,
-    THTensor *input_features, THTensor *output_features, THTensor *weight,
-    THTensor *bias, long filterVolume) {
-
-  SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
-  auto _rules = _m.getSubmanifoldRuleBook(inputSize, filterSize, true);
-  uInt nActive = _m.getNActive(inputSize);
-  THTensor_(resize2d)(output_features, nActive, weight->size[1]);
-  if (not bias)
-    THTensor_(zero)(output_features);
-
-  double flops = 0;
-  if (nActive) {
-    auto iF = THTensor_(data)(input_features);
-    auto oF = THTensor_(data)(output_features);
-    auto ip = input_features->size[1];
-    auto op = output_features->size[1];
-    auto w = THTensor_(data)(weight);
-    auto b = THOptionalTensorData(bias);
-
-    Convolution_ForwardPass(iF, ip, ip, oF, op, op, w, b, _rules, nActive,
-                            THBlas_(gemm));
-    for (auto &r : _rules)
-      flops += r.size() / 2 * ip * op;
-  }
-  return flops;
-}
-
-extern "C" void scn_DR_(SubmanifoldConvolution_backward)(
-    THLongTensor *inputSize, THLongTensor *filterSize, void **m,
-    THTensor *input_features, THTensor *d_input_features,
-    THTensor *d_output_features, THTensor *weight, THTensor *d_weight,
-    THTensor *d_bias, long filterVolume) {
-
-  SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
-  auto _rules = _m.getSubmanifoldRuleBook(inputSize, filterSize, true);
-  uInt nActive = _m.getNActive(inputSize);
-  THTensor_(resizeAs)(d_input_features, input_features);
-  THTensor_(zero)(d_input_features);
-
-  if (nActive) {
-    auto iF = THTensor_(data)(input_features);
-    auto diF = THTensor_(data)(d_input_features);
-    auto doF = THTensor_(data)(d_output_features);
-    auto ip = input_features->size[1];
-    auto op = d_output_features->size[1];
-    auto w = THTensor_(data)(weight);
-    auto dw = THTensor_(data)(d_weight);
-    auto db = THOptionalTensorData(d_bias);
-
-    Convolution_BackwardPass(iF, diF, ip, ip, doF, op, op, w, dw, db, _rules,
-                             nActive, THBlas_(gemm));
-  }
-}
-extern "C" double scn_DR_(FullConvolution_updateOutput)(
-    THLongTensor *inputSize, THLongTensor *outputSize, THLongTensor *filterSize,
-    THLongTensor *filterStride, void **mIn, void **mOut,
-    THTensor *input_features, THTensor *output_features, THTensor *weight,
-    THTensor *bias, long filterVolume) {
-  SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, mIn)
-  SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, mOut)
-  auto _rules = _mIn.getFullConvolutionRuleBook(
-      inputSize, outputSize, filterSize, filterStride, _mOut);
-  uInt nActive = _mOut.getNActive(outputSize);
-  THTensor_(resize2d)(output_features, nActive, weight->size[1]);
-  if (not bias)
-    THTensor_(zero)(output_features);
-
-  double flops = 0;
-  if (nActive) {
-    auto iF = THTensor_(data)(input_features);
-    auto oF = THTensor_(data)(output_features);
-    auto ip = input_features->size[1];
-    auto op = output_features->size[1];
-    auto w = THTensor_(data)(weight);
-    auto b = THOptionalTensorData(bias);
-    Convolution_ForwardPass(iF, ip, ip, oF, op, op, w, b, _rules, nActive,
-                            THBlas_(gemm));
-    for (auto &r : _rules)
-      flops += r.size() / 2 * ip * op;
-  }
-  return flops;
-}
-
-extern "C" void scn_DR_(FullConvolution_backward)(
-    THLongTensor *inputSize, THLongTensor *outputSize, THLongTensor *filterSize,
-    THLongTensor *filterStride, void **mIn, void **mOut,
-    THTensor *input_features, THTensor *d_input_features,
-    THTensor *d_output_features, THTensor *weight, THTensor *d_weight,
-    THTensor *d_bias, long filterVolume) {
-
-  SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, mIn)
-  SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, mOut)
-  auto _rules = _mIn.getFullConvolutionRuleBook(
-      inputSize, outputSize, filterSize, filterStride, _mOut);
-  uInt nActive = _mOut.getNActive(outputSize);
-  THTensor_(resizeAs)(d_input_features, input_features);
-  THTensor_(zero)(d_input_features);
-
-  if (nActive) {
-    auto iF = THTensor_(data)(input_features);
-    auto diF = THTensor_(data)(d_input_features);
-    auto doF = THTensor_(data)(d_output_features);
-    auto ip = input_features->size[1];
-    auto op = d_output_features->size[1];
-    auto w = THTensor_(data)(weight);
-    auto dw = THTensor_(data)(d_weight);
-    auto db = THOptionalTensorData(d_bias);
-
-    Convolution_BackwardPass(iF, diF, ip, ip, doF, op, op, w, dw, db, _rules,
-                             nActive, THBlas_(gemm));
-  }
-}
-
-extern "C" double scn_DR_(RandomizedStrideConvolution_updateOutput)(
-    THLongTensor *inputSize, THLongTensor *outputSize, THLongTensor *filterSize,
-    THLongTensor *filterStride, void **m, THTensor *input_features,
-    THTensor *output_features, THTensor *weight, THTensor *bias,
-    long filterVolume) {
-  SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
-  auto _rules =
-      _m.getRandomizedStrideRuleBook(inputSize, outputSize, filterSize, filterStride, true);
-  uInt nActive = _m.getNActive(outputSize);
-  THTensor_(resize2d)(output_features, nActive, weight->size[1]);
-  if (not bias)
-    THTensor_(zero)(output_features);
-
-  double flops = 0;
-  if (nActive) {
-    auto iF = THTensor_(data)(input_features);
-    auto oF = THTensor_(data)(output_features);
-    auto ip = input_features->size[1];
-    auto op = output_features->size[1];
-    auto w = THTensor_(data)(weight);
-    auto b = THOptionalTensorData(bias);
-    Convolution_ForwardPass(iF, ip, ip, oF, op, op, w, b, _rules, nActive,
-                            THBlas_(gemm));
-    for (auto &r : _rules)
-      flops += r.size() / 2 * ip * op;
-  }
-  return flops;
-}
-
-extern "C" void scn_DR_(RandomizedStrideConvolution_backward)(
-    THLongTensor *inputSize, THLongTensor *outputSize, THLongTensor *filterSize,
-    THLongTensor *filterStride, void **m, THTensor *input_features,
-    THTensor *d_input_features, THTensor *d_output_features, THTensor *weight,
-    THTensor *d_weight, THTensor *d_bias, long filterVolume) {
-
-  SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
-  auto _rules =
-      _m.getRandomizedStrideRuleBook(inputSize, outputSize, filterSize, filterStride, true);
-  uInt nActive = _m.getNActive(outputSize);
-  THTensor_(resizeAs)(d_input_features, input_features);
-  THTensor_(zero)(d_input_features);
-
-  if (nActive) {
-    auto iF = THTensor_(data)(input_features);
-    auto diF = THTensor_(data)(d_input_features);
-    auto doF = THTensor_(data)(d_output_features);
-    auto ip = input_features->size[1];
-    auto op = d_output_features->size[1];
-    auto w = THTensor_(data)(weight);
-    auto dw = THTensor_(data)(d_weight);
-    auto db = THOptionalTensorData(d_bias);
-
-    Convolution_BackwardPass(iF, diF, ip, ip, doF, op, op, w, dw, db, _rules,
-                             nActive, THBlas_(gemm));
-  }
-}
-
-
-#endif
--- a/sparseconvnet/SCN/generic/CPU/Convolution.h
+++ b/sparseconvnet/SCN/generic/CPU/Convolution.h
-// Copyright 2016-present, Facebook, Inc.
-// All rights reserved.
-//
-// This source code is licensed under the license found in the
-// LICENSE file in the root directory of this source tree.
-
-#ifndef CPU_CONVOLUTION_H
-#define CPU_CONVOLUTION_H
-#include "../SparseConvNet.h"
-#include <cstring>
-// buffer must have size >= nHot * (nIn+nOut)
-
-template <typename T>
-void Convolution_ForwardPass(
-    T *input_features, uInt input_nPlanes, uInt input_nPLANES,
-    T *output_features, uInt output_nPlanes, uInt output_nPLANES, T *weight,
-    T *bias, RuleBook &rules, uInt output_nActive,
-    void (*gemm)(char transa, char transb, long m, long n, long k, T alpha,
-                 T *a, long lda, T *b, long ldb, T beta, T *c, long ldc)) {
-
-  if (bias != nullptr) // Set bias
-    for (uInt row = 0; row < output_nActive; row++)
-      for (uInt column = 0; column < output_nPlanes; column++)
-        output_features[row * output_nPLANES + column] = bias[column];
-
-  std::vector<T> input_buffer, output_buffer;
-  for (auto &r : rules) {
-    uInt nHot = r.size() / 2;
-    input_buffer.resize(nHot * input_nPlanes);
-    output_buffer.resize(nHot * output_nPlanes);
-    for (uInt row = 0; row < nHot; row++) {
-      std::memcpy(&input_buffer[row * input_nPlanes],
-                  input_features + r[2 * row] * input_nPLANES,
-                  sizeof(T) * input_nPlanes);
-    }
-    // Do GEMM (note: gemm assumes column-major matrices)
-    // input_buffer    is l*m (row-major)
-    // weight          is m*r (row-major)
-    // output_buffer   is l*r (row-major)
-    // buffer * weights -> output_buffers
-    (*gemm)('n', 'n',
-            output_nPlanes,                   // r
-            nHot,                             // l
-            input_nPlanes,                    // m
-            1,                                // alpha
-            weight, output_nPlanes,           // r
-            &input_buffer[0], input_nPlanes,  // m
-            0,                                // beta
-            &output_buffer[0], output_nPlanes // r
-    );
-    weight += input_nPlanes * output_nPlanes;
-    for (uInt row = 0; row < nHot; row++) {
-      T *b = &output_buffer[row * output_nPlanes];
-      T *o = &output_features[r[2 * row + 1] * output_nPLANES];
-      for (uInt k = 0; k < output_nPlanes; k++)
-        o[k] += b[k];
-    }
-  }
-}
-
-template <typename T>
-void Convolution_BackwardPass(
-    T *input_features, T *d_input_features, uInt input_nPlanes,
-    uInt input_nPLANES, T *d_output_features, uInt output_nPlanes,
-    uInt output_nPLANES, T *weight, T *d_weight, T *d_bias, RuleBook &rules,
-    uInt output_nActive,
-    void (*gemm)(char transa, char transb, long m, long n, long k, T alpha,
-                 T *a, long lda, T *b, long ldb, T beta, T *c, long ldc)) {
-
-  if (d_bias)
-    for (uInt row = 0; row < output_nActive; row++)
-      for (uInt i = 0; i < output_nPlanes; i++)
-        d_bias[i] += d_output_features[row * output_nPLANES + i];
-
-  std::vector<T> input_buffer, output_buffer;
-  for (auto &r : rules) {
-    uInt nHot = r.size() / 2;
-    input_buffer.resize(nHot * input_nPlanes);
-    output_buffer.resize(nHot * output_nPlanes);
-    for (uInt row = 0; row < nHot; row++)
-      std::memcpy(&output_buffer[row * output_nPlanes],
-                  &d_output_features[r[2 * row + 1] * output_nPLANES],
-                  sizeof(T) * output_nPlanes);
-    // Do GEMM (note: gemm assumes column-major matrices)
-    // output_buffer is l*m (row-major)
-    // weights           is r*m (row-major)
-    // input_buffer          is l*r (row-major)
-    // output_buffer * T(weight) -> input_buffer
-    (*gemm)('t', 'n',
-            input_nPlanes,                     // r
-            nHot,                              // l
-            output_nPlanes,                    // m
-            1,                                 // alpha
-            weight, output_nPlanes,            // m
-            &output_buffer[0], output_nPlanes, // m
-            0,                                 // beta
-            &input_buffer[0], input_nPlanes    // r
-    );
-    weight += input_nPlanes * output_nPlanes;
-    for (uInt row = 0; row < nHot; row++) {
-      T *b = &input_buffer[row * input_nPlanes];
-      T *i = &d_input_features[r[2 * row] * input_nPLANES];
-      for (uInt k = 0; k < input_nPlanes; k++)
-        i[k] += b[k];
-    }
-
-    for (uInt row = 0; row < nHot; row++)
-      std::memcpy(&input_buffer[row * input_nPlanes],
-                  input_features + r[2 * row] * input_nPLANES,
-                  sizeof(T) * input_nPlanes);
-    // Do GEMM (note: gemm assumes column-major matrices)
-    // input_buffer          is m*l (row-major)
-    // output_buffer          is m*r   (row-major)
-    // d_weights        is l*r (row-major)
-    // T(input_buffer) * output_buffer -> d_weight
-    (*gemm)('n', 't',
-            output_nPlanes,                    // r
-            input_nPlanes,                     // l
-            nHot,                              // m
-            1,                                 // alpha
-            &output_buffer[0], output_nPlanes, // r
-            &input_buffer[0], input_nPlanes,   // l
-            1,                                 // beta
-            d_weight, output_nPlanes           // r
-    );
-    d_weight += input_nPlanes * output_nPlanes;
-  }
-}
-
-// template <typename T>
-// void Convolution_ForwardPass(
-//     T *input_features, uInt input_nPlanes, uInt input_nPLANES,
-//     T *output_features, uInt output_nPlanes, uInt output_nPLANES, T *weight,
-//     T *bias, RuleBook &rules, uInt output_nActive,
-//     void (*gemm)(char transa, char transb, long m, long n, long k, T alpha,
-//                  T *a, long lda, T *b, long ldb, T beta, T *c, long ldc)) {
-
-//   if (bias != nullptr) // Set bias
-//     for (uInt row = 0; row < output_nActive; row++)
-//       for (uInt column = 0; column < output_nPlanes; column++)
-//         output_features[row * output_nPLANES + column] = bias[column];
-
-//   for (auto &r : rules) {
-//     uInt nHot = r.size() / 2;
-//     for (uInt row = 0; row < nHot; row++) {
-//       T *inp = &input_features[r[2 * row] * input_nPLANES];
-//       T *out = &output_features[r[2 * row + 1] * output_nPLANES];
-//       for (uInt i = 0; i < input_nPlanes; i++)
-//         for (uInt j = 0; j < output_nPlanes; j++)
-//           out[j] += inp[i] * weight[i * input_nPlanes + j];
-//     }
-//     weight += input_nPlanes * output_nPlanes;
-//   }
-// }
-#endif /* CPU_CONVOLUTION_H */
--- a/sparseconvnet/SCN/generic/CPU/Deconvolution.cpp
+++ b/sparseconvnet/SCN/generic/CPU/Deconvolution.cpp
-// Copyright 2016-present, Facebook, Inc.
-// All rights reserved.
-//
-// This source code is licensed under the license found in the
-// LICENSE file in the root directory of this source tree.
-
-#ifndef TH_GENERIC_FILE_
-#define TH_GENERIC_FILE_ "generic/CPU/Deconvolution.cpp"
-#else
-#include "Deconvolution.h"
-
-extern "C" double scn_DR_(Deconvolution_updateOutput)(
-    THLongTensor *inputSize, THLongTensor *outputSize, THLongTensor *filterSize,
-    THLongTensor *filterStride, void **m, THTensor *input_features,
-    THTensor *output_features, THTensor *weight, THTensor *bias,
-    long filterVolume) {
-
-  SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
-  auto _rules =
-      _m.getRuleBook(outputSize, inputSize, filterSize, filterStride, true);
-  uInt nActive = _m.getNActive(outputSize);
-  THTensor_(resize2d)(output_features, nActive, weight->size[1]);
-  if (not bias)
-    THTensor_(zero)(output_features);
-
-  auto iF = THTensor_(data)(input_features);
-  auto oF = THTensor_(data)(output_features);
-  auto ip = input_features->size[1];
-  auto op = output_features->size[1];
-  auto w = THTensor_(data)(weight);
-  auto b = THOptionalTensorData(bias);
-
-  Deconvolution_ForwardPass(iF, ip, ip, oF, op, op, w, b, _rules, nActive,
-                            THBlas_(gemm));
-  double flops = 0;
-  for (auto &r : _rules)
-    flops += r.size() / 2 * ip * op;
-  return flops;
-}
-
-extern "C" void scn_DR_(Deconvolution_backward)(
-    THLongTensor *inputSize, THLongTensor *outputSize, THLongTensor *filterSize,
-    THLongTensor *filterStride, void **m, THTensor *input_features,
-    THTensor *d_input_features, THTensor *d_output_features, THTensor *weight,
-    THTensor *d_weight, THTensor *d_bias, long filterVolume) {
-
-  SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
-  auto _rules =
-      _m.getRuleBook(outputSize, inputSize, filterSize, filterStride, true);
-  uInt nActive = _m.getNActive(outputSize);
-  THTensor_(resizeAs)(d_input_features, input_features);
-  THTensor_(zero)(d_input_features);
-
-  auto iF = THTensor_(data)(input_features);
-  auto diF = THTensor_(data)(d_input_features);
-  auto doF = THTensor_(data)(d_output_features);
-  auto ip = input_features->size[1];
-  auto op = d_output_features->size[1];
-  auto w = THTensor_(data)(weight);
-  auto dw = THTensor_(data)(d_weight);
-  auto db = THOptionalTensorData(d_bias);
-
-  Deconvolution_BackwardPass(iF, diF, ip, ip, doF, op, op, w, dw, db, _rules,
-                             nActive, THBlas_(gemm));
-}
-
-#endif