Merge branch 'master' into batchwise_dropout

3ff930b7 · Ed Ng · GitHub · df96d0c0 · c8e8db32 · 3ff930b7
Commit 3ff930b7 authored Sep 14, 2017 by Ed Ng Committed by GitHub Sep 14, 2017
20 changed files
--- a/PyTorch/setup.py
+++ b/PyTorch/setup.py
@@ -35,6 +35,7 @@ if torch.cuda.is_available():
            'sparseconvnet/SCN/header_cpu.h',
            'sparseconvnet/SCN/header_gpu.h'],
        sources=[],
+        include_dirs=[os.path.expandvars('$CUDA_HOME') + '/include'],
        extra_objects=[
            this_dir +
            '/sparseconvnet/SCN/init.cu.o'],

--- a/PyTorch/sparseconvnet/SCN/generic/CPU/BatchNormalization.cpp
+++ b/PyTorch/sparseconvnet/SCN/generic/CPU/BatchNormalization.cpp
@@ -14,18 +14,20 @@ extern "C" void scn_R_(BatchNormalization_updateOutput)(
    THTensor *saveInvStd, THTensor *runningMean, THTensor *runningVar,
    THTensor *weight, THTensor *bias, real eps, real momentum, bool train,
    real leakiness) {
-
  THTensor_(resizeAs)(output_features, input_features);
-  auto nActive = input_features->size[0];
-  auto nPlanes = input_features->size[1];
-  auto input_stride = input_features->stride[0];
-  auto output_stride = output_features->stride[0];
-  BatchNormalization_ForwardPass<real>(
-      THTensor_(data)(input_features), THTensor_(data)(output_features),
-      nPlanes, input_stride, output_stride, nActive, THTensor_(data)(saveMean),
-      THTensor_(data)(saveInvStd), THTensor_(data)(runningMean),
-      THTensor_(data)(runningVar), THOptionalTensorData(weight),
-      THOptionalTensorData(bias), eps, momentum, train, leakiness);
+  if (input_features->nDimension == 2) {
+    auto nActive = input_features->size[0];
+    auto nPlanes = input_features->size[1];
+    auto input_stride = input_features->stride[0];
+    auto output_stride = output_features->stride[0];
+    BatchNormalization_ForwardPass<real>(
+        THTensor_(data)(input_features), THTensor_(data)(output_features),
+        nPlanes, input_stride, output_stride, nActive,
+        THTensor_(data)(saveMean), THTensor_(data)(saveInvStd),
+        THTensor_(data)(runningMean), THTensor_(data)(runningVar),
+        THOptionalTensorData(weight), THOptionalTensorData(bias), eps, momentum,
+        train, leakiness);
+  }
 }

 extern "C" void scn_R_(BatchNormalizationInTensor_updateOutput)(
@@ -34,17 +36,20 @@ extern "C" void scn_R_(BatchNormalizationInTensor_updateOutput)(
    THTensor *weight, THTensor *bias, real eps, real momentum, bool train,
    real leakiness) {

-  auto nActive = input_features->size[0];
-  auto nPlanes = input_features->size[1];
-  auto input_stride = input_features->stride[0];
-  auto output_stride = output_features->stride[0];
+  if (input_features->nDimension == 2) {
+    auto nActive = input_features->size[0];
+    auto nPlanes = input_features->size[1];
+    auto input_stride = input_features->stride[0];
+    auto output_stride = output_features->stride[0];

-  BatchNormalization_ForwardPass<real>(
-      THTensor_(data)(input_features), THTensor_(data)(output_features),
-      nPlanes, input_stride, output_stride, nActive, THTensor_(data)(saveMean),
-      THTensor_(data)(saveInvStd), THTensor_(data)(runningMean),
-      THTensor_(data)(runningVar), THOptionalTensorData(weight),
-      THOptionalTensorData(bias), eps, momentum, train, leakiness);
+    BatchNormalization_ForwardPass<real>(
+        THTensor_(data)(input_features), THTensor_(data)(output_features),
+        nPlanes, input_stride, output_stride, nActive,
+        THTensor_(data)(saveMean), THTensor_(data)(saveInvStd),
+        THTensor_(data)(runningMean), THTensor_(data)(runningVar),
+        THOptionalTensorData(weight), THOptionalTensorData(bias), eps, momentum,
+        train, leakiness);
+  }
 }

 extern "C" void scn_R_(BatchNormalization_backward)(
@@ -55,17 +60,20 @@ extern "C" void scn_R_(BatchNormalization_backward)(
    real leakiness) {

  THTensor_(resizeAs)(d_input_features, input_features);
-  auto nActive = input_features->size[0];
-  auto nPlanes = input_features->size[1];
-  auto input_stride = input_features->stride[0];
-  auto output_stride = output_features->stride[0];
-  BatchNormalization_BackwardPass<real>(
-      THTensor_(data)(input_features), THTensor_(data)(d_input_features),
-      THTensor_(data)(output_features), THTensor_(data)(d_output_features),
-      nPlanes, input_stride, output_stride, nActive, THTensor_(data)(saveMean),
-      THTensor_(data)(saveInvStd), THTensor_(data)(runningMean),
-      THTensor_(data)(runningVar), THOptionalTensorData(weight),
-      THOptionalTensorData(bias), THOptionalTensorData(d_weight),
-      THOptionalTensorData(d_bias), leakiness);
+  if (input_features->nDimension == 2) {
+    auto nActive = input_features->size[0];
+    auto nPlanes = input_features->size[1];
+    auto input_stride = input_features->stride[0];
+    auto output_stride = output_features->stride[0];
+    BatchNormalization_BackwardPass<real>(
+        THTensor_(data)(input_features), THTensor_(data)(d_input_features),
+        THTensor_(data)(output_features), THTensor_(data)(d_output_features),
+        nPlanes, input_stride, output_stride, nActive,
+        THTensor_(data)(saveMean), THTensor_(data)(saveInvStd),
+        THTensor_(data)(runningMean), THTensor_(data)(runningVar),
+        THOptionalTensorData(weight), THOptionalTensorData(bias),
+        THOptionalTensorData(d_weight), THOptionalTensorData(d_bias),
+        leakiness);
+  }
 }
 #endif
--- a/PyTorch/sparseconvnet/SCN/generic/CPU/Convolution.cpp
+++ b/PyTorch/sparseconvnet/SCN/generic/CPU/Convolution.cpp
@@ -23,17 +23,19 @@ extern "C" double scn_DR_(Convolution_updateOutput)(
  if (not bias)
    THTensor_(zero)(output_features);

-  auto iF = THTensor_(data)(input_features);
-  auto oF = THTensor_(data)(output_features);
-  auto ip = input_features->size[1];
-  auto op = output_features->size[1];
-  auto w = THTensor_(data)(weight);
-  auto b = THOptionalTensorData(bias);
-  Convolution_ForwardPass(iF, ip, ip, oF, op, op, w, b, _rules, nActive,
-                          THBlas_(gemm));
  double flops = 0;
-  for (auto &r : _rules)
-    flops += r.size() / 2 * ip * op;
+  if (nActive) {
+    auto iF = THTensor_(data)(input_features);
+    auto oF = THTensor_(data)(output_features);
+    auto ip = input_features->size[1];
+    auto op = output_features->size[1];
+    auto w = THTensor_(data)(weight);
+    auto b = THOptionalTensorData(bias);
+    Convolution_ForwardPass(iF, ip, ip, oF, op, op, w, b, _rules, nActive,
+                            THBlas_(gemm));
+    for (auto &r : _rules)
+      flops += r.size() / 2 * ip * op;
+  }
  return flops;
 }

@@ -51,17 +53,19 @@ extern "C" void scn_DR_(Convolution_backward)(
  THTensor_(resizeAs)(d_input_features, input_features);
  THTensor_(zero)(d_input_features);

-  auto iF = THTensor_(data)(input_features);
-  auto diF = THTensor_(data)(d_input_features);
-  auto doF = THTensor_(data)(d_output_features);
-  auto ip = input_features->size[1];
-  auto op = d_output_features->size[1];
-  auto w = THTensor_(data)(weight);
-  auto dw = THTensor_(data)(d_weight);
-  auto db = THOptionalTensorData(d_bias);
-
-  Convolution_BackwardPass(iF, diF, ip, ip, doF, op, op, w, dw, db, _rules,
-                           nActive, THBlas_(gemm));
+  if (nActive) {
+    auto iF = THTensor_(data)(input_features);
+    auto diF = THTensor_(data)(d_input_features);
+    auto doF = THTensor_(data)(d_output_features);
+    auto ip = input_features->size[1];
+    auto op = d_output_features->size[1];
+    auto w = THTensor_(data)(weight);
+    auto dw = THTensor_(data)(d_weight);
+    auto db = THOptionalTensorData(d_bias);
+
+    Convolution_BackwardPass(iF, diF, ip, ip, doF, op, op, w, dw, db, _rules,
+                             nActive, THBlas_(gemm));
+  }
 }

 extern "C" double scn_DR_(ValidConvolution_updateOutput)(
@@ -71,24 +75,25 @@ extern "C" double scn_DR_(ValidConvolution_updateOutput)(

  SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
  auto _rules = _m.getValidRuleBook(inputSize, filterSize, true);
-  uInt nActive = input_features->size[0];
+  uInt nActive = _m.getNActive(inputSize);
  THTensor_(resize2d)(output_features, nActive, weight->size[1]);
  if (not bias)
    THTensor_(zero)(output_features);

-  auto iF = THTensor_(data)(input_features);
-  auto oF = THTensor_(data)(output_features);
-  auto ip = input_features->size[1];
-  auto op = output_features->size[1];
-  auto w = THTensor_(data)(weight);
-  auto b = THOptionalTensorData(bias);
-
-  Convolution_ForwardPass(iF, ip, ip, oF, op, op, w, b, _rules, nActive,
-                          THBlas_(gemm));
  double flops = 0;
-  for (auto &r : _rules)
-    flops += r.size() / 2 * ip * op;
-
+  if (nActive) {
+    auto iF = THTensor_(data)(input_features);
+    auto oF = THTensor_(data)(output_features);
+    auto ip = input_features->size[1];
+    auto op = output_features->size[1];
+    auto w = THTensor_(data)(weight);
+    auto b = THOptionalTensorData(bias);
+
+    Convolution_ForwardPass(iF, ip, ip, oF, op, op, w, b, _rules, nActive,
+                            THBlas_(gemm));
+    for (auto &r : _rules)
+      flops += r.size() / 2 * ip * op;
+  }
  return flops;
 }

@@ -100,21 +105,22 @@ extern "C" void scn_DR_(ValidConvolution_backward)(

  SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
  auto _rules = _m.getValidRuleBook(inputSize, filterSize, true);
-  uInt nActive = input_features->size[0];
+  uInt nActive = _m.getNActive(inputSize);
  THTensor_(resizeAs)(d_input_features, input_features);
  THTensor_(zero)(d_input_features);

-  auto iF = THTensor_(data)(input_features);
-  auto diF = THTensor_(data)(d_input_features);
-  auto doF = THTensor_(data)(d_output_features);
-  auto ip = input_features->size[1];
-  auto op = d_output_features->size[1];
-  auto w = THTensor_(data)(weight);
-  auto dw = THTensor_(data)(d_weight);
-  auto db = THOptionalTensorData(d_bias);
-
-  Convolution_BackwardPass(iF, diF, ip, ip, doF, op, op, w, dw, db, _rules,
-                           nActive, THBlas_(gemm));
+  if (nActive) {
+    auto iF = THTensor_(data)(input_features);
+    auto diF = THTensor_(data)(d_input_features);
+    auto doF = THTensor_(data)(d_output_features);
+    auto ip = input_features->size[1];
+    auto op = d_output_features->size[1];
+    auto w = THTensor_(data)(weight);
+    auto dw = THTensor_(data)(d_weight);
+    auto db = THOptionalTensorData(d_bias);
+
+    Convolution_BackwardPass(iF, diF, ip, ip, doF, op, op, w, dw, db, _rules,
+                             nActive, THBlas_(gemm));
+  }
 }
-
 #endif
--- a/PyTorch/sparseconvnet/SCN/generic/CPU/SparseToDense.cpp
+++ b/PyTorch/sparseconvnet/SCN/generic/CPU/SparseToDense.cpp
@@ -9,34 +9,32 @@
 #else
 #include "SparseToDense.h"

-extern "C" void scn_DR_(SparseToDense_updateOutput)(THLongTensor *inputSize,
-                                                    void **m,
-                                                    THTensor *input_features,
-                                                    THTensor *output_features,
-                                                    void *rulesBuffer) {
+extern "C" void scn_DR_(SparseToDense_updateOutput)(
+    THLongTensor *inputSize, void **m, THTensor *input_features,
+    THTensor *output_features, void *rulesBuffer, long nPlanes) {
+
+  SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)

-  SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m) {
+  {
    long sz[Dimension + 2];
-    sz[0] = _m.inputSGs->size();
-    sz[1] = input_features->size[1];
-    for (int i = 0; i < Dimension; i++) {
-      auto x = THLongTensor_data(inputSize)[i];
-      sz[i + 2] = x;
-    }
+    sz[0] = _m.grids.begin()->second.size();
+    sz[1] = nPlanes; // input_features->size[1];
+    std::memcpy(sz + 2, THLongTensor_data(inputSize), sizeof(long) * Dimension);
    THTensor_(resizeNd)(output_features, Dimension + 2, sz, NULL);
    THTensor_(zero)(output_features);
  }
-  auto _rules = _m.getSparseToDenseRuleBook(inputSize, true);
-  auto spatialVolume = _rules.size();
-  uInt nPlanes = input_features->size[1];
-  auto iF = THTensor_(data)(input_features);
-  auto oF = THTensor_(data)(output_features);
-
-  for (auto &r : _rules) {
-    uInt nHot = r.size() / 2;
-    SparseToDense_ForwardPass<real>(iF, oF, nPlanes, spatialVolume, &r[0],
-                                    nHot);
-    oF++;
+  if (input_features->nDimension == 2) {
+    auto _rules = _m.getSparseToDenseRuleBook(inputSize, true);
+    uInt nPlanes = input_features->size[1];
+    auto iF = THTensor_(data)(input_features);
+    auto oF = THTensor_(data)(output_features);
+    long spatialVolume = THLongTensor_prodall(inputSize);
+    for (auto &r : _rules) {
+      uInt nHot = r.size() / 2;
+      SparseToDense_ForwardPass<real>(iF, oF, nPlanes, spatialVolume, &r[0],
+                                      nHot);
+      oF += nPlanes * spatialVolume;
+    }
  }
 }
 extern "C" void scn_DR_(SparseToDense_updateGradInput)(
@@ -44,21 +42,22 @@ extern "C" void scn_DR_(SparseToDense_updateGradInput)(
    THTensor *d_input_features, THTensor *d_output_features,
    void *rulesBuffer) {

+  SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
  THTensor_(resizeAs)(d_input_features, input_features);
  THTensor_(zero)(d_input_features);
-
-  SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
  auto _rules = _m.getSparseToDenseRuleBook(inputSize, true);
-  auto spatialVolume = _rules.size();
-  uInt nPlanes = d_input_features->size[1];
-  auto diF = THTensor_(data)(d_input_features);
-  auto doF = THTensor_(data)(d_output_features);
+  if (input_features->nDimension == 2) {
+    long spatialVolume = THLongTensor_prodall(inputSize);
+    uInt nPlanes = d_input_features->size[1];
+    auto diF = THTensor_(data)(d_input_features);
+    auto doF = THTensor_(data)(d_output_features);

-  for (auto &r : _rules) {
-    uInt nHot = r.size() / 2;
-    SparseToDense_BackwardPass<real>(diF, doF, nPlanes, spatialVolume, &r[0],
-                                     nHot);
-    doF++;
+    for (auto &r : _rules) {
+      uInt nHot = r.size() / 2;
+      SparseToDense_BackwardPass<real>(diF, doF, nPlanes, spatialVolume, &r[0],
+                                       nHot);
+      doF += nPlanes * spatialVolume;
+    }
  }
 }
 #endif
--- a/PyTorch/sparseconvnet/SCN/generic/CPU/SparseToDense.h
+++ b/PyTorch/sparseconvnet/SCN/generic/CPU/SparseToDense.h
@@ -10,27 +10,26 @@

 template <typename T>
 void SparseToDense_ForwardPass(T *input_features, T *output_features,
-                               uInt nPlanes, uInt spatialVolume,
-                               uInt* rules, int nHot) {
-
-                                 for (uInt outSite = 0; outSite < nHot; outSite++) {
-                                   T *i = &input_features[rules[2 * outSite] * nPlanes];
-                                   uInt sample = rules[2 * outSite + 1];
-                                   for (uInt plane = 0; plane < nPlanes; plane++)
-                                     output_features[(sample*nPlanes+plane)*spatialVolume]=i[plane];
-                                 }
+                               uInt nPlanes, uInt spatialVolume, uInt *rules,
+                               int nHot) {
+  for (uInt outSite = 0; outSite < nHot; outSite++) {
+    T *i = input_features + rules[2 * outSite] * nPlanes;
+    T *o = output_features + rules[2 * outSite + 1];
+    for (uInt plane = 0; plane < nPlanes; plane++)
+      o[plane * spatialVolume] = i[plane];
+  }
 }

 template <typename T>
 void SparseToDense_BackwardPass(T *d_input_features, T *d_output_features,
-                                  uInt nPlanes, uInt spatialVolume,
-                                  uInt* rules, int nHot) {
+                                uInt nPlanes, uInt spatialVolume, uInt *rules,
+                                int nHot) {

-                                    for (uInt outSite = 0; outSite < nHot; outSite++) {
-                                      T *di = &d_input_features[rules[2 * outSite] * nPlanes];
-                                      uInt sample = rules[2 * outSite + 1];
-                                      for (uInt plane = 0; plane < nPlanes; plane++)
-                                        di[plane]=d_output_features[(sample*nPlanes+plane)*spatialVolume];
-                                    }
-                           }
+  for (uInt outSite = 0; outSite < nHot; outSite++) {
+    T *d_i = d_input_features + rules[2 * outSite] * nPlanes;
+    auto d_o = d_output_features + rules[2 * outSite + 1];
+    for (uInt plane = 0; plane < nPlanes; plane++)
+      d_i[plane] = d_o[plane * spatialVolume];
+  }
+}
 #endif /* CPU_SPARSETODENSE_H */
--- a/PyTorch/sparseconvnet/SCN/generic/GPU/AffineReluTrivialConvolution.cu
+++ b/PyTorch/sparseconvnet/SCN/generic/GPU/AffineReluTrivialConvolution.cu
@@ -10,7 +10,6 @@
 #include "AffineReluTrivialConvolution.h"

 #include <algorithm>
-#include <iostream>

 extern "C" void scn_R_(AffineReluTrivialConvolution_updateOutput)(
    THCTensor *input_features, THCTensor *output_features,

--- a/PyTorch/sparseconvnet/SCN/generic/GPU/AffineReluTrivialConvolution.h
+++ b/PyTorch/sparseconvnet/SCN/generic/GPU/AffineReluTrivialConvolution.h
@@ -155,6 +155,27 @@ __global__ void dAffineReluTrivialConvolution_forwardB(
  }
 }

+#define FOO(T, K, V)                                                           \
+  {                                                                            \
+    if (input_nPlanes % K == 0 and output_nPlanes % K == 0) {                  \
+      uInt o = (nActive / K) * K;                                              \
+      if (o > 0)                                                               \
+        dAffineReluTrivialConvolution_forwardA<T, K, V> << <                   \
+            dim3(std::min(o / K, (uInt)512), output_nPlanes / K),              \
+            dim3(K, K / V), 0, THCState_getCurrentStream(state)>>>             \
+            (inFeatures, outFeatures, affineWeight, affineBias, convWeight,    \
+             input_nPlanes, input_stride, output_nPlanes, output_stride, o);   \
+      if (nActive > o)                                                         \
+        dAffineReluTrivialConvolution_forwardB<T, K, V> << <                   \
+            dim3(1, output_nPlanes / K), dim3(K, K / V), 0,                    \
+            THCState_getCurrentStream(state)>>>                                \
+            (inFeatures + o * input_stride, outFeatures + o * output_stride,   \
+             affineWeight, affineBias, convWeight, input_nPlanes,              \
+             input_stride, output_nPlanes, output_stride, nActive - o);        \
+      return;                                                                  \
+    }                                                                          \
+  }
+
 template <typename T>
 void dAffineReluTrivialConvolution_forward(T *inFeatures, T *outFeatures,
                                           T *affineWeight, T *affineBias,
@@ -162,92 +183,25 @@ void dAffineReluTrivialConvolution_forward(T *inFeatures, T *outFeatures,
                                           uInt input_stride,
                                           uInt output_nPlanes,
                                           uInt output_stride, uInt nActive) {
-  {
-    const uInt K = 64;
-    const uInt V = 16;
-    if (input_nPlanes % K == 0 and output_nPlanes % K == 0) {
-      uInt o = (nActive / K) * K;
-      if (o > 0)
-        dAffineReluTrivialConvolution_forwardA<
-            T, K, V><<<dim3(std::min(o / K, (uInt)512), output_nPlanes / K),
-                       dim3(K, K / V), 0, THCState_getCurrentStream(state)>>>(
-            inFeatures, outFeatures, affineWeight, affineBias, convWeight,
-            input_nPlanes, input_stride, output_nPlanes, output_stride, o);
-      if (nActive > o)
-        dAffineReluTrivialConvolution_forwardB<
-            T, K, V><<<dim3(1, output_nPlanes / K), dim3(K, K / V), 0,
-                       THCState_getCurrentStream(state)>>>(
-            inFeatures + o * input_stride, outFeatures + o * output_stride,
-            affineWeight, affineBias, convWeight, input_nPlanes, input_stride,
-            output_nPlanes, output_stride, nActive - o);
-      return;
-    }
-  }
-  {
-    const uInt K = 32;
-    const uInt V = 4;
-    if (input_nPlanes % K == 0 and output_nPlanes % K == 0) {
-      uInt o = (nActive / K) * K;
-      if (o > 0)
-        dAffineReluTrivialConvolution_forwardA<
-            T, K, V><<<dim3(std::min(o / K, (uInt)512), output_nPlanes / K),
-                       dim3(K, K / V), 0, THCState_getCurrentStream(state)>>>(
-            inFeatures, outFeatures, affineWeight, affineBias, convWeight,
-            input_nPlanes, input_stride, output_nPlanes, output_stride, o);
-      if (nActive > o)
-        dAffineReluTrivialConvolution_forwardB<
-            T, K, V><<<dim3(1, output_nPlanes / K), dim3(K, K / V), 0,
-                       THCState_getCurrentStream(state)>>>(
-            inFeatures + o * input_stride, outFeatures + o * output_stride,
-            affineWeight, affineBias, convWeight, input_nPlanes, input_stride,
-            output_nPlanes, output_stride, nActive - o);
-      return;
-    }
-  }
-  {
-    const uInt K = 16;
-    const uInt V = 4;
-    if (input_nPlanes % K == 0 and output_nPlanes % K == 0) {
-      uInt o = (nActive / K) * K;
-      if (o > 0)
-        dAffineReluTrivialConvolution_forwardA<
-            T, K, V><<<dim3(std::min(o / K, (uInt)512), output_nPlanes / K),
-                       dim3(K, K / V), 0, THCState_getCurrentStream(state)>>>(
-            inFeatures, outFeatures, affineWeight, affineBias, convWeight,
-            input_nPlanes, input_stride, output_nPlanes, output_stride, o);
-      if (nActive > o)
-        dAffineReluTrivialConvolution_forwardB<
-            T, K, V><<<dim3(1, output_nPlanes / K), dim3(K, K / V), 0,
-                       THCState_getCurrentStream(state)>>>(
-            inFeatures + o * input_stride, outFeatures + o * output_stride,
-            affineWeight, affineBias, convWeight, input_nPlanes, input_stride,
-            output_nPlanes, output_stride, nActive - o);
-      return;
-    }
-  }
-  {
-    const uInt K = 8;
-    const uInt V = 2;
-    if (input_nPlanes % K == 0 and output_nPlanes % K == 0) {
-      uInt o = (nActive / K) * K;
-      if (o > 0)
-        dAffineReluTrivialConvolution_forwardA<
-            T, K, V><<<dim3(std::min(o / K, (uInt)512), output_nPlanes / K),
-                       dim3(K, K / V), 0, THCState_getCurrentStream(state)>>>(
-            inFeatures, outFeatures, affineWeight, affineBias, convWeight,
-            input_nPlanes, input_stride, output_nPlanes, output_stride, o);
-      if (nActive > o)
-        dAffineReluTrivialConvolution_forwardB<
-            T, K, V><<<dim3(1, output_nPlanes / K), dim3(K, K / V), 0,
-                       THCState_getCurrentStream(state)>>>(
-            inFeatures + o * input_stride, outFeatures + o * output_stride,
-            affineWeight, affineBias, convWeight, input_nPlanes, input_stride,
-            output_nPlanes, output_stride, nActive - o);
-      return;
-    }
-  }
+
+  FOO(T, 64, 16)
+  FOO(T, 32, 8)
+  FOO(T, 16, 4)
+  FOO(T, 8, 2)
  assert(false);
 }
+template <>
+void dAffineReluTrivialConvolution_forward<double>(
+    double *inFeatures, double *outFeatures, double *affineWeight,
+    double *affineBias, double *convWeight, uInt input_nPlanes,
+    uInt input_stride, uInt output_nPlanes, uInt output_stride, uInt nActive) {
+
+  FOO(double, 32, 8)
+  FOO(double, 16, 4)
+  FOO(double, 8, 2)
+  assert(false);
+}
+#undef FOO

 // dOutput x W^T -> dInput and
 // Input^T x dOutput -> dW
@@ -449,84 +403,41 @@ __global__ void dAffineReluTrivialConvolution_backward_dW_B(
  atomicAdd(&dAffineBias[tx], dAB);
 }

+#define FOO(T, K, V)                                                           \
+  {                                                                            \
+    if (input_nPlanes % K == 0 and output_nPlanes % K == 0) {                  \
+      uInt o = (nActive / K) * K;                                              \
+      if (o > 0)                                                               \
+        dAffineReluTrivialConvolution_backward_dW_A<T, K, V> << <              \
+            dim3(std::min(o / K, (uInt)512), input_nPlanes / K),               \
+            dim3(K, K / V), 0, THCState_getCurrentStream(state)>>>             \
+            (inFeatures, dInFeatures, dOutFeatures, affineWeight,              \
+             dAffineWeight, affineBias, dAffineBias, convWeight, dConvWeight,  \
+             input_nPlanes, input_stride, output_nPlanes, output_stride, o,    \
+             additiveGrad);                                                    \
+      if (nActive > o)                                                         \
+        dAffineReluTrivialConvolution_backward_dW_B<T, K, V> << <              \
+            dim3(1, input_nPlanes / K), dim3(K, K / V), 0,                     \
+            THCState_getCurrentStream(state)>>>                                \
+            (inFeatures + o * input_stride, dInFeatures + o * input_stride,    \
+             dOutFeatures + o * output_stride, affineWeight, dAffineWeight,    \
+             affineBias, dAffineBias, convWeight, dConvWeight, input_nPlanes,  \
+             input_stride, output_nPlanes, output_stride, nActive - o,         \
+             additiveGrad);                                                    \
+      return;                                                                  \
+    }                                                                          \
+  }
+
 template <typename T>
 void dAffineReluTrivialConvolution_backward_dW(
    T *inFeatures, T *dInFeatures, T *dOutFeatures, T *affineWeight,
    T *dAffineWeight, T *affineBias, T *dAffineBias, T *convWeight,
    T *dConvWeight, uInt input_nPlanes, uInt input_stride, uInt output_nPlanes,
    uInt output_stride, uInt nActive, bool additiveGrad) {
-  {
-    const uInt K = 32;
-    const uInt V = 8;
-    if (input_nPlanes % K == 0 and output_nPlanes % K == 0) {
-      uInt o = (nActive / K) * K;
-      if (o > 0)
-        dAffineReluTrivialConvolution_backward_dW_A<
-            T, K, V><<<dim3(std::min(o / K, (uInt)512), input_nPlanes / K),
-                       dim3(K, K / V), 0, THCState_getCurrentStream(state)>>>(
-            inFeatures, dInFeatures, dOutFeatures, affineWeight, dAffineWeight,
-            affineBias, dAffineBias, convWeight, dConvWeight, input_nPlanes,
-            input_stride, output_nPlanes, output_stride, o, additiveGrad);
-      if (nActive > o)
-        dAffineReluTrivialConvolution_backward_dW_B<
-            T, K, V><<<dim3(1, input_nPlanes / K), dim3(K, K / V), 0,
-                       THCState_getCurrentStream(state)>>>(
-            inFeatures + o * input_stride, dInFeatures + o * input_stride,
-            dOutFeatures + o * output_stride, affineWeight, dAffineWeight,
-            affineBias, dAffineBias, convWeight, dConvWeight, input_nPlanes,
-            input_stride, output_nPlanes, output_stride, nActive - o,
-            additiveGrad);
-      return;
-    }
-  }
-  {
-    const uInt K = 16;
-    const uInt V = 4;
-    if (input_nPlanes % K == 0 and output_nPlanes % K == 0) {
-      uInt o = (nActive / K) * K;
-      if (o > 0)
-        dAffineReluTrivialConvolution_backward_dW_A<
-            T, K, V><<<dim3(std::min(o / K, (uInt)512), input_nPlanes / K),
-                       dim3(K, K / V), 0, THCState_getCurrentStream(state)>>>(
-            inFeatures, dInFeatures, dOutFeatures, affineWeight, dAffineWeight,
-            affineBias, dAffineBias, convWeight, dConvWeight, input_nPlanes,
-            input_stride, output_nPlanes, output_stride, o, additiveGrad);
-      if (nActive > o)
-        dAffineReluTrivialConvolution_backward_dW_B<
-            T, K, V><<<dim3(1, input_nPlanes / K), dim3(K, K / V), 0,
-                       THCState_getCurrentStream(state)>>>(
-            inFeatures + o * input_stride, dInFeatures + o * input_stride,
-            dOutFeatures + o * output_stride, affineWeight, dAffineWeight,
-            affineBias, dAffineBias, convWeight, dConvWeight, input_nPlanes,
-            input_stride, output_nPlanes, output_stride, nActive - o,
-            additiveGrad);
-      return;
-    }
-  }
-  {
-    const uInt K = 8;
-    const uInt V = 2;
-    if (input_nPlanes % K == 0 and output_nPlanes % K == 0) {
-      uInt o = (nActive / K) * K;
-      if (o > 0)
-        dAffineReluTrivialConvolution_backward_dW_A<
-            T, K, V><<<dim3(std::min(o / K, (uInt)512), input_nPlanes / K),
-                       dim3(K, K / V), 0, THCState_getCurrentStream(state)>>>(
-            inFeatures, dInFeatures, dOutFeatures, affineWeight, dAffineWeight,
-            affineBias, dAffineBias, convWeight, dConvWeight, input_nPlanes,
-            input_stride, output_nPlanes, output_stride, o, additiveGrad);
-      if (nActive > o)
-        dAffineReluTrivialConvolution_backward_dW_B<
-            T, K, V><<<dim3(1, input_nPlanes / K), dim3(K, K / V), 0,
-                       THCState_getCurrentStream(state)>>>(
-            inFeatures + o * input_stride, dInFeatures + o * input_stride,
-            dOutFeatures + o * output_stride, affineWeight, dAffineWeight,
-            affineBias, dAffineBias, convWeight, dConvWeight, input_nPlanes,
-            input_stride, output_nPlanes, output_stride, nActive - o,
-            additiveGrad);
-      return;
-    }
-  }
+  FOO(T, 32, 8)
+  FOO(T, 16, 4)
+  FOO(T, 8, 2)
 }
+#undef FOO

 #endif
--- a/PyTorch/sparseconvnet/SCN/generic/GPU/BatchNormalization.cu
+++ b/PyTorch/sparseconvnet/SCN/generic/GPU/BatchNormalization.cu
@@ -30,13 +30,14 @@ extern "C" void scn_R_(BatchNormalization_updateOutput)(
    real leakiness) {

  THCTensor_(resizeAs)(state, output_features, input_features);
-  auto nActive = input_features->size[0];
-  auto nPlanes = input_features->size[1];
-  auto input_stride = input_features->stride[0];
-  auto output_stride = output_features->stride[0];
-
-  BN_F_MACRO(16)
-  else BN_F_MACRO(12) else BN_F_MACRO(8) else BN_F_MACRO(4) else BN_F_MACRO(1)
+  if (input_features->nDimension == 2) {
+    auto nActive = input_features->size[0];
+    auto nPlanes = input_features->size[1];
+    auto input_stride = input_features->stride[0];
+    auto output_stride = output_features->stride[0];
+    BN_F_MACRO(16)
+    else BN_F_MACRO(12) else BN_F_MACRO(8) else BN_F_MACRO(4) else BN_F_MACRO(1)
+  }
 }

 extern "C" void scn_R_(BatchNormalizationInTensor_updateOutput)(
@@ -44,14 +45,14 @@ extern "C" void scn_R_(BatchNormalizationInTensor_updateOutput)(
    THCTensor *saveInvStd, THCTensor *runningMean, THCTensor *runningVar,
    THCTensor *weight, THCTensor *bias, real eps, real momentum, bool train,
    real leakiness) {
-
-  auto nActive = input_features->size[0];
-  auto nPlanes = input_features->size[1];
-  auto input_stride = input_features->stride[0];
-  auto output_stride = output_features->stride[0];
-
-  BN_F_MACRO(16)
-  else BN_F_MACRO(12) else BN_F_MACRO(8) else BN_F_MACRO(4) else BN_F_MACRO(1)
+  if (input_features->nDimension == 2) {
+    auto nActive = input_features->size[0];
+    auto nPlanes = input_features->size[1];
+    auto input_stride = input_features->stride[0];
+    auto output_stride = output_features->stride[0];
+    BN_F_MACRO(16)
+    else BN_F_MACRO(12) else BN_F_MACRO(8) else BN_F_MACRO(4) else BN_F_MACRO(1)
+  }
 }

 #undef BN_F_MACRO
@@ -81,12 +82,13 @@ extern "C" void scn_R_(BatchNormalization_backward)(
    THCTensor *d_weight, THCTensor *d_bias, real leakiness) {

  THCTensor_(resizeAs)(state, d_input_features, d_output_features);
-  auto nActive = input_features->size[0];
-  auto nPlanes = input_features->size[1];
-  auto input_stride = input_features->stride[0];
-  auto output_stride = output_features->stride[0];
-
-  BN_B_MACRO(16)
-  else BN_B_MACRO(12) else BN_B_MACRO(8) else BN_B_MACRO(4) else BN_B_MACRO(1)
+  if (input_features->nDimension == 2) {
+    auto nActive = input_features->size[0];
+    auto nPlanes = input_features->size[1];
+    auto input_stride = input_features->stride[0];
+    auto output_stride = output_features->stride[0];
+    BN_B_MACRO(16)
+    else BN_B_MACRO(12) else BN_B_MACRO(8) else BN_B_MACRO(4) else BN_B_MACRO(1)
+  }
 }
 #endif
--- a/PyTorch/sparseconvnet/SCN/generic/GPU/Convolution.cu
+++ b/PyTorch/sparseconvnet/SCN/generic/GPU/Convolution.cu
@@ -25,28 +25,30 @@ extern "C" double scn_DR_(Convolution_updateOutput)(
  if (not bias)
    THCTensor_(zero)(state, output_features);

-  auto iF = THCTensor_(data)(state, input_features);
-  auto oF = THCTensor_(data)(state, output_features);
-  auto ip = input_features->size[1];
-  auto op = output_features->size[1];
-  auto w = THCTensor_(data)(state, weight);
  double flops = 0;
+  if (nActive) {
+    auto iF = THCTensor_(data)(state, input_features);
+    auto oF = THCTensor_(data)(state, output_features);
+    auto ip = input_features->size[1];
+    auto op = output_features->size[1];
+    auto w = THCTensor_(data)(state, weight);

-  if (bias) {
-    auto b = THCTensor_(data)(state, bias);
-    for (uInt i = 0; i < op; i += 32) {
-      uInt blockDim = min(32L, op - i);
-      uInt gridDim = min(4096, nActive);
-      Convolution_fp_bias
-              << <gridDim, blockDim, 0, THCState_getCurrentStream(state)>>>
-          (oF + i, b + i, op, op, nActive);
+    if (bias) {
+      auto b = THCTensor_(data)(state, bias);
+      for (uInt i = 0; i < op; i += 32) {
+        uInt blockDim = min(32L, op - i);
+        uInt gridDim = min(4096, nActive);
+        Convolution_fp_bias
+                << <gridDim, blockDim, 0, THCState_getCurrentStream(state)>>>
+            (oF + i, b + i, op, op, nActive);
+      }
    }
+    uInt c = ip * op;
+    RULEBOOKITERATOR(
+        dConvolution_forward2<real>(iF, oF, w, rbB, nHotB, ip, ip, op, op,
+                                    THCState_getCurrentStream(state));
+        , w += c; flops += nHotB * c;)
  }
-  uInt c = ip * op;
-  RULEBOOKITERATOR(
-      dConvolution_forward2<real>(iF, oF, w, rbB, nHotB, ip, ip, op, op,
-                                  THCState_getCurrentStream(state));
-      , w += c; flops += nHotB * c;)
  return flops;
 }

@@ -63,23 +65,25 @@ extern "C" void scn_DR_(Convolution_backward)(
  THCTensor_(resizeAs)(state, d_input_features, input_features);
  THCTensor_(zero)(state, d_input_features);

-  auto iF = THCTensor_(data)(state, input_features);
-  auto diF = THCTensor_(data)(state, d_input_features);
-  auto doF = THCTensor_(data)(state, d_output_features);
-  auto ip = input_features->size[1];
-  auto op = d_output_features->size[1];
-  auto w = THCTensor_(data)(state, weight);
-  auto dw = THCTensor_(data)(state, d_weight);
-  uInt c = ip * op;
-  RULEBOOKITERATOR(
-      dConvolution_backward_dW2<real>(iF, diF, doF, w, dw, rbB, nHotB, ip, ip,
-                                      op, op, THCState_getCurrentStream(state));
-      , w += c; dw += c;)
+  if (nActive) {
+    auto iF = THCTensor_(data)(state, input_features);
+    auto diF = THCTensor_(data)(state, d_input_features);
+    auto doF = THCTensor_(data)(state, d_output_features);
+    auto ip = input_features->size[1];
+    auto op = d_output_features->size[1];
+    auto w = THCTensor_(data)(state, weight);
+    auto dw = THCTensor_(data)(state, d_weight);
+    uInt c = ip * op;
+    RULEBOOKITERATOR(dConvolution_backward_dW2<real>(
+                         iF, diF, doF, w, dw, rbB, nHotB, ip, ip, op, op,
+                         THCState_getCurrentStream(state));
+                     , w += c; dw += c;)

-  if (d_bias) {
-    auto db = THCTensor_(data)(state, d_bias);
-    Convolution_bp_bias(doF, db, op, op, nActive,
-                        THCState_getCurrentStream(state));
+    if (d_bias) {
+      auto db = THCTensor_(data)(state, d_bias);
+      Convolution_bp_bias(doF, db, op, op, nActive,
+                          THCState_getCurrentStream(state));
+    }
  }
 }

@@ -89,33 +93,35 @@ extern "C" double scn_DR_(ValidConvolution_updateOutput)(
    THCTensor *bias, long filterVolume, THCITensor *rulesBuffer) {
  SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
  auto _rules = _m.getValidRuleBook(inputSize, filterSize, true);
-  uInt nActive = input_features->size[0];
+  uInt nActive = _m.getNActive(inputSize);
  THCTensor_(resize2d)(state, output_features, nActive, weight->size[1]);
  if (not bias)
    THCTensor_(zero)(state, output_features);

-  auto iF = THCTensor_(data)(state, input_features);
-  auto oF = THCTensor_(data)(state, output_features);
-  auto ip = input_features->size[1];
-  auto op = output_features->size[1];
-  auto w = THCTensor_(data)(state, weight);
  double flops = 0;
+  if (nActive) {
+    auto iF = THCTensor_(data)(state, input_features);
+    auto oF = THCTensor_(data)(state, output_features);
+    auto ip = input_features->size[1];
+    auto op = output_features->size[1];
+    auto w = THCTensor_(data)(state, weight);

-  if (bias) {
-    auto b = THCTensor_(data)(state, bias);
-    for (uInt i = 0; i < op; i += 32) {
-      uInt blockDim = min(32L, op - i);
-      uInt gridDim = min(4096, nActive);
-      Convolution_fp_bias
-              << <gridDim, blockDim, 0, THCState_getCurrentStream(state)>>>
-          (oF + i, b + i, op, op, nActive);
+    if (bias) {
+      auto b = THCTensor_(data)(state, bias);
+      for (uInt i = 0; i < op; i += 32) {
+        uInt blockDim = min(32L, op - i);
+        uInt gridDim = min(4096, nActive);
+        Convolution_fp_bias
+                << <gridDim, blockDim, 0, THCState_getCurrentStream(state)>>>
+            (oF + i, b + i, op, op, nActive);
+      }
    }
+    uInt c = ip * op;
+    RULEBOOKITERATOR(
+        dConvolution_forward2<real>(iF, oF, w, rbB, nHotB, ip, ip, op, op,
+                                    THCState_getCurrentStream(state));
+        , w += c; flops += nHotB * c;)
  }
-  uInt c = ip * op;
-  RULEBOOKITERATOR(
-      dConvolution_forward2<real>(iF, oF, w, rbB, nHotB, ip, ip, op, op,
-                                  THCState_getCurrentStream(state));
-      , w += c; flops += nHotB * c;)
  return flops;
 }

@@ -126,27 +132,29 @@ extern "C" void scn_DR_(ValidConvolution_backward)(
    THCTensor *d_bias, long filterVolume, THCITensor *rulesBuffer) {
  SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
  auto _rules = _m.getValidRuleBook(inputSize, filterSize, true);
-  uInt nActive = input_features->size[0];
+  uInt nActive = _m.getNActive(inputSize);
  THCTensor_(resizeAs)(state, d_input_features, input_features);
  THCTensor_(zero)(state, d_input_features);

-  auto iF = THCTensor_(data)(state, input_features);
-  auto diF = THCTensor_(data)(state, d_input_features);
-  auto doF = THCTensor_(data)(state, d_output_features);
-  auto ip = input_features->size[1];
-  auto op = d_output_features->size[1];
-  auto w = THCTensor_(data)(state, weight);
-  auto dw = THCTensor_(data)(state, d_weight);
-  uInt c = ip * op;
-  RULEBOOKITERATOR(
-      dConvolution_backward_dW2<real>(iF, diF, doF, w, dw, rbB, nHotB, ip, ip,
-                                      op, op, THCState_getCurrentStream(state));
-      , w += c; dw += c;)
+  if (nActive) {
+    auto iF = THCTensor_(data)(state, input_features);
+    auto diF = THCTensor_(data)(state, d_input_features);
+    auto doF = THCTensor_(data)(state, d_output_features);
+    auto ip = input_features->size[1];
+    auto op = d_output_features->size[1];
+    auto w = THCTensor_(data)(state, weight);
+    auto dw = THCTensor_(data)(state, d_weight);
+    uInt c = ip * op;
+    RULEBOOKITERATOR(dConvolution_backward_dW2<real>(
+                         iF, diF, doF, w, dw, rbB, nHotB, ip, ip, op, op,
+                         THCState_getCurrentStream(state));
+                     , w += c; dw += c;)

-  if (d_bias) {
-    auto db = THCTensor_(data)(state, d_bias);
-    Convolution_bp_bias(doF, db, op, op, nActive,
-                        THCState_getCurrentStream(state));
+    if (d_bias) {
+      auto db = THCTensor_(data)(state, d_bias);
+      Convolution_bp_bias(doF, db, op, op, nActive,
+                          THCState_getCurrentStream(state));
+    }
  }
 }


--- a/PyTorch/sparseconvnet/SCN/generic/GPU/Convolution.h
+++ b/PyTorch/sparseconvnet/SCN/generic/GPU/Convolution.h
@@ -184,7 +184,7 @@ dConvolution_KMxKN_forwardB(T *inFeatures, T *outFeatures, T *w, uInt *rules,
  }
 }

-#define FOO(K, V)                                                              \
+#define FOO(T, K, V)                                                           \
  {                                                                            \
    if (input_nPlanes % K == 0 and output_nPlanes % K == 0) {                  \
      uInt o = (nHot / K) * K;                                                 \
@@ -208,10 +208,21 @@ void dConvolution_forward(T *inFeatures, T *outFeatures, T *w, uInt *rules,
                          uInt nHot, uInt input_nPlanes, uInt input_stride,
                          uInt output_nPlanes, uInt output_stride,
                          cudaStream_t stream) {
-  FOO(64, 16)
-  FOO(32, 8)
-  FOO(16, 4)
-  FOO(8, 2)
+  FOO(T, 64, 16)
+  FOO(T, 32, 8)
+  FOO(T, 16, 4)
+  FOO(T, 8, 2)
+  assert(false);
+}
+template <>
+void dConvolution_forward<double>(double *inFeatures, double *outFeatures,
+                                  double *w, uInt *rules, uInt nHot,
+                                  uInt input_nPlanes, uInt input_stride,
+                                  uInt output_nPlanes, uInt output_stride,
+                                  cudaStream_t stream) {
+  FOO(double, 32, 8)
+  FOO(double, 16, 4)
+  FOO(double, 8, 2)
  assert(false);
 }
 #undef FOO
@@ -378,7 +389,7 @@ dConvolution_KMxKN_backward_dW_B(T *inFeatures, T *dInFeatures, T *dOutFeatures,
  }
 }

-#define FOO(K, V)                                                              \
+#define FOO(T, K, V)                                                           \
  {                                                                            \
    if (input_nPlanes % K == 0 and output_nPlanes % K == 0) {                  \
      uInt o = (nHot / K) * K;                                                 \
@@ -404,9 +415,9 @@ void dConvolution_backward_dW(T *inFeatures, T *dInFeatures, T *dOutFeatures,
                              uInt input_nPlanes, uInt input_stride,
                              uInt output_nPlanes, uInt output_stride,
                              cudaStream_t stream) {
-  FOO(32, 8)
-  FOO(16, 4)
-  FOO(8, 2)
+  FOO(T, 32, 8)
+  FOO(T, 16, 4)
+  FOO(T, 8, 2)
  assert(false);
 }
 #undef FOO

--- a/PyTorch/sparseconvnet/SCN/generic/GPU/Deconvolution.h
+++ b/PyTorch/sparseconvnet/SCN/generic/GPU/Deconvolution.h
@@ -153,7 +153,7 @@ dDeconvolution_KMxKN_forwardB(T *inFeatures, T *outFeatures, T *w, uInt *rules,
  }
 }

-#define FOO(K, V)                                                              \
+#define FOO(T, K, V)                                                           \
  {                                                                            \
    if (input_nPlanes % K == 0 and output_nPlanes % K == 0) {                  \
      uInt o = (nHot / K) * K;                                                 \
@@ -177,10 +177,21 @@ void dDeconvolution_forward(T *inFeatures, T *outFeatures, T *w, uInt *rules,
                            uInt nHot, uInt input_nPlanes, uInt input_stride,
                            uInt output_nPlanes, uInt output_stride,
                            cudaStream_t stream) {
-  FOO(64, 16)
-  FOO(32, 8)
-  FOO(16, 4)
-  FOO(8, 2)
+  FOO(T, 64, 16)
+  FOO(T, 32, 8)
+  FOO(T, 16, 4)
+  FOO(T, 8, 2)
+  assert(false);
+}
+template <>
+void dDeconvolution_forward<double>(double *inFeatures, double *outFeatures,
+                                    double *w, uInt *rules, uInt nHot,
+                                    uInt input_nPlanes, uInt input_stride,
+                                    uInt output_nPlanes, uInt output_stride,
+                                    cudaStream_t stream) {
+  FOO(double, 32, 8)
+  FOO(double, 16, 4)
+  FOO(double, 8, 2)
  assert(false);
 }
 #undef FOO
@@ -345,7 +356,7 @@ __global__ void dDeconvolution_KMxKN_backward_dW_B(
  }
 }

-#define FOO(K, V)                                                              \
+#define FOO(T, K, V)                                                           \
  {                                                                            \
    if (input_nPlanes % K == 0 and output_nPlanes % K == 0) {                  \
      uInt o = (nHot / K) * K;                                                 \
@@ -371,9 +382,9 @@ void dDeconvolution_backward_dW(T *inFeatures, T *dInFeatures, T *dOutFeatures,
                                uInt input_nPlanes, uInt input_stride,
                                uInt output_nPlanes, uInt output_stride,
                                cudaStream_t stream) {
-  FOO(32, 8)
-  FOO(16, 4)
-  FOO(8, 2)
+  FOO(T, 32, 8)
+  FOO(T, 16, 4)
+  FOO(T, 8, 2)
  assert(false);
 }
 #undef FOO

--- a/PyTorch/sparseconvnet/SCN/generic/GPU/SparseToDense.cu
+++ b/PyTorch/sparseconvnet/SCN/generic/GPU/SparseToDense.cu
@@ -9,50 +9,54 @@
 #else
 #include "SparseToDense.h"

-extern "C" void scn_DR_(SparseToDense_updateOutput)(THLongTensor *inputSize,
-                                                    void **m,
-                                                    THCTensor *input_features,
-                                                    THCTensor *output_features,
-                                                    THCITensor *rulesBuffer) {
+extern "C" void scn_DR_(SparseToDense_updateOutput)(
+    THLongTensor *inputSize, void **m, THCTensor *input_features,
+    THCTensor *output_features, THCITensor *rulesBuffer, long nPlanes) {

-  SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m) {
+  SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
+  long spatialVolume = 1;
+  {
    long sz[Dimension + 2];
-    sz[0] = _m.inputSGs->size();
-    sz[1] = input_features->size[1];
+    sz[0] = _m.grids.begin()->second.size();
+    sz[1] = nPlanes; // input_features->size[1];
    for (int i = 0; i < Dimension; i++) {
      auto x = THLongTensor_data(inputSize)[i];
      sz[i + 2] = x;
+      spatialVolume *= x;
    }
    THCTensor_(resizeNd)(state, output_features, Dimension + 2, sz, NULL);
    THCTensor_(zero)(state, output_features);
  }
-  auto _rules = _m.getSparseToDenseRuleBook(inputSize, true);
-  auto spatialVolume = _rules.size();
-  uInt nPlanes = input_features->size[1];
-  auto iF = THCTensor_(data)(state, input_features);
-  auto oF = THCTensor_(data)(state, output_features);
-  RULEBOOKITERATOR(
-      SparseToDense_ForwardPass<real>(THCState_getCurrentStream(state), iF, oF,
-                                      nPlanes, spatialVolume, rbB, nHotB);
-      , oF++;) // todo check ++ or +=spatialVolume????zzz
+  if (input_features->nDimension == 2) {
+    auto _rules = _m.getSparseToDenseRuleBook(inputSize, true);
+    uInt nPlanes = input_features->size[1];
+    auto iF = THCTensor_(data)(state, input_features);
+    auto oF = THCTensor_(data)(state, output_features);
+    RULEBOOKITERATOR(
+        SparseToDense_ForwardPass<real>(THCState_getCurrentStream(state), iF,
+                                        oF, nPlanes, spatialVolume, rbB, nHotB);
+        , oF += nPlanes * spatialVolume;)
+  }
 }
 extern "C" void scn_DR_(SparseToDense_updateGradInput)(
    THLongTensor *inputSize, void **m, THCTensor *input_features,
    THCTensor *d_input_features, THCTensor *d_output_features,
    THCITensor *rulesBuffer) {

+  SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
  THCTensor_(resizeAs)(state, d_input_features, input_features);
  THCTensor_(zero)(state, d_input_features);

-  SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
-  auto _rules = _m.getSparseToDenseRuleBook(inputSize, true);
-  auto spatialVolume = _rules.size();
-  uInt nPlanes = d_input_features->size[1];
-  auto diF = THCTensor_(data)(state, d_input_features);
-  auto doF = THCTensor_(data)(state, d_output_features);
-  RULEBOOKITERATOR(
-      SparseToDense_BackwardPass<real>(THCState_getCurrentStream(state), diF,
-                                       doF, nPlanes, spatialVolume, rbB, nHotB);
-      , doF++;)
+  if (input_features->nDimension == 2) {
+    auto _rules = _m.getSparseToDenseRuleBook(inputSize, true);
+    long spatialVolume = THLongTensor_prodall(inputSize);
+    uInt nPlanes = d_input_features->size[1];
+    auto diF = THCTensor_(data)(state, d_input_features);
+    auto doF = THCTensor_(data)(state, d_output_features);
+    RULEBOOKITERATOR(SparseToDense_BackwardPass<real>(
+                         THCState_getCurrentStream(state), diF, doF, nPlanes,
+                         spatialVolume, rbB, nHotB);
+                     , doF += nPlanes * spatialVolume;)
+  }
 }
 #endif
--- a/PyTorch/sparseconvnet/SCN/generic/GPU/SparseToDense.h
+++ b/PyTorch/sparseconvnet/SCN/generic/GPU/SparseToDense.h
@@ -12,7 +12,8 @@
 // NTX must be >=2 so r is filled properly
 template <typename T, uInt NTX, uInt NTY>
 __global__ void SparseToDense_fp(T *input_features, T *output_features,
-                                uInt nPlanes, uInt spatialVolume, uInt *rules, uInt nHot) {
+                                 uInt nPlanes, uInt spatialVolume, uInt *rules,
+                                 uInt nHot) {
  __shared__ uInt r[NTY * 2];
  for (uInt n = blockIdx.x * NTY; n < nHot; n += gridDim.x * NTY) {
    {
@@ -22,10 +23,10 @@ __global__ void SparseToDense_fp(T *input_features, T *output_features,
    }
    __syncthreads();
    if (n + threadIdx.y < nHot) {
-      T *i = &input_features[r[2 * threadIdx.y] * nPlanes];
-      T *o = &output_features[r[2*threadIdx.y+1]*spatialVolume*nPlanes];
+      T *i = input_features + r[2 * threadIdx.y] * nPlanes;
+      T *o = output_features + r[2 * threadIdx.y + 1];
      for (uInt plane = threadIdx.x; plane < nPlanes; plane += NTX)
-      o[plane*spatialVolume]=i[plane];
+        o[plane * spatialVolume] = i[plane];
    }
    __syncthreads();
  }
@@ -33,16 +34,16 @@ __global__ void SparseToDense_fp(T *input_features, T *output_features,

 template <typename T>
 void SparseToDense_ForwardPass(cudaStream_t stream, T *input_features,
-                              T *output_features, uInt nPlanes,
-                              uInt spatialVolume,
-                              uInt *rules, uInt nHot) {
-  SparseToDense_fp<T, 32, 32><<<32, dim3(32, 32), 0, stream>>>(
-      input_features, output_features, nPlanes, spatialVolume,  rules, nHot);
+                               T *output_features, uInt nPlanes,
+                               uInt spatialVolume, uInt *rules, uInt nHot) {
+  SparseToDense_fp<T, 32, 32> << <32, dim3(32, 32), 0, stream>>>
+      (input_features, output_features, nPlanes, spatialVolume, rules, nHot);
 }
 // NTX must be >=2 so r is filled properly
 template <typename T, uInt NTX, uInt NTY>
 __global__ void SparseToDense_bp(T *d_input_features, T *d_output_features,
-                                uInt nPlanes, uInt spatialVolume, uInt *rules, uInt nHot) {
+                                 uInt nPlanes, uInt spatialVolume, uInt *rules,
+                                 uInt nHot) {
  __shared__ uInt r[NTY * 2];
  for (uInt n = blockIdx.x * NTY; n < nHot; n += gridDim.x * NTY) {
    {
@@ -52,10 +53,10 @@ __global__ void SparseToDense_bp(T *d_input_features, T *d_output_features,
    }
    __syncthreads();
    if (n + threadIdx.y < nHot) {
-      T *i = &d_input_features[r[2 * threadIdx.y] * nPlanes];
-      T *o = &d_output_features[r[2*threadIdx.y+1]*spatialVolume*nPlanes];
+      T *d_i = d_input_features + r[2 * threadIdx.y] * nPlanes;
+      T *d_o = d_output_features + r[2 * threadIdx.y + 1];
      for (uInt plane = threadIdx.x; plane < nPlanes; plane += NTX)
-      i[plane]=o[plane*spatialVolume];
+        d_i[plane] = d_o[plane * spatialVolume];
    }
    __syncthreads();
  }
@@ -63,10 +64,10 @@ __global__ void SparseToDense_bp(T *d_input_features, T *d_output_features,

 template <typename T>
 void SparseToDense_BackwardPass(cudaStream_t stream, T *d_input_features,
-                              T *d_output_features, uInt nPlanes,
-                              uInt spatialVolume,
-                              uInt *rules, uInt nHot) {
-  SparseToDense_bp<T, 32, 32><<<32, dim3(32, 32), 0, stream>>>(
-      d_input_features, d_output_features, nPlanes, spatialVolume,  rules, nHot);
+                                T *d_output_features, uInt nPlanes,
+                                uInt spatialVolume, uInt *rules, uInt nHot) {
+  SparseToDense_bp<T, 32, 32> << <32, dim3(32, 32), 0, stream>>>
+      (d_input_features, d_output_features, nPlanes, spatialVolume, rules,
+       nHot);
 }
 #endif /* GPU_SPARSETODENSE_H */
--- a/PyTorch/sparseconvnet/SCN/generic/GPU/THGenerateCudaFloatTypes.h
+++ b/PyTorch/sparseconvnet/SCN/generic/GPU/THGenerateCudaFloatTypes.h
@@ -27,4 +27,22 @@
 #undef TH_REAL_IS_FLOAT
 #undef THBLAS_GEMM

+// double
+// #define real double
+// #define accreal double
+// #define Real Double
+// #define CReal CudaDouble
+// #define TH_REAL_IS_DOUBLE
+// #define THBLAS_GEMM THCudaBlas_Dgemm
+
+// #line 1 TH_GENERIC_FILE
+// #include TH_GENERIC_FILE
+
+// #undef accreal
+// #undef real
+// #undef Real
+// #undef CReal
+// #undef TH_REAL_IS_DOUBLE
+// #undef THBLAS_GEMM
+
 #undef TH_GENERIC_FILE
--- a/PyTorch/sparseconvnet/SCN/generic/Geometry/ConvolutionRules.h
+++ b/PyTorch/sparseconvnet/SCN/generic/Geometry/ConvolutionRules.h
@@ -103,20 +103,26 @@ uInt Convolution_InputSgsToRulesAndOutputSgs_OMP(
  return output_nActive;
 }

-// for each site in filterVolume, list of (inputFeatureNumber,batchIdx) pairs
+// for each active site, list of (inputFeatureNumber,batchIdx, spatialOffset)
+// triples
 template <uInt dimension>
 void SparseToDense_InputSgsToRulesAndOutputSgs(
    SparseGrids<dimension> &input_SGs, RuleBook &rules, long *spatialSize) {
  uInt batchSize = input_SGs.size();
-  SparseGrids<dimension> output_SGs(batchSize);
-  std::vector<long> ones(dimension, 1);
  rules.clear();
-  for (uInt i = 0; i < batchSize; i++) {
-    auto &iSG = input_SGs[i];
-    auto &oSG = output_SGs[i];
-    oSG.ctr = i; // batchIdx
-    Convolution_InputSgToRulesAndOutputSg<dimension>(
-        iSG, oSG, rules, spatialSize, &ones[0], spatialSize, &ones[0]);
+  rules.resize(batchSize);
+  Point<dimension> lb, ub;
+  for (int i = 0; i < dimension; ++i) {
+    lb[i] = 0;
+    ub[i] = spatialSize[i] - 1;
+  }
+  auto region = RectangularRegion<dimension>(lb, ub);
+  for (uInt batchIdx = 0; batchIdx < batchSize; batchIdx++) {
+    auto &iSG = input_SGs[batchIdx];
+    for (auto const &inIter : iSG.mp) {
+      rules[batchIdx].push_back(inIter.second + iSG.ctr);
+      rules[batchIdx].push_back(region.offset(inIter.first));
+    }
  }
 }

@@ -124,33 +130,21 @@ template <uInt dimension>
 void SparseToDense_InputSgsToRulesAndOutputSgs_OMP(
    SparseGrids<dimension> &input_SGs, RuleBook &rules, long *spatialSize) {
  uInt batchSize = input_SGs.size();
-  SparseGrids<dimension> output_SGs(batchSize);
-  std::vector<long> ones(dimension, 1);
  rules.clear();
-  rules.resize(volume<dimension>(spatialSize));
-  std::vector<RuleBook> rbs(batchSize);
-  {
-    uInt i;
-#pragma omp parallel for private(i)
-    for (i = 0; i < batchSize; i++) {
-      output_SGs[i].ctr = i; // batchIdx
-      Convolution_InputSgToRulesAndOutputSg<dimension>(
-          input_SGs[i], output_SGs[i], rbs[i], spatialSize, &ones[0],
-          spatialSize, &ones[0]);
-    }
+  rules.resize(batchSize);
+  Point<dimension> lb, ub;
+  for (int i = 0; i < dimension; ++i) {
+    lb[i] = 0;
+    ub[i] = spatialSize[i] - 1;
  }
-  {
-    uInt i;
-#pragma omp parallel for private(i)
-    for (i = 0; i < rules.size(); i++) {
-      auto &R = rules[i];
-      for (uInt j = 0; j < batchSize; j++) {
-        auto &r = rbs[j][i];
-        for (uInt k = 0; k < r.size();) {
-          R.push_back(r[k++]);
-          R.push_back(r[k++]);
-        }
-      }
+  auto region = RectangularRegion<dimension>(lb, ub);
+  uInt batchIdx;
+#pragma omp parallel for private(batchIdx)
+  for (batchIdx = 0; batchIdx < batchSize; batchIdx++) {
+    auto &iSG = input_SGs[batchIdx];
+    for (auto const &inIter : iSG.mp) {
+      rules[batchIdx].push_back(inIter.second + iSG.ctr);
+      rules[batchIdx].push_back(region.offset(inIter.first));
    }
  }
 }

--- a/PyTorch/sparseconvnet/SCN/generic/Geometry/Metadata.cpp
+++ b/PyTorch/sparseconvnet/SCN/generic/Geometry/Metadata.cpp
@@ -125,16 +125,15 @@ extern "C" void scn_D_(getSpatialLocations)(void **m, THLongTensor *spatialSize,
 }
 extern "C" void
 scn_D_(createMetadataForDenseToSparse)(void **m, THLongTensor *spatialSize_,
-                                       THLongTensor *pad_, THLongTensor *nz_,
-                                       long batchSize) {
+                                       THLongTensor *nz_, long batchSize) {
  SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
+  _m.clear();
  _m.setInputSpatialSize(spatialSize_);
  _m.inputSGs->resize(batchSize);
  auto &nActive = *_m.inputNActive;
  nActive = nz_->size[0];

  auto nz = THLongTensor_data(nz_);
-  auto pad = THLongTensor_data(pad_);
  auto spatialSize = THLongTensor_data(spatialSize_);

  std::vector<uInt> br(batchSize + 1);
@@ -157,8 +156,7 @@ scn_D_(createMetadataForDenseToSparse)(void **m, THLongTensor *spatialSize_,
    for (uInt i = br[b]; i < br[b + 1]; i++) {
      Point<Dimension> x;
      for (uInt j = 0; j < Dimension; j++) {
-        x[j] = nz[i * (Dimension + 1) + j + 1] +
-               pad[b * Dimension + j]; // 0-indexed
+        x[j] = nz[i * (Dimension + 1) + j + 1]; // 0-indexed
      }
      sg.mp[x] = i;
    }
@@ -281,6 +279,7 @@ extern "C" void scn_D_(generateRuleBooks2s2)(void **m) {
      p2[i] = p3[i] = inS[i] = outS[i];
  }
 }
+
 extern "C" void scn_D_(freeMetadata)(void **m) {
  SCN_DELETE(Metadata<Dimension>, m)
 }

--- a/PyTorch/sparseconvnet/SCN/generic/Geometry/Metadata.h
+++ b/PyTorch/sparseconvnet/SCN/generic/Geometry/Metadata.h
@@ -11,7 +11,6 @@
 #include "ActivePoolingRules.h"
 #include "ConvolutionRules.h"
 #include "ValidConvolutionRules.h"
-#include <iostream>
 #include <tuple>
 #include <unordered_map>

@@ -40,6 +39,18 @@ public:
  uInt *inputNActive;

  Metadata() {}
+  void clear() {
+    nActive.clear();
+    grids.clear();
+    activePoolingRuleBooks.clear();
+    validRuleBooks.clear();
+    ruleBooks.clear();
+    sparseToDenseRuleBooks.clear();
+    inputSGs = nullptr;
+    inputSG = nullptr;
+    inputNActive = nullptr;
+  }
+
  void setInputSpatialSize(THLongTensor *spatialSize) {
    inputSpatialSize = LongTensorToPoint<dimension>(spatialSize);
    inputSGs = &grids[inputSpatialSize];

--- a/PyTorch/sparseconvnet/SCN/generic/Geometry/ValidConvolutionRules.h
+++ b/PyTorch/sparseconvnet/SCN/generic/Geometry/ValidConvolutionRules.h
@@ -6,8 +6,6 @@

 #ifndef VALIDCONVOLUTIONRULES_H
 #define VALIDCONVOLUTIONRULES_H
-#include<iostream>
-

 // Full input region for an output point
 template <uInt dimension>
@@ -26,8 +24,8 @@ InputRegionCalculator_Valid(const Point<dimension> &output, long *size) {
 // rules is used to carry out the "lowering" whilst carrying out the convolution

 template <uInt dimension>
-double ValidConvolution_SgToRules(SparseGrid<dimension> &grid,
-                                    RuleBook &rules, long *size) {
+double ValidConvolution_SgToRules(SparseGrid<dimension> &grid, RuleBook &rules,
+                                  long *size) {
  uInt sd = volume<dimension>(size);
  double countActiveInputs = 0;
  for (auto const &outputIter : grid.mp) {
@@ -48,8 +46,8 @@ double ValidConvolution_SgToRules(SparseGrid<dimension> &grid,
 }

 template <uInt dimension>
-uInt ValidConvolution_SgsToRules(SparseGrids<dimension> &SGs,
-                                   RuleBook &rules, long *size) {
+uInt ValidConvolution_SgsToRules(SparseGrids<dimension> &SGs, RuleBook &rules,
+                                 long *size) {
  uInt sd = volume<dimension>(size);
  uInt countActiveInputs = 0;
  rules.clear();
@@ -61,7 +59,7 @@ uInt ValidConvolution_SgsToRules(SparseGrids<dimension> &SGs,
 }
 template <uInt dimension>
 uInt ValidConvolution_SgsToRules_OMP(SparseGrids<dimension> &SGs,
-                                       RuleBook &rules, long *size) {
+                                     RuleBook &rules, long *size) {
  std::vector<RuleBook> rbs(SGs.size());
  std::vector<double> countActiveInputs(SGs.size());
  rules.clear();

--- a/PyTorch/sparseconvnet/SCN/header_cpu.h
+++ b/PyTorch/sparseconvnet/SCN/header_cpu.h
--- a/PyTorch/sparseconvnet/SCN/header_gpu.h
+++ b/PyTorch/sparseconvnet/SCN/header_gpu.h
@@ -120,7 +120,8 @@ void scn_gpu_float1MaxPooling_updateGradInput(
 // SparseToDense
 void scn_gpu_float1SparseToDense_updateOutput(
  THLongTensor *inputSize, void **m, THCudaTensor *input_features,
-  THCudaTensor *output_features, THCudaIntTensor *rulesBuffer);
+  THCudaTensor *output_features, THCudaIntTensor *rulesBuffer,
+  long nPlanes);
 void scn_gpu_float1SparseToDense_updateGradInput(
  THLongTensor *inputSize, void **m, THCudaTensor *input_features,
  THCudaTensor *d_input_features, THCudaTensor *d_output_features,
@@ -197,7 +198,7 @@ void scn_gpu_float2MaxPooling_updateGradInput(
 // SparseToDense
 void scn_gpu_float2SparseToDense_updateOutput(
  THLongTensor *inputSize, void **m, THCudaTensor *input_features,
-  THCudaTensor *output_features, THCudaIntTensor *rulesBuffer);
+  THCudaTensor *output_features, THCudaIntTensor *rulesBuffer, long nPlanes);
 void scn_gpu_float2SparseToDense_updateGradInput(
  THLongTensor *inputSize, void **m, THCudaTensor *input_features,
  THCudaTensor *d_input_features, THCudaTensor *d_output_features,
@@ -274,7 +275,7 @@ void scn_gpu_float3MaxPooling_updateGradInput(
 // SparseToDense
 void scn_gpu_float3SparseToDense_updateOutput(
  THLongTensor *inputSize, void **m, THCudaTensor *input_features,
-  THCudaTensor *output_features, THCudaIntTensor *rulesBuffer);
+  THCudaTensor *output_features, THCudaIntTensor *rulesBuffer, long nPlanes);
 void scn_gpu_float3SparseToDense_updateGradInput(
  THLongTensor *inputSize, void **m, THCudaTensor *input_features,
  THCudaTensor *d_input_features, THCudaTensor *d_output_features,
@@ -351,7 +352,7 @@ void scn_gpu_float4MaxPooling_updateGradInput(
 // SparseToDense
 void scn_gpu_float4SparseToDense_updateOutput(
  THLongTensor *inputSize, void **m, THCudaTensor *input_features,
-  THCudaTensor *output_features, THCudaIntTensor *rulesBuffer);
+  THCudaTensor *output_features, THCudaIntTensor *rulesBuffer, long nPlanes);
 void scn_gpu_float4SparseToDense_updateGradInput(
  THLongTensor *inputSize, void **m, THCudaTensor *input_features,
  THCudaTensor *d_input_features, THCudaTensor *d_output_features,
@@ -428,7 +429,7 @@ void scn_gpu_float5MaxPooling_updateGradInput(
 // SparseToDense
 void scn_gpu_float5SparseToDense_updateOutput(
  THLongTensor *inputSize, void **m, THCudaTensor *input_features,
-  THCudaTensor *output_features, THCudaIntTensor *rulesBuffer);
+  THCudaTensor *output_features, THCudaIntTensor *rulesBuffer, long nPlanes);
 void scn_gpu_float5SparseToDense_updateGradInput(
  THLongTensor *inputSize, void **m, THCudaTensor *input_features,
  THCudaTensor *d_input_features, THCudaTensor *d_output_features,
@@ -505,7 +506,7 @@ void scn_gpu_float6MaxPooling_updateGradInput(
 // SparseToDense
 void scn_gpu_float6SparseToDense_updateOutput(
  THLongTensor *inputSize, void **m, THCudaTensor *input_features,
-  THCudaTensor *output_features, THCudaIntTensor *rulesBuffer);
+  THCudaTensor *output_features, THCudaIntTensor *rulesBuffer, long nPlanes);
 void scn_gpu_float6SparseToDense_updateGradInput(
  THLongTensor *inputSize, void **m, THCudaTensor *input_features,
  THCudaTensor *d_input_features, THCudaTensor *d_output_features,
@@ -582,7 +583,7 @@ void scn_gpu_float7MaxPooling_updateGradInput(
 // SparseToDense
 void scn_gpu_float7SparseToDense_updateOutput(
  THLongTensor *inputSize, void **m, THCudaTensor *input_features,
-  THCudaTensor *output_features, THCudaIntTensor *rulesBuffer);
+  THCudaTensor *output_features, THCudaIntTensor *rulesBuffer, long nPlanes);
 void scn_gpu_float7SparseToDense_updateGradInput(
  THLongTensor *inputSize, void **m, THCudaTensor *input_features,
  THCudaTensor *d_input_features, THCudaTensor *d_output_features,
@@ -659,7 +660,7 @@ void scn_gpu_float8MaxPooling_updateGradInput(
 // SparseToDense
 void scn_gpu_float8SparseToDense_updateOutput(
  THLongTensor *inputSize, void **m, THCudaTensor *input_features,
-  THCudaTensor *output_features, THCudaIntTensor *rulesBuffer);
+  THCudaTensor *output_features, THCudaIntTensor *rulesBuffer, long nPlanes);
 void scn_gpu_float8SparseToDense_updateGradInput(
  THLongTensor *inputSize, void **m, THCudaTensor *input_features,
  THCudaTensor *d_input_features, THCudaTensor *d_output_features,
@@ -736,7 +737,7 @@ void scn_gpu_float9MaxPooling_updateGradInput(
 // SparseToDense
 void scn_gpu_float9SparseToDense_updateOutput(
  THLongTensor *inputSize, void **m, THCudaTensor *input_features,
-  THCudaTensor *output_features, THCudaIntTensor *rulesBuffer);
+  THCudaTensor *output_features, THCudaIntTensor *rulesBuffer, long nPlanes);
 void scn_gpu_float9SparseToDense_updateGradInput(
  THLongTensor *inputSize, void **m, THCudaTensor *input_features,
  THCudaTensor *d_input_features, THCudaTensor *d_output_features,
@@ -813,7 +814,7 @@ void scn_gpu_float10MaxPooling_updateGradInput(
 // SparseToDense
 void scn_gpu_float10SparseToDense_updateOutput(
  THLongTensor *inputSize, void **m, THCudaTensor *input_features,
-  THCudaTensor *output_features, THCudaIntTensor *rulesBuffer);
+  THCudaTensor *output_features, THCudaIntTensor *rulesBuffer, long nPlanes);
 void scn_gpu_float10SparseToDense_updateGradInput(
  THLongTensor *inputSize, void **m, THCudaTensor *input_features,
  THCudaTensor *d_input_features, THCudaTensor *d_output_features,