Goodbye THNN. Hello ATen!

2c4ed608 · Benjamin Thomas Graham · 6d4475db · 6d4475db · 6d4475db · 6d4475db
Commit 2c4ed608 authored Jun 20, 2018 by Benjamin Thomas Graham
20 changed files
--- a/sparseconvnet/SCN/generic/CPU/Deconvolution.h
+++ b/sparseconvnet/SCN/generic/CPU/Deconvolution.h
-// Copyright 2016-present, Facebook, Inc.
-// All rights reserved.
-//
-// This source code is licensed under the license found in the
-// LICENSE file in the root directory of this source tree.
-
-#ifndef CPU_Deconvolution_H
-#define CPU_Deconvolution_H
-#include "../SparseConvNet.h"
-#include <cstring>
-// buffer must have size >= nHot * (nIn+nOut)
-
-template <typename T>
-void Deconvolution_ForwardPass(
-    T *input_features, uInt input_nPlanes, uInt input_nPLANES,
-    T *output_features, uInt output_nPlanes, uInt output_nPLANES, T *weight,
-    T *bias, RuleBook &rules, uInt output_nActive,
-    void (*gemm)(char transa, char transb, long m, long n, long k, T alpha,
-                 T *a, long lda, T *b, long ldb, T beta, T *c, long ldc)) {
-
-  if (bias != nullptr) // Set bias
-    for (uInt row = 0; row < output_nActive; row++)
-      for (uInt column = 0; column < output_nPlanes; column++)
-        output_features[row * output_nPLANES + column] = bias[column];
-
-  std::vector<T> input_buffer, output_buffer;
-  for (auto &r : rules) {
-    uInt nHot = r.size() / 2;
-    input_buffer.resize(nHot * input_nPlanes);
-    output_buffer.resize(nHot * output_nPlanes);
-    for (uInt row = 0; row < nHot; row++)
-      std::memcpy(&input_buffer[row * input_nPlanes],
-                  input_features + r[2 * row + 1] * input_nPLANES,
-                  sizeof(T) * input_nPlanes);
-    // Do GEMM (note: gemm assumes column-major matrices)
-    // input_buffer    is l*m (row-major)
-    // weight          is m*r (row-major)
-    // output_buffer   is l*r (row-major)
-    // buffer * weights -> output_buffers
-    (*gemm)('n', 'n',
-            output_nPlanes,                   // r
-            nHot,                             // l
-            input_nPlanes,                    // m
-            1,                                // alpha
-            weight, output_nPlanes,           // r
-            &input_buffer[0], input_nPlanes,  // m
-            0,                                // beta
-            &output_buffer[0], output_nPlanes // r
-            );
-    weight += input_nPlanes * output_nPlanes;
-    for (uInt row = 0; row < nHot; row++) {
-      T *b = &output_buffer[row * output_nPlanes];
-      T *o = &output_features[r[2 * row] * output_nPLANES];
-      for (uInt k = 0; k < output_nPlanes; k++)
-        o[k] += b[k];
-    }
-  }
-}
-
-template <typename T>
-void Deconvolution_BackwardPass(
-    T *input_features, T *d_input_features, uInt input_nPlanes,
-    uInt input_nPLANES, T *d_output_features, uInt output_nPlanes,
-    uInt output_nPLANES, T *weight, T *d_weight, T *d_bias, RuleBook &rules,
-    uInt output_nActive,
-    void (*gemm)(char transa, char transb, long m, long n, long k, T alpha,
-                 T *a, long lda, T *b, long ldb, T beta, T *c, long ldc)) {
-
-  if (d_bias)
-    for (uInt row = 0; row < output_nActive; row++)
-      for (uInt i = 0; i < output_nPlanes; i++)
-        d_bias[i] += d_output_features[row * output_nPLANES + i];
-
-  std::vector<T> input_buffer, output_buffer;
-  for (auto &r : rules) {
-    uInt nHot = r.size() / 2;
-    input_buffer.resize(nHot * input_nPlanes);
-    output_buffer.resize(nHot * output_nPlanes);
-    for (uInt row = 0; row < nHot; row++)
-      std::memcpy(&output_buffer[row * output_nPlanes],
-                  &d_output_features[r[2 * row] * output_nPLANES],
-                  sizeof(T) * output_nPlanes);
-    // Do GEMM (note: gemm assumes column-major matrices)
-    // output_buffer is l*m (row-major)
-    // weights           is r*m (row-major)
-    // input_buffer          is l*r (row-major)
-    // output_buffer * T(weight) -> input_buffer
-    (*gemm)('t', 'n',
-            input_nPlanes,                     // r
-            nHot,                              // l
-            output_nPlanes,                    // m
-            1,                                 // alpha
-            weight, output_nPlanes,            // m
-            &output_buffer[0], output_nPlanes, // m
-            0,                                 // beta
-            &input_buffer[0], input_nPlanes    // r
-            );
-    weight += input_nPlanes * output_nPlanes;
-    for (uInt row = 0; row < nHot; row++) {
-      T *b = &input_buffer[row * input_nPlanes];
-      T *i = &d_input_features[r[2 * row + 1] * input_nPLANES];
-      for (uInt k = 0; k < input_nPlanes; k++)
-        i[k] += b[k];
-    }
-
-    for (uInt row = 0; row < nHot; row++)
-      std::memcpy(&input_buffer[row * input_nPlanes],
-                  input_features + r[2 * row + 1] * input_nPLANES,
-                  sizeof(T) * input_nPlanes);
-    // Do GEMM (note: gemm assumes column-major matrices)
-    // input_buffer          is m*l (row-major)
-    // output_buffer          is m*r   (row-major)
-    // d_weights        is l*r (row-major)
-    // T(input_buffer) * output_buffer -> d_weight
-    (*gemm)('n', 't',
-            output_nPlanes,                    // r
-            input_nPlanes,                     // l
-            nHot,                              // m
-            1,                                 // alpha
-            &output_buffer[0], output_nPlanes, // r
-            &input_buffer[0], input_nPlanes,   // l
-            1,                                 // beta
-            d_weight, output_nPlanes           // r
-            );
-    d_weight += input_nPlanes * output_nPlanes;
-  }
-}
-#endif /* CPU_Deconvolution_H */
--- a/sparseconvnet/SCN/generic/CPU/IOLayers.cpp
+++ b/sparseconvnet/SCN/generic/CPU/IOLayers.cpp
-// Copyright 2016-present, Facebook, Inc.
-// All rights reserved.
-//
-// This source code is licensed under the license found in the
-// LICENSE file in the root directory of this source tree.
-
-#ifndef TH_GENERIC_FILE_
-#define TH_GENERIC_FILE_ "generic/CPU/IOLayers.cpp"
-#else
-#include "IOLayers.h"
-
-extern "C" void scn_DR_(InputLayer_updateOutput)(
-    void **m, THLongTensor *spatialSize, THLongTensor *input_coords,
-    THTensor *input_features, THTensor *output_features, long batchSize,
-    long mode) {
-  SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
-  _m.inputLayer(spatialSize, input_coords, batchSize, mode);
-  auto nPlanes = input_features->size[1];
-  auto &rules = _m.inputLayerRuleBook;
-  auto maxActive = rules[0][1];
-  auto nRows = rules[0][3];
-  if (mode == 0) {
-    THTensor_(resizeAs)(output_features, input_features);
-    THTensor_(copy)(output_features, input_features);
-  } else {
-    THTensor_(resize2d)(output_features, *_m.inputNActive, nPlanes);
-    THTensor_(zero)(output_features);
-    InputLayer_ForwardPass<real>(THTensor_(data)(input_features),
-                                 THTensor_(data)(output_features), nRows,
-                                 maxActive, nPlanes, &rules[1][0], mode == 4);
-  }
-}
-extern "C" void scn_DR_(InputLayer_updateGradInput)(void **m,
-                                                    THTensor *d_input_features,
-                                                    THTensor *d_output_features) {
-  SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
-  auto &rules = _m.inputLayerRuleBook;
-  auto nPlanes = d_output_features->size[1];
-  auto mode = rules[0][0];
-  auto maxActive = rules[0][1];
-  auto nRows = rules[0][3];
-  if (mode == 0) {
-    THTensor_(resizeAs)(d_input_features, d_output_features);
-    THTensor_(copy)(d_input_features, d_output_features);
-  } else {
-    THTensor_(resize2d)(d_input_features, rules[0][2], nPlanes);
-    THTensor_(zero)(d_input_features);
-    InputLayer_BackwardPass<real>(THTensor_(data)(d_input_features),
-                                  THTensor_(data)(d_output_features), nRows,
-                                  maxActive, nPlanes, &rules[1][0], mode == 4);
-  }
-}
-
-extern "C" void scn_DR_(OutputLayer_updateOutput)(void **m,
-                                                  THTensor *input_features,
-                                                  THTensor *output_features) {
-  SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
-  auto &rules = _m.inputLayerRuleBook;
-  auto nPlanes = input_features->size[1];
-  auto mode = rules[0][0];
-  auto maxActive = rules[0][1];
-  auto nRows = rules[0][3];
-  if (mode == 0) {
-    THTensor_(resizeAs)(output_features, input_features);
-    THTensor_(copy)(output_features, input_features);
-  } else {
-    THTensor_(resize2d)(output_features, rules[0][2], nPlanes);
-    THTensor_(zero)(output_features);
-    InputLayer_BackwardPass<real>(THTensor_(data)(output_features),
-                                  THTensor_(data)(input_features), nRows,
-                                  maxActive, nPlanes, &rules[1][0], false);
-  }
-}
-extern "C" void
-scn_DR_(OutputLayer_updateGradInput)(void **m, THTensor *d_input_features,
-                                     THTensor *d_output_features) {
-  SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
-  auto &rules = _m.inputLayerRuleBook;
-  auto nPlanes = d_output_features->size[1];
-  auto mode = rules[0][0];
-  auto maxActive = rules[0][1];
-  auto nRows = rules[0][3];
-  if (mode == 0) {
-    THTensor_(resizeAs)(d_input_features, d_output_features);
-    THTensor_(copy)(d_input_features, d_output_features);
-  } else {
-    THTensor_(resize2d)(d_input_features, nRows, nPlanes);
-    THTensor_(zero)(d_input_features);
-    InputLayer_ForwardPass<real>(THTensor_(data)(d_output_features),
-                                 THTensor_(data)(d_input_features), nRows,
-                                 maxActive, nPlanes, &rules[1][0], false);
-  }
-}
-
-extern "C" void scn_DR_(BLInputLayer_updateOutput)(
-    void **m, THLongTensor *spatialSize, THLongTensor *input_coords,
-    THTensor *input_features, THTensor *output_features, long mode) {
-  SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
-  _m.blLayer(spatialSize, input_coords, mode);
-  auto nPlanes = input_features->size[2];
-  auto &rules = _m.blLayerRuleBook;
-  auto maxActive = rules[0][1];
-  auto nRows = rules[0][4];
-  if (mode == 0) {
-    THTensor_(resizeAs)(output_features, input_features);
-    THTensor_(copy)(output_features, input_features);
-    THTensor_(resize2d)(output_features, *_m.inputNActive, nPlanes);
-  } else {
-    THTensor_(resize2d)(output_features, *_m.inputNActive, nPlanes);
-    THTensor_(zero)(output_features);
-    InputLayer_ForwardPass<real>(THTensor_(data)(input_features),
-                                 THTensor_(data)(output_features), nRows,
-                                 maxActive, nPlanes, &rules[1][0], mode == 4);
-  }
-}
-extern "C" void
-scn_DR_(BLInputLayer_updateGradInput)(void **m, THTensor *d_input_features,
-                                      THTensor *d_output_features) {
-  SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
-  auto &rules = _m.blLayerRuleBook;
-  auto nPlanes = d_output_features->size[1];
-  auto mode = rules[0][0];
-  auto maxActive = rules[0][1];
-  auto nRows = rules[0][4];
-
-  if (mode == 0) {
-    THTensor_(resizeAs)(d_input_features, d_output_features);
-    THTensor_(copy)(d_input_features, d_output_features);
-    THTensor_(resize3d)(d_input_features, rules[0][2], rules[0][3], nPlanes);
-  } else {
-    THTensor_(resize3d)(d_input_features, rules[0][2], rules[0][3], nPlanes);
-    THTensor_(zero)(d_input_features);
-    InputLayer_BackwardPass<real>(THTensor_(data)(d_input_features),
-                                  THTensor_(data)(d_output_features), nRows,
-                                  maxActive, nPlanes, &rules[1][0], mode == 4);
-  }
-}
-
-extern "C" void scn_DR_(BLOutputLayer_updateOutput)(void **m,
-                                                    THTensor *input_features,
-                                                    THTensor *output_features) {
-  SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
-  auto &rules = _m.blLayerRuleBook;
-  auto nPlanes = input_features->size[1];
-  auto mode = rules[0][0];
-  auto maxActive = rules[0][1];
-  auto nRows = rules[0][4];
-  if (mode == 0) {
-    THTensor_(resizeAs)(output_features, input_features);
-    THTensor_(copy)(output_features, input_features);
-    THTensor_(resize3d)(output_features, rules[0][2], rules[0][3], nPlanes);
-  } else {
-    THTensor_(resize3d)(output_features, rules[0][2], rules[0][3], nPlanes);
-    THTensor_(zero)(output_features);
-    InputLayer_BackwardPass<real>(THTensor_(data)(output_features),
-                                  THTensor_(data)(input_features), nRows,
-                                  maxActive, nPlanes, &rules[1][0], false);
-  }
-}
-extern "C" void
-scn_DR_(BLOutputLayer_updateGradInput)(void **m, THTensor *d_input_features,
-                                       THTensor *d_output_features) {
-  SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
-  auto &rules = _m.blLayerRuleBook;
-  auto nPlanes = d_output_features->size[2];
-  auto mode = rules[0][0];
-  auto maxActive = rules[0][1];
-  auto nRows = rules[0][4];
-  if (mode == 0) {
-    THTensor_(resizeAs)(d_input_features, d_output_features);
-    THTensor_(copy)(d_input_features, d_output_features);
-    THTensor_(resize2d)(d_input_features, nRows, nPlanes);
-  } else {
-    THTensor_(resize2d)(d_input_features, nRows, nPlanes);
-    THTensor_(zero)(d_input_features);
-    InputLayer_ForwardPass<real>(THTensor_(data)(d_output_features),
-                                 THTensor_(data)(d_input_features), nRows,
-                                 maxActive, nPlanes, &rules[1][0], false);
-  }
-}
-#endif
--- a/sparseconvnet/SCN/generic/CPU/LeakyReLU.cpp
+++ b/sparseconvnet/SCN/generic/CPU/LeakyReLU.cpp
-// Copyright 2016-present, Facebook, Inc.
-// All rights reserved.
-//
-// This source code is licensed under the license found in the
-// LICENSE file in the root directory of this source tree.
-
-#ifndef TH_GENERIC_FILE
-#define TH_GENERIC_FILE "generic/CPU/LeakyReLU.cpp"
-#else
-
-extern "C" void scn_R_(LeakyReLU_updateOutput)(THTensor *input_features,
-                                               THTensor *output_features,
-                                               float alpha) {
-  if (input_features != output_features)
-    THTensor_(resizeAs)(output_features, input_features);
-  auto iF = THTensor_(data)(input_features);
-  auto oF = THTensor_(data)(output_features);
-  auto n = THTensor_(nElement)(input_features);
-
-  for (uInt i = 0; i < n; i++)
-    oF[i] = (iF[i] > 0) ? iF[i] : iF[i] * alpha;
-}
-extern "C" void scn_R_(LeakyReLU_updateGradInput)(THTensor *input_features,
-                                                  THTensor *d_input_features,
-                                                  THTensor *d_output_features,
-                                                  float alpha) {
-  if (d_input_features != d_output_features)
-    THTensor_(resizeAs)(d_input_features, d_output_features);
-  auto iF = THTensor_(data)(input_features);
-  auto diF = THTensor_(data)(d_input_features);
-  auto doF = THTensor_(data)(d_output_features);
-  auto n = THTensor_(nElement)(d_input_features);
-
-  for (uInt i = 0; i < n; i++)
-    diF[i] = (iF[i] > 0) ? doF[i] : doF[i] * alpha;
-}
-#endif
--- a/sparseconvnet/SCN/generic/CPU/MaxPooling.cpp
+++ b/sparseconvnet/SCN/generic/CPU/MaxPooling.cpp
-// Copyright 2016-present, Facebook, Inc.
-// All rights reserved.
-//
-// This source code is licensed under the license found in the
-// LICENSE file in the root directory of this source tree.
-
-#ifndef TH_GENERIC_FILE_
-#define TH_GENERIC_FILE_ "generic/CPU/MaxPooling.cpp"
-#else
-#include "MaxPooling.h"
-
-extern "C" void scn_DR_(MaxPooling_updateOutput)(
-    THLongTensor *inputSize, THLongTensor *outputSize, THLongTensor *poolSize,
-    THLongTensor *poolStride, void **m, THTensor *input_features,
-    THTensor *output_features, long nFeaturesToDrop) {
-
-  SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
-  uInt nPlanes = input_features->size[1] - nFeaturesToDrop;
-  auto _rules =
-      _m.getRuleBook(inputSize, outputSize, poolSize, poolStride, true);
-  uInt nActive = _m.getNActive(outputSize);
-  THTensor_(resize2d)(output_features, nActive,
-                      input_features->size[1] - nFeaturesToDrop);
-  THTensor_(zero)(output_features);
-
-  auto iF = THTensor_(data)(input_features) + nFeaturesToDrop;
-  auto oF = THTensor_(data)(output_features);
-
-  for (auto &r : _rules) {
-    uInt nHot = r.size() / 2;
-    MaxPooling_ForwardPass<real>(iF, oF, nPlanes, input_features->stride[0],
-                                 output_features->stride[0], &r[0], nHot);
-  }
-}
-extern "C" void scn_DR_(MaxPooling_updateGradInput)(
-    THLongTensor *inputSize, THLongTensor *outputSize, THLongTensor *poolSize,
-    THLongTensor *poolStride, void **m, THTensor *input_features,
-    THTensor *d_input_features, THTensor *output_features,
-    THTensor *d_output_features, long nFeaturesToDrop) {
-
-  SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
-  uInt nPlanes = input_features->size[1] - nFeaturesToDrop;
-  auto _rules =
-      _m.getRuleBook(inputSize, outputSize, poolSize, poolStride, true);
-  uInt nActive = _m.getNActive(outputSize);
-  THTensor_(resizeAs)(d_input_features, input_features);
-  THTensor_(zero)(d_input_features);
-
-  auto iF = THTensor_(data)(input_features);
-  auto oF = THTensor_(data)(output_features);
-  auto diF = THTensor_(data)(d_input_features);
-  auto doF = THTensor_(data)(d_output_features);
-
-  for (auto &r : _rules) {
-    uInt nHot = r.size() / 2;
-    MaxPooling_BackwardPass<real>(iF, diF, oF, doF, nPlanes,
-                                  input_features->stride[0],
-                                  output_features->stride[0], &r[0], nHot);
-  }
-}
-extern "C" void scn_DR_(RandomizedStrideMaxPooling_updateOutput)(
-    THLongTensor *inputSize, THLongTensor *outputSize, THLongTensor *poolSize,
-    THLongTensor *poolStride, void **m, THTensor *input_features,
-    THTensor *output_features, long nFeaturesToDrop) {
-
-  SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
-  uInt nPlanes = input_features->size[1] - nFeaturesToDrop;
-  auto _rules =
-      _m.getRandomizedStrideRuleBook(inputSize, outputSize, poolSize, poolStride, true);
-  uInt nActive = _m.getNActive(outputSize);
-  THTensor_(resize2d)(output_features, nActive,
-                      input_features->size[1] - nFeaturesToDrop);
-  THTensor_(zero)(output_features);
-
-  auto iF = THTensor_(data)(input_features) + nFeaturesToDrop;
-  auto oF = THTensor_(data)(output_features);
-
-  for (auto &r : _rules) {
-    uInt nHot = r.size() / 2;
-    MaxPooling_ForwardPass<real>(iF, oF, nPlanes, input_features->stride[0],
-                                 output_features->stride[0], &r[0], nHot);
-  }
-}
-extern "C" void scn_DR_(RandomizedStrideMaxPooling_updateGradInput)(
-    THLongTensor *inputSize, THLongTensor *outputSize, THLongTensor *poolSize,
-    THLongTensor *poolStride, void **m, THTensor *input_features,
-    THTensor *d_input_features, THTensor *output_features,
-    THTensor *d_output_features, long nFeaturesToDrop) {
-
-  SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
-  uInt nPlanes = input_features->size[1] - nFeaturesToDrop;
-  auto _rules =
-      _m.getRandomizedStrideRuleBook(inputSize, outputSize, poolSize, poolStride, true);
-  uInt nActive = _m.getNActive(outputSize);
-  THTensor_(resizeAs)(d_input_features, input_features);
-  THTensor_(zero)(d_input_features);
-
-  auto iF = THTensor_(data)(input_features);
-  auto oF = THTensor_(data)(output_features);
-  auto diF = THTensor_(data)(d_input_features);
-  auto doF = THTensor_(data)(d_output_features);
-
-  for (auto &r : _rules) {
-    uInt nHot = r.size() / 2;
-    MaxPooling_BackwardPass<real>(iF, diF, oF, doF, nPlanes,
-                                  input_features->stride[0],
-                                  output_features->stride[0], &r[0], nHot);
-  }
-}
-#endif
--- a/sparseconvnet/SCN/generic/CPU/NetworkInNetwork.cpp
+++ b/sparseconvnet/SCN/generic/CPU/NetworkInNetwork.cpp
-// Copyright 2016-present, Facebook, Inc.
-// All rights reserved.
-//
-// This source code is licensed under the license found in the
-// LICENSE file in the root directory of this source tree.
-
-#ifndef TH_GENERIC_FILE
-#define TH_GENERIC_FILE "generic/CPU/NetworkInNetwork.cpp"
-#else
-
-extern "C" double
-    scn_R_(NetworkInNetwork_updateOutput)(THTensor *input_features_,
-                                          THTensor *output_features_,
-                                          THTensor *weight_, THTensor *bias_) {
-  auto nActive = input_features_->size[0];
-  auto input_nPlanes = weight_->size[0];
-  auto output_nPlanes = weight_->size[1];
-  THTensor_(resize2d)(output_features_, nActive, output_nPlanes);
-  auto input_features = THTensor_(data)(input_features_);
-  auto output_features = THTensor_(data)(output_features_);
-  auto weight = THTensor_(data)(weight_);
-
-  if (bias_ != nullptr) {
-    // Set bias
-    auto bias = THTensor_(data)(bias_);
-    for (uInt row = 0; row < nActive; row++)
-      for (uInt column = 0; column < output_nPlanes; column++)
-        output_features[row * output_nPlanes + column] = bias[column];
-    // Do GEMM (note: gemm assumes column-major matrices)
-    // buffer          is l*m (row-major)
-    // weight          is r*m (row-major)
-    // output_features is l*r (row-major)
-    // buffer * T(weights) + bias -> output_features
-    THBlas_(gemm)('n', 'n',
-                  output_nPlanes,         // r
-                  nActive,                // l
-                  input_nPlanes,          // m
-                  1,                      // alpha
-                  weight, output_nPlanes, // r
-                  input_features,
-                  input_nPlanes,                  // m
-                  1,                              // beta
-                  output_features, output_nPlanes // r
-                  );
-  } else {
-    THTensor_(zero)(output_features_);
-    THBlas_(gemm)('n', 'n',
-                  output_nPlanes,                 // r
-                  nActive,                        // l
-                  input_nPlanes,                  // m
-                  1,                              // alpha
-                  weight, output_nPlanes,         // r
-                  input_features, input_nPlanes,  // m
-                  0,                              // beta
-                  output_features, output_nPlanes // r
-                  );
-  }
-  return nActive * input_nPlanes * output_nPlanes;
-}
-extern "C" void
-    scn_R_(NetworkInNetwork_updateGradInput)(THTensor *d_input_features_,
-                                             THTensor *d_output_features_,
-                                             THTensor *weight_) {
-
-  auto nActive = d_output_features_->size[0];
-  auto input_nPlanes = weight_->size[0];
-  auto output_nPlanes = weight_->size[1];
-  THTensor_(resize2d)(d_input_features_, nActive, input_nPlanes);
-  THTensor_(zero)(d_input_features_);
-  auto d_input_features = THTensor_(data)(d_input_features_);
-  auto d_output_features = THTensor_(data)(d_output_features_);
-  auto weight = THTensor_(data)(weight_);
-  // Do GEMM (note: gemm assumes column-major matrices)
-  // d_output_features is l*m (row-major)
-  // weights           is m*r (row-major)
-  // d_buffer          is l*r (row-major)
-  // d_output_features * weight -> d_buffer
-  THBlas_(gemm)('t', 'n',
-                input_nPlanes,                     // r
-                nActive,                           // l
-                output_nPlanes,                    // m
-                1,                                 // alpha
-                weight, output_nPlanes,            // m
-                d_output_features, output_nPlanes, // m
-                0,                                 // beta
-                d_input_features, input_nPlanes    // r
-                );
-}
-extern "C" void scn_R_(NetworkInNetwork_accGradParameters)(
-    THTensor *input_features_, THTensor *d_output_features_,
-    THTensor *d_weight_, THTensor *d_bias_) {
-  auto nActive = input_features_->size[0];
-  auto input_nPlanes = d_weight_->size[0];
-  auto output_nPlanes = d_weight_->size[1];
-  auto input_features = THTensor_(data)(input_features_);
-  auto d_output_features = THTensor_(data)(d_output_features_);
-  auto d_weight = THTensor_(data)(d_weight_);
-  auto d_bias = d_bias_ and THTensor_(data)(d_bias_);
-
-  // Do GEMM (note: gemm assumes column-major matrices)
-  // d_output_features is m*l (row-major)
-  // buffer            is m*r (row-major)
-  // weights           is l*r (row-major)
-  // T(d_output_features) * buffer -> d_weight
-  THBlas_(gemm)('n', 't',
-                output_nPlanes,                    // r
-                input_nPlanes,                     // l
-                nActive,                           // m
-                1,                                 // alpha
-                d_output_features, output_nPlanes, // r
-                input_features, input_nPlanes,     // l
-                1,                                 // beta
-                d_weight, output_nPlanes           // r
-                );
-
-  if (d_bias_) {
-    auto d_bias = THTensor_(data)(d_bias_);
-    for (uInt row = 0; row < nActive; row++)
-      for (uInt i = 0; i < output_nPlanes; i++)
-        d_bias[i] += d_output_features[row * output_nPlanes + i];
-  }
-}
-
-#endif
--- a/sparseconvnet/SCN/generic/CPU/NetworkInNetwork.h
+++ b/sparseconvnet/SCN/generic/CPU/NetworkInNetwork.h
-// Copyright 2016-present, Facebook, Inc.
-// All rights reserved.
-//
-// This source code is licensed under the license found in the
-// LICENSE file in the root directory of this source tree.
-
-#ifndef CPU_NetworkInNetwork_H
-#define CPU_NetworkInNetwork_H
-#include "../SparseConvNet.h"
-#include "Convolution.h"
-// buffer must have size >= output_nActive * filterVolume * input_nPlanes
-
-template <typename T>
-void NetworkInNetwork_ForwardPass(
-    T *input_features, uInt input_nPlanes, T *output_features,
-    uInt output_nPlanes, T *weight, T *bias, uInt output_nActive,
-    void (*gemm)(char transa, char transb, long m, long n, long k, T alpha,
-                 T *a, long lda, T *b, long ldb, T beta, T *c, long ldc)) {
-
-  if (bias != nullptr) {
-    // Set bias
-    for (uInt row = 0; row < output_nActive; row++)
-      for (uInt column = 0; column < output_nPlanes; column++)
-        output_features[row * output_nPlanes + column] = bias[column];
-    // Do GEMM (note: gemm assumes column-major matrices)
-    // buffer          is l*m (row-major)
-    // weight          is r*m (row-major)
-    // output_features is l*r (row-major)
-    // buffer * T(weights) + bias -> output_features
-    (*gemm)('n', 'n',
-            output_nPlanes,               // r
-            output_nActive,               // l
-            input_nPlanes * filterVolume, // m
-            1,                            // alpha
-            weight, output_nPlanes,       // r
-            buffer,
-            input_nPlanes * filterVolume,   // m
-            1,                              // beta
-            output_features, output_nPlanes // r
-            );
-  } else {
-    (*gemm)('n', 'n',
-            output_nPlanes,                       // r
-            output_nActive,                       // l
-            input_nPlanes * filterVolume,         // m
-            1,                                    // alpha
-            weight, output_nPlanes,               // r
-            buffer, input_nPlanes * filterVolume, // m
-            0,                                    // beta
-            output_features, output_nPlanes       // r
-            );
-  }
-}
-
-template <typename T>
-void NetworkInNetwork_BackwardPass(
-    T *d_input_features, uInt input_nPlanes, T *d_output_features,
-    uInt output_nPlanes, T *weight, uInt *rules, uInt filterVolume,
-    uInt output_nActive, T *d_buffer,
-    void (*gemm)(char transa, char transb, long m, long n, long k, T alpha,
-                 T *a, long lda, T *b, long ldb, T beta, T *c, long ldc)) {
-  // Do GEMM (note: gemm assumes column-major matrices)
-  // d_output_features is l*m (row-major)
-  // weights           is m*r (row-major)
-  // d_buffer          is l*r (row-major)
-  // d_output_features * weight -> d_buffer
-  (*gemm)('t', 'n',
-          input_nPlanes * filterVolume,          // r
-          output_nActive,                        // l
-          output_nPlanes,                        // m
-          1,                                     // alpha
-          weight, output_nPlanes,                // m
-          d_output_features, output_nPlanes,     // m
-          0,                                     // beta
-          d_buffer, input_nPlanes * filterVolume // r
-          );
-
-  // Use rules and d_buffer to accumulate gradient information into d_input
-  for (uInt row = 0; row < output_nActive * filterVolume; row++) {
-    auto r = rules[row];
-    if (r != uInt_MAX) // 2^32-1
-      for (uInt i = 0; i < input_nPlanes; i++)
-        d_input_features[r * input_nPlanes + i] +=
-            d_buffer[row * input_nPlanes + i];
-  }
-}
-
-template <typename T>
-void NetworkInNetwork_GradWeights(
-    T *input_features, uInt input_nPlanes, T *d_output_features,
-    uInt output_nPlanes, T *d_weight, T *d_bias, uInt *rules, uInt filterVolume,
-    uInt output_nActive, T *buffer,
-    void (*gemm)(char transa, char transb, long m, long n, long k, T alpha,
-                 T *a, long lda, T *b, long ldb, T beta, T *c, long ldc)) {
-
-  // d_weight
-  // Use input_features and rules to fill buffer
-  for (uInt row = 0; row < output_nActive * filterVolume; row++) {
-    if (rules[row] == uInt_MAX) { // 2^32-1
-      std::memset(buffer + row * input_nPlanes, 0, sizeof(T) * input_nPlanes);
-    } else {
-      std::memcpy(buffer + row * input_nPlanes,
-                  input_features + rules[row] * input_nPlanes,
-                  sizeof(T) * input_nPlanes);
-    }
-  }
-  // Do GEMM (note: gemm assumes column-major matrices)
-  // d_output_features is m*l (row-major)
-  // buffer            is m*r (row-major)
-  // weights           is l*r (row-major)
-  // T(d_output_features) * buffer -> d_weight
-  (*gemm)('n', 't',
-          output_nPlanes,                       // r
-          input_nPlanes * filterVolume,         // l
-          output_nActive,                       // m
-          1,                                    // alpha
-          d_output_features, output_nPlanes,    // r
-          buffer, input_nPlanes * filterVolume, // l
-          1,                                    // beta
-          d_weight, output_nPlanes              // r
-          );
-
-  if (d_bias)
-    for (uInt row = 0; row < output_nActive; row++)
-      for (uInt i = 0; i < output_nPlanes; i++)
-        d_bias[i] += d_output_features[row * output_nPlanes + i];
-}
-#endif /* CPU_NetworkInNetwork_H */
--- a/sparseconvnet/SCN/generic/CPU/SparseToDense.cpp
+++ b/sparseconvnet/SCN/generic/CPU/SparseToDense.cpp
-// Copyright 2016-present, Facebook, Inc.
-// All rights reserved.
-//
-// This source code is licensed under the license found in the
-// LICENSE file in the root directory of this source tree.
-
-#ifndef TH_GENERIC_FILE_
-#define TH_GENERIC_FILE_ "generic/CPU/SparseToDense.cpp"
-#else
-#include "SparseToDense.h"
-
-extern "C" void scn_DR_(SparseToDense_updateOutput)(
-    THLongTensor *inputSize, void **m, THTensor *input_features,
-    THTensor *output_features, long nPlanes) {
-
-  SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
-
-  {
-    long sz[Dimension + 2];
-    sz[0] = _m.grids.begin()->second.size(); //batch size
-    sz[1] = nPlanes;
-    std::memcpy(sz + 2, THLongTensor_data(inputSize), sizeof(long) * Dimension);
-    THTensor_(resizeNd)(output_features, Dimension + 2, sz, NULL);
-    THTensor_(zero)(output_features);
-  }
-  if (input_features->nDimension == 2) {
-    auto _rules = _m.getSparseToDenseRuleBook(inputSize, true);
-    uInt _nPlanes = input_features->size[1];
-    auto iF = THTensor_(data)(input_features);
-    auto oF = THTensor_(data)(output_features);
-    long spatialVolume = THLongTensor_prodall(inputSize);
-    for (auto &r : _rules) {
-      uInt nHot = r.size() / 2;
-      SparseToDense_ForwardPass<real>(iF, oF, _nPlanes, spatialVolume, &r[0],
-                                      nHot);
-      oF += _nPlanes * spatialVolume;
-    }
-  }
-}
-extern "C" void scn_DR_(SparseToDense_updateGradInput)(
-    THLongTensor *inputSize, void **m, THTensor *input_features,
-    THTensor *d_input_features, THTensor *d_output_features) {
-
-  SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
-  THTensor_(resizeAs)(d_input_features, input_features);
-  THTensor_(zero)(d_input_features);
-  if (input_features->nDimension == 2) {
-    auto _rules = _m.getSparseToDenseRuleBook(inputSize, true);
-    long spatialVolume = THLongTensor_prodall(inputSize);
-    uInt _nPlanes = d_input_features->size[1];
-    auto diF = THTensor_(data)(d_input_features);
-    auto doF = THTensor_(data)(d_output_features);
-    for (auto &r : _rules) {
-      uInt nHot = r.size() / 2;
-      SparseToDense_BackwardPass<real>(diF, doF, _nPlanes, spatialVolume, &r[0],
-                                       nHot);
-      doF += _nPlanes * spatialVolume;
-    }
-  }
-}
-#endif
--- a/sparseconvnet/SCN/generic/CPU/THGenerateDimFloatTypes.h
+++ b/sparseconvnet/SCN/generic/CPU/THGenerateDimFloatTypes.h
-// Copyright 2016-present, Facebook, Inc.
-// All rights reserved.
-//
-// This source code is licensed under the license found in the
-// LICENSE file in the root directory of this source tree.
-
-#ifndef TH_GENERIC_FILE_
-#error "Define TH_GENERIC_FILE_ before including THGenerateDimFloatTypes.h"
-#endif
-
-#define TH_GENERIC_FILE TH_GENERIC_FILE_
-
-#define Dimension 1
-#define TH_GENERIC_FILE TH_GENERIC_FILE_
-#include "THGenerateFloatTypes.h"
-#undef Dimension
-
-#define Dimension 2
-#define TH_GENERIC_FILE TH_GENERIC_FILE_
-#include "THGenerateFloatTypes.h"
-#undef Dimension
-
-#define Dimension 3
-#define TH_GENERIC_FILE TH_GENERIC_FILE_
-#include "THGenerateFloatTypes.h"
-#undef Dimension
-
-#define Dimension 4
-#define TH_GENERIC_FILE TH_GENERIC_FILE_
-#include "THGenerateFloatTypes.h"
-#undef Dimension
-
-#define Dimension 5
-#define TH_GENERIC_FILE TH_GENERIC_FILE_
-#include "THGenerateFloatTypes.h"
-#undef Dimension
-
-#define Dimension 6
-#define TH_GENERIC_FILE TH_GENERIC_FILE_
-#include "THGenerateFloatTypes.h"
-#undef Dimension
-
-#define Dimension 7
-#define TH_GENERIC_FILE TH_GENERIC_FILE_
-#include "THGenerateFloatTypes.h"
-#undef Dimension
-
-#define Dimension 8
-#define TH_GENERIC_FILE TH_GENERIC_FILE_
-#include "THGenerateFloatTypes.h"
-#undef Dimension
-
-#define Dimension 9
-#define TH_GENERIC_FILE TH_GENERIC_FILE_
-#include "THGenerateFloatTypes.h"
-#undef Dimension
-
-#define Dimension 10
-#define TH_GENERIC_FILE TH_GENERIC_FILE_
-#include "THGenerateFloatTypes.h"
-#undef Dimension
-
-#undef TH_GENERIC_FILE_
--- a/sparseconvnet/SCN/generic/CPU/THGenerateFloatTypes.h
+++ b/sparseconvnet/SCN/generic/CPU/THGenerateFloatTypes.h
-// Copyright 2016-present, Facebook, Inc.
-// All rights reserved.
-//
-// This source code is licensed under the license found in the
-// LICENSE file in the root directory of this source tree.
-
-#ifndef TH_GENERIC_FILE
-#error "You must define TH_GENERIC_FILE before including THGenerateFloatTypes.h"
-#endif
-
-#define real float
-#define accreal double
-#define Real Float
-#define TH_REAL_IS_FLOAT
-
-#line 1 TH_GENERIC_FILE
-#include TH_GENERIC_FILE
-
-#undef accreal
-#undef real
-#undef Real
-#undef TH_REAL_IS_FLOAT
-
-#define real double
-#define accreal double
-#define Real Double
-#define TH_REAL_IS_DOUBLE
-
-#line 1 TH_GENERIC_FILE
-#include TH_GENERIC_FILE
-
-#undef accreal
-#undef real
-#undef Real
-#undef TH_REAL_IS_DOUBLE
-
-#undef TH_GENERIC_FILE
--- a/sparseconvnet/SCN/generic/CPU/UnPooling.cpp
+++ b/sparseconvnet/SCN/generic/CPU/UnPooling.cpp
-// Copyright 2016-present, Facebook, Inc.
-// All rights reserved.
-//
-// This source code is licensed under the license found in the
-// LICENSE file in the root directory of this source tree.
-
-#ifndef TH_GENERIC_FILE_
-#define TH_GENERIC_FILE_ "generic/CPU/UnPooling.cpp"
-#else
-#include "UnPooling.h"
-
-extern "C" void scn_DR_(UnPooling_updateOutput)(
-    THLongTensor *inputSize, THLongTensor *outputSize, THLongTensor *poolSize,
-    THLongTensor *poolStride, void **m, THTensor *input_features,
-    THTensor *output_features, long nFeaturesToDrop) {
-
-  SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
-  uInt nPlanes = input_features->size[1] - nFeaturesToDrop;
-  auto _rules =
-      _m.getRuleBook(outputSize, inputSize, poolSize, poolStride, true);
-  uInt nActive = _m.getNActive(outputSize);
-  THTensor_(resize2d)(output_features, nActive,
-                      input_features->size[1] - nFeaturesToDrop);
-  THTensor_(zero)(output_features);
-
-  auto iF = THTensor_(data)(input_features) + nFeaturesToDrop;
-  auto oF = THTensor_(data)(output_features);
-
-  for (auto &r : _rules) {
-    uInt nHot = r.size() / 2;
-    UnPooling_ForwardPass<real>(iF, oF, nPlanes, input_features->size[1],
-                                output_features->size[1], &r[0], nHot,
-                                _rules.size());
-  }
-}
-extern "C" void scn_DR_(UnPooling_updateGradInput)(
-    THLongTensor *inputSize, THLongTensor *outputSize, THLongTensor *poolSize,
-    THLongTensor *poolStride, void **m, THTensor *input_features,
-    THTensor *d_input_features, THTensor *d_output_features,
-    long nFeaturesToDrop) {
-
-  SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
-  uInt nPlanes = input_features->size[1] - nFeaturesToDrop;
-  auto _rules =
-      _m.getRuleBook(outputSize, inputSize, poolSize, poolStride, true);
-  uInt nActive = _m.getNActive(outputSize);
-  THTensor_(resizeAs)(d_input_features, input_features);
-  THTensor_(zero)(d_input_features);
-
-  auto diF = THTensor_(data)(d_input_features) + nFeaturesToDrop;
-  auto doF = THTensor_(data)(d_output_features);
-
-  for (auto &r : _rules) {
-    uInt nHot = r.size() / 2;
-    UnPooling_BackwardPass<real>(diF, doF, nPlanes, input_features->size[1],
-                                 d_output_features->size[1], &r[0], nHot,
-                                 _rules.size());
-  }
-}
-#endif
--- a/sparseconvnet/SCN/generic/GPU/ActivePooling.cu
+++ b/sparseconvnet/SCN/generic/GPU/ActivePooling.cu
-// Copyright 2016-present, Facebook, Inc.
-// All rights reserved.
-//
-// This source code is licensed under the license found in the
-// LICENSE file in the root directory of this source tree.
-
-#ifndef TH_GENERIC_FILE_
-#define TH_GENERIC_FILE_ "generic/GPU/ActivePooling.cu"
-#else
-#include "ActivePooling.h"
-
-extern "C" void scn_DR_(ActivePooling_updateOutput)(
-    THLongTensor *inputSize, void **m, THCTensor *input_features,
-    THCTensor *output_features, bool average) {
-  SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
-  uInt nPlanes = input_features->size[1];
-  auto _rules = _m.getActivePoolingRuleBook(inputSize);
-  uInt batchSize = _rules[1][0];
-  uInt maxActive = _rules[1][1];
-  THCTensor_(resize2d)(state, output_features, batchSize, nPlanes);
-  THCTensor_(zero)(state, output_features);
-
-  auto rulesBuffer = THCITensor_(new)(state);
-  if (THCITensor_(nElement)(state, rulesBuffer) < 1 << 22)
-    THCITensor_(resize1d)(state, rulesBuffer, 1 << 22);
-  uInt *rb = (uInt *)THCITensor_(data)(state, rulesBuffer);
-  uInt rowBatchSize = std::min((uInt)32768, (1 << 22) / (maxActive + 1));
-  THAssert(rowBatchSize > 0);
-
-  auto iF = THCTensor_(data)(state, input_features);
-  auto oF = THCTensor_(data)(state, output_features);
-  for (uInt o = 0; o < batchSize; o += rowBatchSize) {
-    uInt batchSize_ = std::min(rowBatchSize, (uInt)(batchSize - o));
-    cudaMemcpy(rb, &_rules[0][o * (maxActive + 1)],
-               sizeof(uInt) * (maxActive + 1) * batchSize_,
-               cudaMemcpyHostToDevice);
-    ActivePooling_ForwardPass<real>(iF, oF + o * nPlanes, batchSize_, maxActive,
-                                    nPlanes, rb, average);
-  }
-  THCITensor_(free)(state, rulesBuffer);
-}
-extern "C" void scn_DR_(ActivePooling_updateGradInput)(
-    THLongTensor *inputSize, void **m, THCTensor *input_features,
-    THCTensor *d_input_features, THCTensor *d_output_features,
-    bool average) {
-  SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
-  uInt nPlanes = input_features->size[1];
-  auto _rules = _m.getActivePoolingRuleBook(inputSize);
-  uInt batchSize = _rules[1][0];
-  uInt maxActive = _rules[1][1];
-  THCTensor_(resizeAs)(state, d_input_features, input_features);
-  THCTensor_(zero)(state, d_input_features);
-
-  auto rulesBuffer = THCITensor_(new)(state);
-  if (THCITensor_(nElement)(state, rulesBuffer) < 1 << 22)
-    THCITensor_(resize1d)(state, rulesBuffer, 1 << 22);
-  uInt *rb = (uInt *)THCITensor_(data)(state, rulesBuffer);
-  uInt rowBatchSize = std::min((uInt)32768, (1 << 22) / (maxActive + 1));
-  THAssert(rowBatchSize > 0);
-
-  auto diF = THCTensor_(data)(state, d_input_features);
-  auto doF = THCTensor_(data)(state, d_output_features);
-  for (uInt o = 0; o < batchSize; o += rowBatchSize) {
-    uInt batchSize_ = std::min(rowBatchSize, (uInt)(batchSize - o));
-    cudaMemcpy(rb, &_rules[0][o * (maxActive + 1)],
-               sizeof(uInt) * (maxActive + 1) * batchSize_,
-               cudaMemcpyHostToDevice);
-    ActivePooling_BackwardPass<real>(diF, doF + o * nPlanes, batchSize_,
-                                     maxActive, nPlanes, rb, average);
-  }
-  THCITensor_(free)(state, rulesBuffer);
-}
-#endif
--- a/sparseconvnet/SCN/generic/GPU/AffineReluTrivialConvolution.cu
+++ b/sparseconvnet/SCN/generic/GPU/AffineReluTrivialConvolution.cu
-// Copyright 2016-present, Facebook, Inc.
-// All rights reserved.
-//
-// This source code is licensed under the license found in the
-// LICENSE file in the root directory of this source tree.
-
-#ifndef TH_GENERIC_FILE
-#define TH_GENERIC_FILE "generic/GPU/AffineReluTrivialConvolution.cu"
-#else
-#include "AffineReluTrivialConvolution.h"
-
-#include <algorithm>
-
-extern "C" void scn_R_(AffineReluTrivialConvolution_updateOutput)(
-    THCTensor *input_features, THCTensor *output_features,
-    THCTensor *affineWeight, THCTensor *affineBias, THCTensor *convWeight) {
-
-  THCTensor_(resize2d)(state, output_features, input_features->size[0],
-                       convWeight->size[1]);
-  dAffineReluTrivialConvolution_forward<real>(
-      THCTensor_(data)(state, input_features),
-      THCTensor_(data)(state, output_features),
-      THCTensor_(data)(state, affineWeight),
-      THCTensor_(data)(state, affineBias), THCTensor_(data)(state, convWeight),
-      convWeight->size[0], input_features->stride[0], convWeight->size[1],
-      output_features->size[1], input_features->size[0]);
-}
-
-extern "C" void scn_R_(AffineReluTrivialConvolution_backward)(
-    THCTensor *input_features, THCTensor *d_input_features,
-    THCTensor *d_output_features, THCTensor *affineWeight,
-    THCTensor *d_affineWeight, THCTensor *affineBias, THCTensor *d_affineBias,
-    THCTensor *convWeight, THCTensor *d_convWeight, bool additiveGrad) {
-
-  THCTensor_(resizeAs)(state, d_input_features, input_features);
-  dAffineReluTrivialConvolution_backward_dW<real>(
-      THCTensor_(data)(state, input_features),
-      THCTensor_(data)(state, d_input_features),
-      THCTensor_(data)(state, d_output_features),
-      THCTensor_(data)(state, affineWeight),
-      THCTensor_(data)(state, d_affineWeight),
-      THCTensor_(data)(state, affineBias),
-      THCTensor_(data)(state, d_affineBias),
-      THCTensor_(data)(state, convWeight),
-      THCTensor_(data)(state, d_convWeight), convWeight->size[0],
-      input_features->stride[0], convWeight->size[1],
-      d_output_features->stride[0], input_features->size[0], additiveGrad);
-}
-
-#endif
--- a/sparseconvnet/SCN/generic/GPU/AveragePooling.cu
+++ b/sparseconvnet/SCN/generic/GPU/AveragePooling.cu
-// Copyright 2016-present, Facebook, Inc.
-// All rights reserved.
-//
-// This source code is licensed under the license found in the
-// LICENSE file in the root directory of this source tree.
-
-#ifndef TH_GENERIC_FILE_
-#define TH_GENERIC_FILE_ "generic/GPU/AveragePooling.cu"
-#else
-#include "AveragePooling.h"
-#include "RuleBookIterator.h"
-
-extern "C" void scn_DR_(AveragePooling_updateOutput)(
-    THLongTensor *inputSize, THLongTensor *outputSize, THLongTensor *poolSize,
-    THLongTensor *poolStride, void **m, THCTensor *input_features,
-    THCTensor *output_features, long nFeaturesToDrop) {
-
-  SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
-  uInt nPlanes = input_features->size[1] - nFeaturesToDrop;
-  auto _rules =
-      _m.getRuleBook(inputSize, outputSize, poolSize, poolStride, true);
-  uInt nActive = _m.getNActive(outputSize);
-  THCTensor_(resize2d)(state, output_features, nActive,
-                       input_features->size[1] - nFeaturesToDrop);
-  THCTensor_(zero)(state, output_features);
-
-  auto iF = THCTensor_(data)(state, input_features) + nFeaturesToDrop;
-  auto oF = THCTensor_(data)(state, output_features);
-  RULEBOOKITERATOR(AveragePooling_ForwardPass<real>(
-                       THCState_getCurrentStream(state), iF, oF, nPlanes,
-                       input_features->size[1], output_features->size[1], rbB,
-                       nHotB, _rules.size());
-                   , )
-}
-
-extern "C" void scn_DR_(AveragePooling_updateGradInput)(
-    THLongTensor *inputSize, THLongTensor *outputSize, THLongTensor *poolSize,
-    THLongTensor *poolStride, void **m, THCTensor *input_features,
-    THCTensor *d_input_features, THCTensor *d_output_features,
-    long nFeaturesToDrop) {
-
-  SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
-  uInt nPlanes = input_features->size[1] - nFeaturesToDrop;
-  auto _rules =
-      _m.getRuleBook(inputSize, outputSize, poolSize, poolStride, true);
-  uInt nActive = _m.getNActive(outputSize);
-  THCTensor_(resizeAs)(state, d_input_features, input_features);
-  THCTensor_(zero)(state, d_input_features);
-
-  auto diF = THCTensor_(data)(state, d_input_features) + nFeaturesToDrop;
-  auto doF = THCTensor_(data)(state, d_output_features);
-  RULEBOOKITERATOR(AveragePooling_BackwardPass<real>(
-                       THCState_getCurrentStream(state), diF, doF, nPlanes,
-                       input_features->size[1], d_output_features->size[1], rbB,
-                       nHotB, _rules.size());
-                   , )
-}
-#endif
--- a/sparseconvnet/SCN/generic/GPU/BatchNormalization.cu
+++ b/sparseconvnet/SCN/generic/GPU/BatchNormalization.cu
-// Copyright 2016-present, Facebook, Inc.
-// All rights reserved.
-//
-// This source code is licensed under the license found in the
-// LICENSE file in the root directory of this source tree.
-
-#ifndef TH_GENERIC_FILE
-#define TH_GENERIC_FILE "generic/GPU/BatchNormalization.cu"
-#else
-#include "BatchNormalization.h"
-
-#define BN_F_MACRO(N)                                                          \
-  if (nPlanes % N == 0) {                                                      \
-    BatchNormalization_ForwardPass<real, N, 64>(                               \
-        THCTensor_(data)(state, input_features),                               \
-        THCTensor_(data)(state, output_features), nPlanes, input_stride,       \
-        output_stride, nActive, THCTensor_(data)(state, saveMean),             \
-        THCTensor_(data)(state, saveInvStd),                                   \
-        THCTensor_(data)(state, runningMean),                                  \
-        THCTensor_(data)(state, runningVar),                                   \
-        weight ? THCTensor_(data)(state, weight) : 0,                          \
-        bias ? THCTensor_(data)(state, bias) : 0, eps, momentum, train,        \
-        leakiness);                                                            \
-  }
-
-extern "C" void scn_R_(BatchNormalization_updateOutput)(
-    THCTensor *input_features, THCTensor *output_features, THCTensor *saveMean,
-    THCTensor *saveInvStd, THCTensor *runningMean, THCTensor *runningVar,
-    THCTensor *weight, THCTensor *bias, real eps, real momentum, bool train,
-    real leakiness) {
-
-  THCTensor_(resizeAs)(state, output_features, input_features);
-  if (input_features->nDimension == 2) {
-    auto nActive = input_features->size[0];
-    auto nPlanes = input_features->size[1];
-    auto input_stride = input_features->stride[0];
-    auto output_stride = output_features->stride[0];
-    BN_F_MACRO(16)
-    else BN_F_MACRO(12) else BN_F_MACRO(8) else BN_F_MACRO(4) else BN_F_MACRO(1)
-  }
-}
-
-extern "C" void scn_R_(BatchNormalizationInTensor_updateOutput)(
-    THCTensor *input_features, THCTensor *output_features, THCTensor *saveMean,
-    THCTensor *saveInvStd, THCTensor *runningMean, THCTensor *runningVar,
-    THCTensor *weight, THCTensor *bias, real eps, real momentum, bool train,
-    real leakiness) {
-  if (input_features->nDimension == 2) {
-    auto nActive = input_features->size[0];
-    auto nPlanes = input_features->size[1];
-    auto input_stride = input_features->stride[0];
-    auto output_stride = output_features->stride[0];
-    BN_F_MACRO(16)
-    else BN_F_MACRO(12) else BN_F_MACRO(8) else BN_F_MACRO(4) else BN_F_MACRO(1)
-  }
-}
-
-#undef BN_F_MACRO
-
-#define BN_B_MACRO(N)                                                          \
-  if (nPlanes % N == 0) {                                                      \
-    BatchNormalization_BackwardPass<real, N, 64>(                              \
-        THCTensor_(data)(state, input_features),                               \
-        THCTensor_(data)(state, d_input_features),                             \
-        THCTensor_(data)(state, output_features),                              \
-        THCTensor_(data)(state, d_output_features), nPlanes, input_stride,     \
-        output_stride, nActive, THCTensor_(data)(state, saveMean),             \
-        THCTensor_(data)(state, saveInvStd),                                   \
-        THCTensor_(data)(state, runningMean),                                  \
-        THCTensor_(data)(state, runningVar),                                   \
-        weight ? THCTensor_(data)(state, weight) : 0,                          \
-        bias ? THCTensor_(data)(state, bias) : 0,                              \
-        d_weight ? THCTensor_(data)(state, d_weight) : 0,                      \
-        d_bias ? THCTensor_(data)(state, d_bias) : 0, leakiness);              \
-  }
-
-extern "C" void scn_R_(BatchNormalization_backward)(
-    THCTensor *input_features, THCTensor *d_input_features,
-    THCTensor *output_features, THCTensor *d_output_features,
-    THCTensor *saveMean, THCTensor *saveInvStd, THCTensor *runningMean,
-    THCTensor *runningVar, THCTensor *weight, THCTensor *bias,
-    THCTensor *d_weight, THCTensor *d_bias, real leakiness) {
-
-  THCTensor_(resizeAs)(state, d_input_features, d_output_features);
-  if (input_features->nDimension == 2) {
-    auto nActive = input_features->size[0];
-    auto nPlanes = input_features->size[1];
-    auto input_stride = input_features->stride[0];
-    auto output_stride = output_features->stride[0];
-    BN_B_MACRO(16)
-    else BN_B_MACRO(12) else BN_B_MACRO(8) else BN_B_MACRO(4) else BN_B_MACRO(1)
-  }
-}
-#endif
--- a/sparseconvnet/SCN/generic/GPU/Convolution.cu
+++ b/sparseconvnet/SCN/generic/GPU/Convolution.cu
-// Copyright 2016-present, Facebook, Inc.
-// All rights reserved.
-//
-// This source code is licensed under the license found in the
-// LICENSE file in the root directory of this source tree.
-
-#ifndef TH_GENERIC_FILE_
-#define TH_GENERIC_FILE_ "generic/GPU/Convolution.cu"
-#else
-#include "Convolution.h"
-#include "RuleBookIterator.h"
-#include <algorithm>
-#include <cstring>
-
-extern "C" double scn_DR_(Convolution_updateOutput)(
-    THLongTensor *inputSize, THLongTensor *outputSize, THLongTensor *filterSize,
-    THLongTensor *filterStride, void **m, THCTensor *input_features,
-    THCTensor *output_features, THCTensor *weight, THCTensor *bias,
-    long filterVolume) {
-  SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
-  auto _rules =
-      _m.getRuleBook(inputSize, outputSize, filterSize, filterStride, true);
-  uInt nActive = _m.getNActive(outputSize);
-  THCTensor_(resize2d)(state, output_features, nActive, weight->size[1]);
-  if (not bias)
-    THCTensor_(zero)(state, output_features);
-
-  double flops = 0;
-  if (nActive) {
-    auto iF = THCTensor_(data)(state, input_features);
-    auto oF = THCTensor_(data)(state, output_features);
-    auto ip = input_features->size[1];
-    auto op = output_features->size[1];
-    auto w = THCTensor_(data)(state, weight);
-
-    if (bias) {
-      auto b = THCTensor_(data)(state, bias);
-      for (uInt i = 0; i < op; i += 32) {
-        uInt blockDim = min(32L, op - i);
-        uInt gridDim = min(4096, nActive);
-        Convolution_fp_bias<<<gridDim, blockDim, 0,
-                              THCState_getCurrentStream(state)>>>(
-            oF + i, b + i, op, op, nActive);
-      }
-    }
-    uInt c = ip * op;
-    RULEBOOKITERATOR(
-        dConvolution_forward2<real>(iF, oF, w, rbB, nHotB, ip, ip, op, op,
-                                    THCState_getCurrentStream(state));
-        , w += c; flops += nHotB * c;)
-  }
-  return flops;
-}
-
-extern "C" void scn_DR_(Convolution_backward)(
-    THLongTensor *inputSize, THLongTensor *outputSize, THLongTensor *filterSize,
-    THLongTensor *filterStride, void **m, THCTensor *input_features,
-    THCTensor *d_input_features, THCTensor *d_output_features,
-    THCTensor *weight, THCTensor *d_weight, THCTensor *d_bias,
-    long filterVolume) {
-  SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
-  auto _rules =
-      _m.getRuleBook(inputSize, outputSize, filterSize, filterStride, true);
-  uInt nActive = _m.getNActive(outputSize);
-  THCTensor_(resizeAs)(state, d_input_features, input_features);
-  THCTensor_(zero)(state, d_input_features);
-
-  if (nActive) {
-    auto iF = THCTensor_(data)(state, input_features);
-    auto diF = THCTensor_(data)(state, d_input_features);
-    auto doF = THCTensor_(data)(state, d_output_features);
-    auto ip = input_features->size[1];
-    auto op = d_output_features->size[1];
-    auto w = THCTensor_(data)(state, weight);
-    auto dw = THCTensor_(data)(state, d_weight);
-    uInt c = ip * op;
-    RULEBOOKITERATOR(dConvolution_backward_dW2<real>(
-                         iF, diF, doF, w, dw, rbB, nHotB, ip, ip, op, op,
-                         THCState_getCurrentStream(state));
-                     , w += c; dw += c;)
-
-    if (d_bias) {
-      auto db = THCTensor_(data)(state, d_bias);
-      Convolution_bp_bias(doF, db, op, op, nActive,
-                          THCState_getCurrentStream(state));
-    }
-  }
-}
-
-extern "C" double scn_DR_(SubmanifoldConvolution_updateOutput)(
-    THLongTensor *inputSize, THLongTensor *filterSize, void **m,
-    THCTensor *input_features, THCTensor *output_features, THCTensor *weight,
-    THCTensor *bias, long filterVolume) {
-  SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
-  auto _rules = _m.getSubmanifoldRuleBook(inputSize, filterSize, true);
-  uInt nActive = _m.getNActive(inputSize);
-  THCTensor_(resize2d)(state, output_features, nActive, weight->size[1]);
-  if (not bias)
-    THCTensor_(zero)(state, output_features);
-
-  double flops = 0;
-  if (nActive) {
-    auto iF = THCTensor_(data)(state, input_features);
-    auto oF = THCTensor_(data)(state, output_features);
-    auto ip = input_features->size[1];
-    auto op = output_features->size[1];
-    auto w = THCTensor_(data)(state, weight);
-
-    if (bias) {
-      auto b = THCTensor_(data)(state, bias);
-      for (uInt i = 0; i < op; i += 32) {
-        uInt blockDim = min(32L, op - i);
-        uInt gridDim = min(4096, nActive);
-        Convolution_fp_bias<<<gridDim, blockDim, 0,
-                              THCState_getCurrentStream(state)>>>(
-            oF + i, b + i, op, op, nActive);
-      }
-    }
-    uInt c = ip * op;
-    RULEBOOKITERATOR(
-        dConvolution_forward2<real>(iF, oF, w, rbB, nHotB, ip, ip, op, op,
-                                    THCState_getCurrentStream(state));
-        , w += c; flops += nHotB * c;)
-  }
-  return flops;
-}
-
-extern "C" void scn_DR_(SubmanifoldConvolution_backward)(
-    THLongTensor *inputSize, THLongTensor *filterSize, void **m,
-    THCTensor *input_features, THCTensor *d_input_features,
-    THCTensor *d_output_features, THCTensor *weight, THCTensor *d_weight,
-    THCTensor *d_bias, long filterVolume) {
-  SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
-  auto _rules = _m.getSubmanifoldRuleBook(inputSize, filterSize, true);
-  uInt nActive = _m.getNActive(inputSize);
-  THCTensor_(resizeAs)(state, d_input_features, input_features);
-  THCTensor_(zero)(state, d_input_features);
-
-  if (nActive) {
-    auto iF = THCTensor_(data)(state, input_features);
-    auto diF = THCTensor_(data)(state, d_input_features);
-    auto doF = THCTensor_(data)(state, d_output_features);
-    auto ip = input_features->size[1];
-    auto op = d_output_features->size[1];
-    auto w = THCTensor_(data)(state, weight);
-    auto dw = THCTensor_(data)(state, d_weight);
-    uInt c = ip * op;
-    RULEBOOKITERATOR(dConvolution_backward_dW2<real>(
-                         iF, diF, doF, w, dw, rbB, nHotB, ip, ip, op, op,
-                         THCState_getCurrentStream(state));
-                     , w += c; dw += c;)
-
-    if (d_bias) {
-      auto db = THCTensor_(data)(state, d_bias);
-      Convolution_bp_bias(doF, db, op, op, nActive,
-                          THCState_getCurrentStream(state));
-    }
-  }
-}
-
-extern "C" double scn_DR_(FullConvolution_updateOutput)(
-    THLongTensor *inputSize, THLongTensor *outputSize, THLongTensor *filterSize,
-    THLongTensor *filterStride, void **mIn, void **mOut,
-    THCTensor *input_features, THCTensor *output_features, THCTensor *weight,
-    THCTensor *bias, long filterVolume, THCITensor *rulesBuffer) {
-
-  SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, mIn)
-  SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, mOut)
-  auto _rules = _mIn.getFullConvolutionRuleBook(
-      inputSize, outputSize, filterSize, filterStride, _mOut);
-  uInt nActive = _mOut.getNActive(outputSize);
-  THCTensor_(resize2d)(state, output_features, nActive, weight->size[1]);
-  if (not bias)
-    THCTensor_(zero)(state, output_features);
-  double flops = 0;
-
-  if (nActive) {
-    auto iF = THCTensor_(data)(state, input_features);
-    auto oF = THCTensor_(data)(state, output_features);
-    auto ip = input_features->size[1];
-    auto op = output_features->size[1];
-    auto w = THCTensor_(data)(state, weight);
-
-    if (bias) {
-      auto b = THCTensor_(data)(state, bias);
-      for (uInt i = 0; i < op; i += 32) {
-        uInt blockDim = min(32L, op - i);
-        uInt gridDim = min(4096, nActive);
-        Convolution_fp_bias<<<gridDim, blockDim, 0,
-                              THCState_getCurrentStream(state)>>>(
-            oF + i, b + i, op, op, nActive);
-      }
-    }
-    uInt c = ip * op;
-    RULEBOOKITERATOR(
-        dConvolution_forward2<real>(iF, oF, w, rbB, nHotB, ip, ip, op, op,
-                                    THCState_getCurrentStream(state));
-        , w += c; flops += nHotB * c;)
-  }
-  return flops;
-}
-
-extern "C" void scn_DR_(FullConvolution_backward)(
-    THLongTensor *inputSize, THLongTensor *outputSize, THLongTensor *filterSize,
-    THLongTensor *filterStride, void **mIn, void **mOut,
-    THCTensor *input_features, THCTensor *d_input_features,
-    THCTensor *d_output_features, THCTensor *weight, THCTensor *d_weight,
-    THCTensor *d_bias, long filterVolume) {
-
-  SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, mIn)
-  SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, mOut)
-  auto _rules = _mIn.getFullConvolutionRuleBook(
-      inputSize, outputSize, filterSize, filterStride, _mOut);
-  uInt nActive = _mOut.getNActive(outputSize);
-  THCTensor_(resizeAs)(state, d_input_features, input_features);
-  THCTensor_(zero)(state, d_input_features);
-  if (nActive) {
-    auto iF = THCTensor_(data)(state, input_features);
-    auto diF = THCTensor_(data)(state, d_input_features);
-    auto doF = THCTensor_(data)(state, d_output_features);
-    auto ip = input_features->size[1];
-    auto op = d_output_features->size[1];
-    auto w = THCTensor_(data)(state, weight);
-    auto dw = THCTensor_(data)(state, d_weight);
-    uInt c = ip * op;
-    RULEBOOKITERATOR(dConvolution_backward_dW2<real>(
-                         iF, diF, doF, w, dw, rbB, nHotB, ip, ip, op, op,
-                         THCState_getCurrentStream(state));
-                     , w += c; dw += c;)
-
-    if (d_bias) {
-      auto db = THCTensor_(data)(state, d_bias);
-      Convolution_bp_bias(doF, db, op, op, nActive,
-                          THCState_getCurrentStream(state));
-    }
-  }
-}
-extern "C" double scn_DR_(RandomizedStrideConvolution_updateOutput)(
-    THLongTensor *inputSize, THLongTensor *outputSize, THLongTensor *filterSize,
-    THLongTensor *filterStride,
-    void **m, THCTensor *input_features, THCTensor *output_features,
-    THCTensor *weight, THCTensor *bias, long filterVolume,
-    THCITensor *rulesBuffer) {
-  SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
-  auto _rules =
-      _m.getRandomizedStrideRuleBook(inputSize, outputSize, filterSize, filterStride, true);
-  uInt nActive = _m.getNActive(outputSize);
-  THCTensor_(resize2d)(state, output_features, nActive, weight->size[1]);
-  if (not bias)
-    THCTensor_(zero)(state, output_features);
-
-  double flops = 0;
-  if (nActive) {
-    auto iF = THCTensor_(data)(state, input_features);
-    auto oF = THCTensor_(data)(state, output_features);
-    auto ip = input_features->size[1];
-    auto op = output_features->size[1];
-    auto w = THCTensor_(data)(state, weight);
-
-    if (bias) {
-      auto b = THCTensor_(data)(state, bias);
-      for (uInt i = 0; i < op; i += 32) {
-        uInt blockDim = min(32L, op - i);
-        uInt gridDim = min(4096, nActive);
-        Convolution_fp_bias<<<gridDim, blockDim, 0,
-                              THCState_getCurrentStream(state)>>>(
-            oF + i, b + i, op, op, nActive);
-      }
-    }
-    uInt c = ip * op;
-    RULEBOOKITERATOR(
-        dConvolution_forward2<real>(iF, oF, w, rbB, nHotB, ip, ip, op, op,
-                                    THCState_getCurrentStream(state));
-        , w += c; flops += nHotB * c;)
-  }
-  return flops;
-}
-
-extern "C" void scn_DR_(RandomizedStrideConvolution_backward)(
-    THLongTensor *inputSize, THLongTensor *outputSize, THLongTensor *filterSize,
-    THLongTensor *filterStride,
-    void **m, THCTensor *input_features, THCTensor *d_input_features,
-    THCTensor *d_output_features, THCTensor *weight, THCTensor *d_weight,
-    THCTensor *d_bias, long filterVolume) {
-  SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
-  auto _rules =
-      _m.getRandomizedStrideRuleBook(inputSize, outputSize, filterSize, filterStride, true);
-  uInt nActive = _m.getNActive(outputSize);
-  THCTensor_(resizeAs)(state, d_input_features, input_features);
-  THCTensor_(zero)(state, d_input_features);
-
-  if (nActive) {
-    auto iF = THCTensor_(data)(state, input_features);
-    auto diF = THCTensor_(data)(state, d_input_features);
-    auto doF = THCTensor_(data)(state, d_output_features);
-    auto ip = input_features->size[1];
-    auto op = d_output_features->size[1];
-    auto w = THCTensor_(data)(state, weight);
-    auto dw = THCTensor_(data)(state, d_weight);
-    uInt c = ip * op;
-    RULEBOOKITERATOR(dConvolution_backward_dW2<real>(
-                         iF, diF, doF, w, dw, rbB, nHotB, ip, ip, op, op,
-                         THCState_getCurrentStream(state));
-                     , w += c; dw += c;)
-
-    if (d_bias) {
-      auto db = THCTensor_(data)(state, d_bias);
-      Convolution_bp_bias(doF, db, op, op, nActive,
-                          THCState_getCurrentStream(state));
-    }
-  }
-}
-#endif
--- a/sparseconvnet/SCN/generic/GPU/Deconvolution.cu
+++ b/sparseconvnet/SCN/generic/GPU/Deconvolution.cu
-// Copyright 2016-present, Facebook, Inc.
-// All rights reserved.
-//
-// This source code is licensed under the license found in the
-// LICENSE file in the root directory of this source tree.
-
-#ifndef TH_GENERIC_FILE_
-#define TH_GENERIC_FILE_ "generic/GPU/Deconvolution.cu"
-#else
-#include "Convolution.h"
-#include "Deconvolution.h"
-
-#include <algorithm>
-
-extern "C" double scn_DR_(Deconvolution_updateOutput)(
-    THLongTensor *inputSize, THLongTensor *outputSize, THLongTensor *filterSize,
-    THLongTensor *filterStride, void **m, THCTensor *input_features,
-    THCTensor *output_features, THCTensor *weight, THCTensor *bias,
-    long filterVolume) {
-  SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
-  auto _rules =
-      _m.getRuleBook(outputSize, inputSize, filterSize, filterStride, true);
-  uInt nActive = _m.getNActive(outputSize);
-  THCTensor_(resize2d)(state, output_features, nActive, weight->size[1]);
-  if (not bias)
-    THCTensor_(zero)(state, output_features);
-
-  auto iF = THCTensor_(data)(state, input_features);
-  auto oF = THCTensor_(data)(state, output_features);
-  auto ip = input_features->size[1];
-  auto op = output_features->size[1];
-  auto w = THCTensor_(data)(state, weight);
-  double flops = 0;
-
-  if (bias) {
-    auto b = THCTensor_(data)(state, bias);
-    for (uInt i = 0; i < op; i += 32) {
-      uInt blockDim = min(32L, op - i);
-      uInt gridDim = min(4096, nActive);
-      Convolution_fp_bias
-              << <gridDim, blockDim, 0, THCState_getCurrentStream(state)>>>
-          (oF + i, b + i, op, op, nActive);
-    }
-  }
-  uInt c = ip * op;
-  RULEBOOKITERATOR(
-      dDeconvolution_forward2<real>(iF, oF, w, rbB, nHotB, ip, ip, op, op,
-                                    THCState_getCurrentStream(state));
-      , w += c; flops += nHotB * c;)
-  return flops;
-}
-
-extern "C" void scn_DR_(Deconvolution_backward)(
-    THLongTensor *inputSize, THLongTensor *outputSize, THLongTensor *filterSize,
-    THLongTensor *filterStride, void **m, THCTensor *input_features,
-    THCTensor *d_input_features, THCTensor *d_output_features,
-    THCTensor *weight, THCTensor *d_weight, THCTensor *d_bias,
-    long filterVolume) {
-  SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
-  auto _rules =
-      _m.getRuleBook(outputSize, inputSize, filterSize, filterStride, true);
-  uInt nActive = _m.getNActive(outputSize);
-  THCTensor_(resizeAs)(state, d_input_features, input_features);
-  THCTensor_(zero)(state, d_input_features);
-
-  auto iF = THCTensor_(data)(state, input_features);
-  auto diF = THCTensor_(data)(state, d_input_features);
-  auto doF = THCTensor_(data)(state, d_output_features);
-  auto ip = input_features->size[1];
-  auto op = d_output_features->size[1];
-  auto w = THCTensor_(data)(state, weight);
-  auto dw = THCTensor_(data)(state, d_weight);
-  uInt c = ip * op;
-  RULEBOOKITERATOR(dDeconvolution_backward_dW2<real>(
-                       iF, diF, doF, w, dw, rbB, nHotB, ip, ip, op, op,
-                       THCState_getCurrentStream(state));
-
-                   , w += c; dw += c;)
-
-  if (d_bias) {
-    auto db = THCTensor_(data)(state, d_bias);
-    Convolution_bp_bias(doF, db, op, op, nActive,
-                        THCState_getCurrentStream(state));
-  }
-}
-
-#endif
--- a/sparseconvnet/SCN/generic/GPU/IOLayers.cu
+++ b/sparseconvnet/SCN/generic/GPU/IOLayers.cu
-// Copyright 2016-present, Facebook, Inc.
-// All rights reserved.
-//
-// This source code is licensed under the license found in the
-// LICENSE file in the root directory of this source tree.
-
-#ifndef TH_GENERIC_FILE_
-#define TH_GENERIC_FILE_ "generic/GPU/IOLayers.cu"
-#else
-#include "IOLayers.h"
-
-extern "C" void scn_DR_(InputLayer_updateOutput)(
-    void **m, THLongTensor *spatialSize, THLongTensor *input_coords,
-    THCTensor *input_features, THCTensor *output_features, long batchSize,
-    long mode) {
-  SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
-  _m.inputLayer(spatialSize, input_coords, batchSize, mode);
-  uInt nPlanes = input_features->size[1];
-  auto &rules = _m.inputLayerRuleBook;
-  uInt maxActive = rules[0][1];
-  uInt nRows = rules[0][3];
-  if (mode == 0) {
-    THCTensor_(resizeAs)(state, output_features, input_features);
-    THCTensor_(copy)(state, output_features, input_features);
-  } else {
-    THCTensor_(resize2d)(state, output_features, *_m.inputNActive, nPlanes);
-    THCTensor_(zero)(state, output_features);
-    auto rulesBuffer = THCITensor_(new)(state);
-    THCITensor_(resize1d)(state, rulesBuffer, sizeof(uInt) * rules[1].size());
-    auto iF = THCTensor_(data)(state, input_features);
-    auto oF = THCTensor_(data)(state, output_features);
-    auto rb = (uInt *)THCITensor_(data)(state, rulesBuffer);
-    cudaMemcpy(rb, &rules[1][0], sizeof(uInt) * rules[1].size(),
-               cudaMemcpyHostToDevice);
-    InputLayer_fp<real><<<std::min(nRows, 32768U), std::min(nPlanes, 32U), 0,
-                          THCState_getCurrentStream(state)>>>(
-        iF, oF, nRows, maxActive, nPlanes, rb, mode == 4);
-    THCITensor_(free)(state, rulesBuffer);
-  }
-}
-extern "C" void
-scn_DR_(InputLayer_updateGradInput)(void **m, THCTensor *d_input_features,
-                                    THCTensor *d_output_features) {
-  SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
-  auto &rules = _m.inputLayerRuleBook;
-  uInt nPlanes = d_output_features->size[1];
-  auto mode = rules[0][0];
-  uInt maxActive = rules[0][1];
-  uInt nRows = rules[0][3];
-  if (mode == 0) {
-    THCTensor_(resizeAs)(state, d_input_features, d_output_features);
-    THCTensor_(copy)(state, d_input_features, d_output_features);
-  } else {
-    THCTensor_(resize2d)(state, d_input_features, rules[0][2], nPlanes);
-    THCTensor_(zero)(state, d_input_features);
-    auto rulesBuffer = THCITensor_(new)(state);
-    THCITensor_(resize1d)(state, rulesBuffer, sizeof(uInt) * rules[1].size());
-    auto diF = THCTensor_(data)(state, d_input_features);
-    auto doF = THCTensor_(data)(state, d_output_features);
-    auto rb = (uInt *)THCITensor_(data)(state, rulesBuffer);
-    cudaMemcpy(rb, &rules[1][0], sizeof(uInt) * rules[1].size(),
-               cudaMemcpyHostToDevice);
-    InputLayer_bp<real><<<std::min(nRows, 32768U), std::min(nPlanes, 32U), 0,
-                          THCState_getCurrentStream(state)>>>(
-        diF, doF, nRows, maxActive, nPlanes, rb, mode == 4);
-    THCITensor_(free)(state, rulesBuffer);
-  }
-}
-
-extern "C" void scn_DR_(OutputLayer_updateOutput)(void **m,
-                                                  THCTensor *input_features,
-                                                  THCTensor *output_features) {
-  SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
-  auto &rules = _m.inputLayerRuleBook;
-  uInt nPlanes = input_features->size[1];
-  auto mode = rules[0][0];
-  auto maxActive = rules[0][1];
-  auto nRows = rules[0][3];
-  if (mode == 0) {
-    THCTensor_(resizeAs)(state, output_features, input_features);
-    THCTensor_(copy)(state, output_features, input_features);
-  } else {
-    THCTensor_(resize2d)(state, output_features, rules[0][2], nPlanes);
-    THCTensor_(zero)(state, output_features);
-    auto rulesBuffer = THCITensor_(new)(state);
-    THCITensor_(resize1d)(state, rulesBuffer, sizeof(uInt) * rules[1].size());
-    auto iF = THCTensor_(data)(state, input_features);
-    auto oF = THCTensor_(data)(state, output_features);
-    auto rb = (uInt *)THCITensor_(data)(state, rulesBuffer);
-    cudaMemcpy(rb, &rules[1][0], sizeof(uInt) * rules[1].size(),
-               cudaMemcpyHostToDevice);
-    InputLayer_bp<real><<<std::min(nRows, 32768U), std::min(nPlanes, 32U), 0,
-                          THCState_getCurrentStream(state)>>>(
-        oF, iF, nRows, maxActive, nPlanes, rb, false);
-    THCITensor_(free)(state, rulesBuffer);
-  }
-}
-extern "C" void
-scn_DR_(OutputLayer_updateGradInput)(void **m, THCTensor *d_input_features,
-                                     THCTensor *d_output_features) {
-  SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
-  auto &rules = _m.inputLayerRuleBook;
-  uInt nPlanes = d_output_features->size[1];
-  auto mode = rules[0][0];
-  auto maxActive = rules[0][1];
-  auto nRows = rules[0][3];
-  if (mode == 0) {
-    THCTensor_(resizeAs)(state, d_input_features, d_output_features);
-    THCTensor_(copy)(state, d_input_features, d_output_features);
-  } else {
-    THCTensor_(resize2d)(state, d_input_features, nRows, nPlanes);
-    THCTensor_(zero)(state, d_input_features);
-    auto rulesBuffer = THCITensor_(new)(state);
-    THCITensor_(resize1d)(state, rulesBuffer, sizeof(uInt) * rules[1].size());
-    auto diF = THCTensor_(data)(state, d_input_features);
-    auto doF = THCTensor_(data)(state, d_output_features);
-    auto rb = (uInt *)THCITensor_(data)(state, rulesBuffer);
-    cudaMemcpy(rb, &rules[1][0], sizeof(uInt) * rules[1].size(),
-               cudaMemcpyHostToDevice);
-    InputLayer_fp<real><<<std::min(nRows, 32768U), std::min(nPlanes, 32U), 0,
-                          THCState_getCurrentStream(state)>>>(
-        doF, diF, nRows, maxActive, nPlanes, rb, false);
-    THCITensor_(free)(state, rulesBuffer);
-  }
-}
-
-extern "C" void scn_DR_(BLInputLayer_updateOutput)(
-    void **m, THLongTensor *spatialSize, THLongTensor *input_coords,
-    THCTensor *input_features, THCTensor *output_features, long mode) {
-  SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
-  _m.blLayer(spatialSize, input_coords, mode);
-  uInt nPlanes = input_features->size[2];
-  THCTensor_(resize2d)(state, output_features, *_m.inputNActive, nPlanes);
-  THCTensor_(zero)(state, output_features);
-  auto &rules = _m.blLayerRuleBook;
-  uInt maxActive = rules[0][1];
-  uInt nRows = rules[0][4];
-
-  if (mode == 0) {
-    THCTensor_(resizeAs)(state, output_features, input_features);
-    THCTensor_(copy)(state, output_features, input_features);
-    THCTensor_(resize2d)(state, output_features, *_m.inputNActive, nPlanes);
-  } else {
-    auto rulesBuffer = THCITensor_(new)(state);
-    THCITensor_(resize1d)(state, rulesBuffer, sizeof(uInt) * rules[1].size());
-    auto iF = THCTensor_(data)(state, input_features);
-    auto oF = THCTensor_(data)(state, output_features);
-    auto rb = (uInt *)THCITensor_(data)(state, rulesBuffer);
-    cudaMemcpy(rb, &rules[1][0], sizeof(uInt) * rules[1].size(),
-               cudaMemcpyHostToDevice);
-    InputLayer_fp<real><<<std::min(nRows, 32768U), std::min(nPlanes, 32U), 0,
-                          THCState_getCurrentStream(state)>>>(
-        iF, oF, nRows, maxActive, nPlanes, rb, mode == 4);
-    THCITensor_(free)(state, rulesBuffer);
-  }
-}
-extern "C" void
-scn_DR_(BLInputLayer_updateGradInput)(void **m, THCTensor *d_input_features,
-                                      THCTensor *d_output_features) {
-  SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
-  auto &rules = _m.blLayerRuleBook;
-  uInt nPlanes = d_output_features->size[1];
-  uInt mode = rules[0][0];
-  uInt maxActive = rules[0][1];
-  uInt nRows = rules[0][4];
-
-  if (mode == 0) {
-    THCTensor_(resizeAs)(state, d_input_features, d_output_features);
-    THCTensor_(copy)(state, d_input_features, d_output_features);
-    THCTensor_(resize3d)(state, d_input_features, rules[0][2], rules[0][3],
-                         nPlanes);
-  } else {
-    THCTensor_(resize3d)(state, d_input_features, rules[0][2], rules[0][3],
-                         nPlanes);
-    THCTensor_(zero)(state, d_input_features);
-    auto rulesBuffer = THCITensor_(new)(state);
-    THCITensor_(resize1d)(state, rulesBuffer, sizeof(uInt) * rules[1].size());
-    auto diF = THCTensor_(data)(state, d_input_features);
-    auto doF = THCTensor_(data)(state, d_output_features);
-    auto rb = (uInt *)THCITensor_(data)(state, rulesBuffer);
-    cudaMemcpy(rb, &rules[1][0], sizeof(uInt) * rules[1].size(),
-               cudaMemcpyHostToDevice);
-    InputLayer_bp<real><<<std::min(nRows, 32768U), std::min(nPlanes, 32U), 0,
-                          THCState_getCurrentStream(state)>>>(
-        diF, doF, nRows, maxActive, nPlanes, rb, mode == 4);
-  THCITensor_(free)(state, rulesBuffer);
-  }
-}
-
-extern "C" void scn_DR_(BLOutputLayer_updateOutput)(void **m,
-                                                    THCTensor *input_features,
-                                                    THCTensor *output_features) {
-  SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
-  auto &rules = _m.blLayerRuleBook;
-  uInt nPlanes = input_features->size[1];
-  auto mode = rules[0][0];
-  uInt maxActive = rules[0][1];
-  uInt nRows = rules[0][4];
-  if (mode == 0) {
-    THCTensor_(resizeAs)(state, output_features, input_features);
-    THCTensor_(copy)(state, output_features, input_features);
-    THCTensor_(resize3d)(state, output_features, rules[0][2], rules[0][3],
-                         nPlanes);
-  } else {
-    THCTensor_(resize3d)(state, output_features, rules[0][2], rules[0][3],
-                         nPlanes);
-    THCTensor_(zero)(state, output_features);
-    auto rulesBuffer = THCITensor_(new)(state);
-    THCITensor_(resize1d)(state, rulesBuffer, sizeof(uInt) * rules[1].size());
-    auto iF = THCTensor_(data)(state, input_features);
-    auto oF = THCTensor_(data)(state, output_features);
-    auto rb = (uInt *)THCITensor_(data)(state, rulesBuffer);
-    cudaMemcpy(rb, &rules[1][0], sizeof(uInt) * rules[1].size(),
-               cudaMemcpyHostToDevice);
-    InputLayer_bp<real><<<std::min(nRows, 32768U), std::min(nPlanes, 32U), 0,
-                          THCState_getCurrentStream(state)>>>(
-        oF, iF, nRows, maxActive, nPlanes, rb, false);
-  THCITensor_(free)(state, rulesBuffer);
-  }
-}
-extern "C" void
-scn_DR_(BLOutputLayer_updateGradInput)(void **m, THCTensor *d_input_features,
-                                       THCTensor *d_output_features) {
-  SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
-  auto &rules = _m.blLayerRuleBook;
-  uInt nPlanes = d_output_features->size[2];
-  uInt mode = rules[0][0];
-  uInt maxActive = rules[0][1];
-  uInt nRows = rules[0][4];
-  if (mode == 0) {
-    THCTensor_(resizeAs)(state, d_input_features, d_output_features);
-    THCTensor_(copy)(state, d_input_features, d_output_features);
-    THCTensor_(resize2d)(state, d_input_features, nRows, nPlanes);
-  } else {
-    THCTensor_(resize2d)(state, d_input_features, nRows, nPlanes);
-    THCTensor_(zero)(state, d_input_features);
-    auto rulesBuffer = THCITensor_(new)(state);
-    THCITensor_(resize1d)(state, rulesBuffer, sizeof(uInt) * rules[1].size());
-    auto diF = THCTensor_(data)(state, d_input_features);
-    auto doF = THCTensor_(data)(state, d_output_features);
-    auto rb = (uInt *)THCITensor_(data)(state, rulesBuffer);
-    cudaMemcpy(rb, &rules[1][0], sizeof(uInt) * rules[1].size(),
-               cudaMemcpyHostToDevice);
-    InputLayer_fp<real><<<std::min(nRows, 32768U), std::min(nPlanes, 32U), 0,
-                          THCState_getCurrentStream(state)>>>(
-        doF, diF, nRows, maxActive, nPlanes, rb, false);
-   THCITensor_(free)(state, rulesBuffer);
-  }
-}
-#endif
--- a/sparseconvnet/SCN/generic/GPU/LeakyReLU.cu
+++ b/sparseconvnet/SCN/generic/GPU/LeakyReLU.cu
-// Copyright 2016-present, Facebook, Inc.
-// All rights reserved.
-//
-// This source code is licensed under the license found in the
-// LICENSE file in the root directory of this source tree.
-
-#ifndef TH_GENERIC_FILE
-#define TH_GENERIC_FILE "generic/GPU/LeakyReLU.cu"
-#else
-#include "LeakyReLU.h"
-
-extern "C" void scn_R_(LeakyReLU_updateOutput)(THCTensor *input_features,
-                                               THCTensor *output_features,
-                                               float alpha) {
-  if (input_features != output_features)
-    THCTensor_(resizeAs)(state, output_features, input_features);
-  auto n = THCTensor_(nElement)(state, input_features);
-  LeakyReLU_fp<real> << <16, 1024, 0, THCState_getCurrentStream(state)>>>
-      (THCTensor_(data)(state, input_features),
-       THCTensor_(data)(state, output_features), n, alpha);
-}
-
-extern "C" void scn_R_(LeakyReLU_updateGradInput)(THCTensor *input_features,
-                                                  THCTensor *d_input_features,
-                                                  THCTensor *d_output_features,
-                                                  float alpha) {
-  if (d_input_features != d_output_features)
-    THCTensor_(resizeAs)(state, d_input_features, d_output_features);
-  auto n = THCTensor_(nElement)(state, d_input_features);
-  LeakyReLU_bp<real> << <16, 1024, 0, THCState_getCurrentStream(state)>>>
-      (THCTensor_(data)(state, input_features),
-       THCTensor_(data)(state, d_input_features),
-       THCTensor_(data)(state, d_output_features), n, alpha);
-}
-
-#endif
--- a/sparseconvnet/SCN/generic/GPU/MaxPooling.cu
+++ b/sparseconvnet/SCN/generic/GPU/MaxPooling.cu
-// Copyright 2016-present, Facebook, Inc.
-// All rights reserved.
-//
-// This source code is licensed under the license found in the
-// LICENSE file in the root directory of this source tree.
-
-#ifndef TH_GENERIC_FILE_
-#define TH_GENERIC_FILE_ "generic/GPU/MaxPooling.cu"
-#else
-#include "MaxPooling.h"
-#include "RuleBookIterator.h"
-
-extern "C" void scn_DR_(MaxPooling_updateOutput)(
-    THLongTensor *inputSize, THLongTensor *outputSize, THLongTensor *poolSize,
-    THLongTensor *poolStride, void **m, THCTensor *input_features,
-    THCTensor *output_features, long nFeaturesToDrop) {
-
-  SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
-  uInt nPlanes = input_features->size[1] - nFeaturesToDrop;
-  auto _rules =
-      _m.getRuleBook(inputSize, outputSize, poolSize, poolStride, true);
-  uInt nActive = _m.getNActive(outputSize);
-  THCTensor_(resize2d)(state, output_features, nActive, nPlanes);
-  THCTensor_(zero)(state, output_features);
-
-  auto iF = THCTensor_(data)(state, input_features) + nFeaturesToDrop;
-  auto oF = THCTensor_(data)(state, output_features);
-  RULEBOOKITERATOR(
-      MaxPooling_ForwardPass<real>(THCState_getCurrentStream(state), iF, oF,
-                                   nPlanes, input_features->size[1],
-                                   output_features->size[1], rbB, nHotB);
-      , )
-}
-extern "C" void scn_DR_(MaxPooling_updateGradInput)(
-    THLongTensor *inputSize, THLongTensor *outputSize, THLongTensor *poolSize,
-    THLongTensor *poolStride, void **m, THCTensor *input_features,
-    THCTensor *d_input_features, THCTensor *output_features,
-    THCTensor *d_output_features, long nFeaturesToDrop) {
-
-  SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
-  uInt nPlanes = input_features->size[1] - nFeaturesToDrop;
-  auto _rules =
-      _m.getRuleBook(inputSize, outputSize, poolSize, poolStride, true);
-  uInt nActive = _m.getNActive(outputSize);
-  THCTensor_(resizeAs)(state, d_input_features, input_features);
-  THCTensor_(zero)(state, d_input_features);
-
-  auto iF = THCTensor_(data)(state, input_features);
-  auto oF = THCTensor_(data)(state, output_features);
-  auto diF = THCTensor_(data)(state, d_input_features);
-  auto doF = THCTensor_(data)(state, d_output_features);
-  RULEBOOKITERATOR(
-      MaxPooling_BackwardPass<real>(THCState_getCurrentStream(state), iF, diF,
-                                    oF, doF, nPlanes, input_features->size[1],
-                                    d_output_features->size[1], rbB, nHotB);
-      , )
-}
-extern "C" void scn_DR_(RandomizedStrideMaxPooling_updateOutput)(
-    THLongTensor *inputSize, THLongTensor *outputSize, THLongTensor *poolSize,
-    THLongTensor *poolStride, void **m, THCTensor *input_features,
-    THCTensor *output_features, long nFeaturesToDrop) {
-
-  SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
-  uInt nPlanes = input_features->size[1] - nFeaturesToDrop;
-  auto _rules =
-      _m.getRandomizedStrideRuleBook(inputSize, outputSize, poolSize, poolStride, true);
-  uInt nActive = _m.getNActive(outputSize);
-  THCTensor_(resize2d)(state, output_features, nActive, nPlanes);
-  THCTensor_(zero)(state, output_features);
-
-  auto iF = THCTensor_(data)(state, input_features) + nFeaturesToDrop;
-  auto oF = THCTensor_(data)(state, output_features);
-  RULEBOOKITERATOR(
-      MaxPooling_ForwardPass<real>(THCState_getCurrentStream(state), iF, oF,
-                                   nPlanes, input_features->size[1],
-                                   output_features->size[1], rbB, nHotB);
-      , )
-}
-extern "C" void scn_DR_(RandomizedStrideMaxPooling_updateGradInput)(
-    THLongTensor *inputSize, THLongTensor *outputSize, THLongTensor *poolSize,
-    THLongTensor *poolStride, void **m, THCTensor *input_features,
-    THCTensor *d_input_features, THCTensor *output_features,
-    THCTensor *d_output_features, long nFeaturesToDrop) {
-
-  SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
-  uInt nPlanes = input_features->size[1] - nFeaturesToDrop;
-  auto _rules =
-      _m.getRandomizedStrideRuleBook(inputSize, outputSize, poolSize, poolStride, true);
-  uInt nActive = _m.getNActive(outputSize);
-  THCTensor_(resizeAs)(state, d_input_features, input_features);
-  THCTensor_(zero)(state, d_input_features);
-
-  auto iF = THCTensor_(data)(state, input_features);
-  auto oF = THCTensor_(data)(state, output_features);
-  auto diF = THCTensor_(data)(state, d_input_features);
-  auto doF = THCTensor_(data)(state, d_output_features);
-  RULEBOOKITERATOR(
-      MaxPooling_BackwardPass<real>(THCState_getCurrentStream(state), iF, diF,
-                                    oF, doF, nPlanes, input_features->size[1],
-                                    d_output_features->size[1], rbB, nHotB);
-      , )
-}
-#endif
--- a/sparseconvnet/SCN/generic/GPU/NetworkInNetwork.cu
+++ b/sparseconvnet/SCN/generic/GPU/NetworkInNetwork.cu
-// Copyright 2016-present, Facebook, Inc.
-// All rights reserved.
-//
-// This source code is licensed under the license found in the
-// LICENSE file in the root directory of this source tree.
-
-#ifndef TH_GENERIC_FILE
-#define TH_GENERIC_FILE "generic/GPU/NetworkInNetwork.cu"
-#else
-#include "Convolution.h"
-
-#include <algorithm>
-
-extern "C" double
-scn_R_(NetworkInNetwork_updateOutput)(THCTensor *input_features_,
-                                      THCTensor *output_features_,
-                                      THCTensor *weight_, THCTensor *bias_) {
-  auto nActive = input_features_->size[0];
-  auto input_nPlanes = weight_->size[0];
-  auto output_nPlanes = weight_->size[1];
-  THCTensor_(resize2d)(state, output_features_, nActive, output_nPlanes);
-  auto input_features = THCTensor_(data)(state, input_features_);
-  auto output_features = THCTensor_(data)(state, output_features_);
-  auto weight = THCTensor_(data)(state, weight_);
-
-  if (bias_ != nullptr) {
-    auto bias = THCTensor_(data)(state, bias_);
-    for (uInt i = 0; i < output_nPlanes; i += 32) {
-      uInt blockDim = min(32L, output_nPlanes - i);
-      uInt gridDim = min(4096L, nActive);
-      Convolution_fp_bias<<<gridDim, blockDim, 0,
-                            THCState_getCurrentStream(state)>>>(
-          output_features + i, bias + i, output_nPlanes, output_nPlanes,
-          nActive);
-    }
-    // Do GEMM (note: gemm assumes column-major matrices)
-    // buffer          is l*m (row-major)
-    // weight          is m*r (row-major)
-    // output_features is l*r (row-major)
-    // buffer * weights + bias -> output_features
-    THBLAS_GEMM(state, 'n', 'n',
-                output_nPlanes, // r
-                nActive,        // l
-                input_nPlanes,  // m
-                1,              // alpha
-                weight,
-                output_nPlanes, // r
-                input_features,
-                input_nPlanes, // m
-                1,             // beta
-                output_features,
-                output_nPlanes // r
-                );
-  } else {
-    THCTensor_(zero)(state, output_features_);
-    THBLAS_GEMM(state, 'n', 'n',
-                output_nPlanes, // r
-                nActive,        // l
-                input_nPlanes,  // m
-                1,              // alpha
-                weight,
-                output_nPlanes, // r
-                input_features,
-                input_nPlanes, // m
-                0,             // beta
-                output_features,
-                output_nPlanes // r
-                );
-  }
-  return nActive * input_nPlanes * output_nPlanes;
-}
-
-extern "C" void
-scn_R_(NetworkInNetwork_updateGradInput)(THCTensor *d_input_features_,
-                                         THCTensor *d_output_features_,
-                                         THCTensor *weight_) {
-  auto nActive = d_output_features_->size[0];
-  auto input_nPlanes = weight_->size[0];
-  auto output_nPlanes = weight_->size[1];
-  THCTensor_(resize2d)(state, d_input_features_, nActive, input_nPlanes);
-  THCTensor_(zero)(state, d_input_features_);
-  auto d_input_features = THCTensor_(data)(state, d_input_features_);
-  auto d_output_features = THCTensor_(data)(state, d_output_features_);
-  auto weight = THCTensor_(data)(state, weight_);
-  // Do GEMM (note: gemm assumes column-major matrices)
-  // d_output_features is l*m (row-major)
-  // weights           is r*m (row-major)
-  // d_buffer          is l*r (row-major)
-  // d_output_features * T(weight) -> d_buffer
-  THBLAS_GEMM(state, 't', 'n',
-              input_nPlanes,  // r
-              nActive,        // l
-              output_nPlanes, // m
-              1,              // alpha
-              weight,
-              output_nPlanes, // m
-              d_output_features,
-              output_nPlanes, // m
-              0,              // beta
-              d_input_features,
-              input_nPlanes // r
-              );
-}
-
-extern "C" void scn_R_(NetworkInNetwork_accGradParameters)(
-    THCTensor *input_features_, THCTensor *d_output_features_,
-    THCTensor *d_weight_, THCTensor *d_bias_) {
-  auto nActive = input_features_->size[0];
-  auto input_nPlanes = d_weight_->size[0];
-  auto output_nPlanes = d_weight_->size[1];
-  auto input_features = THCTensor_(data)(state, input_features_);
-  auto d_output_features = THCTensor_(data)(state, d_output_features_);
-  auto d_weight = THCTensor_(data)(state, d_weight_);
-  // Do GEMM (note: gemm assumes column-major matrices)
-  // buffer            is m*l (row-major)
-  // d_output_features is m*r (row-major)
-  // weights           is l*r (row-major)
-  // T(buffer) * d_output_features -> d_weight
-  THBLAS_GEMM(state, 'n', 't',
-              output_nPlanes, // r
-              input_nPlanes,  // l
-              nActive,        // m
-              1,              // alpha
-              d_output_features,
-              output_nPlanes, // r
-              input_features,
-              input_nPlanes, // l
-              1,             // beta
-              d_weight,
-              output_nPlanes // r
-              );
-
-  if (d_bias_) {
-    auto d_bias = THCTensor_(data)(state, d_bias_);
-    Convolution_bp_bias(d_output_features, d_bias, output_nPlanes,
-                        output_nPlanes, nActive,
-                        THCState_getCurrentStream(state));
-  }
-}
-
-#endif