initial commit

f9552033 · Benjamin Thomas Graham · f9552033 · f9552033 · f9552033 · f9552033
Commit f9552033 authored Jul 16, 2017 by Benjamin Thomas Graham
20 changed files
--- a/PyTorch/sparseconvnet/SCN/generic/CPU/MaxPooling.cpp
+++ b/PyTorch/sparseconvnet/SCN/generic/CPU/MaxPooling.cpp
+// Copyright 2016-present, Facebook, Inc.
+// All rights reserved.
+//
+// This source code is licensed under the license found in the
+// LICENSE file in the root directory of this source tree.
+#ifndef TH_GENERIC_FILE_
+#define TH_GENERIC_FILE_ "generic/CPU/MaxPooling.cpp"
+#else
+#include "MaxPooling.h"
+extern "C" void scn_DR_(MaxPooling_updateOutput)(
+    THLongTensor *inputSize, THLongTensor *outputSize, THLongTensor *poolSize,
+    THLongTensor *poolStride, void **m, THTensor *input_features,
+    THTensor *output_features, long nFeaturesToDrop, void *rulesBuffer) {
+  SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
+  uInt nPlanes = input_features->size[1] - nFeaturesToDrop;
+  auto _rules =
+      _m.getRuleBook(inputSize, outputSize, poolSize, poolStride, true);
+  uInt nActive = _m.getNActive(outputSize);
+  THTensor_(resize2d)(output_features, nActive,
+                      input_features->size[1] - nFeaturesToDrop);
+  THTensor_(zero)(output_features);
+  auto iF = THTensor_(data)(input_features) + nFeaturesToDrop;
+  auto oF = THTensor_(data)(output_features);
+  for (auto &r : _rules) {
+    uInt nHot = r.size() / 2;
+    MaxPooling_ForwardPass<real>(iF, oF, nPlanes, input_features->stride[0],
+                                 output_features->stride[0], &r[0], nHot);
+  }
+}
+extern "C" void scn_DR_(MaxPooling_updateGradInput)(
+    THLongTensor *inputSize, THLongTensor *outputSize, THLongTensor *poolSize,
+    THLongTensor *poolStride, void **m, THTensor *input_features,
+    THTensor *d_input_features, THTensor *output_features,
+    THTensor *d_output_features, long nFeaturesToDrop, void *rulesBuffer) {
+  SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
+  uInt nPlanes = input_features->size[1] - nFeaturesToDrop;
+  auto _rules =
+      _m.getRuleBook(inputSize, outputSize, poolSize, poolStride, true);
+  uInt nActive = _m.getNActive(outputSize);
+  THTensor_(resizeAs)(d_input_features, input_features);
+  THTensor_(zero)(d_input_features);
+  auto iF = THTensor_(data)(input_features);
+  auto oF = THTensor_(data)(output_features);
+  auto diF = THTensor_(data)(d_input_features);
+  auto doF = THTensor_(data)(d_output_features);
+  for (auto &r : _rules) {
+    uInt nHot = r.size() / 2;
+    MaxPooling_BackwardPass<real>(iF, diF, oF, doF, nPlanes,
+                                  input_features->stride[0],
+                                  output_features->stride[0], &r[0], nHot);
+  }
+}
+#endif
--- a/PyTorch/sparseconvnet/SCN/generic/CPU/MaxPooling.h
+++ b/PyTorch/sparseconvnet/SCN/generic/CPU/MaxPooling.h
+// Copyright 2016-present, Facebook, Inc.
+// All rights reserved.
+//
+// This source code is licensed under the license found in the
+// LICENSE file in the root directory of this source tree.
+#ifndef CPU_MAXPOOLING_H
+#define CPU_MAXPOOLING_H
+#include "../SparseConvNet.h"
+template <typename T>
+void MaxPooling_ForwardPass(T *input_features, T *output_features,
+                              uInt nPlanes, uInt input_stride,
+                              uInt output_stride, uInt *rules, uInt nHot) {
+  for (uInt outSite = 0; outSite < nHot; outSite++) {
+    uInt i = rules[2 * outSite] * input_stride;
+    uInt o = rules[2 * outSite + 1] * output_stride;
+    for (uInt plane = 0; plane < nPlanes; plane++)
+      if (output_features[o + plane] < input_features[i + plane])
+        output_features[o + plane] = input_features[i + plane];
+  }
+}
+template <typename T>
+void MaxPooling_BackwardPass(T *input_features, T *d_input_features,
+                               T *output_features, T *d_output_features,
+                               uInt nPlanes, uInt input_stride,
+                               uInt output_stride, uInt *rules, uInt nHot) {
+  for (uInt outSite = 0; outSite < nHot; outSite++) {
+    uInt i = rules[2 * outSite] * input_stride;
+    uInt o = rules[2 * outSite + 1] * output_stride;
+    for (uInt plane = 0; plane < nPlanes; plane++)
+      if (output_features[o + plane] == input_features[i + plane])
+        d_input_features[i + plane] += d_output_features[o + plane];
+  }
+}
+#endif /* CPU_MAXPOOLING_H */
--- a/PyTorch/sparseconvnet/SCN/generic/CPU/NetworkInNetwork.cpp
+++ b/PyTorch/sparseconvnet/SCN/generic/CPU/NetworkInNetwork.cpp
+// Copyright 2016-present, Facebook, Inc.
+// All rights reserved.
+//
+// This source code is licensed under the license found in the
+// LICENSE file in the root directory of this source tree.
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/CPU/NetworkInNetwork.cpp"
+#else
+extern "C" double
+    scn_R_(NetworkInNetwork_updateOutput)(THTensor *input_features_,
+                                          THTensor *output_features_,
+                                          THTensor *weight_, THTensor *bias_) {
+  auto nActive = input_features_->size[0];
+  auto input_nPlanes = weight_->size[0];
+  auto output_nPlanes = weight_->size[1];
+  THTensor_(resize2d)(output_features_, nActive, output_nPlanes);
+  auto input_features = THTensor_(data)(input_features_);
+  auto output_features = THTensor_(data)(output_features_);
+  auto weight = THTensor_(data)(weight_);
+  if (bias_ != nullptr) {
+    // Set bias
+    auto bias = THTensor_(data)(bias_);
+    for (uInt row = 0; row < nActive; row++)
+      for (uInt column = 0; column < output_nPlanes; column++)
+        output_features[row * output_nPlanes + column] = bias[column];
+    // Do GEMM (note: gemm assumes column-major matrices)
+    // buffer          is l*m (row-major)
+    // weight          is r*m (row-major)
+    // output_features is l*r (row-major)
+    // buffer * T(weights) + bias -> output_features
+    THBlas_(gemm)('n', 'n',
+                  output_nPlanes,         // r
+                  nActive,                // l
+                  input_nPlanes,          // m
+                  1,                      // alpha
+                  weight, output_nPlanes, // r
+                  input_features,
+                  input_nPlanes,                  // m
+                  1,                              // beta
+                  output_features, output_nPlanes // r
+                  );
+  } else {
+    THTensor_(zero)(output_features_);
+    THBlas_(gemm)('n', 'n',
+                  output_nPlanes,                 // r
+                  nActive,                        // l
+                  input_nPlanes,                  // m
+                  1,                              // alpha
+                  weight, output_nPlanes,         // r
+                  input_features, input_nPlanes,  // m
+                  0,                              // beta
+                  output_features, output_nPlanes // r
+                  );
+  }
+  return nActive * input_nPlanes * output_nPlanes;
+}
+extern "C" void
+    scn_R_(NetworkInNetwork_updateGradInput)(THTensor *d_input_features_,
+                                             THTensor *d_output_features_,
+                                             THTensor *weight_) {
+  auto nActive = d_output_features_->size[0];
+  auto input_nPlanes = weight_->size[0];
+  auto output_nPlanes = weight_->size[1];
+  THTensor_(resize2d)(d_input_features_, nActive, input_nPlanes);
+  THTensor_(zero)(d_input_features_);
+  auto d_input_features = THTensor_(data)(d_input_features_);
+  auto d_output_features = THTensor_(data)(d_output_features_);
+  auto weight = THTensor_(data)(weight_);
+  // Do GEMM (note: gemm assumes column-major matrices)
+  // d_output_features is l*m (row-major)
+  // weights           is m*r (row-major)
+  // d_buffer          is l*r (row-major)
+  // d_output_features * weight -> d_buffer
+  THBlas_(gemm)('t', 'n',
+                input_nPlanes,                     // r
+                nActive,                           // l
+                output_nPlanes,                    // m
+                1,                                 // alpha
+                weight, output_nPlanes,            // m
+                d_output_features, output_nPlanes, // m
+                0,                                 // beta
+                d_input_features, input_nPlanes    // r
+                );
+}
+extern "C" void scn_R_(NetworkInNetwork_accGradParameters)(
+    THTensor *input_features_, THTensor *d_output_features_,
+    THTensor *d_weight_, THTensor *d_bias_) {
+  auto nActive = input_features_->size[0];
+  auto input_nPlanes = d_weight_->size[0];
+  auto output_nPlanes = d_weight_->size[1];
+  auto input_features = THTensor_(data)(input_features_);
+  auto d_output_features = THTensor_(data)(d_output_features_);
+  auto d_weight = THTensor_(data)(d_weight_);
+  auto d_bias = d_bias_ and THTensor_(data)(d_bias_);
+  // Do GEMM (note: gemm assumes column-major matrices)
+  // d_output_features is m*l (row-major)
+  // buffer            is m*r (row-major)
+  // weights           is l*r (row-major)
+  // T(d_output_features) * buffer -> d_weight
+  THBlas_(gemm)('n', 't',
+                output_nPlanes,                    // r
+                input_nPlanes,                     // l
+                nActive,                           // m
+                1,                                 // alpha
+                d_output_features, output_nPlanes, // r
+                input_features, input_nPlanes,     // l
+                1,                                 // beta
+                d_weight, output_nPlanes           // r
+                );
+  if (d_bias_) {
+    auto d_bias = THTensor_(data)(d_bias_);
+    for (uInt row = 0; row < nActive; row++)
+      for (uInt i = 0; i < output_nPlanes; i++)
+        d_bias[i] += d_output_features[row * output_nPlanes + i];
+  }
+}
+#endif
--- a/PyTorch/sparseconvnet/SCN/generic/CPU/NetworkInNetwork.h
+++ b/PyTorch/sparseconvnet/SCN/generic/CPU/NetworkInNetwork.h
+// Copyright 2016-present, Facebook, Inc.
+// All rights reserved.
+//
+// This source code is licensed under the license found in the
+// LICENSE file in the root directory of this source tree.
+#ifndef CPU_NetworkInNetwork_H
+#define CPU_NetworkInNetwork_H
+#include "../SparseConvNet.h"
+#include "Convolution.h"
+// buffer must have size >= output_nActive * filterVolume * input_nPlanes
+template <typename T>
+void NetworkInNetwork_ForwardPass(
+    T *input_features, uInt input_nPlanes, T *output_features,
+    uInt output_nPlanes, T *weight, T *bias, uInt output_nActive,
+    void (*gemm)(char transa, char transb, long m, long n, long k, T alpha,
+                 T *a, long lda, T *b, long ldb, T beta, T *c, long ldc)) {
+  if (bias != nullptr) {
+    // Set bias
+    for (uInt row = 0; row < output_nActive; row++)
+      for (uInt column = 0; column < output_nPlanes; column++)
+        output_features[row * output_nPlanes + column] = bias[column];
+    // Do GEMM (note: gemm assumes column-major matrices)
+    // buffer          is l*m (row-major)
+    // weight          is r*m (row-major)
+    // output_features is l*r (row-major)
+    // buffer * T(weights) + bias -> output_features
+    (*gemm)('n', 'n',
+            output_nPlanes,               // r
+            output_nActive,               // l
+            input_nPlanes * filterVolume, // m
+            1,                            // alpha
+            weight, output_nPlanes,       // r
+            buffer,
+            input_nPlanes * filterVolume,   // m
+            1,                              // beta
+            output_features, output_nPlanes // r
+            );
+  } else {
+    (*gemm)('n', 'n',
+            output_nPlanes,                       // r
+            output_nActive,                       // l
+            input_nPlanes * filterVolume,         // m
+            1,                                    // alpha
+            weight, output_nPlanes,               // r
+            buffer, input_nPlanes * filterVolume, // m
+            0,                                    // beta
+            output_features, output_nPlanes       // r
+            );
+  }
+}
+template <typename T>
+void NetworkInNetwork_BackwardPass(
+    T *d_input_features, uInt input_nPlanes, T *d_output_features,
+    uInt output_nPlanes, T *weight, uInt *rules, uInt filterVolume,
+    uInt output_nActive, T *d_buffer,
+    void (*gemm)(char transa, char transb, long m, long n, long k, T alpha,
+                 T *a, long lda, T *b, long ldb, T beta, T *c, long ldc)) {
+  // Do GEMM (note: gemm assumes column-major matrices)
+  // d_output_features is l*m (row-major)
+  // weights           is m*r (row-major)
+  // d_buffer          is l*r (row-major)
+  // d_output_features * weight -> d_buffer
+  (*gemm)('t', 'n',
+          input_nPlanes * filterVolume,          // r
+          output_nActive,                        // l
+          output_nPlanes,                        // m
+          1,                                     // alpha
+          weight, output_nPlanes,                // m
+          d_output_features, output_nPlanes,     // m
+          0,                                     // beta
+          d_buffer, input_nPlanes * filterVolume // r
+          );
+  // Use rules and d_buffer to accumulate gradient information into d_input
+  for (uInt row = 0; row < output_nActive * filterVolume; row++) {
+    auto r = rules[row];
+    if (r != uInt_MAX) // 2^32-1
+      for (uInt i = 0; i < input_nPlanes; i++)
+        d_input_features[r * input_nPlanes + i] +=
+            d_buffer[row * input_nPlanes + i];
+  }
+}
+template <typename T>
+void NetworkInNetwork_GradWeights(
+    T *input_features, uInt input_nPlanes, T *d_output_features,
+    uInt output_nPlanes, T *d_weight, T *d_bias, uInt *rules, uInt filterVolume,
+    uInt output_nActive, T *buffer,
+    void (*gemm)(char transa, char transb, long m, long n, long k, T alpha,
+                 T *a, long lda, T *b, long ldb, T beta, T *c, long ldc)) {
+  // d_weight
+  // Use input_features and rules to fill buffer
+  for (uInt row = 0; row < output_nActive * filterVolume; row++) {
+    if (rules[row] == uInt_MAX) { // 2^32-1
+      std::memset(buffer + row * input_nPlanes, 0, sizeof(T) * input_nPlanes);
+    } else {
+      std::memcpy(buffer + row * input_nPlanes,
+                  input_features + rules[row] * input_nPlanes,
+                  sizeof(T) * input_nPlanes);
+    }
+  }
+  // Do GEMM (note: gemm assumes column-major matrices)
+  // d_output_features is m*l (row-major)
+  // buffer            is m*r (row-major)
+  // weights           is l*r (row-major)
+  // T(d_output_features) * buffer -> d_weight
+  (*gemm)('n', 't',
+          output_nPlanes,                       // r
+          input_nPlanes * filterVolume,         // l
+          output_nActive,                       // m
+          1,                                    // alpha
+          d_output_features, output_nPlanes,    // r
+          buffer, input_nPlanes * filterVolume, // l
+          1,                                    // beta
+          d_weight, output_nPlanes              // r
+          );
+  if (d_bias)
+    for (uInt row = 0; row < output_nActive; row++)
+      for (uInt i = 0; i < output_nPlanes; i++)
+        d_bias[i] += d_output_features[row * output_nPlanes + i];
+}
+#endif /* CPU_NetworkInNetwork_H */
--- a/PyTorch/sparseconvnet/SCN/generic/CPU/SparseToDense.cpp
+++ b/PyTorch/sparseconvnet/SCN/generic/CPU/SparseToDense.cpp
+// Copyright 2016-present, Facebook, Inc.
+// All rights reserved.
+//
+// This source code is licensed under the license found in the
+// LICENSE file in the root directory of this source tree.
+#ifndef TH_GENERIC_FILE_
+#define TH_GENERIC_FILE_ "generic/CPU/SparseToDense.cpp"
+#else
+#include "SparseToDense.h"
+extern "C" void scn_DR_(SparseToDense_updateOutput)(THLongTensor *inputSize,
+                                                    void **m,
+                                                    THTensor *input_features,
+                                                    THTensor *output_features,
+                                                    void *rulesBuffer) {
+  SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m) {
+    long sz[Dimension + 2];
+    sz[0] = _m.inputSGs->size();
+    sz[1] = input_features->size[1];
+    for (int i = 0; i < Dimension; i++) {
+      auto x = THLongTensor_data(inputSize)[i];
+      sz[i + 2] = x;
+    }
+    THTensor_(resizeNd)(output_features, Dimension + 2, sz, NULL);
+    THTensor_(zero)(output_features);
+  }
+  auto _rules = _m.getSparseToDenseRuleBook(inputSize, true);
+  auto spatialVolume = _rules.size();
+  uInt nPlanes = input_features->size[1];
+  auto iF = THTensor_(data)(input_features);
+  auto oF = THTensor_(data)(output_features);
+  for (auto &r : _rules) {
+    uInt nHot = r.size() / 2;
+    SparseToDense_ForwardPass<real>(iF, oF, nPlanes, spatialVolume, &r[0],
+                                    nHot);
+    oF += spatialVolume;
+  }
+}
+extern "C" void scn_DR_(SparseToDense_updateGradInput)(
+    THLongTensor *inputSize, void **m, THTensor *input_features,
+    THTensor *d_input_features, THTensor *d_output_features,
+    void *rulesBuffer) {
+  THTensor_(resizeAs)(d_input_features, input_features);
+  THTensor_(zero)(d_input_features);
+  SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
+  auto _rules = _m.getSparseToDenseRuleBook(inputSize, true);
+  auto spatialVolume = _rules.size();
+  uInt nPlanes = d_input_features->size[1];
+  auto diF = THTensor_(data)(d_input_features);
+  auto doF = THTensor_(data)(d_output_features);
+  for (auto &r : _rules) {
+    uInt nHot = r.size() / 2;
+    SparseToDense_BackwardPass<real>(diF, doF, nPlanes, spatialVolume, &r[0],
+                                     nHot);
+    doF += spatialVolume;
+  }
+}
+#endif
--- a/PyTorch/sparseconvnet/SCN/generic/CPU/SparseToDense.h
+++ b/PyTorch/sparseconvnet/SCN/generic/CPU/SparseToDense.h
+// Copyright 2016-present, Facebook, Inc.
+// All rights reserved.
+//
+// This source code is licensed under the license found in the
+// LICENSE file in the root directory of this source tree.
+#ifndef CPU_SPARSETODENSE_H
+#define CPU_SPARSETODENSE_H
+#include "../SparseConvNet.h"
+template <typename T>
+void SparseToDense_ForwardPass(T *input_features, T *output_features,
+                               uInt nPlanes, uInt spatialVolume,
+                               uInt* rules, int nHot) {
+                                 for (uInt outSite = 0; outSite < nHot; outSite++) {
+                                   T *i = &input_features[rules[2 * outSite] * nPlanes];
+                                   uInt sample = rules[2 * outSite + 1];
+                                   for (uInt plane = 0; plane < nPlanes; plane++)
+                                     output_features[(sample*nPlanes+plane)*spatialVolume]=i[plane];
+                                 }
+}
+template <typename T>
+void SparseToDense_BackwardPass(T *d_input_features, T *d_output_features,
+                                  uInt nPlanes, uInt spatialVolume,
+                                  uInt* rules, int nHot) {
+                                    for (uInt outSite = 0; outSite < nHot; outSite++) {
+                                      T *di = &d_input_features[rules[2 * outSite] * nPlanes];
+                                      uInt sample = rules[2 * outSite + 1];
+                                      for (uInt plane = 0; plane < nPlanes; plane++)
+                                        di[plane]=d_output_features[(sample*nPlanes+plane)*spatialVolume];
+                                    }
+                           }
+#endif /* CPU_SPARSETODENSE_H */
--- a/PyTorch/sparseconvnet/SCN/generic/CPU/THGenerateDimFloatTypes.h
+++ b/PyTorch/sparseconvnet/SCN/generic/CPU/THGenerateDimFloatTypes.h
+// Copyright 2016-present, Facebook, Inc.
+// All rights reserved.
+//
+// This source code is licensed under the license found in the
+// LICENSE file in the root directory of this source tree.
+#ifndef TH_GENERIC_FILE_
+#error "Define TH_GENERIC_FILE_ before including THGenerateDimFloatTypes.h"
+#endif
+#define TH_GENERIC_FILE TH_GENERIC_FILE_
+#define Dimension 1
+#define TH_GENERIC_FILE TH_GENERIC_FILE_
+#include "THGenerateFloatTypes.h"
+#undef Dimension
+#define Dimension 2
+#define TH_GENERIC_FILE TH_GENERIC_FILE_
+#include "THGenerateFloatTypes.h"
+#undef Dimension
+#define Dimension 3
+#define TH_GENERIC_FILE TH_GENERIC_FILE_
+#include "THGenerateFloatTypes.h"
+#undef Dimension
+#define Dimension 4
+#define TH_GENERIC_FILE TH_GENERIC_FILE_
+#include "THGenerateFloatTypes.h"
+#undef Dimension
+#define Dimension 5
+#define TH_GENERIC_FILE TH_GENERIC_FILE_
+#include "THGenerateFloatTypes.h"
+#undef Dimension
+#define Dimension 6
+#define TH_GENERIC_FILE TH_GENERIC_FILE_
+#include "THGenerateFloatTypes.h"
+#undef Dimension
+#define Dimension 7
+#define TH_GENERIC_FILE TH_GENERIC_FILE_
+#include "THGenerateFloatTypes.h"
+#undef Dimension
+#define Dimension 8
+#define TH_GENERIC_FILE TH_GENERIC_FILE_
+#include "THGenerateFloatTypes.h"
+#undef Dimension
+#define Dimension 9
+#define TH_GENERIC_FILE TH_GENERIC_FILE_
+#include "THGenerateFloatTypes.h"
+#undef Dimension
+#define Dimension 10
+#define TH_GENERIC_FILE TH_GENERIC_FILE_
+#include "THGenerateFloatTypes.h"
+#undef Dimension
+#undef TH_GENERIC_FILE_
--- a/PyTorch/sparseconvnet/SCN/generic/CPU/THGenerateFloatTypes.h
+++ b/PyTorch/sparseconvnet/SCN/generic/CPU/THGenerateFloatTypes.h
+// Copyright 2016-present, Facebook, Inc.
+// All rights reserved.
+//
+// This source code is licensed under the license found in the
+// LICENSE file in the root directory of this source tree.
+#ifndef TH_GENERIC_FILE
+#error "You must define TH_GENERIC_FILE before including THGenerateFloatTypes.h"
+#endif
+#define real float
+#define accreal double
+#define Real Float
+#define TH_REAL_IS_FLOAT
+#line 1 TH_GENERIC_FILE
+#include TH_GENERIC_FILE
+#undef accreal
+#undef real
+#undef Real
+#undef TH_REAL_IS_FLOAT
+#define real double
+#define accreal double
+#define Real Double
+#define TH_REAL_IS_DOUBLE
+#line 1 TH_GENERIC_FILE
+#include TH_GENERIC_FILE
+#undef accreal
+#undef real
+#undef Real
+#undef TH_REAL_IS_DOUBLE
+#undef TH_GENERIC_FILE
--- a/PyTorch/sparseconvnet/SCN/generic/GPU/ActivePooling.cu
+++ b/PyTorch/sparseconvnet/SCN/generic/GPU/ActivePooling.cu
+// Copyright 2016-present, Facebook, Inc.
+// All rights reserved.
+//
+// This source code is licensed under the license found in the
+// LICENSE file in the root directory of this source tree.
+#ifndef TH_GENERIC_FILE_
+#define TH_GENERIC_FILE_ "generic/GPU/ActivePooling.cu"
+#else
+#include "ActivePooling.h"
+extern "C" void scn_DR_(ActivePooling_updateOutput)(
+    THLongTensor *inputSize, void **m, THCTensor *input_features,
+    THCTensor *output_features, THCITensor *rulesBuffer, bool average) {
+  SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
+  uInt nPlanes = input_features->size[1];
+  auto _rules = _m.getActivePoolingRuleBook(inputSize);
+  uInt batchSize = _rules[1][0];
+  uInt maxActive = _rules[1][1];
+  THCTensor_(resize2d)(state, output_features, batchSize, nPlanes);
+  THCTensor_(zero)(state, output_features);
+  if (THCITensor_nElement(state, rulesBuffer) < 1 << 22)
+    THCITensor_resize1d(state, rulesBuffer, 1 << 22);
+  uInt *rb = (uInt *)THCITensor_data(state, rulesBuffer);
+  uInt rowBatchSize = std::min((uInt)32768, (1 << 22) / (maxActive + 1));
+  THAssert(rowBatchSize > 0);
+  auto iF = THCTensor_(data)(state, input_features);
+  auto oF = THCTensor_(data)(state, output_features);
+  for (uInt o = 0; o < batchSize; o += rowBatchSize) {
+    uInt batchSize_ = std::min(rowBatchSize, (uInt)(batchSize - o));
+    cudaMemcpy(rb, &_rules[0][o * (maxActive + 1)],
+               sizeof(uInt) * (maxActive + 1) * batchSize_,
+               cudaMemcpyHostToDevice);
+    ActivePooling_ForwardPass<real>(iF, oF + o * nPlanes, batchSize_, maxActive,
+                                    nPlanes, rb, average);
+  }
+}
+extern "C" void scn_DR_(ActivePooling_updateGradInput)(
+    THLongTensor *inputSize, void **m, THCTensor *input_features,
+    THCTensor *d_input_features, THCTensor *d_output_features,
+    THCITensor *rulesBuffer, bool average) {
+  SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
+  uInt nPlanes = input_features->size[1];
+  auto _rules = _m.getActivePoolingRuleBook(inputSize);
+  uInt batchSize = _rules[1][0];
+  uInt maxActive = _rules[1][1];
+  THCTensor_(resizeAs)(state, d_input_features, input_features);
+  THCTensor_(zero)(state, d_input_features);
+  if (THCITensor_nElement(state, rulesBuffer) < 1 << 22)
+    THCITensor_resize1d(state, rulesBuffer, 1 << 22);
+  uInt *rb = (uInt *)THCITensor_data(state, rulesBuffer);
+  uInt rowBatchSize = std::min((uInt)32768, (1 << 22) / (maxActive + 1));
+  THAssert(rowBatchSize > 0);
+  auto diF = THCTensor_(data)(state, d_input_features);
+  auto doF = THCTensor_(data)(state, d_output_features);
+  for (uInt o = 0; o < batchSize; o += rowBatchSize) {
+    uInt batchSize_ = std::min(rowBatchSize, (uInt)(batchSize - o));
+    cudaMemcpy(rb, &_rules[0][o * (maxActive + 1)],
+               sizeof(uInt) * (maxActive + 1) * batchSize_,
+               cudaMemcpyHostToDevice);
+    ActivePooling_BackwardPass<real>(diF, doF + o * nPlanes, batchSize_,
+                                     maxActive, nPlanes, rb, average);
+  }
+}
+#endif
--- a/PyTorch/sparseconvnet/SCN/generic/GPU/ActivePooling.h
+++ b/PyTorch/sparseconvnet/SCN/generic/GPU/ActivePooling.h
+// Copyright 2016-present, Facebook, Inc.
+// All rights reserved.
+//
+// This source code is licensed under the license found in the
+// LICENSE file in the root directory of this source tree.
+#ifndef GPU_ACTIVEPOOLING_H
+#define GPU_ACTIVEPOOLING_H
+template <typename T>
+__global__ void ActivePooling_fp(T *input_features, T *output_features,
+                                 uInt maxActive, uInt nPlanes, uInt *rules,
+                                 bool average) {
+  T *out = &output_features[blockIdx.x * nPlanes];
+  uInt *r = &rules[blockIdx.x * (maxActive + 1)];
+  uInt nActive = *r++;
+  T multiplier = (average and nActive > 0) ? 1.0f / nActive : 1.0f;
+  while (nActive-- > 0) {
+    T *inp = &input_features[(*r++) * nPlanes];
+    for (uInt plane = threadIdx.x; plane < nPlanes; plane += 32)
+      out[plane] += inp[plane] * multiplier;
+  }
+}
+template <typename T>
+void ActivePooling_ForwardPass(T *input_features, T *output_features,
+                               uInt batchSize, uInt maxActive, uInt nPlanes,
+                               uInt *rules, bool average) {
+  uInt kernelBlockDim = std::min(nPlanes, (uInt)32);
+  ActivePooling_fp<T> << <batchSize, kernelBlockDim, 0,
+                          THCState_getCurrentStream(state)>>>
+      (input_features, output_features, maxActive, nPlanes, rules, average);
+}
+template <typename T>
+__global__ void ActivePooling_bp(T *d_input_features, T *d_output_features,
+                                 uInt maxActive, uInt nPlanes, uInt *rules,
+                                 bool average) {
+  T *out = &d_output_features[blockIdx.x * nPlanes];
+  uInt *r = &rules[blockIdx.x * (maxActive + 1)];
+  uInt nActive = *r++;
+  T multiplier = (average and nActive > 0) ? 1.0f / nActive : 1.0f;
+  while (nActive-- > 0) {
+    T *inp = &d_input_features[(*r++) * nPlanes];
+    for (uInt plane = threadIdx.x; plane < nPlanes; plane += 32)
+      inp[plane] = out[plane] * multiplier;
+  }
+}
+template <typename T>
+void ActivePooling_BackwardPass(T *d_input_features, T *d_output_features,
+                                uInt batchSize, uInt maxActive, uInt nPlanes,
+                                uInt *rules, bool average) {
+  uInt kernelBlockDim = std::min(nPlanes, (uInt)32);
+  ActivePooling_bp<T> << <batchSize, kernelBlockDim, 0,
+                          THCState_getCurrentStream(state)>>>
+      (d_input_features, d_output_features, maxActive, nPlanes, rules, average);
+}
+#endif /* GPU_ActivePOOLING_H */
--- a/PyTorch/sparseconvnet/SCN/generic/GPU/AffineReluTrivialConvolution.cu
+++ b/PyTorch/sparseconvnet/SCN/generic/GPU/AffineReluTrivialConvolution.cu
+// Copyright 2016-present, Facebook, Inc.
+// All rights reserved.
+//
+// This source code is licensed under the license found in the
+// LICENSE file in the root directory of this source tree.
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/GPU/AffineReluTrivialConvolution.cu"
+#else
+#include "AffineReluTrivialConvolution.h"
+#include <algorithm>
+#include <iostream>
+extern "C" void scn_R_(AffineReluTrivialConvolution_updateOutput)(
+    THCTensor *input_features, THCTensor *output_features,
+    THCTensor *affineWeight, THCTensor *affineBias, THCTensor *convWeight) {
+  THCTensor_(resize2d)(state, output_features, input_features->size[0],
+                       convWeight->size[1]);
+  dAffineReluTrivialConvolution_forward<real>(
+      THCTensor_(data)(state, input_features),
+      THCTensor_(data)(state, output_features),
+      THCTensor_(data)(state, affineWeight),
+      THCTensor_(data)(state, affineBias), THCTensor_(data)(state, convWeight),
+      convWeight->size[0], input_features->stride[0], convWeight->size[1],
+      output_features->size[1], input_features->size[0]);
+}
+extern "C" void scn_R_(AffineReluTrivialConvolution_backward)(
+    THCTensor *input_features, THCTensor *d_input_features,
+    THCTensor *d_output_features, THCTensor *affineWeight,
+    THCTensor *d_affineWeight, THCTensor *affineBias, THCTensor *d_affineBias,
+    THCTensor *convWeight, THCTensor *d_convWeight, bool additiveGrad) {
+  THCTensor_(resizeAs)(state, d_input_features, input_features);
+  dAffineReluTrivialConvolution_backward_dW<real>(
+      THCTensor_(data)(state, input_features),
+      THCTensor_(data)(state, d_input_features),
+      THCTensor_(data)(state, d_output_features),
+      THCTensor_(data)(state, affineWeight),
+      THCTensor_(data)(state, d_affineWeight),
+      THCTensor_(data)(state, affineBias),
+      THCTensor_(data)(state, d_affineBias),
+      THCTensor_(data)(state, convWeight),
+      THCTensor_(data)(state, d_convWeight), convWeight->size[0],
+      input_features->stride[0], convWeight->size[1],
+      d_output_features->stride[0], input_features->size[0], additiveGrad);
+}
+#endif
--- a/PyTorch/sparseconvnet/SCN/generic/GPU/AffineReluTrivialConvolution.h
+++ b/PyTorch/sparseconvnet/SCN/generic/GPU/AffineReluTrivialConvolution.h
+// Copyright 2016-present, Facebook, Inc.
+// All rights reserved.
+//
+// This source code is licensed under the license found in the
+// LICENSE file in the root directory of this source tree.
+#ifndef GPU_AFFINERELUTRIVIALCONVOLUTION_H
+#define GPU_AFFINERELUTRIVIALCONVOLUTION_H
+// check if A+B is faster than just B
+// check if loading affineBias into shared memory is faster than loading
+// multiple times (if not try 64,16 backwards case)
+template <typename T, uInt K, uInt V>
+__global__ void dAffineReluTrivialConvolution_forwardA(
+    T *inFeatures, T *outFeatures, T *affineWeight, T *affineBias,
+    T *convWeight, uInt input_nPlanes, uInt input_stride, uInt output_nPlanes,
+    uInt output_stride, uInt nActive) {
+  // nActive must be a multiple of K!!
+  // Input x Weight -> Output
+  // blockDim=(K,K/V,1), gridDim=(nBlocks,N,1) Volkov-blocks
+  // K is a multiple of V,
+  // nActive x KM -> nActive x KN - parallel over N,nActive - loop over M
+  uInt M = input_nPlanes / K;
+  // N = gridDim.y == output_nPlanes/K
+  uInt n = blockIdx.y;
+  outFeatures += n * K;
+  convWeight += n * K;
+  T O[V];
+  __shared__ T I[K][K];
+  __shared__ T AW[K];
+  __shared__ T AB[K];
+  __shared__ T CW[K][K];
+  const uInt tx = threadIdx.x;
+  int ty[V];
+#pragma unroll
+  for (int v = 0; v < V; v++)
+    ty[v] = threadIdx.y + v * (K / V);
+  for (int m = 0; m < M; m++) {
+    // Read affineWeight, affineBias and convWeight
+    if (ty[0] == 0) {
+      AW[tx] = affineWeight[tx];
+      AB[tx] = affineBias[tx];
+    }
+#pragma unroll
+    for (int v = 0; v < V; v++)
+      CW[ty[v]][tx] = convWeight[ty[v] * output_nPlanes + tx];
+    __syncthreads();
+    for (uInt s = blockIdx.x * K; s < nActive; s += K * gridDim.x) {
+// Read input, do affine + relu, set O[]
+#pragma unroll
+      for (int v = 0; v < V; v++) {
+        T i = inFeatures[(s + ty[v]) * input_stride + tx] * AW[tx] + AB[tx];
+        I[ty[v]][tx] = (i > 0) ? i : 0;
+        if (m == 0) {
+          O[v] = 0;
+        } else {
+          O[v] = outFeatures[(s + ty[v]) * output_stride + tx];
+        }
+      }
+      __syncthreads();
+#pragma unroll
+      for (int k = 0; k < K; k++)
+#pragma unroll
+        for (int v = 0; v < V; v++)
+          O[v] += I[ty[v]][k] * CW[k][tx];
+#pragma unroll
+      for (int v = 0; v < V; v++)
+        outFeatures[(s + ty[v]) * output_stride + tx] = O[v];
+      __syncthreads();
+    }
+    affineWeight += K;
+    affineBias += K;
+    convWeight += K * output_nPlanes;
+    inFeatures += K;
+  }
+}
+template <typename T, uInt K, uInt V>
+__global__ void dAffineReluTrivialConvolution_forwardB(
+    T *inFeatures, T *outFeatures, T *affineWeight, T *affineBias,
+    T *convWeight, uInt input_nPlanes, uInt input_stride, uInt output_nPlanes,
+    uInt output_stride, uInt nActive) {
+  // Input x Weight -> Output
+  // blockDim=(K,K/V,1), gridDim=(nBlocks,N,1) Volkov-blocks
+  // K is a multiple of V,
+  // nActive x KM -> nActive x KN - parallel over N,nActive - loop over M
+  uInt M = input_nPlanes / K;
+  // N = gridDim.y == output_nPlanes/K
+  uInt n = blockIdx.y;
+  outFeatures += n * K;
+  convWeight += n * K;
+  T O[V];
+  __shared__ T I[K][K]; // zz try K+1 trick A+B+backwards
+  __shared__ T AW[K];
+  __shared__ T AB[K];
+  __shared__ T CW[K][K];
+  const uInt tx = threadIdx.x;
+  int ty[V];
+#pragma unroll
+  for (int v = 0; v < V; v++)
+    ty[v] = threadIdx.y + v * (K / V);
+  for (int m = 0; m < M; m++) {
+    // Read affineWeight, affineBias and convWeight
+    if (ty[0] == 0) {
+      AW[tx] = affineWeight[tx];
+      AB[tx] = affineBias[tx];
+    }
+#pragma unroll
+    for (int v = 0; v < V; v++)
+      CW[ty[v]][tx] = convWeight[ty[v] * output_nPlanes + tx];
+    __syncthreads();
+    for (uInt s = blockIdx.x * K; s < nActive; s += K * gridDim.x) {
+// Read input, do affine + relu, set O[]
+#pragma unroll
+      for (int v = 0; v < V; v++) {
+        if (s + ty[v] < nActive) {
+          T i = inFeatures[(s + ty[v]) * input_stride + tx] * AW[tx] + AB[tx];
+          I[ty[v]][tx] = (i > 0) ? i : 0;
+          if (m == 0) {
+            O[v] = 0;
+          } else {
+            O[v] = outFeatures[(s + ty[v]) * output_stride + tx];
+          }
+        }
+      }
+      __syncthreads();
+#pragma unroll
+      for (int k = 0; k < K; k++)
+#pragma unroll
+        for (int v = 0; v < V; v++)
+          O[v] += I[ty[v]][k] * CW[k][tx];
+#pragma unroll
+      for (int v = 0; v < V; v++)
+        if (s + ty[v] < nActive)
+          outFeatures[(s + ty[v]) * output_stride + tx] = O[v];
+      __syncthreads();
+    }
+    affineWeight += K;
+    affineBias += K;
+    convWeight += K * output_nPlanes;
+    inFeatures += K;
+  }
+}
+template <typename T>
+void dAffineReluTrivialConvolution_forward(T *inFeatures, T *outFeatures,
+                                           T *affineWeight, T *affineBias,
+                                           T *convWeight, uInt input_nPlanes,
+                                           uInt input_stride,
+                                           uInt output_nPlanes,
+                                           uInt output_stride, uInt nActive) {
+  {
+    const uInt K = 64;
+    const uInt V = 16;
+    if (input_nPlanes % K == 0 and output_nPlanes % K == 0) {
+      uInt o = (nActive / K) * K;
+      if (o > 0)
+        dAffineReluTrivialConvolution_forwardA<
+            T, K, V><<<dim3(std::min(o / K, (uInt)512), output_nPlanes / K),
+                       dim3(K, K / V), 0, THCState_getCurrentStream(state)>>>(
+            inFeatures, outFeatures, affineWeight, affineBias, convWeight,
+            input_nPlanes, input_stride, output_nPlanes, output_stride, o);
+      if (nActive > o)
+        dAffineReluTrivialConvolution_forwardB<
+            T, K, V><<<dim3(1, output_nPlanes / K), dim3(K, K / V), 0,
+                       THCState_getCurrentStream(state)>>>(
+            inFeatures + o * input_stride, outFeatures + o * output_stride,
+            affineWeight, affineBias, convWeight, input_nPlanes, input_stride,
+            output_nPlanes, output_stride, nActive - o);
+      return;
+    }
+  }
+  {
+    const uInt K = 32;
+    const uInt V = 4;
+    if (input_nPlanes % K == 0 and output_nPlanes % K == 0) {
+      uInt o = (nActive / K) * K;
+      if (o > 0)
+        dAffineReluTrivialConvolution_forwardA<
+            T, K, V><<<dim3(std::min(o / K, (uInt)512), output_nPlanes / K),
+                       dim3(K, K / V), 0, THCState_getCurrentStream(state)>>>(
+            inFeatures, outFeatures, affineWeight, affineBias, convWeight,
+            input_nPlanes, input_stride, output_nPlanes, output_stride, o);
+      if (nActive > o)
+        dAffineReluTrivialConvolution_forwardB<
+            T, K, V><<<dim3(1, output_nPlanes / K), dim3(K, K / V), 0,
+                       THCState_getCurrentStream(state)>>>(
+            inFeatures + o * input_stride, outFeatures + o * output_stride,
+            affineWeight, affineBias, convWeight, input_nPlanes, input_stride,
+            output_nPlanes, output_stride, nActive - o);
+      return;
+    }
+  }
+  {
+    const uInt K = 16;
+    const uInt V = 4;
+    if (input_nPlanes % K == 0 and output_nPlanes % K == 0) {
+      uInt o = (nActive / K) * K;
+      if (o > 0)
+        dAffineReluTrivialConvolution_forwardA<
+            T, K, V><<<dim3(std::min(o / K, (uInt)512), output_nPlanes / K),
+                       dim3(K, K / V), 0, THCState_getCurrentStream(state)>>>(
+            inFeatures, outFeatures, affineWeight, affineBias, convWeight,
+            input_nPlanes, input_stride, output_nPlanes, output_stride, o);
+      if (nActive > o)
+        dAffineReluTrivialConvolution_forwardB<
+            T, K, V><<<dim3(1, output_nPlanes / K), dim3(K, K / V), 0,
+                       THCState_getCurrentStream(state)>>>(
+            inFeatures + o * input_stride, outFeatures + o * output_stride,
+            affineWeight, affineBias, convWeight, input_nPlanes, input_stride,
+            output_nPlanes, output_stride, nActive - o);
+      return;
+    }
+  }
+  {
+    const uInt K = 8;
+    const uInt V = 2;
+    if (input_nPlanes % K == 0 and output_nPlanes % K == 0) {
+      uInt o = (nActive / K) * K;
+      if (o > 0)
+        dAffineReluTrivialConvolution_forwardA<
+            T, K, V><<<dim3(std::min(o / K, (uInt)512), output_nPlanes / K),
+                       dim3(K, K / V), 0, THCState_getCurrentStream(state)>>>(
+            inFeatures, outFeatures, affineWeight, affineBias, convWeight,
+            input_nPlanes, input_stride, output_nPlanes, output_stride, o);
+      if (nActive > o)
+        dAffineReluTrivialConvolution_forwardB<
+            T, K, V><<<dim3(1, output_nPlanes / K), dim3(K, K / V), 0,
+                       THCState_getCurrentStream(state)>>>(
+            inFeatures + o * input_stride, outFeatures + o * output_stride,
+            affineWeight, affineBias, convWeight, input_nPlanes, input_stride,
+            output_nPlanes, output_stride, nActive - o);
+      return;
+    }
+  }
+  assert(false);
+}
+// dOutput x W^T -> dInput and
+// Input^T x dOutput -> dW
+// blockDim=(K,K/V,1), gridDim=(nBlocks,M,1)
+template <typename T, uInt K, uInt V>
+__global__ void dAffineReluTrivialConvolution_backward_dW_A(
+    T *inFeatures, T *dInFeatures, T *dOutFeatures, T *affineWeight,
+    T *dAffineWeight, T *affineBias, T *dAffineBias, T *convWeight,
+    T *dConvWeight, uInt input_nPlanes, uInt input_stride, uInt output_nPlanes,
+    uInt output_stride, uInt nActive, bool additiveGrad) {
+  // M = gridDim.y == input_nPlanes / K
+  uInt N = output_nPlanes / K;
+  uInt m = blockIdx.y;
+  inFeatures += m * K;
+  dInFeatures += m * K;
+  convWeight += m * K * output_nPlanes;
+  dConvWeight += m * K * output_nPlanes;
+  affineWeight += m * K;
+  dAffineWeight += m * K;
+  affineBias += m * K;
+  dAffineBias += m * K;
+  T dI[V];
+  T dCW[V];
+  T i[V];
+  T dAW = 0;
+  T dAB = 0;
+  __shared__ T I[K][K];
+  __shared__ T dO[K][K];
+  __shared__ T AW[K];
+  __shared__ T AB[K];
+  __shared__ T CW[K][K];
+  const uInt tx = threadIdx.x;
+  int ty[V];
+#pragma unroll
+  for (int v = 0; v < V; v++)
+    ty[v] = threadIdx.y + v * (K / V);
+  if (ty[0] == 0) {
+    AW[tx] = affineWeight[tx];
+    AB[tx] = affineBias[tx];
+  }
+  for (int n = 0; n < N; n++) {
+// Read w, reset dW
+#pragma unroll
+    for (int v = 0; v < V; v++) {
+      CW[ty[v]][tx] = convWeight[ty[v] * output_nPlanes + tx];
+      dCW[v] = 0;
+    }
+    __syncthreads();
+    for (uInt s = blockIdx.x * K; s < nActive; s += K * gridDim.x) {
+#pragma unroll
+      for (int v = 0; v < V; v++)
+        dI[v] = 0;
+      __syncthreads();
+// Read input and dOutput
+#pragma unroll
+      for (int v = 0; v < V; v++) {
+        T i_ = inFeatures[(s + ty[v]) * input_stride + tx];
+        i[v] = i_;
+        i_ = i_ * AW[tx] + AB[tx];
+        I[ty[v]][tx] = (i_ > 0) ? i_ : 0;
+        dO[ty[v]][tx] = dOutFeatures[(s + ty[v]) * output_stride + tx];
+      }
+      __syncthreads();
+#pragma unroll
+      for (int k = 0; k < K; k++)
+#pragma unroll
+        for (int v = 0; v < V; v++) {
+          dI[v] += dO[ty[v]][k] * CW[tx][k];
+          dCW[v] += I[k][ty[v]] * dO[k][tx];
+        }
+#pragma unroll
+      for (int v = 0; v < V; v++) {
+        dI[v] = (I[ty[v]][tx] > 0) ? dI[v] : 0;
+        dAW += i[v] * dI[v];
+        dAB += dI[v];
+        if (additiveGrad)
+          dInFeatures[(s + ty[v]) * input_stride + tx] += dI[v];
+        else
+          dInFeatures[(s + ty[v]) * input_stride + tx] = dI[v];
+      }
+      __syncthreads();
+    }
+#pragma unroll
+    for (int v = 0; v < V; v++)
+      atomicAdd(&dConvWeight[ty[v] * output_nPlanes + tx], dCW[v]);
+    convWeight += K;
+    dConvWeight += K;
+    dOutFeatures += K;
+    __syncthreads();
+  }
+  atomicAdd(&dAffineWeight[tx], dAW);
+  atomicAdd(&dAffineBias[tx], dAB);
+}
+// dOutput x W^T -> dInput and
+// Input^T x dOutput -> dW
+// blockDim=(K,K/V,1), gridDim=(nBlocks,M,1)
+template <typename T, uInt K, uInt V>
+__global__ void dAffineReluTrivialConvolution_backward_dW_B(
+    T *inFeatures, T *dInFeatures, T *dOutFeatures, T *affineWeight,
+    T *dAffineWeight, T *affineBias, T *dAffineBias, T *convWeight,
+    T *dConvWeight, uInt input_nPlanes, uInt input_stride, uInt output_nPlanes,
+    uInt output_stride, uInt nActive, bool additiveGrad) {
+  // M = gridDim.y == input_nPlanes / K
+  uInt N = output_nPlanes / K;
+  uInt m = blockIdx.y;
+  inFeatures += m * K;
+  dInFeatures += m * K;
+  convWeight += m * K * output_nPlanes;
+  dConvWeight += m * K * output_nPlanes;
+  affineWeight += m * K;
+  dAffineWeight += m * K;
+  affineBias += m * K;
+  dAffineBias += m * K;
+  T dI[V];
+  T dCW[V];
+  T i[V];
+  T dAW = 0;
+  T dAB = 0;
+  __shared__ T I[K][K];
+  __shared__ T dO[K][K];
+  __shared__ T AW[K];
+  __shared__ T AB[K];
+  __shared__ T CW[K][K];
+  const uInt tx = threadIdx.x;
+  int ty[V];
+#pragma unroll
+  for (int v = 0; v < V; v++)
+    ty[v] = threadIdx.y + v * (K / V);
+  if (ty[0] == 0) {
+    AW[tx] = affineWeight[tx];
+    AB[tx] = affineBias[tx];
+  }
+  for (int n = 0; n < N; n++) {
+// Read w, reset dW
+#pragma unroll
+    for (int v = 0; v < V; v++) {
+      CW[ty[v]][tx] = convWeight[ty[v] * output_nPlanes + tx];
+      dCW[v] = 0;
+    }
+    __syncthreads();
+    for (uInt s = blockIdx.x * K; s < nActive; s += K * gridDim.x) {
+#pragma unroll
+      for (int v = 0; v < V; v++)
+        dI[v] = 0;
+      __syncthreads();
+// Read input and dOutput
+#pragma unroll
+      for (int v = 0; v < V; v++)
+        if (s + ty[v] < nActive) {
+          T i_ = inFeatures[(s + ty[v]) * input_stride + tx];
+          i[v] = i_;
+          i_ = i_ * AW[tx] + AB[tx];
+          I[ty[v]][tx] = (i_ > 0) ? i_ : 0;
+          dO[ty[v]][tx] = dOutFeatures[(s + ty[v]) * output_stride + tx];
+        } else {
+          i[v] = 0;
+          I[ty[v]][tx] = 0;
+          dO[ty[v]][tx] = 0;
+        }
+      __syncthreads();
+#pragma unroll
+      for (int k = 0; k < K; k++)
+#pragma unroll
+        for (int v = 0; v < V; v++) {
+          dI[v] += dO[ty[v]][k] * CW[tx][k];
+          dCW[v] += I[k][ty[v]] * dO[k][tx];
+        }
+#pragma unroll
+      for (int v = 0; v < V; v++)
+        if (s + ty[v] < nActive) {
+          dI[v] = (I[ty[v]][tx] > 0) ? dI[v] : 0;
+          dAW += i[v] * dI[v];
+          dAB += dI[v];
+          if (additiveGrad)
+            dInFeatures[(s + ty[v]) * input_stride + tx] += dI[v];
+          else
+            dInFeatures[(s + ty[v]) * input_stride + tx] = dI[v];
+        }
+      __syncthreads();
+    }
+#pragma unroll
+    for (int v = 0; v < V; v++)
+      atomicAdd(&dConvWeight[ty[v] * output_nPlanes + tx], dCW[v]);
+    convWeight += K;
+    dConvWeight += K;
+    dOutFeatures += K;
+    __syncthreads();
+  }
+  atomicAdd(&dAffineWeight[tx], dAW);
+  atomicAdd(&dAffineBias[tx], dAB);
+}
+template <typename T>
+void dAffineReluTrivialConvolution_backward_dW(
+    T *inFeatures, T *dInFeatures, T *dOutFeatures, T *affineWeight,
+    T *dAffineWeight, T *affineBias, T *dAffineBias, T *convWeight,
+    T *dConvWeight, uInt input_nPlanes, uInt input_stride, uInt output_nPlanes,
+    uInt output_stride, uInt nActive, bool additiveGrad) {
+  {
+    const uInt K = 32;
+    const uInt V = 8;
+    if (input_nPlanes % K == 0 and output_nPlanes % K == 0) {
+      uInt o = (nActive / K) * K;
+      if (o > 0)
+        dAffineReluTrivialConvolution_backward_dW_A<
+            T, K, V><<<dim3(std::min(o / K, (uInt)512), input_nPlanes / K),
+                       dim3(K, K / V), 0, THCState_getCurrentStream(state)>>>(
+            inFeatures, dInFeatures, dOutFeatures, affineWeight, dAffineWeight,
+            affineBias, dAffineBias, convWeight, dConvWeight, input_nPlanes,
+            input_stride, output_nPlanes, output_stride, o, additiveGrad);
+      if (nActive > o)
+        dAffineReluTrivialConvolution_backward_dW_B<
+            T, K, V><<<dim3(1, input_nPlanes / K), dim3(K, K / V), 0,
+                       THCState_getCurrentStream(state)>>>(
+            inFeatures + o * input_stride, dInFeatures + o * input_stride,
+            dOutFeatures + o * output_stride, affineWeight, dAffineWeight,
+            affineBias, dAffineBias, convWeight, dConvWeight, input_nPlanes,
+            input_stride, output_nPlanes, output_stride, nActive - o,
+            additiveGrad);
+      return;
+    }
+  }
+  {
+    const uInt K = 16;
+    const uInt V = 4;
+    if (input_nPlanes % K == 0 and output_nPlanes % K == 0) {
+      uInt o = (nActive / K) * K;
+      if (o > 0)
+        dAffineReluTrivialConvolution_backward_dW_A<
+            T, K, V><<<dim3(std::min(o / K, (uInt)512), input_nPlanes / K),
+                       dim3(K, K / V), 0, THCState_getCurrentStream(state)>>>(
+            inFeatures, dInFeatures, dOutFeatures, affineWeight, dAffineWeight,
+            affineBias, dAffineBias, convWeight, dConvWeight, input_nPlanes,
+            input_stride, output_nPlanes, output_stride, o, additiveGrad);
+      if (nActive > o)
+        dAffineReluTrivialConvolution_backward_dW_B<
+            T, K, V><<<dim3(1, input_nPlanes / K), dim3(K, K / V), 0,
+                       THCState_getCurrentStream(state)>>>(
+            inFeatures + o * input_stride, dInFeatures + o * input_stride,
+            dOutFeatures + o * output_stride, affineWeight, dAffineWeight,
+            affineBias, dAffineBias, convWeight, dConvWeight, input_nPlanes,
+            input_stride, output_nPlanes, output_stride, nActive - o,
+            additiveGrad);
+      return;
+    }
+  }
+  {
+    const uInt K = 8;
+    const uInt V = 2;
+    if (input_nPlanes % K == 0 and output_nPlanes % K == 0) {
+      uInt o = (nActive / K) * K;
+      if (o > 0)
+        dAffineReluTrivialConvolution_backward_dW_A<
+            T, K, V><<<dim3(std::min(o / K, (uInt)512), input_nPlanes / K),
+                       dim3(K, K / V), 0, THCState_getCurrentStream(state)>>>(
+            inFeatures, dInFeatures, dOutFeatures, affineWeight, dAffineWeight,
+            affineBias, dAffineBias, convWeight, dConvWeight, input_nPlanes,
+            input_stride, output_nPlanes, output_stride, o, additiveGrad);
+      if (nActive > o)
+        dAffineReluTrivialConvolution_backward_dW_B<
+            T, K, V><<<dim3(1, input_nPlanes / K), dim3(K, K / V), 0,
+                       THCState_getCurrentStream(state)>>>(
+            inFeatures + o * input_stride, dInFeatures + o * input_stride,
+            dOutFeatures + o * output_stride, affineWeight, dAffineWeight,
+            affineBias, dAffineBias, convWeight, dConvWeight, input_nPlanes,
+            input_stride, output_nPlanes, output_stride, nActive - o,
+            additiveGrad);
+      return;
+    }
+  }
+}
+#endif
--- a/PyTorch/sparseconvnet/SCN/generic/GPU/AveragePooling.cu
+++ b/PyTorch/sparseconvnet/SCN/generic/GPU/AveragePooling.cu
+// Copyright 2016-present, Facebook, Inc.
+// All rights reserved.
+//
+// This source code is licensed under the license found in the
+// LICENSE file in the root directory of this source tree.
+#ifndef TH_GENERIC_FILE_
+#define TH_GENERIC_FILE_ "generic/GPU/AveragePooling.cu"
+#else
+#include "AveragePooling.h"
+#include "RuleBookIterator.h"
+extern "C" void scn_DR_(AveragePooling_updateOutput)(
+    THLongTensor *inputSize, THLongTensor *outputSize, THLongTensor *poolSize,
+    THLongTensor *poolStride, void **m, THCTensor *input_features,
+    THCTensor *output_features, long nFeaturesToDrop, THCITensor *rulesBuffer) {
+  SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
+  uInt nPlanes = input_features->size[1] - nFeaturesToDrop;
+  auto _rules =
+      _m.getRuleBook(inputSize, outputSize, poolSize, poolStride, true);
+  uInt nActive = _m.getNActive(outputSize);
+  THCTensor_(resize2d)(state, output_features, nActive,
+                       input_features->size[1] - nFeaturesToDrop);
+  THCTensor_(zero)(state, output_features);
+  auto iF = THCTensor_(data)(state, input_features) + nFeaturesToDrop;
+  auto oF = THCTensor_(data)(state, output_features);
+  RULEBOOKITERATOR(AveragePooling_ForwardPass<real>(
+                       THCState_getCurrentStream(state), iF, oF, nPlanes,
+                       input_features->size[1], output_features->size[1], rbB,
+                       nHotB, _rules.size());
+                   , )
+}
+extern "C" void scn_DR_(AveragePooling_updateGradInput)(
+    THLongTensor *inputSize, THLongTensor *outputSize, THLongTensor *poolSize,
+    THLongTensor *poolStride, void **m, THCTensor *input_features,
+    THCTensor *d_input_features, THCTensor *d_output_features,
+    long nFeaturesToDrop, THCITensor *rulesBuffer) {
+  SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
+  uInt nPlanes = input_features->size[1] - nFeaturesToDrop;
+  auto _rules =
+      _m.getRuleBook(inputSize, outputSize, poolSize, poolStride, true);
+  uInt nActive = _m.getNActive(outputSize);
+  THCTensor_(resizeAs)(state, d_input_features, input_features);
+  THCTensor_(zero)(state, d_input_features);
+  auto diF = THCTensor_(data)(state, d_input_features) + nFeaturesToDrop;
+  auto doF = THCTensor_(data)(state, d_output_features);
+  RULEBOOKITERATOR(AveragePooling_BackwardPass<real>(
+                       THCState_getCurrentStream(state), diF, doF, nPlanes,
+                       input_features->size[1], d_output_features->size[1], rbB,
+                       nHotB, _rules.size());
+                   , )
+}
+#endif
--- a/PyTorch/sparseconvnet/SCN/generic/GPU/AveragePooling.h
+++ b/PyTorch/sparseconvnet/SCN/generic/GPU/AveragePooling.h
+// Copyright 2016-present, Facebook, Inc.
+// All rights reserved.
+//
+// This source code is licensed under the license found in the
+// LICENSE file in the root directory of this source tree.
+#ifndef GPU_AVERAGEPOOLING_H
+#define GPU_AVERAGEPOOLING_H
+// NTX must be >=2 so r is filled properly
+template <typename T, uInt NTX, uInt NTY>
+__global__ void AveragePooling_fp(T *input_features, T *output_features,
+                                  uInt nPlanes, uInt input_stride,
+                                  uInt output_stride, uInt *rules, uInt nHot,
+                                  T alpha) {
+  __shared__ uInt r[NTY * 2];
+  for (uInt n = blockIdx.x * NTY; n < nHot; n += gridDim.x * NTY) {
+    {
+      uInt i = threadIdx.x + NTX * threadIdx.y;
+      if (i < NTY * 2 and i < 2 * (n - nHot))
+        r[i] = rules[2 * n + i];
+    }
+    __syncthreads();
+    if (n + threadIdx.y < nHot) {
+      uInt i = r[2 * threadIdx.y] * input_stride;
+      uInt o = r[2 * threadIdx.y + 1] * output_stride;
+      for (uInt plane = threadIdx.x; plane < nPlanes; plane += NTX)
+        atomicAdd(&output_features[o + plane],
+                  alpha * input_features[i + plane]);
+    }
+    __syncthreads();
+  }
+}
+template <typename T>
+void AveragePooling_ForwardPass(cudaStream_t stream, T *input_features,
+                                T *output_features, uInt nPlanes,
+                                uInt input_stride, uInt output_stride,
+                                uInt *rules, uInt nHot, uInt filterVolume) {
+  AveragePooling_fp<T, 32, 32><<<32, dim3(32, 32), 0, stream>>>(
+      input_features, output_features, nPlanes, input_stride, output_stride,
+      rules, nHot, 1.0 / filterVolume);
+}
+template <typename T, uInt NTX, uInt NTY>
+__global__ void AveragePooling_bp(T *d_input_features, T *d_output_features,
+                                  uInt nPlanes, uInt input_stride,
+                                  uInt output_stride, uInt *rules, uInt nHot,
+                                  T alpha) {
+  __shared__ uInt r[NTY * 2];
+  for (uInt n = blockIdx.x * NTY; n < nHot; n += gridDim.x * NTY) {
+    {
+      uInt i = threadIdx.x + NTX * threadIdx.y;
+      if (i < NTY * 2 and i < 2 * (n - nHot))
+        r[i] = rules[2 * n + i];
+    }
+    __syncthreads();
+    if (n + threadIdx.y < nHot) {
+      uInt i = r[2 * threadIdx.y] * input_stride;
+      uInt o = r[2 * threadIdx.y + 1] * output_stride;
+      for (uInt plane = threadIdx.x; plane < nPlanes; plane += NTX)
+        d_input_features[i + plane] += alpha * d_output_features[o + plane];
+    }
+    __syncthreads();
+  }
+}
+template <typename T>
+void AveragePooling_BackwardPass(cudaStream_t stream, T *d_input_features,
+                                 T *d_output_features, uInt nPlanes,
+                                 uInt input_stride, uInt output_stride,
+                                 uInt *rules, uInt nHot, uInt filterVolume) {
+  AveragePooling_bp<T, 32, 32><<<32, dim3(32, 32), 0, stream>>>(
+      d_input_features, d_output_features, nPlanes, input_stride, output_stride,
+      rules, nHot, 1.0 / filterVolume);
+}
+#endif /* GPU_AVERAGEPOOLING_H */
--- a/PyTorch/sparseconvnet/SCN/generic/GPU/BatchNormalization.cu
+++ b/PyTorch/sparseconvnet/SCN/generic/GPU/BatchNormalization.cu
+// Copyright 2016-present, Facebook, Inc.
+// All rights reserved.
+//
+// This source code is licensed under the license found in the
+// LICENSE file in the root directory of this source tree.
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/GPU/BatchNormalization.cu"
+#else
+#include "BatchNormalization.h"
+#define BN_F_MACRO(N)                                                          \
+  if (nPlanes % N == 0) {                                                      \
+    BatchNormalization_ForwardPass<real, N, 64>(                               \
+        THCTensor_(data)(state, input_features),                               \
+        THCTensor_(data)(state, output_features), nPlanes, input_stride,       \
+        output_stride, nActive, THCTensor_(data)(state, saveMean),             \
+        THCTensor_(data)(state, saveInvStd),                                   \
+        THCTensor_(data)(state, runningMean),                                  \
+        THCTensor_(data)(state, runningVar),                                   \
+        weight ? THCTensor_(data)(state, weight) : 0,                          \
+        bias ? THCTensor_(data)(state, bias) : 0, eps, momentum, train,        \
+        leakiness);                                                            \
+  }
+extern "C" void scn_R_(BatchNormalization_updateOutput)(
+    THCTensor *input_features, THCTensor *output_features, THCTensor *saveMean,
+    THCTensor *saveInvStd, THCTensor *runningMean, THCTensor *runningVar,
+    THCTensor *weight, THCTensor *bias, real eps, real momentum, bool train,
+    real leakiness) {
+  THCTensor_(resizeAs)(state, output_features, input_features);
+  auto nActive = input_features->size[0];
+  auto nPlanes = input_features->size[1];
+  auto input_stride = input_features->stride[0];
+  auto output_stride = output_features->stride[0];
+  BN_F_MACRO(16)
+  else BN_F_MACRO(12) else BN_F_MACRO(8) else BN_F_MACRO(4) else BN_F_MACRO(1)
+}
+extern "C" void scn_R_(BatchNormalizationInTensor_updateOutput)(
+    THCTensor *input_features, THCTensor *output_features, THCTensor *saveMean,
+    THCTensor *saveInvStd, THCTensor *runningMean, THCTensor *runningVar,
+    THCTensor *weight, THCTensor *bias, real eps, real momentum, bool train,
+    real leakiness) {
+  auto nActive = input_features->size[0];
+  auto nPlanes = input_features->size[1];
+  auto input_stride = input_features->stride[0];
+  auto output_stride = output_features->stride[0];
+  BN_F_MACRO(16)
+  else BN_F_MACRO(12) else BN_F_MACRO(8) else BN_F_MACRO(4) else BN_F_MACRO(1)
+}
+#undef BN_F_MACRO
+#define BN_B_MACRO(N)                                                          \
+  if (nPlanes % N == 0) {                                                      \
+    BatchNormalization_BackwardPass<real, N, 64>(                              \
+        THCTensor_(data)(state, input_features),                               \
+        THCTensor_(data)(state, d_input_features),                             \
+        THCTensor_(data)(state, output_features),                              \
+        THCTensor_(data)(state, d_output_features), nPlanes, input_stride,     \
+        output_stride, nActive, THCTensor_(data)(state, saveMean),             \
+        THCTensor_(data)(state, saveInvStd),                                   \
+        THCTensor_(data)(state, runningMean),                                  \
+        THCTensor_(data)(state, runningVar),                                   \
+        weight ? THCTensor_(data)(state, weight) : 0,                          \
+        bias ? THCTensor_(data)(state, bias) : 0,                              \
+        d_weight ? THCTensor_(data)(state, d_weight) : 0,                      \
+        d_bias ? THCTensor_(data)(state, d_bias) : 0, leakiness);              \
+  }
+extern "C" void scn_R_(BatchNormalization_backward)(
+    THCTensor *input_features, THCTensor *d_input_features,
+    THCTensor *output_features, THCTensor *d_output_features,
+    THCTensor *saveMean, THCTensor *saveInvStd, THCTensor *runningMean,
+    THCTensor *runningVar, THCTensor *weight, THCTensor *bias,
+    THCTensor *d_weight, THCTensor *d_bias, real leakiness) {
+  THCTensor_(resizeAs)(state, d_input_features, d_output_features);
+  auto nActive = input_features->size[0];
+  auto nPlanes = input_features->size[1];
+  auto input_stride = input_features->stride[0];
+  auto output_stride = output_features->stride[0];
+  BN_B_MACRO(16)
+  else BN_B_MACRO(12) else BN_B_MACRO(8) else BN_B_MACRO(4) else BN_B_MACRO(1)
+}
+#endif
--- a/PyTorch/sparseconvnet/SCN/generic/GPU/BatchNormalization.h
+++ b/PyTorch/sparseconvnet/SCN/generic/GPU/BatchNormalization.h
+// Copyright 2016-present, Facebook, Inc.
+// All rights reserved.
+//
+// This source code is licensed under the license found in the
+// LICENSE file in the root directory of this source tree.
+#ifndef GPU_BATCHNORMALIZATION_H
+#define GPU_BATCHNORMALIZATION_H
+#include "../SparseConvNet.h"
+#include <cassert>
+// input_stride and output_stride are normally the same as nPlanes; allow larger
+// values to act on a subset of columns, i.e. an inplace DenseNet blocks
+// NTX ~ 16 - nPlanes must be a multiple of this
+// NTY ~ 64 - at least 4
+template <typename T, uInt NTX, uInt NTY>
+__global__ void
+BatchNormalization_f_train(T *input_features, T *output_features, uInt nPlanes,
+                           uInt input_stride, uInt output_stride, uInt nActive,
+                           T *saveMean, T *saveInvStd, T *runningMean,
+                           T *runningVar, T *weight, T *bias, T eps, T momentum,
+                           T leakiness) {
+  __shared__ T t[NTY][NTX];
+  __shared__ T t2[NTY][NTX];
+  for (uInt plane = threadIdx.x + blockIdx.x * NTX; plane < nPlanes;
+       plane += gridDim.x * NTX) {
+    t[threadIdx.y][threadIdx.x] = 0;
+    t2[threadIdx.y][threadIdx.x] = 0;
+    for (uInt row = threadIdx.y, c = plane + threadIdx.y * input_stride;
+         row < nActive; row += NTY, c += input_stride * NTY) {
+      T i = input_features[c];
+      t[threadIdx.y][threadIdx.x] += i;
+      t2[threadIdx.y][threadIdx.x] += i * i;
+    }
+    __syncthreads();
+    T _saveMean = 0;
+    T _saveInvStd = 0;
+    for (uInt row = 0; row < NTY; row++) {
+      _saveMean += t[row][threadIdx.x];
+      _saveInvStd += t2[row][threadIdx.x];
+    }
+    _saveMean /= nActive;
+    _saveInvStd = _saveInvStd - _saveMean * _saveMean * nActive;
+    if (threadIdx.y == 0) {
+      saveMean[plane] = _saveMean;
+      runningMean[plane] =
+          momentum * runningMean[plane] + (1 - momentum) * _saveMean;
+      runningVar[plane] = momentum * runningVar[plane] +
+                          (1 - momentum) * _saveInvStd / (nActive - 1);
+    }
+    _saveInvStd = pow(_saveInvStd / nActive + eps, -0.5);
+    if (threadIdx.y == 0)
+      saveInvStd[plane] = _saveInvStd;
+    __syncthreads();
+    if (threadIdx.y == 0) {
+      t[0][threadIdx.x] = _saveInvStd * (weight ? weight[plane] : 1);
+      t[1][threadIdx.x] =
+          -_saveMean * t[0][threadIdx.x] + (bias ? bias[plane] : 0);
+    }
+    __syncthreads();
+    T W = t[0][threadIdx.x];
+    T B = t[1][threadIdx.x];
+    for (uInt row = threadIdx.y, ci = plane + threadIdx.y * input_stride,
+              co = plane + threadIdx.y * output_stride;
+         row < nActive;
+         row += NTY, ci += input_stride * NTY, co += output_stride * NTY) {
+      T out = W * input_features[ci] + B;
+      output_features[co] = (out > 0) ? out : (out * leakiness);
+    }
+    __syncthreads();
+  }
+}
+template <typename T, uInt NTX, uInt NTY>
+__global__ void
+BatchNormalization_f_test(T *input_features, T *output_features, uInt nPlanes,
+                          uInt input_stride, uInt output_stride, uInt nActive,
+                          T *saveMean, T *saveInvStd, T *runningMean,
+                          T *runningVar, T *weight, T *bias, T eps, T momentum,
+                          T leakiness) {
+  __shared__ T W[NTX];
+  __shared__ T B[NTX];
+  for (uInt plane = threadIdx.x + blockIdx.x * NTX; plane < nPlanes;
+       plane += gridDim.x * NTX) {
+    if (threadIdx.y == 0) {
+      W[threadIdx.x] =
+          pow(runningVar[plane] + eps, -0.5) * (weight ? weight[plane] : 1);
+      B[threadIdx.x] =
+          (bias ? bias[plane] : 0) - runningMean[plane] * W[threadIdx.x];
+    }
+    __syncthreads();
+    float w = W[threadIdx.x], b = B[threadIdx.x];
+    for (uInt row = threadIdx.y, ci = plane + threadIdx.y * input_stride,
+              co = plane + threadIdx.y * output_stride;
+         row < nActive;
+         row += NTY, ci += input_stride * NTY, co += output_stride * NTY) {
+      T out = w * input_features[ci] + b;
+      output_features[co] = (out > 0) ? out : (out * leakiness);
+    }
+    __syncthreads();
+  }
+}
+template <typename T, uInt NTX, uInt NTY>
+void BatchNormalization_ForwardPass(T *input_features, T *output_features,
+                                    uInt nPlanes, uInt input_stride,
+                                    uInt output_stride, uInt nActive,
+                                    T *saveMean, T *saveInvStd, T *runningMean,
+                                    T *runningVar, T *weight, T *bias, T eps,
+                                    T momentum, bool train, T leakiness) {
+  if (train) {
+    BatchNormalization_f_train<
+        T, NTX, NTY><<<std::min((uInt)16, nPlanes / NTX), dim3(NTX, NTY), 0,
+                       THCState_getCurrentStream(state)>>>(
+        input_features, output_features, nPlanes, input_stride, output_stride,
+        nActive, saveMean, saveInvStd, runningMean, runningVar, weight, bias,
+        eps, momentum, leakiness);
+  } else {
+    BatchNormalization_f_test<
+        T, NTX, NTY><<<std::min((uInt)16, nPlanes / NTX), dim3(NTX, NTY), 0,
+                       THCState_getCurrentStream(state)>>>(
+        input_features, output_features, nPlanes, input_stride, output_stride,
+        nActive, saveMean, saveInvStd, runningMean, runningVar, weight, bias,
+        eps, momentum, leakiness);
+  }
+}
+template <typename T, uInt NTX, uInt NTY>
+__global__ void
+BatchNormalization_b(T *input_features, T *d_input_features, T *output_features,
+                     T *d_output_features, uInt nPlanes, uInt input_stride,
+                     uInt output_stride, uInt nActive, T *saveMean,
+                     T *saveInvStd, T *runningMean, T *runningVar, T *weight,
+                     T *bias, T *d_weight, T *d_bias, T leakiness) {
+  __shared__ T t[NTY][NTX];
+  __shared__ T t2[NTY][NTX];
+  for (uInt plane = threadIdx.x + blockIdx.x * NTX; plane < nPlanes;
+       plane += gridDim.x * NTX) {
+    if (threadIdx.y == 0) {
+      t[0][threadIdx.x] = saveMean[plane];
+      t[1][threadIdx.x] = saveInvStd[plane];
+      t[2][threadIdx.x] = (weight ? weight[plane] : 1);
+    }
+    __syncthreads();
+    T _saveMean = t[0][threadIdx.x];
+    T _saveInvStd = t[1][threadIdx.x];
+    T _weight = t[2][threadIdx.x];
+    __syncthreads();
+    t[threadIdx.y][threadIdx.x] = 0;
+    t2[threadIdx.y][threadIdx.x] = 0;
+    for (uInt row = threadIdx.y, ci = plane + threadIdx.y * input_stride,
+              co = plane + threadIdx.y * output_stride;
+         row < nActive;
+         row += NTY, ci += input_stride * NTY, co += output_stride * NTY) {
+      T d = d_output_features[co];
+      d = (output_features[co] > 0) ? d : (d * leakiness);
+      d_output_features[co] = d;
+      t[threadIdx.y][threadIdx.x] += d;
+      t2[threadIdx.y][threadIdx.x] += (input_features[ci] - _saveMean) * d;
+    }
+    __syncthreads();
+    T gradMean = 0;
+    T dotp = 0;
+    for (int row = 0; row < NTY; row++) {
+      gradMean += t[row][threadIdx.x];
+      dotp += t2[row][threadIdx.x];
+    }
+    __syncthreads();
+    if (d_weight)
+      d_weight[plane] = dotp * _saveInvStd;
+    if (d_bias)
+      d_bias[plane] = gradMean; // sum really
+    gradMean /= nActive;
+    T k = dotp * _saveInvStd * _saveInvStd / nActive;
+    for (uInt row = threadIdx.y, ci = plane + threadIdx.y * input_stride,
+              co = plane + threadIdx.y * output_stride;
+         row < nActive;
+         row += NTY, ci += input_stride * NTY, co += output_stride * NTY) {
+      d_input_features[ci] = (d_output_features[co] - gradMean -
+                              (input_features[ci] - _saveMean) * k) *
+                             _saveInvStd * _weight;
+    }
+    __syncthreads();
+  }
+}
+template <typename T, uInt NTX, uInt NTY>
+void BatchNormalization_BackwardPass(T *input_features, T *d_input_features,
+                                     T *output_features, T *d_output_features,
+                                     uInt nPlanes, uInt input_stride,
+                                     uInt output_stride, uInt nActive,
+                                     T *saveMean, T *saveInvStd, T *runningMean,
+                                     T *runningVar, T *weight, T *bias,
+                                     T *d_weight, T *d_bias, T leakiness) {
+  BatchNormalization_b<T, NTX,
+                       NTY><<<std::min((uInt)16, nPlanes / NTX), dim3(NTX, NTY),
+                              0, THCState_getCurrentStream(state)>>>(
+      input_features, d_input_features, output_features, d_output_features,
+      nPlanes, input_stride, output_stride, nActive, saveMean, saveInvStd,
+      runningMean, runningVar, weight, bias, d_weight, d_bias, leakiness);
+}
+#undef NTX
+#undef NTY
+#endif /* GPU_BATCHNORMALIZATION_H */
--- a/PyTorch/sparseconvnet/SCN/generic/GPU/BatchwiseMultiplicativeDropout.cu
+++ b/PyTorch/sparseconvnet/SCN/generic/GPU/BatchwiseMultiplicativeDropout.cu
+// Copyright 2016-present, Facebook, Inc.
+// All rights reserved.
+//
+// This source code is licensed under the license found in the
+// LICENSE file in the root directory of this source tree.
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/GPU/BatchwiseMultiplicativeDropout.cu"
+#else
+#include "BatchwiseMultiplicativeDropout.h"
+#define SPARSECONVNET_FOO(NTX, NTY)                                            \
+  {                                                                            \
+    if (nPlanes % NTX == 0) {                                                  \
+      BatchwiseMultiplicativeDropout_fp<real, NTX, NTY> << <                   \
+          dim3(std::min(16L, nPlanes / NTX), 16), dim3(NTX, NTY), 0,           \
+          THCState_getCurrentStream(state)>>>                                  \
+          (THCTensor_(data)(state, input_features),                            \
+           THCTensor_(data)(state, output_features),                           \
+           THCTensor_(data)(state, noise), nActive, nPlanes, nPlanes, nPlanes, \
+           alpha);                                                             \
+      return;                                                                  \
+    }                                                                          \
+  }
+extern "C" void scn_R_(BatchwiseMultiplicativeDropout_updateOutput)(
+    THCTensor *input_features, THCTensor *output_features, THCTensor *noise,
+    float alpha) {
+  if (input_features != output_features)
+    THCTensor_(resizeAs)(state, output_features, input_features);
+  auto nActive = input_features->size[0];
+  auto nPlanes = input_features->size[1];
+  SPARSECONVNET_FOO(32, 32)
+  SPARSECONVNET_FOO(24, 32)
+  SPARSECONVNET_FOO(16, 64)
+  SPARSECONVNET_FOO(12, 64)
+  SPARSECONVNET_FOO(8, 64)
+  SPARSECONVNET_FOO(4, 64)
+  SPARSECONVNET_FOO(1, 64)
+}
+#undef SPARSECONVNET_FOO
+#define SPARSECONVNET_FOO(NTX, NTY)                                            \
+  {                                                                            \
+    if (nPlanes % NTX == 0) {                                                  \
+      BatchwiseMultiplicativeDropout_bp<real, NTX, NTY> << <                   \
+          dim3(std::min(16L, nPlanes / NTX), 16), dim3(NTX, NTY), 0,           \
+          THCState_getCurrentStream(state)>>>                                  \
+          (THCTensor_(data)(state, input_features),                            \
+           THCTensor_(data)(state, d_input_features),                          \
+           THCTensor_(data)(state, d_output_features),                         \
+           THCTensor_(data)(state, noise), nActive, nPlanes, nPlanes, nPlanes, \
+           alpha);                                                             \
+      return;                                                                  \
+    }                                                                          \
+  }
+extern "C" void scn_R_(BatchwiseMultiplicativeDropout_updateGradInput)(
+    THCTensor *input_features, THCTensor *d_input_features,
+    THCTensor *d_output_features, THCTensor *noise, float alpha) {
+  if (d_input_features != d_output_features)
+    THCTensor_(resizeAs)(state, d_input_features, d_output_features);
+  auto nActive = input_features->size[0];
+  auto nPlanes = input_features->size[1];
+  SPARSECONVNET_FOO(32, 32)
+  SPARSECONVNET_FOO(24, 32)
+  SPARSECONVNET_FOO(16, 64)
+  SPARSECONVNET_FOO(12, 64)
+  SPARSECONVNET_FOO(8, 64)
+  SPARSECONVNET_FOO(4, 64)
+  SPARSECONVNET_FOO(1, 64)
+}
+#undef SPARSECONVNET_FOO
+#endif
--- a/PyTorch/sparseconvnet/SCN/generic/GPU/BatchwiseMultiplicativeDropout.h
+++ b/PyTorch/sparseconvnet/SCN/generic/GPU/BatchwiseMultiplicativeDropout.h
+// Copyright 2016-present, Facebook, Inc.
+// All rights reserved.
+//
+// This source code is licensed under the license found in the
+// LICENSE file in the root directory of this source tree.
+#ifndef GPU_BATCHWISEMULTIPLICATIVEDROPOUT_H
+#define GPU_BATCHWISEMULTIPLICATIVEDROPOUT_H
+template <typename T, uInt NTX, uInt NTY>
+__global__ void BatchwiseMultiplicativeDropout_fp(T *input_features,
+                                                  T *output_features, T *noise,
+                                                  uInt nActive, uInt nPlanes,
+                                                  uInt input_stride,
+                                                  uInt output_stride, T alpha) {
+  __shared__ T nz[NTX];
+  for (uInt plane = threadIdx.x + blockIdx.x * NTX; plane < nPlanes;
+       plane += gridDim.x * NTX) {
+    if (threadIdx.y == 0)
+      nz[threadIdx.x] = noise[plane];
+    __syncthreads();
+    for (uInt row = threadIdx.y + blockIdx.y * NTY; row < nActive;
+         row += gridDim.y * NTY) {
+      uInt i = row * input_stride + plane;
+      uInt o = row * output_stride + plane;
+      output_features[o] = input_features[i] * nz[threadIdx.x] *
+                           ((input_features[i] > 0) ? 1 : alpha);
+    }
+    __syncthreads();
+  }
+}
+template <typename T, uInt NTX, uInt NTY>
+__global__ void
+BatchwiseMultiplicativeDropout_bp(T *input_features, T *d_input_features,
+                                  T *d_output_features, T *noise, uInt nActive,
+                                  uInt nPlanes, uInt input_stride,
+                                  uInt output_stride, T alpha) {
+  __shared__ T nz[NTX];
+  for (uInt plane = threadIdx.x + blockIdx.x * NTX; plane < nPlanes;
+       plane += gridDim.x * NTX) {
+    if (threadIdx.y == 0)
+      nz[threadIdx.x] = noise[plane];
+    __syncthreads();
+    for (uInt row = threadIdx.y + blockIdx.y * NTY; row < nActive;
+         row += gridDim.y * NTY) {
+      uInt i = row * input_stride + plane;
+      uInt o = row * output_stride + plane;
+      d_input_features[i] = d_output_features[o] * nz[threadIdx.x] *
+                            ((input_features[i] > 0) ? 1 : alpha);
+    }
+    __syncthreads();
+  }
+}
+#endif /* GPU_BATCHWISEMULTIPLICATIVEDROPOUT_H */
--- a/PyTorch/sparseconvnet/SCN/generic/GPU/Convolution.cu
+++ b/PyTorch/sparseconvnet/SCN/generic/GPU/Convolution.cu
+// Copyright 2016-present, Facebook, Inc.
+// All rights reserved.
+//
+// This source code is licensed under the license found in the
+// LICENSE file in the root directory of this source tree.
+#ifndef TH_GENERIC_FILE_
+#define TH_GENERIC_FILE_ "generic/GPU/Convolution.cu"
+#else
+#include "Convolution.h"
+#include "RuleBookIterator.h"
+#include <algorithm>
+#include <cstring>
+extern "C" double scn_DR_(Convolution_updateOutput)(
+    THLongTensor *inputSize, THLongTensor *outputSize, THLongTensor *filterSize,
+    THLongTensor *filterStride, void **m, THCTensor *input_features,
+    THCTensor *output_features, THCTensor *weight, THCTensor *bias,
+    long filterVolume, THCITensor *rulesBuffer) {
+  SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
+  auto _rules =
+      _m.getRuleBook(inputSize, outputSize, filterSize, filterStride, true);
+  uInt nActive = _m.getNActive(outputSize);
+  THCTensor_(resize2d)(state, output_features, nActive, weight->size[1]);
+  if (not bias)
+    THCTensor_(zero)(state, output_features);
+  auto iF = THCTensor_(data)(state, input_features);
+  auto oF = THCTensor_(data)(state, output_features);
+  auto ip = input_features->size[1];
+  auto op = output_features->size[1];
+  auto w = THCTensor_(data)(state, weight);
+  double flops = 0;
+  if (bias) {
+    auto b = THCTensor_(data)(state, bias);
+    for (uInt i = 0; i < op; i += 32) {
+      uInt blockDim = min(32L, op - i);
+      uInt gridDim = min(4096, nActive);
+      Convolution_fp_bias
+              << <gridDim, blockDim, 0, THCState_getCurrentStream(state)>>>
+          (oF + i, b + i, op, op, nActive);
+    }
+  }
+  uInt c = ip * op;
+  RULEBOOKITERATOR(
+      dConvolution_forward2<real>(iF, oF, w, rbB, nHotB, ip, ip, op, op,
+                                  THCState_getCurrentStream(state));
+      , w += c; flops += nHotB * c;)
+  return flops;
+}
+extern "C" void scn_DR_(Convolution_backward)(
+    THLongTensor *inputSize, THLongTensor *outputSize, THLongTensor *filterSize,
+    THLongTensor *filterStride, void **m, THCTensor *input_features,
+    THCTensor *d_input_features, THCTensor *d_output_features,
+    THCTensor *weight, THCTensor *d_weight, THCTensor *d_bias,
+    long filterVolume, THCITensor *rulesBuffer) {
+  SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
+  auto _rules =
+      _m.getRuleBook(inputSize, outputSize, filterSize, filterStride, true);
+  uInt nActive = _m.getNActive(outputSize);
+  THCTensor_(resizeAs)(state, d_input_features, input_features);
+  THCTensor_(zero)(state, d_input_features);
+  auto iF = THCTensor_(data)(state, input_features);
+  auto diF = THCTensor_(data)(state, d_input_features);
+  auto doF = THCTensor_(data)(state, d_output_features);
+  auto ip = input_features->size[1];
+  auto op = d_output_features->size[1];
+  auto w = THCTensor_(data)(state, weight);
+  auto dw = THCTensor_(data)(state, d_weight);
+  uInt c = ip * op;
+  RULEBOOKITERATOR(
+      dConvolution_backward_dW2<real>(iF, diF, doF, w, dw, rbB, nHotB, ip, ip,
+                                      op, op, THCState_getCurrentStream(state));
+      , w += c; dw += c;)
+  if (d_bias) {
+    auto db = THCTensor_(data)(state, d_bias);
+    Convolution_bp_bias(doF, db, op, op, nActive,
+                        THCState_getCurrentStream(state));
+  }
+}
+extern "C" double scn_DR_(ValidConvolution_updateOutput)(
+    THLongTensor *inputSize, THLongTensor *filterSize, void **m,
+    THCTensor *input_features, THCTensor *output_features, THCTensor *weight,
+    THCTensor *bias, long filterVolume, THCITensor *rulesBuffer) {
+  SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
+  auto _rules = _m.getValidRuleBook(inputSize, filterSize, true);
+  uInt nActive = input_features->size[0];
+  THCTensor_(resize2d)(state, output_features, nActive, weight->size[1]);
+  if (not bias)
+    THCTensor_(zero)(state, output_features);
+  auto iF = THCTensor_(data)(state, input_features);
+  auto oF = THCTensor_(data)(state, output_features);
+  auto ip = input_features->size[1];
+  auto op = output_features->size[1];
+  auto w = THCTensor_(data)(state, weight);
+  double flops = 0;
+  if (bias) {
+    auto b = THCTensor_(data)(state, bias);
+    for (uInt i = 0; i < op; i += 32) {
+      uInt blockDim = min(32L, op - i);
+      uInt gridDim = min(4096, nActive);
+      Convolution_fp_bias
+              << <gridDim, blockDim, 0, THCState_getCurrentStream(state)>>>
+          (oF + i, b + i, op, op, nActive);
+    }
+  }
+  uInt c = ip * op;
+  RULEBOOKITERATOR(
+      dConvolution_forward2<real>(iF, oF, w, rbB, nHotB, ip, ip, op, op,
+                                  THCState_getCurrentStream(state));
+      , w += c; flops += nHotB * c;)
+  return flops;
+}
+extern "C" void scn_DR_(ValidConvolution_backward)(
+    THLongTensor *inputSize, THLongTensor *filterSize, void **m,
+    THCTensor *input_features, THCTensor *d_input_features,
+    THCTensor *d_output_features, THCTensor *weight, THCTensor *d_weight,
+    THCTensor *d_bias, long filterVolume, THCITensor *rulesBuffer) {
+  SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
+  auto _rules = _m.getValidRuleBook(inputSize, filterSize, true);
+  uInt nActive = input_features->size[0];
+  THCTensor_(resizeAs)(state, d_input_features, input_features);
+  THCTensor_(zero)(state, d_input_features);
+  auto iF = THCTensor_(data)(state, input_features);
+  auto diF = THCTensor_(data)(state, d_input_features);
+  auto doF = THCTensor_(data)(state, d_output_features);
+  auto ip = input_features->size[1];
+  auto op = d_output_features->size[1];
+  auto w = THCTensor_(data)(state, weight);
+  auto dw = THCTensor_(data)(state, d_weight);
+  uInt c = ip * op;
+  RULEBOOKITERATOR(
+      dConvolution_backward_dW2<real>(iF, diF, doF, w, dw, rbB, nHotB, ip, ip,
+                                      op, op, THCState_getCurrentStream(state));
+      , w += c; dw += c;)
+  if (d_bias) {
+    auto db = THCTensor_(data)(state, d_bias);
+    Convolution_bp_bias(doF, db, op, op, nActive,
+                        THCState_getCurrentStream(state));
+  }
+}
+#endif
--- a/PyTorch/sparseconvnet/SCN/generic/GPU/Convolution.h
+++ b/PyTorch/sparseconvnet/SCN/generic/GPU/Convolution.h
+// Copyright 2016-present, Facebook, Inc.
+// All rights reserved.
+//
+// This source code is licensed under the license found in the
+// LICENSE file in the root directory of this source tree.
+#ifndef GPU_CONVOLUTION_H
+#define GPU_CONVOLUTION_H
+#include "../SparseConvNet.h"
+template <typename T>
+__global__ void Convolution_fp_bias(T *output_features, T *bias, uInt nPlanes,
+                                    uInt output_stride, uInt nActive) {
+  __shared__ T b[32];
+  b[threadIdx.x] = bias[threadIdx.x];
+  for (uInt row = blockIdx.x; row < nActive; row += 1 << 12) {
+    output_features[row * output_stride + threadIdx.x] = b[threadIdx.x];
+  }
+}
+template <typename T>
+__global__ void dColumnSum(T *matrix, T *target, uInt nRows, uInt nColumns,
+                           uInt nCOLUMNS) {
+  uInt i = blockIdx.x * 32 + threadIdx.x;
+  T t = 0;
+  for (uInt j = blockIdx.y; j < nRows; j += 32)
+    t += matrix[j * nCOLUMNS + i];
+  atomicAdd(&target[i], t);
+}
+template <typename T>
+void Convolution_bp_bias(T *matrix, T *target, uInt nRows, uInt nColumns,
+                         uInt nCOLUMNS, cudaStream_t stream) {
+  if (nColumns / 32 > 0)
+    dColumnSum << <dim3(nColumns / 32, 32), 32, 0, stream>>>
+        (matrix, target, nRows, nColumns, nCOLUMNS);
+  if (nColumns % 32 > 0) {
+    uInt o = nColumns / 32 * 32;
+    dColumnSum << <dim3(1, 32), nColumns - o, 0, stream>>>
+        (matrix + o, target + o, nRows, nColumns, nCOLUMNS);
+  }
+}
+template <typename T, uInt K, uInt V>
+__global__ void
+dConvolution_KMxKN_forwardA(T *inFeatures, T *outFeatures, T *w, uInt *rules,
+                            uInt nHot, uInt input_nPlanes, uInt input_stride,
+                            uInt output_nPlanes, uInt output_stride) {
+  // nHot must be a multiple of K!!
+  // Input x Weight -> Output
+  // blockDim=(K,K/V,1), gridDim=(nBlocks,N,1) Volkov-blocks
+  // K is a multiple of V,
+  // nHot x KM -> nHot x KN - parallel over N,nHot - loop over M
+  uInt M = input_nPlanes / K;
+  // N = gridDim.y == output_nPlanes/K
+  uInt n = blockIdx.y;
+  outFeatures += n * K;
+  w += n * K;
+  T O[V];
+  __shared__ T W[K][K];
+  __shared__ T I[K][K];
+  uInt R0[V];
+  uInt R1[V];
+  const int tx = threadIdx.x;
+  int ty[V];
+#pragma unroll
+  for (int v = 0; v < V; v++)
+    ty[v] = threadIdx.y + v * (K / V);
+  for (int m = 0; m < M; m++) {
+// Read w
+#pragma unroll
+    for (int v = 0; v < V; v++)
+      W[ty[v]][tx] = w[ty[v] * output_nPlanes + tx];
+    for (uInt s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
+#pragma unroll
+      for (int v = 0; v < V; v++) {
+        R0[v] = rules[2 * (s + ty[v])];
+        R1[v] = rules[2 * (s + ty[v]) + 1];
+      }
+      __syncthreads();
+// Read input, reset O[]
+#pragma unroll
+      for (int v = 0; v < V; v++) {
+        I[ty[v]][tx] = inFeatures[R0[v] * input_stride + tx];
+        O[v] = 0;
+      }
+      __syncthreads();
+#pragma unroll
+      for (int k = 0; k < K; k++)
+#pragma unroll
+        for (int v = 0; v < V; v++)
+          O[v] += I[ty[v]][k] * W[k][tx];
+#pragma unroll
+      for (int v = 0; v < V; v++)
+        O[v] += outFeatures[R1[v] * output_stride + tx];
+#pragma unroll
+      for (int v = 0; v < V; v++)
+        outFeatures[R1[v] * output_stride + tx] = O[v];
+      __syncthreads();
+    }
+    w += K * output_nPlanes;
+    inFeatures += K;
+  }
+}
+template <typename T, uInt K, uInt V>
+__global__ void
+dConvolution_KMxKN_forwardB(T *inFeatures, T *outFeatures, T *w, uInt *rules,
+                            uInt nHot, uInt input_nPlanes, uInt input_stride,
+                            uInt output_nPlanes, uInt output_stride) {
+  // Input x Weight -> Output
+  // blockDim=(K,K/V,1), gridDim=(nBlocks,N,1) Volkov-blocks
+  // K is a multiple of V,
+  // nHot x KM -> nHot x KN - parallel over N,nHot - loop over M
+  uInt M = input_nPlanes / K;
+  // N = gridDim.y == output_nPlanes/K
+  uInt n = blockIdx.y;
+  outFeatures += n * K;
+  w += n * K;
+  T O[V];
+  __shared__ T W[K][K];
+  __shared__ T I[K][K];
+  uInt R0[V];
+  uInt R1[V];
+  const int tx = threadIdx.x;
+  int ty[V];
+#pragma unroll
+  for (int v = 0; v < V; v++)
+    ty[v] = threadIdx.y + v * (K / V);
+  for (int m = 0; m < M; m++) {
+// Read w
+#pragma unroll
+    for (int v = 0; v < V; v++)
+      W[ty[v]][tx] = w[ty[v] * output_nPlanes + tx];
+    for (uInt s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
+#pragma unroll
+      for (int v = 0; v < V; v++) {
+        if (s + ty[v] < nHot) {
+          R0[v] = rules[2 * (s + ty[v])];
+          R1[v] = rules[2 * (s + ty[v]) + 1];
+        }
+      }
+      __syncthreads();
+// Read input, reset O[]
+#pragma unroll
+      for (int v = 0; v < V; v++) {
+        if (s + ty[v] < nHot)
+          I[ty[v]][tx] = inFeatures[R0[v] * input_stride + tx];
+        O[v] = 0;
+      }
+      __syncthreads();
+#pragma unroll
+      for (int k = 0; k < K; k++)
+#pragma unroll
+        for (int v = 0; v < V; v++)
+          O[v] += I[ty[v]][k] * W[k][tx];
+#pragma unroll
+      for (int v = 0; v < V; v++)
+        if (s + ty[v] < nHot)
+          O[v] += outFeatures[R1[v] * output_stride + tx];
+#pragma unroll
+      for (int v = 0; v < V; v++)
+        if (s + ty[v] < nHot)
+          outFeatures[R1[v] * output_stride + tx] = O[v];
+      __syncthreads();
+    }
+    w += K * output_nPlanes;
+    inFeatures += K;
+  }
+}
+#define FOO(K, V)                                                              \
+  {                                                                            \
+    if (input_nPlanes % K == 0 and output_nPlanes % K == 0) {                  \
+      uInt o = (nHot / K) * K;                                                 \
+      if (o >= K)                                                              \
+        dConvolution_KMxKN_forwardA<T, K, V> << <                              \
+            dim3(std::min(o / K, (uInt)512), output_nPlanes / K),              \
+            dim3(K, K / V), 0, stream>>>                                       \
+            (inFeatures, outFeatures, w, rules, o, input_nPlanes,              \
+             input_stride, output_nPlanes, output_stride);                     \
+      if (nHot > o)                                                            \
+        dConvolution_KMxKN_forwardB<T, K, V> << <dim3(1, output_nPlanes / K),  \
+                                                 dim3(K, K / V), 0, stream>>>  \
+            (inFeatures, outFeatures, w, rules + 2 * o, nHot - o,              \
+             input_nPlanes, input_stride, output_nPlanes, output_stride);      \
+      return;                                                                  \
+    }                                                                          \
+  }
+template <typename T>
+void dConvolution_forward(T *inFeatures, T *outFeatures, T *w, uInt *rules,
+                          uInt nHot, uInt input_nPlanes, uInt input_stride,
+                          uInt output_nPlanes, uInt output_stride,
+                          cudaStream_t stream) {
+  FOO(64, 16)
+  FOO(32, 8)
+  FOO(16, 4)
+  FOO(8, 2)
+  assert(false);
+}
+#undef FOO
+// dOutput x W^T -> dInput and
+// Input^T x dOutput -> dW
+// blockDim=(K,K/V,1), gridDim=(nBlocks,M,1)
+template <typename T, uInt K, uInt V>
+__global__ void
+dConvolution_KMxKN_backward_dW_A(T *inFeatures, T *dInFeatures, T *dOutFeatures,
+                                 T *w, T *dw, uInt *rules, uInt nHot,
+                                 uInt input_nPlanes, uInt input_stride,
+                                 uInt output_nPlanes, uInt output_stride) {
+  // M = gridDim.y == input_nPlanes / K
+  uInt N = output_nPlanes / K;
+  uInt m = blockIdx.y;
+  inFeatures += m * K;
+  dInFeatures += m * K;
+  w += m * K * output_nPlanes;
+  dw += m * K * output_nPlanes;
+  T dI[V];
+  T dW[V];
+  __shared__ T I[K][K];
+  __shared__ T dO[K][K];
+  __shared__ T W[K][K];
+  uInt R0[V];
+  uInt R1[V];
+  const int tx = threadIdx.x;
+  int ty[V];
+#pragma unroll
+  for (int v = 0; v < V; v++)
+    ty[v] = threadIdx.y + v * (K / V);
+  for (int n = 0; n < N; n++) {
+// Read w, reset dW
+#pragma unroll
+    for (int v = 0; v < V; v++) {
+      W[ty[v]][tx] = w[ty[v] * output_nPlanes + tx];
+      dW[v] = 0;
+    }
+    for (uInt s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
+#pragma unroll
+      for (int v = 0; v < V; v++) {
+        R0[v] = rules[2 * (s + ty[v])];
+        R1[v] = rules[2 * (s + ty[v]) + 1];
+        dI[v] = 0;
+      }
+      __syncthreads();
+// Read input and dOutput
+#pragma unroll
+      for (int v = 0; v < V; v++) {
+        I[ty[v]][tx] = inFeatures[R0[v] * input_stride + tx];
+        dO[ty[v]][tx] = dOutFeatures[R1[v] * output_stride + tx];
+      }
+      __syncthreads();
+#pragma unroll
+      for (int k = 0; k < K; k++)
+#pragma unroll
+        for (int v = 0; v < V; v++) {
+          dI[v] += dO[ty[v]][k] * W[tx][k];
+          dW[v] += I[k][ty[v]] * dO[k][tx];
+        }
+#pragma unroll
+      for (int v = 0; v < V; v++)
+        dI[v] += dInFeatures[R0[v] * input_stride + tx];
+#pragma unroll
+      for (int v = 0; v < V; v++)
+        dInFeatures[R0[v] * input_stride + tx] = dI[v];
+      __syncthreads();
+    }
+#pragma unroll
+    for (int v = 0; v < V; v++)
+      atomicAdd(&dw[ty[v] * output_nPlanes + tx], dW[v]);
+    w += K;
+    dw += K;
+    dOutFeatures += K;
+  }
+}
+// dOutput x W^T -> dInput and
+// Input^T x dOutput -> dW
+// blockDim=(K,K/V,1), gridDim=(nBlocks,M,1)
+template <typename T, uInt K, uInt V>
+__global__ void
+dConvolution_KMxKN_backward_dW_B(T *inFeatures, T *dInFeatures, T *dOutFeatures,
+                                 T *w, T *dw, uInt *rules, uInt nHot,
+                                 uInt input_nPlanes, uInt input_stride,
+                                 uInt output_nPlanes, uInt output_stride) {
+  // M = gridDim.y == input_nPlanes / K
+  uInt N = output_nPlanes / K;
+  uInt m = blockIdx.y;
+  inFeatures += m * K;
+  dInFeatures += m * K;
+  w += m * K * output_nPlanes;
+  dw += m * K * output_nPlanes;
+  T dI[V];
+  T dW[V];
+  __shared__ T I[K][K];
+  __shared__ T dO[K][K];
+  __shared__ T W[K][K];
+  uInt R0[V];
+  uInt R1[V];
+  const int tx = threadIdx.x;
+  int ty[V];
+#pragma unroll
+  for (int v = 0; v < V; v++)
+    ty[v] = threadIdx.y + v * (K / V);
+  for (int n = 0; n < N; n++) {
+// Read w, reset dW
+#pragma unroll
+    for (int v = 0; v < V; v++) {
+      W[ty[v]][tx] = w[ty[v] * output_nPlanes + tx];
+      dW[v] = 0;
+    }
+    for (uInt s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
+#pragma unroll
+      for (int v = 0; v < V; v++) {
+        if (s + ty[v] < nHot) {
+          R0[v] = rules[2 * (s + ty[v])];
+          R1[v] = rules[2 * (s + ty[v]) + 1];
+        }
+        dI[v] = 0;
+      }
+      __syncthreads();
+// Read input and dOutput
+#pragma unroll
+      for (int v = 0; v < V; v++)
+        if (s + ty[v] < nHot) {
+          I[ty[v]][tx] = inFeatures[R0[v] * input_stride + tx];
+          dO[ty[v]][tx] = dOutFeatures[R1[v] * output_stride + tx];
+        } else {
+          I[ty[v]][tx] = 0;
+          dO[ty[v]][tx] = 0;
+        }
+      __syncthreads();
+#pragma unroll
+      for (int k = 0; k < K; k++)
+#pragma unroll
+        for (int v = 0; v < V; v++) {
+          dI[v] += dO[ty[v]][k] * W[tx][k];
+          dW[v] += I[k][ty[v]] * dO[k][tx];
+        }
+#pragma unroll
+      for (int v = 0; v < V; v++)
+        if (s + ty[v] < nHot)
+          dI[v] += dInFeatures[R0[v] * input_stride + tx];
+#pragma unroll
+      for (int v = 0; v < V; v++)
+        if (s + ty[v] < nHot)
+          dInFeatures[R0[v] * input_stride + tx] = dI[v];
+      __syncthreads();
+    }
+#pragma unroll
+    for (int v = 0; v < V; v++)
+      atomicAdd(&dw[ty[v] * output_nPlanes + tx], dW[v]);
+    w += K;
+    dw += K;
+    dOutFeatures += K;
+  }
+}
+#define FOO(K, V)                                                              \
+  {                                                                            \
+    if (input_nPlanes % K == 0 and output_nPlanes % K == 0) {                  \
+      uInt o = (nHot / K) * K;                                                 \
+      if (o >= K)                                                              \
+        dConvolution_KMxKN_backward_dW_A<T, K, V> << <                         \
+            dim3(std::min(o / K, (uInt)512), input_nPlanes / K),               \
+            dim3(K, K / V), 0, stream>>>                                       \
+            (inFeatures, dInFeatures, dOutFeatures, w, dw, rules, o,           \
+             input_nPlanes, input_stride, output_nPlanes, output_stride);      \
+      if (nHot > o)                                                            \
+        dConvolution_KMxKN_backward_dW_B<T, K, V> << <                         \
+            dim3(1, input_nPlanes / K), dim3(K, K / V), 0, stream>>>           \
+            (inFeatures, dInFeatures, dOutFeatures, w, dw, rules + 2 * o,      \
+             nHot - o, input_nPlanes, input_stride, output_nPlanes,            \
+             output_stride);                                                   \
+      return;                                                                  \
+    }                                                                          \
+  }
+template <typename T>
+void dConvolution_backward_dW(T *inFeatures, T *dInFeatures, T *dOutFeatures,
+                              T *w, T *dw, uInt *rules, uInt nHot,
+                              uInt input_nPlanes, uInt input_stride,
+                              uInt output_nPlanes, uInt output_stride,
+                              cudaStream_t stream) {
+  FOO(32, 8)
+  FOO(16, 4)
+  FOO(8, 2)
+  assert(false);
+}
+#undef FOO
+template <typename T, uInt K, uInt V>
+__global__ void
+dConvolution_KMxKN_forward2(T *inFeatures, T *outFeatures, T *w, uInt *rules,
+                            uInt nHot, uInt input_nPlanes, uInt input_stride,
+                            uInt output_nPlanes, uInt output_stride) {
+  // Input x Weight -> Output
+  // blockDim=(K,K/V,1), gridDim=(nBlocks,N,1) Volkov-blocks
+  // K is a multiple of V,
+  // nHot x input_nplanes<=KM -> nHot x output_nPlanes<=KN
+  // - parallel over N,nHot - loop over M
+  uInt M = (input_nPlanes + K - 1) / K;
+  // N = gridDim.y ~ output_nPlanes/K
+  uInt n = blockIdx.y;
+  outFeatures += n * K;
+  w += n * K;
+  uInt KO = min(K, output_nPlanes - K * n);
+  T O[V];
+  __shared__ T W[K][K];
+  __shared__ T I[K][K];
+  __shared__ uInt R[K * 2];
+  const int tx = threadIdx.x;
+  int ty[V];
+#pragma unroll
+  for (int v = 0; v < V; v++)
+    ty[v] = threadIdx.y + v * (K / V);
+  for (int m = 0; m < M; m++) {
+    uInt KI = min(K, input_nPlanes - K * m);
+// Read w
+#pragma unroll
+    for (int v = 0; v < V; v++)
+      if (ty[v] < KI and tx < KO)
+        W[ty[v]][tx] = w[ty[v] * output_nPlanes + tx];
+    for (uInt s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
+// Read rules for K input/output pairs
+#pragma unroll
+      for (int v = 0; v < V; v++) {
+        if (ty[v] < 2) {
+          int q = ty[v] * K + tx;
+          if (s + q / 2 < nHot)
+            R[q] = rules[2 * s + q];
+        }
+      }
+      __syncthreads();
+// Read input, reset O[]
+#pragma unroll
+      for (int v = 0; v < V; v++) {
+        if (tx < KI and s + ty[v] < nHot)
+          I[ty[v]][tx] = inFeatures[R[2 * ty[v]] * input_stride + tx];
+        O[v] = 0;
+      }
+      __syncthreads();
+#pragma unroll
+      for (int k = 0; k < KI; k++)
+#pragma unroll
+        for (int v = 0; v < V; v++)
+          O[v] += I[ty[v]][k] * W[k][tx];
+      __syncthreads();
+#pragma unroll
+      for (int v = 0; v < V; v++)
+        if (tx < KO and s + ty[v] < nHot)
+          outFeatures[R[2 * ty[v] + 1] * output_stride + tx] += O[v];
+      __syncthreads();
+    }
+    w += K * output_nPlanes;
+    inFeatures += K;
+  }
+}
+template <typename T>
+void dConvolution_forward2(T *inFeatures, T *outFeatures, T *w, uInt *rules,
+                           uInt nHot, uInt input_nPlanes, uInt input_stride,
+                           uInt output_nPlanes, uInt output_stride,
+                           cudaStream_t stream) {
+  if (input_nPlanes % 8 != 0 or output_nPlanes % 8 != 0) {
+    const int K = 16;
+    const int V = 4;
+    dConvolution_KMxKN_forward2<T, K, V> << <
+        dim3(128, (output_nPlanes + K - 1) / K), dim3(K, K / V), 0, stream>>>
+        (inFeatures, outFeatures, w, rules, nHot, input_nPlanes, input_stride,
+         output_nPlanes, output_stride);
+    return;
+  } else {
+    dConvolution_forward(inFeatures, outFeatures, w, rules, nHot, input_nPlanes,
+                         input_stride, output_nPlanes, output_stride, stream);
+  }
+}
+// dOutput x W^T -> dInput and
+// Input^T x dOutput -> dW
+// blockDim=(K,K/V,1), gridDim=(nBlocks,M,1)
+template <typename T, uInt K, uInt V>
+__global__ void
+dConvolution_KMxKN_backward_dW2(T *inFeatures, T *dInFeatures, T *dOutFeatures,
+                                T *w, T *dw, uInt *rules, uInt nHot,
+                                uInt input_nPlanes, uInt input_stride,
+                                uInt output_nPlanes, uInt output_stride) {
+  // M = gridDim.y == input_nPlanes / K
+  uInt N = (output_nPlanes + K - 1) / K;
+  uInt m = blockIdx.y;
+  inFeatures += m * K;
+  dInFeatures += m * K;
+  w += m * K * output_nPlanes;
+  dw += m * K * output_nPlanes;
+  uInt KI = min(K, input_nPlanes - K * m);
+  T dI[V];
+  T dW[V];
+  __shared__ T I[K][K];
+  __shared__ T dO[K][K];
+  __shared__ T W[K][K];
+  __shared__ uInt R[K * 2];
+  const int tx = threadIdx.x;
+  int ty[V];
+#pragma unroll
+  for (int v = 0; v < V; v++)
+    ty[v] = threadIdx.y + v * (K / V);
+  for (int n = 0; n < N; n++) {
+    uInt KO = min(K, output_nPlanes - K * n);
+// Read w, reset dW
+#pragma unroll
+    for (int v = 0; v < V; v++) {
+      if (ty[v] < KI and tx < KO)
+        W[ty[v]][tx] = w[ty[v] * output_nPlanes + tx];
+      dW[v] = 0;
+    }
+    for (uInt s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
+// Read rules for K input/output pairs, reset dI[]
+#pragma unroll
+      for (int v = 0; v < V; v++) {
+        if (ty[v] < 2) {
+          int q = ty[v] * K + tx;
+          if (s + q / 2 < nHot)
+            R[q] = rules[2 * s + q];
+        }
+        dI[v] = 0;
+      }
+      __syncthreads();
+// Read input and dOutput
+#pragma unroll
+      for (int v = 0; v < V; v++) {
+        if (tx < KI and s + ty[v] < nHot)
+          I[ty[v]][tx] = inFeatures[R[2 * ty[v]] * input_stride + tx];
+        else
+          I[ty[v]][tx] = 0;
+        if (tx < KO and s + ty[v] < nHot)
+          dO[ty[v]][tx] = dOutFeatures[R[2 * ty[v] + 1] * output_stride + tx];
+        else
+          dO[ty[v]][tx] = 0;
+      }
+      __syncthreads();
+#pragma unroll
+      for (int k = 0; k < KO; k++)
+#pragma unroll
+        for (int v = 0; v < V; v++)
+          dI[v] += dO[ty[v]][k] * W[tx][k];
+#pragma unroll
+      for (int k = 0; k < K; k++)
+#pragma unroll
+        for (int v = 0; v < V; v++)
+          dW[v] += I[k][ty[v]] * dO[k][tx];
+      __syncthreads();
+#pragma unroll
+      for (int v = 0; v < V; v++)
+        if (tx < KI and s + ty[v] < nHot)
+          dInFeatures[R[2 * ty[v]] * input_stride + tx] += dI[v];
+      __syncthreads();
+    }
+#pragma unroll
+    for (int v = 0; v < V; v++)
+      if (ty[v] < KI and tx < KO)
+        atomicAdd(&dw[ty[v] * output_nPlanes + tx], dW[v]);
+    w += K;
+    dw += K;
+    dOutFeatures += K;
+  }
+}
+template <typename T>
+void dConvolution_backward_dW2(T *inFeatures, T *dInFeatures, T *dOutFeatures,
+                               T *w, T *dw, uInt *rules, uInt nHot,
+                               uInt input_nPlanes, uInt input_stride,
+                               uInt output_nPlanes, uInt output_stride,
+                               cudaStream_t stream) {
+  if (input_nPlanes % 8 != 0 or output_nPlanes % 8 != 0) {
+    const int K = 16;
+    const int V = 4;
+    dConvolution_KMxKN_backward_dW2<T, K, V> << <
+        dim3(128, (input_nPlanes + K - 1) / K), dim3(K, K / V), 0, stream>>>
+        (inFeatures, dInFeatures, dOutFeatures, w, dw, rules, nHot,
+         input_nPlanes, input_stride, output_nPlanes, output_stride);
+    return;
+  } else {
+    dConvolution_backward_dW(inFeatures, dInFeatures, dOutFeatures, w, dw,
+                             rules, nHot, input_nPlanes, input_stride,
+                             output_nPlanes, output_stride, stream);
+  }
+}
+#endif /* GPU_CONVOLUTION_H */