Factor out CUDA code

de3743f6 · Benjamin Thomas Graham · f0407b36 · de3743f6 · f0407b36 · de3743f6
Commit de3743f6 authored Jul 13, 2018 by Benjamin Thomas Graham
20 changed files
--- a/sparseconvnet/SCN/CPU/MaxPooling.cpp
+++ b/sparseconvnet/SCN/CPU/MaxPooling.cpp
@@ -4,7 +4,31 @@
 // This source code is licensed under the license found in the
 // LICENSE file in the root directory of this source tree.
-#include "MaxPooling.h"
+template <typename T>
+void MaxPooling_ForwardPass(T *input_features, T *output_features, Int nPlanes,
+                            Int input_stride, Int output_stride, Int *rules,
+                            Int nHot) {
+  for (Int outSite = 0; outSite < nHot; outSite++) {
+    Int i = rules[2 * outSite] * input_stride;
+    Int o = rules[2 * outSite + 1] * output_stride;
+    for (Int plane = 0; plane < nPlanes; plane++)
+      if (output_features[o + plane] < input_features[i + plane])
+        output_features[o + plane] = input_features[i + plane];
+  }
+}
+template <typename T>
+void MaxPooling_BackwardPass(T *input_features, T *d_input_features,
+                             T *output_features, T *d_output_features,
+                             Int nPlanes, Int input_stride, Int output_stride,
+                             Int *rules, Int nHot) {
+  for (Int outSite = 0; outSite < nHot; outSite++) {
+    Int i = rules[2 * outSite] * input_stride;
+    Int o = rules[2 * outSite + 1] * output_stride;
+    for (Int plane = 0; plane < nPlanes; plane++)
+      if (output_features[o + plane] == input_features[i + plane])
+        d_input_features[i + plane] += d_output_features[o + plane];
+  }
+}
 template <typename T, Int Dimension>
 void cpu_MaxPooling_updateOutput(

--- a/sparseconvnet/SCN/CPU/MaxPooling.h
+++ b/sparseconvnet/SCN/CPU/MaxPooling.h
-// Copyright 2016-present, Facebook, Inc.
-// All rights reserved.
-//
-// This source code is licensed under the license found in the
-// LICENSE file in the root directory of this source tree.
-#ifndef CPU_MAXPOOLING_H
-#define CPU_MAXPOOLING_H
-template <typename T>
-void MaxPooling_ForwardPass(T *input_features, T *output_features,
-                              Int nPlanes, Int input_stride,
-                              Int output_stride, Int *rules, Int nHot) {
-  for (Int outSite = 0; outSite < nHot; outSite++) {
-    Int i = rules[2 * outSite] * input_stride;
-    Int o = rules[2 * outSite + 1] * output_stride;
-    for (Int plane = 0; plane < nPlanes; plane++)
-      if (output_features[o + plane] < input_features[i + plane])
-        output_features[o + plane] = input_features[i + plane];
-  }
-}
-template <typename T>
-void MaxPooling_BackwardPass(T *input_features, T *d_input_features,
-                               T *output_features, T *d_output_features,
-                               Int nPlanes, Int input_stride,
-                               Int output_stride, Int *rules, Int nHot) {
-  for (Int outSite = 0; outSite < nHot; outSite++) {
-    Int i = rules[2 * outSite] * input_stride;
-    Int o = rules[2 * outSite + 1] * output_stride;
-    for (Int plane = 0; plane < nPlanes; plane++)
-      if (output_features[o + plane] == input_features[i + plane])
-        d_input_features[i + plane] += d_output_features[o + plane];
-  }
-}
-#endif /* CPU_MAXPOOLING_H */
--- a/sparseconvnet/SCN/CPU/SparseToDense.cpp
+++ b/sparseconvnet/SCN/CPU/SparseToDense.cpp
@@ -4,7 +4,30 @@
 // This source code is licensed under the license found in the
 // LICENSE file in the root directory of this source tree.
-#include "SparseToDense.h"
+template <typename T>
+void SparseToDense_ForwardPass(T *input_features, T *output_features,
+                               Int nPlanes, Int spatialVolume, Int *rules,
+                               int nHot) {
+  for (Int outSite = 0; outSite < nHot; outSite++) {
+    T *i = input_features + rules[2 * outSite] * nPlanes;
+    T *o = output_features + rules[2 * outSite + 1];
+    for (Int plane = 0; plane < nPlanes; plane++)
+      o[plane * spatialVolume] = i[plane];
+  }
+}
+template <typename T>
+void SparseToDense_BackwardPass(T *d_input_features, T *d_output_features,
+                                Int nPlanes, Int spatialVolume, Int *rules,
+                                int nHot) {
+  for (Int outSite = 0; outSite < nHot; outSite++) {
+    T *d_i = d_input_features + rules[2 * outSite] * nPlanes;
+    T *d_o = d_output_features + rules[2 * outSite + 1];
+    for (Int plane = 0; plane < nPlanes; plane++)
+      d_i[plane] = d_o[plane * spatialVolume];
+  }
+}
 template <typename T, Int Dimension>
 void cpu_SparseToDense_updateOutput(

--- a/sparseconvnet/SCN/CPU/SparseToDense.h
+++ b/sparseconvnet/SCN/CPU/SparseToDense.h
-// Copyright 2016-present, Facebook, Inc.
-// All rights reserved.
-//
-// This source code is licensed under the license found in the
-// LICENSE file in the root directory of this source tree.
-#ifndef CPU_SPARSETODENSE_H
-#define CPU_SPARSETODENSE_H
-template <typename T>
-void SparseToDense_ForwardPass(T *input_features, T *output_features,
-                               Int nPlanes, Int spatialVolume, Int *rules,
-                               int nHot) {
-  for (Int outSite = 0; outSite < nHot; outSite++) {
-    T *i = input_features + rules[2 * outSite] * nPlanes;
-    T *o = output_features + rules[2 * outSite + 1];
-    for (Int plane = 0; plane < nPlanes; plane++)
-      o[plane * spatialVolume] = i[plane];
-  }
-}
-template <typename T>
-void SparseToDense_BackwardPass(T *d_input_features, T *d_output_features,
-                                Int nPlanes, Int spatialVolume, Int *rules,
-                                int nHot) {
-  for (Int outSite = 0; outSite < nHot; outSite++) {
-    T *d_i = d_input_features + rules[2 * outSite] * nPlanes;
-    T *d_o = d_output_features + rules[2 * outSite + 1];
-    for (Int plane = 0; plane < nPlanes; plane++)
-      d_i[plane] = d_o[plane * spatialVolume];
-  }
-}
-#endif /* CPU_SPARSETODENSE_H */
--- a/sparseconvnet/SCN/CPU/UnPooling.cpp
+++ b/sparseconvnet/SCN/CPU/UnPooling.cpp
@@ -4,7 +4,28 @@
 // This source code is licensed under the license found in the
 // LICENSE file in the root directory of this source tree.
-#include "UnPooling.h"
+template <typename T>
+void UnPooling_ForwardPass(T *input_features, T *output_features, Int nPlanes,
+                           Int input_stride, Int output_stride, Int *rules,
+                           Int nHot) {
+  for (Int outSite = 0; outSite < nHot; outSite++) {
+    Int i = rules[2 * outSite + 1] * input_stride;
+    Int o = rules[2 * outSite] * output_stride;
+    for (Int plane = 0; plane < nPlanes; plane++)
+      output_features[o + plane] += input_features[i + plane];
+  }
+}
+template <typename T>
+void UnPooling_BackwardPass(T *d_input_features, T *d_output_features,
+                            Int nPlanes, Int input_stride, Int output_stride,
+                            Int *rules, Int nHot) {
+  for (Int outSite = 0; outSite < nHot; outSite++) {
+    Int i = rules[2 * outSite + 1] * input_stride;
+    Int o = rules[2 * outSite] * output_stride;
+    for (Int plane = 0; plane < nPlanes; plane++)
+      d_input_features[i + plane] += d_output_features[o + plane];
+  }
+}
 template <typename T, Int Dimension>
 void cpu_UnPooling_updateOutput(

--- a/sparseconvnet/SCN/CPU/UnPooling.h
+++ b/sparseconvnet/SCN/CPU/UnPooling.h
-// Copyright 2016-present, Facebook, Inc.
-// All rights reserved.
-//
-// This source code is licensed under the license found in the
-// LICENSE file in the root directory of this source tree.
-#ifndef CPU_UNPOOLING_H
-#define CPU_UNPOOLING_H
-template <typename T>
-void UnPooling_ForwardPass(T *input_features, T *output_features, Int nPlanes,
-                           Int input_stride, Int output_stride, Int *rules,
-                           Int nHot) {
-  for (Int outSite = 0; outSite < nHot; outSite++) {
-    Int i = rules[2 * outSite + 1] * input_stride;
-    Int o = rules[2 * outSite] * output_stride;
-    for (Int plane = 0; plane < nPlanes; plane++)
-      output_features[o + plane] += input_features[i + plane];
-  }
-}
-template <typename T>
-void UnPooling_BackwardPass(T *d_input_features, T *d_output_features,
-                            Int nPlanes, Int input_stride, Int output_stride,
-                            Int *rules, Int nHot) {
-  for (Int outSite = 0; outSite < nHot; outSite++) {
-    Int i = rules[2 * outSite + 1] * input_stride;
-    Int o = rules[2 * outSite] * output_stride;
-    for (Int plane = 0; plane < nPlanes; plane++)
-      d_input_features[i + plane] += d_output_features[o + plane];
-  }
-}
-#endif /* CPU_UNPOOLING_H */
--- a/sparseconvnet/SCN/CUDA/ActivePooling.cpp
+++ b/sparseconvnet/SCN/CUDA/ActivePooling.cpp
+// Copyright 2016-present, Facebook, Inc.
+// All rights reserved.
+//
+// This source code is licensed under the license found in the
+// LICENSE file in the root directory of this source tree.
+template <typename T>
+void ActivePooling_ForwardPass(T *input_features, T *output_features,
+                               Int batchSize, Int maxActive, Int nPlanes,
+                               Int *rules, bool average);
+template <typename T>
+void ActivePooling_BackwardPass(T *d_input_features, T *d_output_features,
+                                Int batchSize, Int maxActive, Int nPlanes,
+                                Int *rules, bool average);
+template <typename T, Int Dimension>
+void cuda_ActivePooling_updateOutput(
+    /*long*/ at::Tensor inputSize, Metadata<Dimension> &m,
+    /*cuda float*/ at::Tensor input_features,
+    /*cuda float*/ at::Tensor output_features, bool average) {
+  Int nPlanes = input_features.size(1);
+  auto _rules = m.getActivePoolingRuleBook(inputSize);
+  Int batchSize = _rules[1][0];
+  Int maxActive = _rules[1][1];
+  output_features.resize_({batchSize, nPlanes});
+  output_features.zero_();
+  auto iF = input_features.data<T>();
+  auto oF = output_features.data<T>();
+  ActivePooling_ForwardPass<T>(iF, oF, batchSize, maxActive, nPlanes,
+                               &_rules[0][0], average);
+}
+template <typename T, Int Dimension>
+void cuda_ActivePooling_updateGradInput(
+    /*long*/ at::Tensor inputSize, Metadata<Dimension> &m,
+    /*cuda float*/ at::Tensor input_features,
+    /*cuda float*/ at::Tensor d_input_features,
+    /*cuda float*/ at::Tensor d_output_features, bool average) {
+  Int nPlanes = input_features.size(1);
+  auto _rules = m.getActivePoolingRuleBook(inputSize);
+  Int batchSize = _rules[1][0];
+  Int maxActive = _rules[1][1];
+  d_input_features.resize_as_(input_features);
+  d_input_features.zero_();
+  auto diF = d_input_features.data<T>();
+  auto doF = d_output_features.data<T>();
+  ActivePooling_BackwardPass<T>(diF, doF, batchSize, maxActive, nPlanes,
+                                &_rules[0][0], average);
+}
--- a/sparseconvnet/SCN/CUDA/ActivePooling.cu
+++ b/sparseconvnet/SCN/CUDA/ActivePooling.cu
@@ -4,64 +4,73 @@
 // This source code is licensed under the license found in the
 // LICENSE file in the root directory of this source tree.
-#include "ActivePooling.h"
+template <typename T>
+__global__ void ActivePooling_fp(T *input_features, T *output_features,
-template <typename T, Int Dimension>
+                                 Int maxActive, Int nPlanes, Int *rules,
-void cuda_ActivePooling_updateOutput(
+                                 bool average) {
-    /*long*/ at::Tensor inputSize, Metadata<Dimension> &m,
+  T *out = &output_features[blockIdx.x * nPlanes];
-    /*cuda float*/ at::Tensor input_features,
+  Int *r = &rules[blockIdx.x * (maxActive + 1)];
-    /*cuda float*/ at::Tensor output_features, bool average) {
+  Int nActive = *r++;
+  T multiplier = (average and nActive > 0) ? 1.0f / nActive : 1.0f;
-  Int nPlanes = input_features.size(1);
+  while (nActive-- > 0) {
-  auto _rules = m.getActivePoolingRuleBook(inputSize);
+    T *inp = &input_features[(*r++) * nPlanes];
-  Int batchSize = _rules[1][0];
+    for (Int plane = threadIdx.x; plane < nPlanes; plane += 32)
-  Int maxActive = _rules[1][1];
+      out[plane] += inp[plane] * multiplier;
-  output_features.resize_({batchSize, nPlanes});
+  }
-  output_features.zero_();
+}
+template <typename T>
+void ActivePooling_ForwardPass(T *input_features, T *output_features,
+                               Int batchSize, Int maxActive, Int nPlanes,
+                               Int *rules, bool average) {
  auto rulesBuffer = at::CUDA(at_kINT).tensor({1 << 22});
  Int *rb = rulesBuffer.data<Int>();
  Int rowBatchSize = std::min((Int)32768, (1 << 22) / (maxActive + 1));
  assert(rowBatchSize > 0);
+  Int kernelBlockDim = std::min(nPlanes, (Int)32);
-  auto iF = input_features.data<T>();
-  auto oF = output_features.data<T>();
  for (Int o = 0; o < batchSize; o += rowBatchSize) {
    Int batchSize_ = std::min(rowBatchSize, (Int(batchSize - o)));
-    cudaMemcpy(rb, &_rules[0][o * (maxActive + 1)],
+    cudaMemcpy(rb, rules + o * (maxActive + 1),
               sizeof(Int) * (maxActive + 1) * batchSize_,
               cudaMemcpyHostToDevice);
-    ActivePooling_ForwardPass<T>(iF, oF + o * nPlanes, batchSize_, maxActive,
+    ActivePooling_fp<T><<<batchSize_, kernelBlockDim>>>(
-                                 nPlanes, rb, average);
+        input_features, output_features + 0 * nPlanes, maxActive, nPlanes,
+        rules, average);
+  }
+}
+template <typename T>
+__global__ void ActivePooling_bp(T *d_input_features, T *d_output_features,
+                                 Int maxActive, Int nPlanes, Int *rules,
+                                 bool average) {
+  T *out = &d_output_features[blockIdx.x * nPlanes];
+  Int *r = &rules[blockIdx.x * (maxActive + 1)];
+  Int nActive = *r++;
+  T multiplier = (average and nActive > 0) ? 1.0f / nActive : 1.0f;
+  while (nActive-- > 0) {
+    T *inp = &d_input_features[(*r++) * nPlanes];
+    for (Int plane = threadIdx.x; plane < nPlanes; plane += 32)
+      inp[plane] = out[plane] * multiplier;
  }
 }
-template <typename T, Int Dimension>
-void cuda_ActivePooling_updateGradInput(
-    /*long*/ at::Tensor inputSize, Metadata<Dimension> &m,
-    /*cuda float*/ at::Tensor input_features,
-    /*cuda float*/ at::Tensor d_input_features,
-    /*cuda float*/ at::Tensor d_output_features, bool average) {
-  Int nPlanes = input_features.size(1);
-  auto _rules = m.getActivePoolingRuleBook(inputSize);
-  Int batchSize = _rules[1][0];
-  Int maxActive = _rules[1][1];
-  d_input_features.resize_as_(input_features);
-  d_input_features.zero_();
+template <typename T>
+void ActivePooling_BackwardPass(T *d_input_features, T *d_output_features,
+                                Int batchSize, Int maxActive, Int nPlanes,
+                                Int *rules, bool average) {
  auto rulesBuffer = at::CUDA(at_kINT).tensor({1 << 22});
  Int *rb = rulesBuffer.data<Int>();
  Int rowBatchSize = std::min((Int)32768, (1 << 22) / (maxActive + 1));
  assert(rowBatchSize > 0);
+  Int kernelBlockDim = std::min(nPlanes, (Int)32);
-  auto diF = d_input_features.data<T>();
-  auto doF = d_output_features.data<T>();
  for (Int o = 0; o < batchSize; o += rowBatchSize) {
    Int batchSize_ = std::min(rowBatchSize, (Int(batchSize - o)));
-    cudaMemcpy(rb, &_rules[0][o * (maxActive + 1)],
+    cudaMemcpy(rb, rules + o * (maxActive + 1),
               sizeof(Int) * (maxActive + 1) * batchSize_,
               cudaMemcpyHostToDevice);
-    ActivePooling_BackwardPass<T>(diF, doF + o * nPlanes, batchSize_, maxActive,
+    ActivePooling_bp<T><<<batchSize_, kernelBlockDim>>>(
-                                  nPlanes, rb, average);
+        d_input_features, d_output_features + o * nPlanes, maxActive, nPlanes,
+        rules, average);
  }
 }
--- a/sparseconvnet/SCN/CUDA/ActivePooling.h
+++ b/sparseconvnet/SCN/CUDA/ActivePooling.h
-// Copyright 2016-present, Facebook, Inc.
-// All rights reserved.
-//
-// This source code is licensed under the license found in the
-// LICENSE file in the root directory of this source tree.
-#ifndef CUDA_ACTIVEPOOLING_H
-#define CUDA_ACTIVEPOOLING_H
-template <typename T>
-__global__ void ActivePooling_fp(T *input_features, T *output_features,
-                                 Int maxActive, Int nPlanes, Int *rules,
-                                 bool average) {
-  T *out = &output_features[blockIdx.x * nPlanes];
-  Int *r = &rules[blockIdx.x * (maxActive + 1)];
-  Int nActive = *r++;
-  T multiplier = (average and nActive > 0) ? 1.0f / nActive : 1.0f;
-  while (nActive-- > 0) {
-    T *inp = &input_features[(*r++) * nPlanes];
-    for (Int plane = threadIdx.x; plane < nPlanes; plane += 32)
-      out[plane] += inp[plane] * multiplier;
-  }
-}
-template <typename T>
-void ActivePooling_ForwardPass(T *input_features, T *output_features,
-                               Int batchSize, Int maxActive, Int nPlanes,
-                               Int *rules, bool average) {
-  Int kernelBlockDim = std::min(nPlanes, (Int)32);
-  ActivePooling_fp<T><<<batchSize, kernelBlockDim>>>(
-      input_features, output_features, maxActive, nPlanes, rules, average);
-}
-template <typename T>
-__global__ void ActivePooling_bp(T *d_input_features, T *d_output_features,
-                                 Int maxActive, Int nPlanes, Int *rules,
-                                 bool average) {
-  T *out = &d_output_features[blockIdx.x * nPlanes];
-  Int *r = &rules[blockIdx.x * (maxActive + 1)];
-  Int nActive = *r++;
-  T multiplier = (average and nActive > 0) ? 1.0f / nActive : 1.0f;
-  while (nActive-- > 0) {
-    T *inp = &d_input_features[(*r++) * nPlanes];
-    for (Int plane = threadIdx.x; plane < nPlanes; plane += 32)
-      inp[plane] = out[plane] * multiplier;
-  }
-}
-template <typename T>
-void ActivePooling_BackwardPass(T *d_input_features, T *d_output_features,
-                                Int batchSize, Int maxActive, Int nPlanes,
-                                Int *rules, bool average) {
-  Int kernelBlockDim = std::min(nPlanes, (Int)32);
-  ActivePooling_bp<T><<<batchSize, kernelBlockDim>>>(
-      d_input_features, d_output_features, maxActive, nPlanes, rules, average);
-}
-#endif /* CUDA_ActivePOOLING_H */
--- a/sparseconvnet/SCN/CUDA/AffineReluTrivialConvolution.cpp
+++ b/sparseconvnet/SCN/CUDA/AffineReluTrivialConvolution.cpp
+// Copyright 2016-present, Facebook, Inc.
+// All rights reserved.
+//
+// This source code is licensed under the license found in the
+// LICENSE file in the root directory of this source tree.
+// check if A+B is faster than just B
+// check if loading affineBias into shared memory is faster than loading
+// multiple times (if not try 64,16 backwards case)
+template <typename T>
+void dAffineReluTrivialConvolution_forward(T *inFeatures, T *outFeatures,
+                                           T *affineWeight, T *affineBias,
+                                           T *convWeight, Int input_nPlanes,
+                                           Int input_stride, Int output_nPlanes,
+                                           Int output_stride, Int nActive);
+template <typename T>
+void dAffineReluTrivialConvolution_backward_dW(
+    T *inFeatures, T *dInFeatures, T *dOutFeatures, T *affineWeight,
+    T *dAffineWeight, T *affineBias, T *dAffineBias, T *convWeight,
+    T *dConvWeight, Int input_nPlanes, Int input_stride, Int output_nPlanes,
+    Int output_stride, Int nActive, bool additiveGrad);
+template <typename T>
+double cuda_AffineReluTrivialConvolution_updateOutput(
+    /*cuda float*/ at::Tensor input_features,
+    /*cuda float*/ at::Tensor output_features,
+    /*cuda float*/ at::Tensor affineWeight,
+    /*cuda float*/ at::Tensor affineBias,
+    /*cuda float*/ at::Tensor convWeight) {
+  output_features.resize_({input_features.size(0), convWeight.size(1)});
+  dAffineReluTrivialConvolution_forward<T>(
+      input_features.data<T>(), output_features.data<T>(),
+      affineWeight.data<T>(), affineBias.data<T>(), convWeight.data<T>(),
+      convWeight.size(0), input_features.stride(0), convWeight.size(1),
+      output_features.size(1), input_features.size(0));
+  return input_features.size(0) * input_features.size(1) *
+         output_features.size(1);
+}
+template <typename T>
+void cuda_AffineReluTrivialConvolution_backward(
+    /*cuda float*/ at::Tensor input_features,
+    /*cuda float*/ at::Tensor d_input_features,
+    /*cuda float*/ at::Tensor d_output_features,
+    /*cuda float*/ at::Tensor affineWeight,
+    /*cuda float*/ at::Tensor d_affineWeight,
+    /*cuda float*/ at::Tensor affineBias,
+    /*cuda float*/ at::Tensor d_affineBias,
+    /*cuda float*/ at::Tensor convWeight,
+    /*cuda float*/ at::Tensor d_convWeight, bool additiveGrad) {
+  d_input_features.resize_as_(input_features);
+  dAffineReluTrivialConvolution_backward_dW<T>(
+      input_features.data<T>(), d_input_features.data<T>(),
+      d_output_features.data<T>(), affineWeight.data<T>(),
+      d_affineWeight.data<T>(), affineBias.data<T>(), d_affineBias.data<T>(),
+      convWeight.data<T>(), d_convWeight.data<T>(), convWeight.size(0),
+      input_features.stride(0), convWeight.size(1), d_output_features.stride(0),
+      input_features.size(0), additiveGrad);
+}
--- a/sparseconvnet/SCN/CUDA/AffineReluTrivialConvolution.cu
+++ b/sparseconvnet/SCN/CUDA/AffineReluTrivialConvolution.cu
@@ -4,44 +4,432 @@
 // This source code is licensed under the license found in the
 // LICENSE file in the root directory of this source tree.
-#include "AffineReluTrivialConvolution.h"
+// check if A+B is faster than just B
+// check if loading affineBias into shared memory is faster than loading
+// multiple times (if not try 64,16 backwards case)
+template <typename T, Int K, Int V>
+__global__ void dAffineReluTrivialConvolution_forwardA(
+    T *inFeatures, T *outFeatures, T *affineWeight, T *affineBias,
+    T *convWeight, Int input_nPlanes, Int input_stride, Int output_nPlanes,
+    Int output_stride, Int nActive) {
+  // nActive must be a multiple of K!!
+  // Input x Weight -> Output
+  // blockDim=(K,K/V,1), gridDim=(nBlocks,N,1) Volkov-blocks
+  // K is a multiple of V,
+  // nActive x KM -> nActive x KN - parallel over N,nActive - loop over M
+  Int M = input_nPlanes / K;
+  // N = gridDim.y == output_nPlanes/K
+  Int n = blockIdx.y;
+  outFeatures += n * K;
+  convWeight += n * K;
+  T O[V];
+  __shared__ T I[K][K];
+  __shared__ T AW[K];
+  __shared__ T AB[K];
+  __shared__ T CW[K][K];
+  const Int tx = threadIdx.x;
+  int ty[V];
+#pragma unroll
+  for (int v = 0; v < V; v++)
+    ty[v] = threadIdx.y + v * (K / V);
+  for (int m = 0; m < M; m++) {
+    // Read affineWeight, affineBias and convWeight
+    if (ty[0] == 0) {
+      AW[tx] = affineWeight[tx];
+      AB[tx] = affineBias[tx];
+    }
+#pragma unroll
+    for (int v = 0; v < V; v++)
+      CW[ty[v]][tx] = convWeight[ty[v] * output_nPlanes + tx];
+    __syncthreads();
+    for (Int s = blockIdx.x * K; s < nActive; s += K * gridDim.x) {
+// Read input, do affine + relu, set O[]
+#pragma unroll
+      for (int v = 0; v < V; v++) {
+        T i = inFeatures[(s + ty[v]) * input_stride + tx] * AW[tx] + AB[tx];
+        I[ty[v]][tx] = (i > 0) ? i : 0;
+        if (m == 0) {
+          O[v] = 0;
+        } else {
+          O[v] = outFeatures[(s + ty[v]) * output_stride + tx];
+        }
+      }
+      __syncthreads();
+#pragma unroll
+      for (int k = 0; k < K; k++)
+#pragma unroll
+        for (int v = 0; v < V; v++)
+          O[v] += I[ty[v]][k] * CW[k][tx];
+#pragma unroll
+      for (int v = 0; v < V; v++)
+        outFeatures[(s + ty[v]) * output_stride + tx] = O[v];
+      __syncthreads();
+    }
+    affineWeight += K;
+    affineBias += K;
+    convWeight += K * output_nPlanes;
+    inFeatures += K;
+  }
+}
+template <typename T, Int K, Int V>
+__global__ void dAffineReluTrivialConvolution_forwardB(
+    T *inFeatures, T *outFeatures, T *affineWeight, T *affineBias,
+    T *convWeight, Int input_nPlanes, Int input_stride, Int output_nPlanes,
+    Int output_stride, Int nActive) {
+  // Input x Weight -> Output
+  // blockDim=(K,K/V,1), gridDim=(nBlocks,N,1) Volkov-blocks
+  // K is a multiple of V,
+  // nActive x KM -> nActive x KN - parallel over N,nActive - loop over M
+  Int M = input_nPlanes / K;
+  // N = gridDim.y == output_nPlanes/K
+  Int n = blockIdx.y;
+  outFeatures += n * K;
+  convWeight += n * K;
+  T O[V];
+  __shared__ T I[K][K]; // zz try K+1 trick A+B+backwards
+  __shared__ T AW[K];
+  __shared__ T AB[K];
+  __shared__ T CW[K][K];
+  const Int tx = threadIdx.x;
+  int ty[V];
+#pragma unroll
+  for (int v = 0; v < V; v++)
+    ty[v] = threadIdx.y + v * (K / V);
+  for (int m = 0; m < M; m++) {
+    // Read affineWeight, affineBias and convWeight
+    if (ty[0] == 0) {
+      AW[tx] = affineWeight[tx];
+      AB[tx] = affineBias[tx];
+    }
+#pragma unroll
+    for (int v = 0; v < V; v++)
+      CW[ty[v]][tx] = convWeight[ty[v] * output_nPlanes + tx];
+    __syncthreads();
+    for (Int s = blockIdx.x * K; s < nActive; s += K * gridDim.x) {
+// Read input, do affine + relu, set O[]
+#pragma unroll
+      for (int v = 0; v < V; v++) {
+        if (s + ty[v] < nActive) {
+          T i = inFeatures[(s + ty[v]) * input_stride + tx] * AW[tx] + AB[tx];
+          I[ty[v]][tx] = (i > 0) ? i : 0;
+          if (m == 0) {
+            O[v] = 0;
+          } else {
+            O[v] = outFeatures[(s + ty[v]) * output_stride + tx];
+          }
+        }
+      }
+      __syncthreads();
+#pragma unroll
+      for (int k = 0; k < K; k++)
+#pragma unroll
+        for (int v = 0; v < V; v++)
+          O[v] += I[ty[v]][k] * CW[k][tx];
+#pragma unroll
+      for (int v = 0; v < V; v++)
+        if (s + ty[v] < nActive)
+          outFeatures[(s + ty[v]) * output_stride + tx] = O[v];
+      __syncthreads();
+    }
+    affineWeight += K;
+    affineBias += K;
+    convWeight += K * output_nPlanes;
+    inFeatures += K;
+  }
+}
+#define FOO(T, K, V)                                                           \
+  {                                                                            \
+    if (input_nPlanes % K == 0 and output_nPlanes % K == 0) {                  \
+      Int o = (nActive / K) * K;                                               \
+      if (o > 0)                                                               \
+        dAffineReluTrivialConvolution_forwardA<                                \
+            T, K, V><<<dim3(std::min(o / K, (Int)512), output_nPlanes / K),    \
+                       dim3(K, K / V)>>>(                                      \
+            inFeatures, outFeatures, affineWeight, affineBias, convWeight,     \
+            input_nPlanes, input_stride, output_nPlanes, output_stride, o);    \
+      if (nActive > o)                                                         \
+        dAffineReluTrivialConvolution_forwardB<                                \
+            T, K, V><<<dim3(1, output_nPlanes / K), dim3(K, K / V)>>>(         \
+            inFeatures + o * input_stride, outFeatures + o * output_stride,    \
+            affineWeight, affineBias, convWeight, input_nPlanes, input_stride, \
+            output_nPlanes, output_stride, nActive - o);                       \
+      return;                                                                  \
+    }                                                                          \
+  }
 template <typename T>
-double cuda_AffineReluTrivialConvolution_updateOutput(
+void dAffineReluTrivialConvolution_forward(T *inFeatures, T *outFeatures,
-    /*cuda float*/ at::Tensor input_features,
+                                           T *affineWeight, T *affineBias,
-    /*cuda float*/ at::Tensor output_features,
+                                           T *convWeight, Int input_nPlanes,
-    /*cuda float*/ at::Tensor affineWeight,
+                                           Int input_stride, Int output_nPlanes,
-    /*cuda float*/ at::Tensor affineBias,
+                                           Int output_stride, Int nActive) {
-    /*cuda float*/ at::Tensor convWeight) {
+  FOO(T, 64, 16)
-  output_features.resize_({input_features.size(0), convWeight.size(1)});
+  FOO(T, 32, 8)
-  dAffineReluTrivialConvolution_forward<T>(
+  FOO(T, 16, 4)
-      input_features.data<T>(), output_features.data<T>(),
+  FOO(T, 8, 2)
-      affineWeight.data<T>(), affineBias.data<T>(), convWeight.data<T>(),
+  assert(false);
-      convWeight.size(0), input_features.stride(0), convWeight.size(1),
+}
-      output_features.size(1), input_features.size(0));
+template <>
-  return input_features.size(0) * input_features.size(1) *
+void dAffineReluTrivialConvolution_forward<double>(
-         output_features.size(1);
+    double *inFeatures, double *outFeatures, double *affineWeight,
+    double *affineBias, double *convWeight, Int input_nPlanes, Int input_stride,
+    Int output_nPlanes, Int output_stride, Int nActive) {
+  FOO(double, 32, 8)
+  FOO(double, 16, 4)
+  FOO(double, 8, 2)
+  assert(false);
+}
+#undef FOO
+// dOutput x W^T -> dInput and
+// Input^T x dOutput -> dW
+// blockDim=(K,K/V,1), gridDim=(nBlocks,M,1)
+template <typename T, Int K, Int V>
+__global__ void dAffineReluTrivialConvolution_backward_dW_A(
+    T *inFeatures, T *dInFeatures, T *dOutFeatures, T *affineWeight,
+    T *dAffineWeight, T *affineBias, T *dAffineBias, T *convWeight,
+    T *dConvWeight, Int input_nPlanes, Int input_stride, Int output_nPlanes,
+    Int output_stride, Int nActive, bool additiveGrad) {
+  // M = gridDim.y == input_nPlanes / K
+  Int N = output_nPlanes / K;
+  Int m = blockIdx.y;
+  inFeatures += m * K;
+  dInFeatures += m * K;
+  convWeight += m * K * output_nPlanes;
+  dConvWeight += m * K * output_nPlanes;
+  affineWeight += m * K;
+  dAffineWeight += m * K;
+  affineBias += m * K;
+  dAffineBias += m * K;
+  T dI[V];
+  T dCW[V];
+  T i[V];
+  T dAW = 0;
+  T dAB = 0;
+  __shared__ T I[K][K];
+  __shared__ T dO[K][K];
+  __shared__ T AW[K];
+  __shared__ T AB[K];
+  __shared__ T CW[K][K];
+  const Int tx = threadIdx.x;
+  int ty[V];
+#pragma unroll
+  for (int v = 0; v < V; v++)
+    ty[v] = threadIdx.y + v * (K / V);
+  if (ty[0] == 0) {
+    AW[tx] = affineWeight[tx];
+    AB[tx] = affineBias[tx];
+  }
+  for (int n = 0; n < N; n++) {
+// Read w, reset dW
+#pragma unroll
+    for (int v = 0; v < V; v++) {
+      CW[ty[v]][tx] = convWeight[ty[v] * output_nPlanes + tx];
+      dCW[v] = 0;
+    }
+    __syncthreads();
+    for (Int s = blockIdx.x * K; s < nActive; s += K * gridDim.x) {
+#pragma unroll
+      for (int v = 0; v < V; v++)
+        dI[v] = 0;
+      __syncthreads();
+// Read input and dOutput
+#pragma unroll
+      for (int v = 0; v < V; v++) {
+        T i_ = inFeatures[(s + ty[v]) * input_stride + tx];
+        i[v] = i_;
+        i_ = i_ * AW[tx] + AB[tx];
+        I[ty[v]][tx] = (i_ > 0) ? i_ : 0;
+        dO[ty[v]][tx] = dOutFeatures[(s + ty[v]) * output_stride + tx];
+      }
+      __syncthreads();
+#pragma unroll
+      for (int k = 0; k < K; k++)
+#pragma unroll
+        for (int v = 0; v < V; v++) {
+          dI[v] += dO[ty[v]][k] * CW[tx][k];
+          dCW[v] += I[k][ty[v]] * dO[k][tx];
+        }
+#pragma unroll
+      for (int v = 0; v < V; v++) {
+        dI[v] = (I[ty[v]][tx] > 0) ? dI[v] : 0;
+        dAW += i[v] * dI[v];
+        dAB += dI[v];
+        if (additiveGrad)
+          dInFeatures[(s + ty[v]) * input_stride + tx] += dI[v];
+        else
+          dInFeatures[(s + ty[v]) * input_stride + tx] = dI[v];
+      }
+      __syncthreads();
+    }
+#pragma unroll
+    for (int v = 0; v < V; v++)
+      atomicAdd(&dConvWeight[ty[v] * output_nPlanes + tx], dCW[v]);
+    convWeight += K;
+    dConvWeight += K;
+    dOutFeatures += K;
+    __syncthreads();
+  }
+  atomicAdd(&dAffineWeight[tx], dAW);
+  atomicAdd(&dAffineBias[tx], dAB);
+}
+// dOutput x W^T -> dInput and
+// Input^T x dOutput -> dW
+// blockDim=(K,K/V,1), gridDim=(nBlocks,M,1)
+template <typename T, Int K, Int V>
+__global__ void dAffineReluTrivialConvolution_backward_dW_B(
+    T *inFeatures, T *dInFeatures, T *dOutFeatures, T *affineWeight,
+    T *dAffineWeight, T *affineBias, T *dAffineBias, T *convWeight,
+    T *dConvWeight, Int input_nPlanes, Int input_stride, Int output_nPlanes,
+    Int output_stride, Int nActive, bool additiveGrad) {
+  // M = gridDim.y == input_nPlanes / K
+  Int N = output_nPlanes / K;
+  Int m = blockIdx.y;
+  inFeatures += m * K;
+  dInFeatures += m * K;
+  convWeight += m * K * output_nPlanes;
+  dConvWeight += m * K * output_nPlanes;
+  affineWeight += m * K;
+  dAffineWeight += m * K;
+  affineBias += m * K;
+  dAffineBias += m * K;
+  T dI[V];
+  T dCW[V];
+  T i[V];
+  T dAW = 0;
+  T dAB = 0;
+  __shared__ T I[K][K];
+  __shared__ T dO[K][K];
+  __shared__ T AW[K];
+  __shared__ T AB[K];
+  __shared__ T CW[K][K];
+  const Int tx = threadIdx.x;
+  int ty[V];
+#pragma unroll
+  for (int v = 0; v < V; v++)
+    ty[v] = threadIdx.y + v * (K / V);
+  if (ty[0] == 0) {
+    AW[tx] = affineWeight[tx];
+    AB[tx] = affineBias[tx];
+  }
+  for (int n = 0; n < N; n++) {
+// Read w, reset dW
+#pragma unroll
+    for (int v = 0; v < V; v++) {
+      CW[ty[v]][tx] = convWeight[ty[v] * output_nPlanes + tx];
+      dCW[v] = 0;
+    }
+    __syncthreads();
+    for (Int s = blockIdx.x * K; s < nActive; s += K * gridDim.x) {
+#pragma unroll
+      for (int v = 0; v < V; v++)
+        dI[v] = 0;
+      __syncthreads();
+// Read input and dOutput
+#pragma unroll
+      for (int v = 0; v < V; v++)
+        if (s + ty[v] < nActive) {
+          T i_ = inFeatures[(s + ty[v]) * input_stride + tx];
+          i[v] = i_;
+          i_ = i_ * AW[tx] + AB[tx];
+          I[ty[v]][tx] = (i_ > 0) ? i_ : 0;
+          dO[ty[v]][tx] = dOutFeatures[(s + ty[v]) * output_stride + tx];
+        } else {
+          i[v] = 0;
+          I[ty[v]][tx] = 0;
+          dO[ty[v]][tx] = 0;
+        }
+      __syncthreads();
+#pragma unroll
+      for (int k = 0; k < K; k++)
+#pragma unroll
+        for (int v = 0; v < V; v++) {
+          dI[v] += dO[ty[v]][k] * CW[tx][k];
+          dCW[v] += I[k][ty[v]] * dO[k][tx];
+        }
+#pragma unroll
+      for (int v = 0; v < V; v++)
+        if (s + ty[v] < nActive) {
+          dI[v] = (I[ty[v]][tx] > 0) ? dI[v] : 0;
+          dAW += i[v] * dI[v];
+          dAB += dI[v];
+          if (additiveGrad)
+            dInFeatures[(s + ty[v]) * input_stride + tx] += dI[v];
+          else
+            dInFeatures[(s + ty[v]) * input_stride + tx] = dI[v];
+        }
+      __syncthreads();
+    }
+#pragma unroll
+    for (int v = 0; v < V; v++)
+      atomicAdd(&dConvWeight[ty[v] * output_nPlanes + tx], dCW[v]);
+    convWeight += K;
+    dConvWeight += K;
+    dOutFeatures += K;
+    __syncthreads();
+  }
+  atomicAdd(&dAffineWeight[tx], dAW);
+  atomicAdd(&dAffineBias[tx], dAB);
 }
+#define FOO(T, K, V)                                                           \
+  {                                                                            \
+    if (input_nPlanes % K == 0 and output_nPlanes % K == 0) {                  \
+      Int o = (nActive / K) * K;                                               \
+      if (o > 0)                                                               \
+        dAffineReluTrivialConvolution_backward_dW_A<                           \
+            T, K, V><<<dim3(std::min(o / K, (Int)512), input_nPlanes / K),     \
+                       dim3(K, K / V)>>>(                                      \
+            inFeatures, dInFeatures, dOutFeatures, affineWeight,               \
+            dAffineWeight, affineBias, dAffineBias, convWeight, dConvWeight,   \
+            input_nPlanes, input_stride, output_nPlanes, output_stride, o,     \
+            additiveGrad);                                                     \
+      if (nActive > o)                                                         \
+        dAffineReluTrivialConvolution_backward_dW_B<                           \
+            T, K, V><<<dim3(1, input_nPlanes / K), dim3(K, K / V)>>>(          \
+            inFeatures + o * input_stride, dInFeatures + o * input_stride,     \
+            dOutFeatures + o * output_stride, affineWeight, dAffineWeight,     \
+            affineBias, dAffineBias, convWeight, dConvWeight, input_nPlanes,   \
+            input_stride, output_nPlanes, output_stride, nActive - o,          \
+            additiveGrad);                                                     \
+      return;                                                                  \
+    }                                                                          \
+  }
 template <typename T>
-void cuda_AffineReluTrivialConvolution_backward(
+void dAffineReluTrivialConvolution_backward_dW(
-    /*cuda float*/ at::Tensor input_features,
+    T *inFeatures, T *dInFeatures, T *dOutFeatures, T *affineWeight,
-    /*cuda float*/ at::Tensor d_input_features,
+    T *dAffineWeight, T *affineBias, T *dAffineBias, T *convWeight,
-    /*cuda float*/ at::Tensor d_output_features,
+    T *dConvWeight, Int input_nPlanes, Int input_stride, Int output_nPlanes,
-    /*cuda float*/ at::Tensor affineWeight,
+    Int output_stride, Int nActive, bool additiveGrad) {
-    /*cuda float*/ at::Tensor d_affineWeight,
+  FOO(T, 32, 8)
-    /*cuda float*/ at::Tensor affineBias,
+  FOO(T, 16, 4)
-    /*cuda float*/ at::Tensor d_affineBias,
+  FOO(T, 8, 2)
-    /*cuda float*/ at::Tensor convWeight,
-    /*cuda float*/ at::Tensor d_convWeight, bool additiveGrad) {
-  d_input_features.resize_as_(input_features);
-  dAffineReluTrivialConvolution_backward_dW<T>(
-      input_features.data<T>(), d_input_features.data<T>(),
-      d_output_features.data<T>(), affineWeight.data<T>(),
-      d_affineWeight.data<T>(), affineBias.data<T>(), d_affineBias.data<T>(),
-      convWeight.data<T>(), d_convWeight.data<T>(), convWeight.size(0),
-      input_features.stride(0), convWeight.size(1), d_output_features.stride(0),
-      input_features.size(0), additiveGrad);
 }
+#undef FOO
--- a/sparseconvnet/SCN/CUDA/AffineReluTrivialConvolution.h
+++ b/sparseconvnet/SCN/CUDA/AffineReluTrivialConvolution.h
-// Copyright 2016-present, Facebook, Inc.
-// All rights reserved.
-//
-// This source code is licensed under the license found in the
-// LICENSE file in the root directory of this source tree.
-#ifndef CUDA_AFFINERELUTRIVIALCONVOLUTION_H
-#define CUDA_AFFINERELUTRIVIALCONVOLUTION_H
-// check if A+B is faster than just B
-// check if loading affineBias into shared memory is faster than loading
-// multiple times (if not try 64,16 backwards case)
-template <typename T, Int K, Int V>
-__global__ void dAffineReluTrivialConvolution_forwardA(
-    T *inFeatures, T *outFeatures, T *affineWeight, T *affineBias,
-    T *convWeight, Int input_nPlanes, Int input_stride, Int output_nPlanes,
-    Int output_stride, Int nActive) {
-  // nActive must be a multiple of K!!
-  // Input x Weight -> Output
-  // blockDim=(K,K/V,1), gridDim=(nBlocks,N,1) Volkov-blocks
-  // K is a multiple of V,
-  // nActive x KM -> nActive x KN - parallel over N,nActive - loop over M
-  Int M = input_nPlanes / K;
-  // N = gridDim.y == output_nPlanes/K
-  Int n = blockIdx.y;
-  outFeatures += n * K;
-  convWeight += n * K;
-  T O[V];
-  __shared__ T I[K][K];
-  __shared__ T AW[K];
-  __shared__ T AB[K];
-  __shared__ T CW[K][K];
-  const Int tx = threadIdx.x;
-  int ty[V];
-#pragma unroll
-  for (int v = 0; v < V; v++)
-    ty[v] = threadIdx.y + v * (K / V);
-  for (int m = 0; m < M; m++) {
-    // Read affineWeight, affineBias and convWeight
-    if (ty[0] == 0) {
-      AW[tx] = affineWeight[tx];
-      AB[tx] = affineBias[tx];
-    }
-#pragma unroll
-    for (int v = 0; v < V; v++)
-      CW[ty[v]][tx] = convWeight[ty[v] * output_nPlanes + tx];
-    __syncthreads();
-    for (Int s = blockIdx.x * K; s < nActive; s += K * gridDim.x) {
-// Read input, do affine + relu, set O[]
-#pragma unroll
-      for (int v = 0; v < V; v++) {
-        T i = inFeatures[(s + ty[v]) * input_stride + tx] * AW[tx] + AB[tx];
-        I[ty[v]][tx] = (i > 0) ? i : 0;
-        if (m == 0) {
-          O[v] = 0;
-        } else {
-          O[v] = outFeatures[(s + ty[v]) * output_stride + tx];
-        }
-      }
-      __syncthreads();
-#pragma unroll
-      for (int k = 0; k < K; k++)
-#pragma unroll
-        for (int v = 0; v < V; v++)
-          O[v] += I[ty[v]][k] * CW[k][tx];
-#pragma unroll
-      for (int v = 0; v < V; v++)
-        outFeatures[(s + ty[v]) * output_stride + tx] = O[v];
-      __syncthreads();
-    }
-    affineWeight += K;
-    affineBias += K;
-    convWeight += K * output_nPlanes;
-    inFeatures += K;
-  }
-}
-template <typename T, Int K, Int V>
-__global__ void dAffineReluTrivialConvolution_forwardB(
-    T *inFeatures, T *outFeatures, T *affineWeight, T *affineBias,
-    T *convWeight, Int input_nPlanes, Int input_stride, Int output_nPlanes,
-    Int output_stride, Int nActive) {
-  // Input x Weight -> Output
-  // blockDim=(K,K/V,1), gridDim=(nBlocks,N,1) Volkov-blocks
-  // K is a multiple of V,
-  // nActive x KM -> nActive x KN - parallel over N,nActive - loop over M
-  Int M = input_nPlanes / K;
-  // N = gridDim.y == output_nPlanes/K
-  Int n = blockIdx.y;
-  outFeatures += n * K;
-  convWeight += n * K;
-  T O[V];
-  __shared__ T I[K][K]; // zz try K+1 trick A+B+backwards
-  __shared__ T AW[K];
-  __shared__ T AB[K];
-  __shared__ T CW[K][K];
-  const Int tx = threadIdx.x;
-  int ty[V];
-#pragma unroll
-  for (int v = 0; v < V; v++)
-    ty[v] = threadIdx.y + v * (K / V);
-  for (int m = 0; m < M; m++) {
-    // Read affineWeight, affineBias and convWeight
-    if (ty[0] == 0) {
-      AW[tx] = affineWeight[tx];
-      AB[tx] = affineBias[tx];
-    }
-#pragma unroll
-    for (int v = 0; v < V; v++)
-      CW[ty[v]][tx] = convWeight[ty[v] * output_nPlanes + tx];
-    __syncthreads();
-    for (Int s = blockIdx.x * K; s < nActive; s += K * gridDim.x) {
-// Read input, do affine + relu, set O[]
-#pragma unroll
-      for (int v = 0; v < V; v++) {
-        if (s + ty[v] < nActive) {
-          T i = inFeatures[(s + ty[v]) * input_stride + tx] * AW[tx] + AB[tx];
-          I[ty[v]][tx] = (i > 0) ? i : 0;
-          if (m == 0) {
-            O[v] = 0;
-          } else {
-            O[v] = outFeatures[(s + ty[v]) * output_stride + tx];
-          }
-        }
-      }
-      __syncthreads();
-#pragma unroll
-      for (int k = 0; k < K; k++)
-#pragma unroll
-        for (int v = 0; v < V; v++)
-          O[v] += I[ty[v]][k] * CW[k][tx];
-#pragma unroll
-      for (int v = 0; v < V; v++)
-        if (s + ty[v] < nActive)
-          outFeatures[(s + ty[v]) * output_stride + tx] = O[v];
-      __syncthreads();
-    }
-    affineWeight += K;
-    affineBias += K;
-    convWeight += K * output_nPlanes;
-    inFeatures += K;
-  }
-}
-#define FOO(T, K, V)                                                           \
-  {                                                                            \
-    if (input_nPlanes % K == 0 and output_nPlanes % K == 0) {                  \
-      Int o = (nActive / K) * K;                                              \
-      if (o > 0)                                                               \
-        dAffineReluTrivialConvolution_forwardA<                                \
-            T, K, V><<<dim3(std::min(o / K, (Int)512), output_nPlanes / K),   \
-                       dim3(K, K / V)>>>(                                      \
-            inFeatures, outFeatures, affineWeight, affineBias, convWeight,     \
-            input_nPlanes, input_stride, output_nPlanes, output_stride, o);    \
-      if (nActive > o)                                                         \
-        dAffineReluTrivialConvolution_forwardB<                                \
-            T, K, V><<<dim3(1, output_nPlanes / K), dim3(K, K / V)>>>(         \
-            inFeatures + o * input_stride, outFeatures + o * output_stride,    \
-            affineWeight, affineBias, convWeight, input_nPlanes, input_stride, \
-            output_nPlanes, output_stride, nActive - o);                       \
-      return;                                                                  \
-    }                                                                          \
-  }
-template <typename T>
-void dAffineReluTrivialConvolution_forward(T *inFeatures, T *outFeatures,
-                                           T *affineWeight, T *affineBias,
-                                           T *convWeight, Int input_nPlanes,
-                                           Int input_stride,
-                                           Int output_nPlanes,
-                                           Int output_stride, Int nActive) {
-  FOO(T, 64, 16)
-  FOO(T, 32, 8)
-  FOO(T, 16, 4)
-  FOO(T, 8, 2)
-  assert(false);
-}
-template <>
-void dAffineReluTrivialConvolution_forward<double>(
-    double *inFeatures, double *outFeatures, double *affineWeight,
-    double *affineBias, double *convWeight, Int input_nPlanes,
-    Int input_stride, Int output_nPlanes, Int output_stride, Int nActive) {
-  FOO(double, 32, 8)
-  FOO(double, 16, 4)
-  FOO(double, 8, 2)
-  assert(false);
-}
-#undef FOO
-// dOutput x W^T -> dInput and
-// Input^T x dOutput -> dW
-// blockDim=(K,K/V,1), gridDim=(nBlocks,M,1)
-template <typename T, Int K, Int V>
-__global__ void dAffineReluTrivialConvolution_backward_dW_A(
-    T *inFeatures, T *dInFeatures, T *dOutFeatures, T *affineWeight,
-    T *dAffineWeight, T *affineBias, T *dAffineBias, T *convWeight,
-    T *dConvWeight, Int input_nPlanes, Int input_stride, Int output_nPlanes,
-    Int output_stride, Int nActive, bool additiveGrad) {
-  // M = gridDim.y == input_nPlanes / K
-  Int N = output_nPlanes / K;
-  Int m = blockIdx.y;
-  inFeatures += m * K;
-  dInFeatures += m * K;
-  convWeight += m * K * output_nPlanes;
-  dConvWeight += m * K * output_nPlanes;
-  affineWeight += m * K;
-  dAffineWeight += m * K;
-  affineBias += m * K;
-  dAffineBias += m * K;
-  T dI[V];
-  T dCW[V];
-  T i[V];
-  T dAW = 0;
-  T dAB = 0;
-  __shared__ T I[K][K];
-  __shared__ T dO[K][K];
-  __shared__ T AW[K];
-  __shared__ T AB[K];
-  __shared__ T CW[K][K];
-  const Int tx = threadIdx.x;
-  int ty[V];
-#pragma unroll
-  for (int v = 0; v < V; v++)
-    ty[v] = threadIdx.y + v * (K / V);
-  if (ty[0] == 0) {
-    AW[tx] = affineWeight[tx];
-    AB[tx] = affineBias[tx];
-  }
-  for (int n = 0; n < N; n++) {
-// Read w, reset dW
-#pragma unroll
-    for (int v = 0; v < V; v++) {
-      CW[ty[v]][tx] = convWeight[ty[v] * output_nPlanes + tx];
-      dCW[v] = 0;
-    }
-    __syncthreads();
-    for (Int s = blockIdx.x * K; s < nActive; s += K * gridDim.x) {
-#pragma unroll
-      for (int v = 0; v < V; v++)
-        dI[v] = 0;
-      __syncthreads();
-// Read input and dOutput
-#pragma unroll
-      for (int v = 0; v < V; v++) {
-        T i_ = inFeatures[(s + ty[v]) * input_stride + tx];
-        i[v] = i_;
-        i_ = i_ * AW[tx] + AB[tx];
-        I[ty[v]][tx] = (i_ > 0) ? i_ : 0;
-        dO[ty[v]][tx] = dOutFeatures[(s + ty[v]) * output_stride + tx];
-      }
-      __syncthreads();
-#pragma unroll
-      for (int k = 0; k < K; k++)
-#pragma unroll
-        for (int v = 0; v < V; v++) {
-          dI[v] += dO[ty[v]][k] * CW[tx][k];
-          dCW[v] += I[k][ty[v]] * dO[k][tx];
-        }
-#pragma unroll
-      for (int v = 0; v < V; v++) {
-        dI[v] = (I[ty[v]][tx] > 0) ? dI[v] : 0;
-        dAW += i[v] * dI[v];
-        dAB += dI[v];
-        if (additiveGrad)
-          dInFeatures[(s + ty[v]) * input_stride + tx] += dI[v];
-        else
-          dInFeatures[(s + ty[v]) * input_stride + tx] = dI[v];
-      }
-      __syncthreads();
-    }
-#pragma unroll
-    for (int v = 0; v < V; v++)
-      atomicAdd(&dConvWeight[ty[v] * output_nPlanes + tx], dCW[v]);
-    convWeight += K;
-    dConvWeight += K;
-    dOutFeatures += K;
-    __syncthreads();
-  }
-  atomicAdd(&dAffineWeight[tx], dAW);
-  atomicAdd(&dAffineBias[tx], dAB);
-}
-// dOutput x W^T -> dInput and
-// Input^T x dOutput -> dW
-// blockDim=(K,K/V,1), gridDim=(nBlocks,M,1)
-template <typename T, Int K, Int V>
-__global__ void dAffineReluTrivialConvolution_backward_dW_B(
-    T *inFeatures, T *dInFeatures, T *dOutFeatures, T *affineWeight,
-    T *dAffineWeight, T *affineBias, T *dAffineBias, T *convWeight,
-    T *dConvWeight, Int input_nPlanes, Int input_stride, Int output_nPlanes,
-    Int output_stride, Int nActive, bool additiveGrad) {
-  // M = gridDim.y == input_nPlanes / K
-  Int N = output_nPlanes / K;
-  Int m = blockIdx.y;
-  inFeatures += m * K;
-  dInFeatures += m * K;
-  convWeight += m * K * output_nPlanes;
-  dConvWeight += m * K * output_nPlanes;
-  affineWeight += m * K;
-  dAffineWeight += m * K;
-  affineBias += m * K;
-  dAffineBias += m * K;
-  T dI[V];
-  T dCW[V];
-  T i[V];
-  T dAW = 0;
-  T dAB = 0;
-  __shared__ T I[K][K];
-  __shared__ T dO[K][K];
-  __shared__ T AW[K];
-  __shared__ T AB[K];
-  __shared__ T CW[K][K];
-  const Int tx = threadIdx.x;
-  int ty[V];
-#pragma unroll
-  for (int v = 0; v < V; v++)
-    ty[v] = threadIdx.y + v * (K / V);
-  if (ty[0] == 0) {
-    AW[tx] = affineWeight[tx];
-    AB[tx] = affineBias[tx];
-  }
-  for (int n = 0; n < N; n++) {
-// Read w, reset dW
-#pragma unroll
-    for (int v = 0; v < V; v++) {
-      CW[ty[v]][tx] = convWeight[ty[v] * output_nPlanes + tx];
-      dCW[v] = 0;
-    }
-    __syncthreads();
-    for (Int s = blockIdx.x * K; s < nActive; s += K * gridDim.x) {
-#pragma unroll
-      for (int v = 0; v < V; v++)
-        dI[v] = 0;
-      __syncthreads();
-// Read input and dOutput
-#pragma unroll
-      for (int v = 0; v < V; v++)
-        if (s + ty[v] < nActive) {
-          T i_ = inFeatures[(s + ty[v]) * input_stride + tx];
-          i[v] = i_;
-          i_ = i_ * AW[tx] + AB[tx];
-          I[ty[v]][tx] = (i_ > 0) ? i_ : 0;
-          dO[ty[v]][tx] = dOutFeatures[(s + ty[v]) * output_stride + tx];
-        } else {
-          i[v] = 0;
-          I[ty[v]][tx] = 0;
-          dO[ty[v]][tx] = 0;
-        }
-      __syncthreads();
-#pragma unroll
-      for (int k = 0; k < K; k++)
-#pragma unroll
-        for (int v = 0; v < V; v++) {
-          dI[v] += dO[ty[v]][k] * CW[tx][k];
-          dCW[v] += I[k][ty[v]] * dO[k][tx];
-        }
-#pragma unroll
-      for (int v = 0; v < V; v++)
-        if (s + ty[v] < nActive) {
-          dI[v] = (I[ty[v]][tx] > 0) ? dI[v] : 0;
-          dAW += i[v] * dI[v];
-          dAB += dI[v];
-          if (additiveGrad)
-            dInFeatures[(s + ty[v]) * input_stride + tx] += dI[v];
-          else
-            dInFeatures[(s + ty[v]) * input_stride + tx] = dI[v];
-        }
-      __syncthreads();
-    }
-#pragma unroll
-    for (int v = 0; v < V; v++)
-      atomicAdd(&dConvWeight[ty[v] * output_nPlanes + tx], dCW[v]);
-    convWeight += K;
-    dConvWeight += K;
-    dOutFeatures += K;
-    __syncthreads();
-  }
-  atomicAdd(&dAffineWeight[tx], dAW);
-  atomicAdd(&dAffineBias[tx], dAB);
-}
-#define FOO(T, K, V)                                                           \
-  {                                                                            \
-    if (input_nPlanes % K == 0 and output_nPlanes % K == 0) {                  \
-      Int o = (nActive / K) * K;                                              \
-      if (o > 0)                                                               \
-        dAffineReluTrivialConvolution_backward_dW_A<                           \
-            T, K, V><<<dim3(std::min(o / K, (Int)512), input_nPlanes / K),    \
-                       dim3(K, K / V)>>>(                                      \
-            inFeatures, dInFeatures, dOutFeatures, affineWeight,               \
-            dAffineWeight, affineBias, dAffineBias, convWeight, dConvWeight,   \
-            input_nPlanes, input_stride, output_nPlanes, output_stride, o,     \
-            additiveGrad);                                                     \
-      if (nActive > o)                                                         \
-        dAffineReluTrivialConvolution_backward_dW_B<                           \
-            T, K, V><<<dim3(1, input_nPlanes / K), dim3(K, K / V)>>>(          \
-            inFeatures + o * input_stride, dInFeatures + o * input_stride,     \
-            dOutFeatures + o * output_stride, affineWeight, dAffineWeight,     \
-            affineBias, dAffineBias, convWeight, dConvWeight, input_nPlanes,   \
-            input_stride, output_nPlanes, output_stride, nActive - o,          \
-            additiveGrad);                                                     \
-      return;                                                                  \
-    }                                                                          \
-  }
-template <typename T>
-void dAffineReluTrivialConvolution_backward_dW(
-    T *inFeatures, T *dInFeatures, T *dOutFeatures, T *affineWeight,
-    T *dAffineWeight, T *affineBias, T *dAffineBias, T *convWeight,
-    T *dConvWeight, Int input_nPlanes, Int input_stride, Int output_nPlanes,
-    Int output_stride, Int nActive, bool additiveGrad) {
-  FOO(T, 32, 8)
-  FOO(T, 16, 4)
-  FOO(T, 8, 2)
-}
-#undef FOO
-#endif
--- a/sparseconvnet/SCN/CUDA/AveragePooling.cpp
+++ b/sparseconvnet/SCN/CUDA/AveragePooling.cpp
+// Copyright 2016-present, Facebook, Inc.
+// All rights reserved.
+//
+// This source code is licensed under the license found in the
+// LICENSE file in the root directory of this source tree.
+template <typename T>
+void cuda_AveragePooling_ForwardPass(T *input_features, T *output_features,
+                                     Int nPlanes, Int input_stride,
+                                     Int output_stride, RuleBook _rules,
+                                     Int filterVolume);
+template <typename T>
+void cuda_AveragePooling_BackwardPass(T *d_input_features, T *d_output_features,
+                                      Int nPlanes, Int input_stride,
+                                      Int output_stride, RuleBook _rules,
+                                      Int filterVolume);
+template <typename T, Int Dimension>
+void cuda_AveragePooling_updateOutput(
+    /*long*/ at::Tensor inputSize, /*long*/ at::Tensor outputSize,
+    /*long*/ at::Tensor poolSize,
+    /*long*/ at::Tensor poolStride, Metadata<Dimension> &m,
+    /*cuda float*/ at::Tensor input_features,
+    /*cuda float*/ at::Tensor output_features, long nFeaturesToDrop) {
+  Int nPlanes = input_features.size(1) - nFeaturesToDrop;
+  auto _rules =
+      m.getRuleBook(inputSize, outputSize, poolSize, poolStride, true);
+  Int nActive = m.getNActive(outputSize);
+  output_features.resize_({nActive, input_features.size(1) - nFeaturesToDrop});
+  output_features.zero_();
+  auto iF = input_features.data<T>() + nFeaturesToDrop;
+  auto oF = output_features.data<T>();
+  cuda_AveragePooling_ForwardPass<T>(iF, oF, nPlanes, input_features.size(1),
+                                     output_features.size(1), _rules,
+                                     _rules.size());
+}
+template <typename T, Int Dimension>
+void cuda_AveragePooling_updateGradInput(
+    /*long*/ at::Tensor inputSize, /*long*/ at::Tensor outputSize,
+    /*long*/ at::Tensor poolSize,
+    /*long*/ at::Tensor poolStride, Metadata<Dimension> &m,
+    /*cuda float*/ at::Tensor input_features,
+    /*cuda float*/ at::Tensor d_input_features,
+    /*cuda float*/ at::Tensor d_output_features, long nFeaturesToDrop) {
+  Int nPlanes = input_features.size(1) - nFeaturesToDrop;
+  auto _rules =
+      m.getRuleBook(inputSize, outputSize, poolSize, poolStride, true);
+  d_input_features.resize_as_(input_features);
+  d_input_features.zero_();
+  auto diF = d_input_features.data<T>() + nFeaturesToDrop;
+  auto doF = d_output_features.data<T>();
+  cuda_AveragePooling_BackwardPass<T>(diF, doF, nPlanes, input_features.size(1),
+                                      d_output_features.size(1), _rules,
+                                      _rules.size());
+}
--- a/sparseconvnet/SCN/CUDA/AveragePooling.cu
+++ b/sparseconvnet/SCN/CUDA/AveragePooling.cu
@@ -4,51 +4,73 @@
 // This source code is licensed under the license found in the
 // LICENSE file in the root directory of this source tree.
-#include "AveragePooling.h"
 #include "RuleBookIterator.h"
-template <typename T, Int Dimension>
+// NTX must be >=2 so r is filled properly
-void cuda_AveragePooling_updateOutput(
+template <typename T, Int NTX, Int NTY>
-    /*long*/ at::Tensor inputSize, /*long*/ at::Tensor outputSize,
+__global__ void AveragePooling_fp(T *input_features, T *output_features,
-    /*long*/ at::Tensor poolSize,
+                                  Int nPlanes, Int input_stride,
-    /*long*/ at::Tensor poolStride, Metadata<Dimension> &m,
+                                  Int output_stride, Int *rules, Int nHot,
-    /*cuda float*/ at::Tensor input_features,
+                                  T alpha) {
-    /*cuda float*/ at::Tensor output_features, long nFeaturesToDrop) {
+  __shared__ Int r[NTY * 2];
+  for (Int n = blockIdx.x * NTY; n < nHot; n += gridDim.x * NTY) {
-  Int nPlanes = input_features.size(1) - nFeaturesToDrop;
+    {
-  auto _rules =
+      Int i = threadIdx.x + NTX * threadIdx.y;
-      m.getRuleBook(inputSize, outputSize, poolSize, poolStride, true);
+      if (i < NTY * 2 and i < 2 * (nHot - n))
-  Int nActive = m.getNActive(outputSize);
+        r[i] = rules[2 * n + i];
-  output_features.resize_({nActive, input_features.size(1) - nFeaturesToDrop});
+    }
-  output_features.zero_();
+    __syncthreads();
+    if (n + threadIdx.y < nHot) {
+      Int i = r[2 * threadIdx.y] * input_stride;
+      Int o = r[2 * threadIdx.y + 1] * output_stride;
+      for (Int plane = threadIdx.x; plane < nPlanes; plane += NTX)
+        atomicAdd(&output_features[o + plane],
+                  alpha * input_features[i + plane]);
+    }
+    __syncthreads();
+  }
+}
-  auto iF = input_features.data<T>() + nFeaturesToDrop;
+template <typename T>
-  auto oF = output_features.data<T>();
+void cuda_AveragePooling_ForwardPass(T *input_features, T *output_features,
-  RULEBOOKITERATOR(cuda_AveragePooling_ForwardPass<T>(
+                                     Int nPlanes, Int input_stride,
-                       iF, oF, nPlanes, input_features.size(1),
+                                     Int output_stride, RuleBook _rules,
-                       output_features.size(1), rbB, nHotB, _rules.size());
+                                     Int filterVolume) {
+  RULEBOOKITERATOR((AveragePooling_fp<T, 32, 32><<<32, dim3(32, 32)>>>(
+      input_features, output_features, nPlanes, input_stride, output_stride,
+      rbB, nHotB, 1.0 / filterVolume));
                   , )
 }
+template <typename T, Int NTX, Int NTY>
+__global__ void AveragePooling_bp(T *d_input_features, T *d_output_features,
+                                  Int nPlanes, Int input_stride,
+                                  Int output_stride, Int *rules, Int nHot,
+                                  T alpha) {
+  __shared__ Int r[NTY * 2];
+  for (Int n = blockIdx.x * NTY; n < nHot; n += gridDim.x * NTY) {
+    {
+      Int i = threadIdx.x + NTX * threadIdx.y;
+      if (i < NTY * 2 and i < 2 * (nHot - n))
+        r[i] = rules[2 * n + i];
+    }
+    __syncthreads();
+    if (n + threadIdx.y < nHot) {
+      Int i = r[2 * threadIdx.y] * input_stride;
+      Int o = r[2 * threadIdx.y + 1] * output_stride;
+      for (Int plane = threadIdx.x; plane < nPlanes; plane += NTX)
+        d_input_features[i + plane] += alpha * d_output_features[o + plane];
+    }
+    __syncthreads();
+  }
+}
-template <typename T, Int Dimension>
+template <typename T>
-void cuda_AveragePooling_updateGradInput(
+void cuda_AveragePooling_BackwardPass(T *d_input_features, T *d_output_features,
-    /*long*/ at::Tensor inputSize, /*long*/ at::Tensor outputSize,
+                                      Int nPlanes, Int input_stride,
-    /*long*/ at::Tensor poolSize,
+                                      Int output_stride, RuleBook _rules,
-    /*long*/ at::Tensor poolStride, Metadata<Dimension> &m,
+                                      Int filterVolume) {
-    /*cuda float*/ at::Tensor input_features,
+  RULEBOOKITERATOR((AveragePooling_bp<T, 32, 32><<<32, dim3(32, 32)>>>(
-    /*cuda float*/ at::Tensor d_input_features,
+      d_input_features, d_output_features, nPlanes, input_stride, output_stride,
-    /*cuda float*/ at::Tensor d_output_features, long nFeaturesToDrop) {
+      rbB, nHotB, 1.0 / filterVolume));
-  Int nPlanes = input_features.size(1) - nFeaturesToDrop;
-  auto _rules =
-      m.getRuleBook(inputSize, outputSize, poolSize, poolStride, true);
-  d_input_features.resize_as_(input_features);
-  d_input_features.zero_();
-  auto diF = d_input_features.data<T>() + nFeaturesToDrop;
-  auto doF = d_output_features.data<T>();
-  RULEBOOKITERATOR(cuda_AveragePooling_BackwardPass<T>(
-                       diF, doF, nPlanes, input_features.size(1),
-                       d_output_features.size(1), rbB, nHotB, _rules.size());
                   , )
 }
--- a/sparseconvnet/SCN/CUDA/AveragePooling.h
+++ b/sparseconvnet/SCN/CUDA/AveragePooling.h
-// Copyright 2016-present, Facebook, Inc.
-// All rights reserved.
-//
-// This source code is licensed under the license found in the
-// LICENSE file in the root directory of this source tree.
-#ifndef CUDA_AVERAGEPOOLING_H
-#define CUDA_AVERAGEPOOLING_H
-// NTX must be >=2 so r is filled properly
-template <typename T, Int NTX, Int NTY>
-__global__ void AveragePooling_fp(T *input_features, T *output_features,
-                                  Int nPlanes, Int input_stride,
-                                  Int output_stride, Int *rules, Int nHot,
-                                  T alpha) {
-  __shared__ Int r[NTY * 2];
-  for (Int n = blockIdx.x * NTY; n < nHot; n += gridDim.x * NTY) {
-    {
-      Int i = threadIdx.x + NTX * threadIdx.y;
-      if (i < NTY * 2 and i < 2 * (nHot - n))
-        r[i] = rules[2 * n + i];
-    }
-    __syncthreads();
-    if (n + threadIdx.y < nHot) {
-      Int i = r[2 * threadIdx.y] * input_stride;
-      Int o = r[2 * threadIdx.y + 1] * output_stride;
-      for (Int plane = threadIdx.x; plane < nPlanes; plane += NTX)
-        atomicAdd(&output_features[o + plane],
-                  alpha * input_features[i + plane]);
-    }
-    __syncthreads();
-  }
-}
-template <typename T>
-void cuda_AveragePooling_ForwardPass(T *input_features, T *output_features,
-                                     Int nPlanes, Int input_stride,
-                                     Int output_stride, Int *rules, Int nHot,
-                                     Int filterVolume) {
-  AveragePooling_fp<T, 32, 32><<<32, dim3(32, 32)>>>(
-      input_features, output_features, nPlanes, input_stride, output_stride,
-      rules, nHot, 1.0 / filterVolume);
-}
-template <typename T, Int NTX, Int NTY>
-__global__ void AveragePooling_bp(T *d_input_features, T *d_output_features,
-                                  Int nPlanes, Int input_stride,
-                                  Int output_stride, Int *rules, Int nHot,
-                                  T alpha) {
-  __shared__ Int r[NTY * 2];
-  for (Int n = blockIdx.x * NTY; n < nHot; n += gridDim.x * NTY) {
-    {
-      Int i = threadIdx.x + NTX * threadIdx.y;
-      if (i < NTY * 2 and i < 2 * (nHot - n))
-        r[i] = rules[2 * n + i];
-    }
-    __syncthreads();
-    if (n + threadIdx.y < nHot) {
-      Int i = r[2 * threadIdx.y] * input_stride;
-      Int o = r[2 * threadIdx.y + 1] * output_stride;
-      for (Int plane = threadIdx.x; plane < nPlanes; plane += NTX)
-        d_input_features[i + plane] += alpha * d_output_features[o + plane];
-    }
-    __syncthreads();
-  }
-}
-template <typename T>
-void cuda_AveragePooling_BackwardPass(T *d_input_features, T *d_output_features,
-                                      Int nPlanes, Int input_stride,
-                                      Int output_stride, Int *rules,
-                                      Int nHot, Int filterVolume) {
-  AveragePooling_bp<T, 32, 32><<<32, dim3(32, 32)>>>(
-      d_input_features, d_output_features, nPlanes, input_stride, output_stride,
-      rules, nHot, 1.0 / filterVolume);
-}
-#endif /* CUDA_AVERAGEPOOLING_H */
--- a/sparseconvnet/SCN/CUDA/BatchNormalization.cpp
+++ b/sparseconvnet/SCN/CUDA/BatchNormalization.cpp
+// Copyright 2016-present, Facebook, Inc.
+// All rights reserved.
+//
+// This source code is licensed under the license found in the
+// LICENSE file in the root directory of this source tree.
+template <typename T>
+void bn_f(T *iF, T *oF, Int nPlanes, Int input_stride, Int output_stride,
+          Int nActive, T *saveMean, T *saveInvStd, T *runningMean,
+          T *runningVar, T *weight, T *bias, T eps, T momentum, bool train,
+          T leakiness);
+template <typename T>
+void bn_b(T *input_features, T *d_input_features, T *output_features,
+          T *d_output_features, Int nPlanes, Int input_stride,
+          Int output_stride, Int nActive, T *saveMean, T *saveInvStd,
+          T *runningMean, T *runningVar, T *weight, T *bias, T *d_weight,
+          T *d_bias, T leakiness);
+template <typename T>
+void cuda_BatchNormalization_updateOutput(
+    /*cuda float*/ at::Tensor input_features,
+    /*cuda float*/ at::Tensor output_features,
+    /*cuda float*/ at::Tensor saveMean,
+    /*cuda float*/ at::Tensor saveInvStd, /*cuda float*/ at::Tensor runningMean,
+    /*cuda float*/ at::Tensor runningVar,
+    /*cuda float*/ at::Tensor weight, /*cuda float*/ at::Tensor bias, T eps,
+    T momentum, bool train, T leakiness) {
+  output_features.resize_as_(input_features);
+  if (input_features.ndimension() == 2) {
+    auto nActive = input_features.size(0);
+    auto nPlanes = input_features.size(1);
+    auto input_stride = input_features.stride(0);
+    auto output_stride = output_features.stride(0);
+    bn_f(input_features.data<T>(), output_features.data<T>(), nPlanes,
+         input_stride, output_stride, nActive, saveMean.data<T>(),
+         saveInvStd.data<T>(), runningMean.data<T>(), runningVar.data<T>(),
+         OptionalTensorData<T>(weight), OptionalTensorData<T>(bias), eps,
+         momentum, train, leakiness);
+  }
+}
+template <typename T>
+void cuda_BatchNormalization_backward(
+    /*cuda float*/ at::Tensor input_features,
+    /*cuda float*/ at::Tensor d_input_features,
+    /*cuda float*/ at::Tensor output_features,
+    /*cuda float*/ at::Tensor d_output_features,
+    /*cuda float*/ at::Tensor saveMean, /*cuda float*/ at::Tensor saveInvStd,
+    /*cuda float*/ at::Tensor runningMean,
+    /*cuda float*/ at::Tensor runningVar, /*cuda float*/ at::Tensor weight,
+    /*cuda float*/ at::Tensor bias,
+    /*cuda float*/ at::Tensor d_weight, /*cuda float*/ at::Tensor d_bias,
+    T leakiness) {
+  d_input_features.resize_as_(d_output_features);
+  if (input_features.ndimension() == 2) {
+    auto nActive = input_features.size(0);
+    auto nPlanes = input_features.size(1);
+    auto input_stride = input_features.stride(0);
+    auto output_stride = output_features.stride(0);
+    bn_b(input_features.data<T>(), d_input_features.data<T>(),
+         output_features.data<T>(), d_output_features.data<T>(), nPlanes,
+         input_stride, output_stride, nActive, saveMean.data<T>(),
+         saveInvStd.data<T>(), runningMean.data<T>(), runningVar.data<T>(),
+         OptionalTensorData<T>(weight), OptionalTensorData<T>(bias),
+         OptionalTensorData<T>(d_weight), OptionalTensorData<T>(d_bias),
+         leakiness);
+  }
+}
--- a/sparseconvnet/SCN/CUDA/BatchNormalization.cu
+++ b/sparseconvnet/SCN/CUDA/BatchNormalization.cu
@@ -4,56 +4,215 @@
 // This source code is licensed under the license found in the
 // LICENSE file in the root directory of this source tree.
-#include "BatchNormalization.h"
+#include <cassert>
+// input_stride and output_stride are normally the same as nPlanes; allow larger
+// values to act on a subset of columns, i.e. an inplace DenseNet blocks
+// NTX ~ 16 - nPlanes must be a multiple of this
+// NTY ~ 64 - at least 4
+template <typename T, Int NTX, Int NTY>
+__global__ void BatchNormalization_f_train(
+    T *input_features, T *output_features, Int nPlanes, Int input_stride,
+    Int output_stride, Int nActive, T *saveMean, T *saveInvStd, T *runningMean,
+    T *runningVar, T *weight, T *bias, T eps, T momentum, T leakiness) {
+  __shared__ T t[NTY][NTX];
+  __shared__ T t2[NTY][NTX];
+  for (Int plane = threadIdx.x + blockIdx.x * NTX; plane < nPlanes;
+       plane += gridDim.x * NTX) {
+    t[threadIdx.y][threadIdx.x] = 0;
+    t2[threadIdx.y][threadIdx.x] = 0;
+    for (Int row = threadIdx.y, c = plane + threadIdx.y * input_stride;
+         row < nActive; row += NTY, c += input_stride * NTY) {
+      T i = input_features[c];
+      t[threadIdx.y][threadIdx.x] += i;
+      t2[threadIdx.y][threadIdx.x] += i * i;
+    }
+    __syncthreads();
+    T _saveMean = 0;
+    T _saveInvStd = 0;
+    for (Int row = 0; row < NTY; row++) {
+      _saveMean += t[row][threadIdx.x];
+      _saveInvStd += t2[row][threadIdx.x];
+    }
+    _saveMean /= nActive;
+    _saveInvStd = _saveInvStd - _saveMean * _saveMean * nActive;
+    if (threadIdx.y == 0) {
+      saveMean[plane] = _saveMean;
+      runningMean[plane] =
+          momentum * runningMean[plane] + (1 - momentum) * _saveMean;
+      runningVar[plane] = momentum * runningVar[plane] +
+                          (1 - momentum) * _saveInvStd / (nActive - 1);
+    }
+    _saveInvStd = pow(_saveInvStd / nActive + eps, -0.5);
+    if (threadIdx.y == 0)
+      saveInvStd[plane] = _saveInvStd;
+    __syncthreads();
+    if (threadIdx.y == 0) {
+      t[0][threadIdx.x] = _saveInvStd * (weight ? weight[plane] : 1);
+      t[1][threadIdx.x] =
+          -_saveMean * t[0][threadIdx.x] + (bias ? bias[plane] : 0);
+    }
+    __syncthreads();
+    T W = t[0][threadIdx.x];
+    T B = t[1][threadIdx.x];
+    for (Int row = threadIdx.y, ci = plane + threadIdx.y * input_stride,
+             co = plane + threadIdx.y * output_stride;
+         row < nActive;
+         row += NTY, ci += input_stride * NTY, co += output_stride * NTY) {
+      T out = W * input_features[ci] + B;
+      output_features[co] = (out > 0) ? out : (out * leakiness);
+    }
+    __syncthreads();
+  }
+}
+template <typename T, Int NTX, Int NTY>
+__global__ void BatchNormalization_f_test(
+    T *input_features, T *output_features, Int nPlanes, Int input_stride,
+    Int output_stride, Int nActive, T *saveMean, T *saveInvStd, T *runningMean,
+    T *runningVar, T *weight, T *bias, T eps, T momentum, T leakiness) {
+  __shared__ T W[NTX];
+  __shared__ T B[NTX];
+  for (Int plane = threadIdx.x + blockIdx.x * NTX; plane < nPlanes;
+       plane += gridDim.x * NTX) {
+    if (threadIdx.y == 0) {
+      W[threadIdx.x] =
+          pow(runningVar[plane] + eps, -0.5) * (weight ? weight[plane] : 1);
+      B[threadIdx.x] =
+          (bias ? bias[plane] : 0) - runningMean[plane] * W[threadIdx.x];
+    }
+    __syncthreads();
+    float w = W[threadIdx.x], b = B[threadIdx.x];
+    for (Int row = threadIdx.y, ci = plane + threadIdx.y * input_stride,
+             co = plane + threadIdx.y * output_stride;
+         row < nActive;
+         row += NTY, ci += input_stride * NTY, co += output_stride * NTY) {
+      T out = w * input_features[ci] + b;
+      output_features[co] = (out > 0) ? out : (out * leakiness);
+    }
+    __syncthreads();
+  }
+}
+template <typename T, Int NTX, Int NTY>
+void BatchNormalization_ForwardPass(T *input_features, T *output_features,
+                                    Int nPlanes, Int input_stride,
+                                    Int output_stride, Int nActive, T *saveMean,
+                                    T *saveInvStd, T *runningMean,
+                                    T *runningVar, T *weight, T *bias, T eps,
+                                    T momentum, bool train, T leakiness) {
+  if (train) {
+    BatchNormalization_f_train<
+        T, NTX, NTY><<<std::min((Int)16, nPlanes / NTX), dim3(NTX, NTY)>>>(
+        input_features, output_features, nPlanes, input_stride, output_stride,
+        nActive, saveMean, saveInvStd, runningMean, runningVar, weight, bias,
+        eps, momentum, leakiness);
+  } else {
+    BatchNormalization_f_test<
+        T, NTX, NTY><<<std::min((Int)16, nPlanes / NTX), dim3(NTX, NTY)>>>(
+        input_features, output_features, nPlanes, input_stride, output_stride,
+        nActive, saveMean, saveInvStd, runningMean, runningVar, weight, bias,
+        eps, momentum, leakiness);
+  }
+}
+template <typename T, Int NTX, Int NTY>
+__global__ void
+BatchNormalization_b(T *input_features, T *d_input_features, T *output_features,
+                     T *d_output_features, Int nPlanes, Int input_stride,
+                     Int output_stride, Int nActive, T *saveMean, T *saveInvStd,
+                     T *runningMean, T *runningVar, T *weight, T *bias,
+                     T *d_weight, T *d_bias, T leakiness) {
+  __shared__ T t[NTY][NTX];
+  __shared__ T t2[NTY][NTX];
+  for (Int plane = threadIdx.x + blockIdx.x * NTX; plane < nPlanes;
+       plane += gridDim.x * NTX) {
+    if (threadIdx.y == 0) {
+      t[0][threadIdx.x] = saveMean[plane];
+      t[1][threadIdx.x] = saveInvStd[plane];
+      t[2][threadIdx.x] = (weight ? weight[plane] : 1);
+    }
+    __syncthreads();
+    T _saveMean = t[0][threadIdx.x];
+    T _saveInvStd = t[1][threadIdx.x];
+    T _weight = t[2][threadIdx.x];
+    __syncthreads();
+    t[threadIdx.y][threadIdx.x] = 0;
+    t2[threadIdx.y][threadIdx.x] = 0;
+    for (Int row = threadIdx.y, ci = plane + threadIdx.y * input_stride,
+             co = plane + threadIdx.y * output_stride;
+         row < nActive;
+         row += NTY, ci += input_stride * NTY, co += output_stride * NTY) {
+      T d = d_output_features[co];
+      d = (output_features[co] > 0) ? d : (d * leakiness);
+      d_output_features[co] = d;
+      t[threadIdx.y][threadIdx.x] += d;
+      t2[threadIdx.y][threadIdx.x] += (input_features[ci] - _saveMean) * d;
+    }
+    __syncthreads();
+    T gradMean = 0;
+    T dotp = 0;
+    for (int row = 0; row < NTY; row++) {
+      gradMean += t[row][threadIdx.x];
+      dotp += t2[row][threadIdx.x];
+    }
+    __syncthreads();
+    if (d_weight)
+      d_weight[plane] = dotp * _saveInvStd;
+    if (d_bias)
+      d_bias[plane] = gradMean; // sum really
+    gradMean /= nActive;
+    T k = dotp * _saveInvStd * _saveInvStd / nActive;
+    for (Int row = threadIdx.y, ci = plane + threadIdx.y * input_stride,
+             co = plane + threadIdx.y * output_stride;
+         row < nActive;
+         row += NTY, ci += input_stride * NTY, co += output_stride * NTY) {
+      d_input_features[ci] = (d_output_features[co] - gradMean -
+                              (input_features[ci] - _saveMean) * k) *
+                             _saveInvStd * _weight;
+    }
+    __syncthreads();
+  }
+}
+template <typename T, Int NTX, Int NTY>
+void BatchNormalization_BackwardPass(T *input_features, T *d_input_features,
+                                     T *output_features, T *d_output_features,
+                                     Int nPlanes, Int input_stride,
+                                     Int output_stride, Int nActive,
+                                     T *saveMean, T *saveInvStd, T *runningMean,
+                                     T *runningVar, T *weight, T *bias,
+                                     T *d_weight, T *d_bias, T leakiness) {
+  BatchNormalization_b<
+      T, NTX, NTY><<<std::min((Int)16, nPlanes / NTX), dim3(NTX, NTY)>>>(
+      input_features, d_input_features, output_features, d_output_features,
+      nPlanes, input_stride, output_stride, nActive, saveMean, saveInvStd,
+      runningMean, runningVar, weight, bias, d_weight, d_bias, leakiness);
+}
 #define BN_F_MACRO(N)                                                          \
  if (nPlanes % N == 0) {                                                      \
    BatchNormalization_ForwardPass<T, N, 64>(                                  \
-        input_features.data<T>(), output_features.data<T>(), nPlanes,          \
+        iF, oF, nPlanes, input_stride, output_stride, nActive, saveMean,       \
-        input_stride, output_stride, nActive, saveMean.data<T>(),              \
+        saveInvStd, runningMean, runningVar, weight, bias, eps, momentum,      \
-        saveInvStd.data<T>(), runningMean.data<T>(), runningVar.data<T>(),     \
-        OptionalTensorData<T>(weight), OptionalTensorData<T>(bias), eps, momentum,   \
        train, leakiness);                                                     \
  }
 template <typename T>
-void cuda_BatchNormalization_updateOutput(
+void bn_f(T *iF, T *oF, Int nPlanes, Int input_stride, Int output_stride,
-    /*cuda float*/ at::Tensor input_features,
+          Int nActive, T *saveMean, T *saveInvStd, T *runningMean,
-    /*cuda float*/ at::Tensor output_features,
+          T *runningVar, T *weight, T *bias, T eps, T momentum, bool train,
-    /*cuda float*/ at::Tensor saveMean,
+          T leakiness) {
-    /*cuda float*/ at::Tensor saveInvStd, /*cuda float*/ at::Tensor runningMean,
+  BN_F_MACRO(16)
-    /*cuda float*/ at::Tensor runningVar,
+  else BN_F_MACRO(12) else BN_F_MACRO(8) else BN_F_MACRO(4) else BN_F_MACRO(1)
-    /*cuda float*/ at::Tensor weight, /*cuda float*/ at::Tensor bias, T eps,
-    T momentum, bool train, T leakiness) {
-  output_features.resize_as_(input_features);
-  if (input_features.ndimension() == 2) {
-    auto nActive = input_features.size(0);
-    auto nPlanes = input_features.size(1);
-    auto input_stride = input_features.stride(0);
-    auto output_stride = output_features.stride(0);
-    BN_F_MACRO(16)
-    else BN_F_MACRO(12) else BN_F_MACRO(8) else BN_F_MACRO(4) else BN_F_MACRO(1)
-  }
-}
-template <typename T>
-void cuda_BatchNormalizationInTensor_updateOutput(
-    /*cuda float*/ at::Tensor input_features,
-    /*cuda float*/ at::Tensor output_features,
-    /*cuda float*/ at::Tensor saveMean,
-    /*cuda float*/ at::Tensor saveInvStd, /*cuda float*/ at::Tensor runningMean,
-    /*cuda float*/ at::Tensor runningVar,
-    /*cuda float*/ at::Tensor weight, /*cuda float*/ at::Tensor bias, T eps,
-    T momentum, bool train, T leakiness) {
-  if (input_features.ndimension() == 2) {
-    auto nActive = input_features.size(0);
-    auto nPlanes = input_features.size(1);
-    auto input_stride = input_features.stride(0);
-    auto output_stride = output_features.stride(0);
-    BN_F_MACRO(16)
-    else BN_F_MACRO(12) else BN_F_MACRO(8) else BN_F_MACRO(4) else BN_F_MACRO(1)
-  }
 }
 #undef BN_F_MACRO
@@ -61,34 +220,19 @@ void cuda_BatchNormalizationInTensor_updateOutput(
 #define BN_B_MACRO(N)                                                          \
  if (nPlanes % N == 0) {                                                      \
    BatchNormalization_BackwardPass<T, N, 64>(                                 \
-        input_features.data<T>(), d_input_features.data<T>(),                  \
+        input_features, d_input_features, output_features, d_output_features,  \
-        output_features.data<T>(), d_output_features.data<T>(), nPlanes,       \
+        nPlanes, input_stride, output_stride, nActive, saveMean, saveInvStd,   \
-        input_stride, output_stride, nActive, saveMean.data<T>(),              \
+        runningMean, runningVar, weight, bias, d_weight, d_bias, leakiness);   \
-        saveInvStd.data<T>(), runningMean.data<T>(), runningVar.data<T>(),     \
-        OptionalTensorData<T>(weight), OptionalTensorData<T>(bias),                  \
-        OptionalTensorData<T>(d_weight), OptionalTensorData<T>(d_bias), leakiness);  \
  }
 template <typename T>
-void cuda_BatchNormalization_backward(
+void bn_b(T *input_features, T *d_input_features, T *output_features,
-    /*cuda float*/ at::Tensor input_features,
+          T *d_output_features, Int nPlanes, Int input_stride,
-    /*cuda float*/ at::Tensor d_input_features,
+          Int output_stride, Int nActive, T *saveMean, T *saveInvStd,
-    /*cuda float*/ at::Tensor output_features,
+          T *runningMean, T *runningVar, T *weight, T *bias, T *d_weight,
-    /*cuda float*/ at::Tensor d_output_features,
+          T *d_bias, T leakiness) {
-    /*cuda float*/ at::Tensor saveMean, /*cuda float*/ at::Tensor saveInvStd,
+  BN_B_MACRO(16)
-    /*cuda float*/ at::Tensor runningMean,
+  else BN_B_MACRO(12) else BN_B_MACRO(8) else BN_B_MACRO(4) else BN_B_MACRO(1)
-    /*cuda float*/ at::Tensor runningVar, /*cuda float*/ at::Tensor weight,
-    /*cuda float*/ at::Tensor bias,
-    /*cuda float*/ at::Tensor d_weight, /*cuda float*/ at::Tensor d_bias,
-    T leakiness) {
-  d_input_features.resize_as_(d_output_features);
-  if (input_features.ndimension() == 2) {
-    auto nActive = input_features.size(0);
-    auto nPlanes = input_features.size(1);
-    auto input_stride = input_features.stride(0);
-    auto output_stride = output_features.stride(0);
-    BN_B_MACRO(16)
-    else BN_B_MACRO(12) else BN_B_MACRO(8) else BN_B_MACRO(4) else BN_B_MACRO(1)
-  }
 }
+#undef BN_B_MACRO
--- a/sparseconvnet/SCN/CUDA/BatchNormalization.h
+++ b/sparseconvnet/SCN/CUDA/BatchNormalization.h
-// Copyright 2016-present, Facebook, Inc.
-// All rights reserved.
-//
-// This source code is licensed under the license found in the
-// LICENSE file in the root directory of this source tree.
-#ifndef CUDA_BATCHNORMALIZATION_H
-#define CUDA_BATCHNORMALIZATION_H
-#include <cassert>
-// input_stride and output_stride are normally the same as nPlanes; allow larger
-// values to act on a subset of columns, i.e. an inplace DenseNet blocks
-// NTX ~ 16 - nPlanes must be a multiple of this
-// NTY ~ 64 - at least 4
-template <typename T, Int NTX, Int NTY>
-__global__ void
-BatchNormalization_f_train(T *input_features, T *output_features, Int nPlanes,
-                           Int input_stride, Int output_stride, Int nActive,
-                           T *saveMean, T *saveInvStd, T *runningMean,
-                           T *runningVar, T *weight, T *bias, T eps, T momentum,
-                           T leakiness) {
-  __shared__ T t[NTY][NTX];
-  __shared__ T t2[NTY][NTX];
-  for (Int plane = threadIdx.x + blockIdx.x * NTX; plane < nPlanes;
-       plane += gridDim.x * NTX) {
-    t[threadIdx.y][threadIdx.x] = 0;
-    t2[threadIdx.y][threadIdx.x] = 0;
-    for (Int row = threadIdx.y, c = plane + threadIdx.y * input_stride;
-         row < nActive; row += NTY, c += input_stride * NTY) {
-      T i = input_features[c];
-      t[threadIdx.y][threadIdx.x] += i;
-      t2[threadIdx.y][threadIdx.x] += i * i;
-    }
-    __syncthreads();
-    T _saveMean = 0;
-    T _saveInvStd = 0;
-    for (Int row = 0; row < NTY; row++) {
-      _saveMean += t[row][threadIdx.x];
-      _saveInvStd += t2[row][threadIdx.x];
-    }
-    _saveMean /= nActive;
-    _saveInvStd = _saveInvStd - _saveMean * _saveMean * nActive;
-    if (threadIdx.y == 0) {
-      saveMean[plane] = _saveMean;
-      runningMean[plane] =
-          momentum * runningMean[plane] + (1 - momentum) * _saveMean;
-      runningVar[plane] = momentum * runningVar[plane] +
-                          (1 - momentum) * _saveInvStd / (nActive - 1);
-    }
-    _saveInvStd = pow(_saveInvStd / nActive + eps, -0.5);
-    if (threadIdx.y == 0)
-      saveInvStd[plane] = _saveInvStd;
-    __syncthreads();
-    if (threadIdx.y == 0) {
-      t[0][threadIdx.x] = _saveInvStd * (weight ? weight[plane] : 1);
-      t[1][threadIdx.x] =
-          -_saveMean * t[0][threadIdx.x] + (bias ? bias[plane] : 0);
-    }
-    __syncthreads();
-    T W = t[0][threadIdx.x];
-    T B = t[1][threadIdx.x];
-    for (Int row = threadIdx.y, ci = plane + threadIdx.y * input_stride,
-              co = plane + threadIdx.y * output_stride;
-         row < nActive;
-         row += NTY, ci += input_stride * NTY, co += output_stride * NTY) {
-      T out = W * input_features[ci] + B;
-      output_features[co] = (out > 0) ? out : (out * leakiness);
-    }
-    __syncthreads();
-  }
-}
-template <typename T, Int NTX, Int NTY>
-__global__ void
-BatchNormalization_f_test(T *input_features, T *output_features, Int nPlanes,
-                          Int input_stride, Int output_stride, Int nActive,
-                          T *saveMean, T *saveInvStd, T *runningMean,
-                          T *runningVar, T *weight, T *bias, T eps, T momentum,
-                          T leakiness) {
-  __shared__ T W[NTX];
-  __shared__ T B[NTX];
-  for (Int plane = threadIdx.x + blockIdx.x * NTX; plane < nPlanes;
-       plane += gridDim.x * NTX) {
-    if (threadIdx.y == 0) {
-      W[threadIdx.x] =
-          pow(runningVar[plane] + eps, -0.5) * (weight ? weight[plane] : 1);
-      B[threadIdx.x] =
-          (bias ? bias[plane] : 0) - runningMean[plane] * W[threadIdx.x];
-    }
-    __syncthreads();
-    float w = W[threadIdx.x], b = B[threadIdx.x];
-    for (Int row = threadIdx.y, ci = plane + threadIdx.y * input_stride,
-              co = plane + threadIdx.y * output_stride;
-         row < nActive;
-         row += NTY, ci += input_stride * NTY, co += output_stride * NTY) {
-      T out = w * input_features[ci] + b;
-      output_features[co] = (out > 0) ? out : (out * leakiness);
-    }
-    __syncthreads();
-  }
-}
-template <typename T, Int NTX, Int NTY>
-void BatchNormalization_ForwardPass(T *input_features, T *output_features,
-                                    Int nPlanes, Int input_stride,
-                                    Int output_stride, Int nActive,
-                                    T *saveMean, T *saveInvStd, T *runningMean,
-                                    T *runningVar, T *weight, T *bias, T eps,
-                                    T momentum, bool train, T leakiness) {
-  if (train) {
-    BatchNormalization_f_train<
-        T, NTX, NTY><<<std::min((Int)16, nPlanes / NTX), dim3(NTX, NTY)>>>(
-        input_features, output_features, nPlanes, input_stride, output_stride,
-        nActive, saveMean, saveInvStd, runningMean, runningVar, weight, bias,
-        eps, momentum, leakiness);
-  } else {
-    BatchNormalization_f_test<
-        T, NTX, NTY><<<std::min((Int)16, nPlanes / NTX), dim3(NTX, NTY)>>>(
-        input_features, output_features, nPlanes, input_stride, output_stride,
-        nActive, saveMean, saveInvStd, runningMean, runningVar, weight, bias,
-        eps, momentum, leakiness);
-  }
-}
-template <typename T, Int NTX, Int NTY>
-__global__ void
-BatchNormalization_b(T *input_features, T *d_input_features, T *output_features,
-                     T *d_output_features, Int nPlanes, Int input_stride,
-                     Int output_stride, Int nActive, T *saveMean,
-                     T *saveInvStd, T *runningMean, T *runningVar, T *weight,
-                     T *bias, T *d_weight, T *d_bias, T leakiness) {
-  __shared__ T t[NTY][NTX];
-  __shared__ T t2[NTY][NTX];
-  for (Int plane = threadIdx.x + blockIdx.x * NTX; plane < nPlanes;
-       plane += gridDim.x * NTX) {
-    if (threadIdx.y == 0) {
-      t[0][threadIdx.x] = saveMean[plane];
-      t[1][threadIdx.x] = saveInvStd[plane];
-      t[2][threadIdx.x] = (weight ? weight[plane] : 1);
-    }
-    __syncthreads();
-    T _saveMean = t[0][threadIdx.x];
-    T _saveInvStd = t[1][threadIdx.x];
-    T _weight = t[2][threadIdx.x];
-    __syncthreads();
-    t[threadIdx.y][threadIdx.x] = 0;
-    t2[threadIdx.y][threadIdx.x] = 0;
-    for (Int row = threadIdx.y, ci = plane + threadIdx.y * input_stride,
-              co = plane + threadIdx.y * output_stride;
-         row < nActive;
-         row += NTY, ci += input_stride * NTY, co += output_stride * NTY) {
-      T d = d_output_features[co];
-      d = (output_features[co] > 0) ? d : (d * leakiness);
-      d_output_features[co] = d;
-      t[threadIdx.y][threadIdx.x] += d;
-      t2[threadIdx.y][threadIdx.x] += (input_features[ci] - _saveMean) * d;
-    }
-    __syncthreads();
-    T gradMean = 0;
-    T dotp = 0;
-    for (int row = 0; row < NTY; row++) {
-      gradMean += t[row][threadIdx.x];
-      dotp += t2[row][threadIdx.x];
-    }
-    __syncthreads();
-    if (d_weight)
-      d_weight[plane] = dotp * _saveInvStd;
-    if (d_bias)
-      d_bias[plane] = gradMean; // sum really
-    gradMean /= nActive;
-    T k = dotp * _saveInvStd * _saveInvStd / nActive;
-    for (Int row = threadIdx.y, ci = plane + threadIdx.y * input_stride,
-              co = plane + threadIdx.y * output_stride;
-         row < nActive;
-         row += NTY, ci += input_stride * NTY, co += output_stride * NTY) {
-      d_input_features[ci] = (d_output_features[co] - gradMean -
-                              (input_features[ci] - _saveMean) * k) *
-                             _saveInvStd * _weight;
-    }
-    __syncthreads();
-  }
-}
-template <typename T, Int NTX, Int NTY>
-void BatchNormalization_BackwardPass(T *input_features, T *d_input_features,
-                                     T *output_features, T *d_output_features,
-                                     Int nPlanes, Int input_stride,
-                                     Int output_stride, Int nActive,
-                                     T *saveMean, T *saveInvStd, T *runningMean,
-                                     T *runningVar, T *weight, T *bias,
-                                     T *d_weight, T *d_bias, T leakiness) {
-  BatchNormalization_b<
-      T, NTX, NTY><<<std::min((Int)16, nPlanes / NTX), dim3(NTX, NTY)>>>(
-      input_features, d_input_features, output_features, d_output_features,
-      nPlanes, input_stride, output_stride, nActive, saveMean, saveInvStd,
-      runningMean, runningVar, weight, bias, d_weight, d_bias, leakiness);
-}
-#undef NTX
-#undef NTY
-#endif /* CUDA_BATCHNORMALIZATION_H */
--- a/sparseconvnet/SCN/CUDA/BatchwiseMultiplicativeDropout.cpp
+++ b/sparseconvnet/SCN/CUDA/BatchwiseMultiplicativeDropout.cpp
+// Copyright 2016-present, Facebook, Inc.
+// All rights reserved.
+//
+// This source code is licensed under the license found in the
+// LICENSE file in the root directory of this source tree.
+template <typename T>
+void bmd_f(T *input_features, T *output_features, T *noise, Int nActive,
+           Int nPlanes, T alpha);
+template <typename T>
+void bmd_b(T *input_features, T *d_input_features, T *d_output_features,
+           T *noise, Int nActive, Int nPlanes, T alpha);
+template <typename T>
+void cuda_BatchwiseMultiplicativeDropout_updateOutput(
+    /*cuda float*/ at::Tensor input_features,
+    /*cuda float*/ at::Tensor output_features, /*cuda float*/ at::Tensor noise,
+    T alpha) {
+  output_features.resize_as_(input_features);
+  auto nActive = input_features.size(0);
+  auto nPlanes = input_features.size(1);
+  bmd_f(input_features.data<T>(), output_features.data<T>(), noise.data<T>(),
+        nActive, nPlanes, alpha);
+}
+template <typename T>
+void cuda_BatchwiseMultiplicativeDropout_updateGradInput(
+    /*cuda float*/ at::Tensor input_features,
+    /*cuda float*/ at::Tensor d_input_features,
+    /*cuda float*/ at::Tensor d_output_features,
+    /*cuda float*/ at::Tensor noise, T alpha) {
+  d_input_features.resize_as_(d_output_features);
+  auto nActive = input_features.size(0);
+  auto nPlanes = input_features.size(1);
+  bmd_b(input_features.data<T>(), d_input_features.data<T>(),
+        d_output_features.data<T>(), noise.data<T>(), nActive, nPlanes, alpha);
+}
--- a/sparseconvnet/SCN/CUDA/BatchwiseMultiplicativeDropout.cu
+++ b/sparseconvnet/SCN/CUDA/BatchwiseMultiplicativeDropout.cu
@@ -4,28 +4,63 @@
 // This source code is licensed under the license found in the
 // LICENSE file in the root directory of this source tree.
-#include "BatchwiseMultiplicativeDropout.h"
+template <typename T, Int NTX, Int NTY>
+__global__ void BatchwiseMultiplicativeDropout_fp(T *input_features,
+                                                  T *output_features, T *noise,
+                                                  Int nActive, Int nPlanes,
+                                                  Int input_stride,
+                                                  Int output_stride, T alpha) {
+  __shared__ T nz[NTX];
+  for (Int plane = threadIdx.x + blockIdx.x * NTX; plane < nPlanes;
+       plane += gridDim.x * NTX) {
+    if (threadIdx.y == 0)
+      nz[threadIdx.x] = noise[plane];
+    __syncthreads();
+    for (Int row = threadIdx.y + blockIdx.y * NTY; row < nActive;
+         row += gridDim.y * NTY) {
+      Int i = row * input_stride + plane;
+      Int o = row * output_stride + plane;
+      output_features[o] = input_features[i] * nz[threadIdx.x] *
+                           ((input_features[i] > 0) ? 1 : alpha);
+    }
+    __syncthreads();
+  }
+}
+template <typename T, Int NTX, Int NTY>
+__global__ void BatchwiseMultiplicativeDropout_bp(
+    T *input_features, T *d_input_features, T *d_output_features, T *noise,
+    Int nActive, Int nPlanes, Int input_stride, Int output_stride, T alpha) {
+  __shared__ T nz[NTX];
+  for (Int plane = threadIdx.x + blockIdx.x * NTX; plane < nPlanes;
+       plane += gridDim.x * NTX) {
+    if (threadIdx.y == 0)
+      nz[threadIdx.x] = noise[plane];
+    __syncthreads();
+    for (Int row = threadIdx.y + blockIdx.y * NTY; row < nActive;
+         row += gridDim.y * NTY) {
+      Int i = row * input_stride + plane;
+      Int o = row * output_stride + plane;
+      d_input_features[i] = d_output_features[o] * nz[threadIdx.x] *
+                            ((input_features[i] > 0) ? 1 : alpha);
+    }
+    __syncthreads();
+  }
+}
 #define SPARSECONVNET_FOO(NTX, NTY)                                            \
  {                                                                            \
    if (nPlanes % NTX == 0) {                                                  \
-      BatchwiseMultiplicativeDropout_fp<                                       \
+      BatchwiseMultiplicativeDropout_fp<T, NTX, NTY><<<                        \
-          T, NTX,                                                              \
+          dim3(std::min((Int)16, nPlanes / NTX), 16), dim3(NTX, NTY)>>>(       \
-          NTY><<<dim3(std::min(16L, nPlanes / NTX), 16), dim3(NTX, NTY)>>>(    \
+          input_features, output_features, noise, nActive, nPlanes, nPlanes,   \
-          input_features.data<T>(), output_features.data<T>(),                 \
+          nPlanes, alpha);                                                     \
-          noise.data<T>(), nActive, nPlanes, nPlanes, nPlanes, alpha);         \
      return;                                                                  \
    }                                                                          \
  }
 template <typename T>
-void cuda_BatchwiseMultiplicativeDropout_updateOutput(
+void bmd_f(T *input_features, T *output_features, T *noise, Int nActive,
-    /*cuda float*/ at::Tensor input_features,
+           Int nPlanes, T alpha) {
-    /*cuda float*/ at::Tensor output_features, /*cuda float*/ at::Tensor noise,
-    float alpha) {
-  output_features.resize_as_(input_features);
-  auto nActive = input_features.size(0);
-  auto nPlanes = input_features.size(1);
  SPARSECONVNET_FOO(32, 32)
  SPARSECONVNET_FOO(24, 32)
  SPARSECONVNET_FOO(16, 64)
@@ -39,25 +74,17 @@ void cuda_BatchwiseMultiplicativeDropout_updateOutput(
 #define SPARSECONVNET_FOO(NTX, NTY)                                            \
  {                                                                            \
    if (nPlanes % NTX == 0) {                                                  \
-      BatchwiseMultiplicativeDropout_bp<                                       \
+      BatchwiseMultiplicativeDropout_bp<T, NTX, NTY><<<                        \
-          T, NTX,                                                              \
+          dim3(std::min((Int)16, nPlanes / NTX), 16), dim3(NTX, NTY)>>>(       \
-          NTY><<<dim3(std::min(16L, nPlanes / NTX), 16), dim3(NTX, NTY)>>>(    \
+          input_features, d_input_features, d_output_features, noise, nActive, \
-          input_features.data<T>(), d_input_features.data<T>(),                \
+          nPlanes, nPlanes, nPlanes, alpha);                                   \
-          d_output_features.data<T>(), noise.data<T>(), nActive, nPlanes,      \
-          nPlanes, nPlanes, alpha);                                            \
      return;                                                                  \
    }                                                                          \
  }
-template <typename T>
-void cuda_BatchwiseMultiplicativeDropout_updateGradInput(
-    /*cuda float*/ at::Tensor input_features,
-    /*cuda float*/ at::Tensor d_input_features,
-    /*cuda float*/ at::Tensor d_output_features,
-    /*cuda float*/ at::Tensor noise, float alpha) {
-  d_input_features.resize_as_(d_output_features);
-  auto nActive = input_features.size(0);
-  auto nPlanes = input_features.size(1);
+template <typename T>
+void bmd_b(T *input_features, T *d_input_features, T *d_output_features,
+           T *noise, Int nActive, Int nPlanes, T alpha) {
  SPARSECONVNET_FOO(32, 32)
  SPARSECONVNET_FOO(24, 32)
  SPARSECONVNET_FOO(16, 64)
@@ -66,4 +93,5 @@ void cuda_BatchwiseMultiplicativeDropout_updateGradInput(
  SPARSECONVNET_FOO(4, 64)
  SPARSECONVNET_FOO(1, 64)
 }
 #undef SPARSECONVNET_FOO