Factor out CUDA code

de3743f6 · Benjamin Thomas Graham · f0407b36 · de3743f6 · de3743f6 · de3743f6
Commit de3743f6 authored Jul 13, 2018 by Benjamin Thomas Graham
20 changed files
--- a/.gitignore
+++ b/.gitignore
 SparseConvNetTorch/build/
-*.t7
-t7/
+*.pth
 *.o
 *.a
 *.so
@@ -11,3 +10,5 @@ pickle
 PyTorch/sparseconvnet.egg-info/
 PyTorch/sparseconvnet/SCN/__init__.py
 sparseconvnet.egg-info
+*.zip
+*.rar
--- a/build.sh
+++ b/build.sh
 #!/bin/bash
-rm -rf build/ sparseconvnet.egg-info sparseconvnet_SCN*.so
+rm -rf build/ dist/ sparseconvnet.egg-info sparseconvnet_SCN*.so
 python setup.py install
--- a/examples/3d_segmentation/fully_convolutional.py
+++ b/examples/3d_segmentation/fully_convolutional.py
@@ -47,7 +47,7 @@ p['initial_lr'] = 1e-1
 p['lr_decay'] = 4e-2
 p['weight_decay'] = 1e-4
 p['momentum'] = 0.9
-p['check_point'] = True
+p['check_point'] = False
 p['use_cuda'] = torch.cuda.is_available()
 dtype = 'torch.cuda.FloatTensor' if p['use_cuda'] else 'torch.FloatTensor'
 dtypei = 'torch.cuda.LongTensor' if p['use_cuda'] else 'torch.LongTensor'

--- a/examples/3d_segmentation/unet.py
+++ b/examples/3d_segmentation/unet.py
@@ -47,7 +47,7 @@ p['initial_lr'] = 1e-1
 p['lr_decay'] = 4e-2
 p['weight_decay'] = 1e-4
 p['momentum'] = 0.9
-p['check_point'] = True
+p['check_point'] = False
 p['use_cuda'] = torch.cuda.is_available()
 dtype = 'torch.cuda.FloatTensor' if p['use_cuda'] else 'torch.FloatTensor'
 dtypei = 'torch.cuda.LongTensor' if p['use_cuda'] else 'torch.LongTensor'

--- a/examples/Assamese_handwriting/data.py
+++ b/examples/Assamese_handwriting/data.py
@@ -4,8 +4,7 @@
 # This source code is licensed under the license found in the
 # LICENSE file in the root directory of this source tree.

-import torch
-import torchnet
+import torch, torch.utils.data
 import sparseconvnet as scn
 import pickle
 import math

--- a/examples/Assamese_handwriting/process.sh
+++ b/examples/Assamese_handwriting/process.sh
@@ -4,6 +4,7 @@
 # This source code is licensed under the license found in the
 # LICENSE file in the root directory of this source tree.
 #!/bin/bash
+set -e
 wget https://archive.ics.uci.edu/ml/machine-learning-databases/00208/Online%20Handwritten%20Assamese%20Characters%20Dataset.rar
 unrar e -cl -y "Online Handwritten Assamese Characters Dataset.rar"
 mkdir tmp

--- a/setup.py
+++ b/setup.py
@@ -24,12 +24,13 @@ setup(
    packages=['sparseconvnet','sparseconvnet.SCN'],
    ext_modules=[
      CUDAExtension('sparseconvnet_SCN',
-        ['sparseconvnet/SCN/pybind_cuda.cpp', 'sparseconvnet/SCN/instantiate_cpu.cpp', 'sparseconvnet/SCN/instantiate_cuda.cu'],
+        [
+         'sparseconvnet/SCN/cuda.cu', 'sparseconvnet/SCN/sparseconvnet_cuda.cpp', 'sparseconvnet/SCN/pybind.cpp'],
        include_dirs=[conda_include_dir, this_dir+'/sparseconvnet/SCN/'],
        extra_compile_args=extra)
      if torch.cuda.is_available()  else
      CppExtension('sparseconvnet_SCN',
-        ['sparseconvnet/SCN/pybind_cpu.cpp', 'sparseconvnet/SCN/instantiate_cpu.cpp'],
+        ['sparseconvnet/SCN/pybind.cpp', 'sparseconvnet/SCN/sparseconvnet_cpu.cpp'],
        include_dirs=[conda_include_dir, this_dir+'/sparseconvnet/SCN/'],
        extra_compile_args=extra['cxx'])],
    cmdclass={'build_ext': BuildExtension},

--- a/sparseconvnet/SCN/CPU/ActivePooling.cpp
+++ b/sparseconvnet/SCN/CPU/ActivePooling.cpp
@@ -4,7 +4,39 @@
 // This source code is licensed under the license found in the
 // LICENSE file in the root directory of this source tree.

-#include "ActivePooling.h"
+// Assume output_features and d_input_features have been zero-ed
+template <typename T>
+void ActivePooling_ForwardPass(T *input_features, T *output_features,
+                               Int batchSize, Int maxActive, Int nPlanes,
+                               RuleBook &rules, bool average) {
+  for (Int outSite = 0; outSite < batchSize; outSite++) {
+    T *out = &output_features[outSite * nPlanes];
+    Int *r = &rules[0][outSite * (maxActive + 1)];
+    Int nActive = *r++;
+    T multiplier = (average and nActive > 0) ? 1.0f / nActive : 1.0f;
+    while (nActive-- > 0) {
+      T *inp = &input_features[(*r++) * nPlanes];
+      for (Int plane = 0; plane < nPlanes; plane++)
+        out[plane] += inp[plane] * multiplier;
+    }
+  }
+}
+template <typename T>
+void ActivePooling_BackwardPass(T *d_input_features, T *d_output_features,
+                                Int batchSize, Int maxActive, Int nPlanes,
+                                RuleBook &rules, bool average) {
+  for (Int outSite = 0; outSite < batchSize; outSite++) {
+    T *out = &d_output_features[outSite * nPlanes];
+    Int *r = &rules[0][outSite * (maxActive + 1)];
+    Int nActive = *r++;
+    T multiplier = (average and nActive > 0) ? 1.0f / nActive : 1.0f;
+    while (nActive-- > 0) {
+      T *inp = &d_input_features[(*r++) * nPlanes];
+      for (Int plane = 0; plane < nPlanes; plane++)
+        inp[plane] = out[plane] * multiplier;
+    }
+  }
+}

 template <typename T, Int Dimension>
 void cpu_ActivePooling_updateOutput(

--- a/sparseconvnet/SCN/CPU/ActivePooling.h
+++ b/sparseconvnet/SCN/CPU/ActivePooling.h
-// Copyright 2016-present, Facebook, Inc.
-// All rights reserved.
-//
-// This source code is licensed under the license found in the
-// LICENSE file in the root directory of this source tree.
-
-#ifndef CPU_ACTIVEPOOLING_H
-#define CPU_ACTIVEPOOLING_H
-
-// Assume output_features and d_input_features have been zero-ed
-
-template <typename T>
-void ActivePooling_ForwardPass(T *input_features, T *output_features,
-                               Int batchSize, Int maxActive, Int nPlanes,
-                               RuleBook &rules, bool average) {
-  for (Int outSite = 0; outSite < batchSize; outSite++) {
-    T *out = &output_features[outSite * nPlanes];
-    Int *r = &rules[0][outSite * (maxActive + 1)];
-    Int nActive = *r++;
-    T multiplier = (average and nActive > 0) ? 1.0f / nActive : 1.0f;
-    while (nActive-- > 0) {
-      T *inp = &input_features[(*r++) * nPlanes];
-      for (Int plane = 0; plane < nPlanes; plane++)
-        out[plane] += inp[plane] * multiplier;
-    }
-  }
-}
-template <typename T>
-void ActivePooling_BackwardPass(T *d_input_features, T *d_output_features,
-                                Int batchSize, Int maxActive, Int nPlanes,
-                                RuleBook &rules, bool average) {
-  for (Int outSite = 0; outSite < batchSize; outSite++) {
-    T *out = &d_output_features[outSite * nPlanes];
-    Int *r = &rules[0][outSite * (maxActive + 1)];
-    Int nActive = *r++;
-    T multiplier = (average and nActive > 0) ? 1.0f / nActive : 1.0f;
-    while (nActive-- > 0) {
-      T *inp = &d_input_features[(*r++) * nPlanes];
-      for (Int plane = 0; plane < nPlanes; plane++)
-        inp[plane] = out[plane] * multiplier;
-    }
-  }
-}
-#endif /* CPU_ACTIVEPOOLING_H */
--- a/sparseconvnet/SCN/CPU/AffineReluTrivialConvolution.cpp
+++ b/sparseconvnet/SCN/CPU/AffineReluTrivialConvolution.cpp
@@ -4,7 +4,68 @@
 // This source code is licensed under the license found in the
 // LICENSE file in the root directory of this source tree.

-#include "AffineReluTrivialConvolution.h"
+#include <cstring>
+
+template <typename T>
+void AffineReluTrivialConvolution_ForwardPass(
+    T *input_features, Int input_nPlanes, Int input_stride, T *output_features,
+    Int output_nPlanes, Int output_stride, T *affineWeight, T *affineBias,
+    T *convWeight, Int nActive) {
+
+  for (Int row = 0; row < nActive; row++) {
+    for (Int column = 0; column < output_nPlanes; column++) {
+      T sum = 0;
+      for (Int j = 0; j < input_nPlanes; j++) {
+        T i = input_features[row * input_stride + j] * affineWeight[j] +
+              affineBias[j];
+        i = (i > 0) ? i : 0;
+        sum += i * convWeight[j * output_nPlanes + column];
+      }
+      output_features[row * output_stride + column] = sum;
+    }
+  }
+}
+
+template <typename T>
+void AffineReluTrivialConvolution_BackwardPass(
+    T *input_features, T *d_input_features, Int input_nPlanes, Int input_stride,
+    T *d_output_features, Int output_nPlanes, Int output_stride,
+    T *affineWeight, T *dAffineWeight, T *affineBias, T *dAffineBias,
+    T *convWeight, T *dConvWeight, Int nActive, bool additiveGrad) {
+
+  for (Int row = 0; row < input_nPlanes; row++) {
+    for (Int column = 0; column < output_nPlanes; column++) {
+      T sum = 0;
+      for (Int j = 0; j < nActive; j++) {
+        T i = input_features[j * input_stride + row] * affineWeight[row] +
+              affineBias[row];
+        i = (i > 0) ? i : 0;
+        sum += i * d_output_features[j * output_stride + column];
+      }
+      dConvWeight[row * output_nPlanes + column] += sum;
+    }
+  }
+  for (Int row = 0; row < nActive; row++) {
+    for (Int column = 0; column < input_nPlanes; column++) {
+      T sum = 0;
+      for (Int j = 0; j < output_nPlanes; j++) {
+        sum += d_output_features[row * output_stride + j] *
+               convWeight[column * output_nPlanes + j];
+      }
+      T i = input_features[row * input_stride + column] * affineWeight[column] +
+            affineBias[column];
+      if (i <= 0) // d_ReLU
+        sum = 0;
+      dAffineWeight[column] += sum * i;
+      dAffineBias[column] += sum;
+      sum *= affineWeight[column];
+      if (additiveGrad)
+        d_input_features[row * input_stride + column] += sum;
+      else
+        d_input_features[row * input_stride + column] = sum;
+    }
+  }
+}

 template <typename T>
 double cpu_AffineReluTrivialConvolution_updateOutput(

--- a/sparseconvnet/SCN/CPU/AffineReluTrivialConvolution.h
+++ b/sparseconvnet/SCN/CPU/AffineReluTrivialConvolution.h
-// Copyright 2016-present, Facebook, Inc.
-// All rights reserved.
-//
-// This source code is licensed under the license found in the
-// LICENSE file in the root directory of this source tree.
-
-#ifndef CPU_AffineReluTrivialConvolution_H
-#define CPU_AffineReluTrivialConvolution_H
-
-#include <cstring>
-
-template <typename T>
-void AffineReluTrivialConvolution_ForwardPass(
-    T *input_features, Int input_nPlanes, Int input_stride, T *output_features,
-    Int output_nPlanes, Int output_stride, T *affineWeight, T *affineBias,
-    T *convWeight, Int nActive) {
-
-  for (Int row = 0; row < nActive; row++) {
-    for (Int column = 0; column < output_nPlanes; column++) {
-      T sum = 0;
-      for (Int j = 0; j < input_nPlanes; j++) {
-        T i = input_features[row * input_stride + j] * affineWeight[j] +
-              affineBias[j];
-        i = (i > 0) ? i : 0;
-        sum += i * convWeight[j * output_nPlanes + column];
-      }
-      output_features[row * output_stride + column] = sum;
-    }
-  }
-}
-
-template <typename T>
-void AffineReluTrivialConvolution_BackwardPass(
-    T *input_features, T *d_input_features, Int input_nPlanes, Int input_stride,
-    T *d_output_features, Int output_nPlanes, Int output_stride,
-    T *affineWeight, T *dAffineWeight, T *affineBias, T *dAffineBias,
-    T *convWeight, T *dConvWeight, Int nActive, bool additiveGrad) {
-
-  for (Int row = 0; row < input_nPlanes; row++) {
-    for (Int column = 0; column < output_nPlanes; column++) {
-      T sum = 0;
-      for (Int j = 0; j < nActive; j++) {
-        T i = input_features[j * input_stride + row] * affineWeight[row] +
-              affineBias[row];
-        i = (i > 0) ? i : 0;
-        sum += i * d_output_features[j * output_stride + column];
-      }
-      dConvWeight[row * output_nPlanes + column] += sum;
-    }
-  }
-  for (Int row = 0; row < nActive; row++) {
-    for (Int column = 0; column < input_nPlanes; column++) {
-      T sum = 0;
-      for (Int j = 0; j < output_nPlanes; j++) {
-        sum += d_output_features[row * output_stride + j] *
-               convWeight[column * output_nPlanes + j];
-      }
-      T i = input_features[row * input_stride + column] * affineWeight[column] +
-            affineBias[column];
-      if (i <= 0) // d_ReLU
-        sum = 0;
-      dAffineWeight[column] += sum * i;
-      dAffineBias[column] += sum;
-      sum *= affineWeight[column];
-      if (additiveGrad)
-        d_input_features[row * input_stride + column] += sum;
-      else
-        d_input_features[row * input_stride + column] = sum;
-    }
-  }
-}
-#endif /* CPU_AffineReluTrivialConvolution_H */
--- a/sparseconvnet/SCN/CPU/AveragePooling.cpp
+++ b/sparseconvnet/SCN/CPU/AveragePooling.cpp
@@ -4,7 +4,31 @@
 // This source code is licensed under the license found in the
 // LICENSE file in the root directory of this source tree.

-#include "AveragePooling.h"
+template <typename T>
+void AveragePooling_ForwardPass(T *input_features, T *output_features,
+                                Int nPlanes, Int input_stride,
+                                Int output_stride, Int *rules, Int nHot,
+                                Int filterVolume) {
+  for (Int outSite = 0; outSite < nHot; outSite++) {
+    Int i = rules[2 * outSite] * input_stride;
+    Int o = rules[2 * outSite + 1] * output_stride;
+    for (Int plane = 0; plane < nPlanes; plane++)
+      output_features[o + plane] += input_features[i + plane] / filterVolume;
+  }
+}
+template <typename T>
+void AveragePooling_BackwardPass(T *d_input_features, T *d_output_features,
+                                 Int nPlanes, Int input_stride,
+                                 Int output_stride, Int *rules, Int nHot,
+                                 Int filterVolume) {
+  for (Int outSite = 0; outSite < nHot; outSite++) {
+    Int i = rules[2 * outSite] * input_stride;
+    Int o = rules[2 * outSite + 1] * output_stride;
+    for (Int plane = 0; plane < nPlanes; plane++)
+      d_input_features[i + plane] +=
+          d_output_features[o + plane] / filterVolume;
+  }
+}

 template <typename T, Int Dimension>
 void cpu_AveragePooling_updateOutput(

--- a/sparseconvnet/SCN/CPU/AveragePooling.h
+++ b/sparseconvnet/SCN/CPU/AveragePooling.h
-// Copyright 2016-present, Facebook, Inc.
-// All rights reserved.
-//
-// This source code is licensed under the license found in the
-// LICENSE file in the root directory of this source tree.
-
-#ifndef CPU_AVERAGEPOOLING_H
-#define CPU_AVERAGEPOOLING_H
-
-
-template <typename T>
-void AveragePooling_ForwardPass(T *input_features, T *output_features,
-                                Int nPlanes, Int input_stride,
-                                Int output_stride, Int *rules, Int nHot,
-                                Int filterVolume) {
-  for (Int outSite = 0; outSite < nHot; outSite++) {
-    Int i = rules[2 * outSite] * input_stride;
-    Int o = rules[2 * outSite + 1] * output_stride;
-    for (Int plane = 0; plane < nPlanes; plane++)
-      output_features[o + plane] += input_features[i + plane] / filterVolume;
-  }
-}
-template <typename T>
-void AveragePooling_BackwardPass(T *d_input_features, T *d_output_features,
-                                 Int nPlanes, Int input_stride,
-                                 Int output_stride, Int *rules, Int nHot,
-                                 Int filterVolume) {
-  for (Int outSite = 0; outSite < nHot; outSite++) {
-    Int i = rules[2 * outSite] * input_stride;
-    Int o = rules[2 * outSite + 1] * output_stride;
-    for (Int plane = 0; plane < nPlanes; plane++)
-      d_input_features[i + plane] +=
-          d_output_features[o + plane] / filterVolume;
-  }
-}
-#endif /* CPU_AVERAGEPOOLING_H */
--- a/sparseconvnet/SCN/CPU/BatchNormalization.cpp
+++ b/sparseconvnet/SCN/CPU/BatchNormalization.cpp
@@ -4,46 +4,125 @@
 // This source code is licensed under the license found in the
 // LICENSE file in the root directory of this source tree.

-#include "BatchNormalization.h"
+#include <vector>
+
+// in/output_stride is normally the same as nPlanes; allow other values to act
+// on a subset of columns, i.e. an inplace DenseNet blocks

 template <typename T>
-void cpu_BatchNormalization_updateOutput(
-    /*float*/ at::Tensor input_features, /*float*/ at::Tensor output_features,
-    /*float*/ at::Tensor saveMean,
-    /*float*/ at::Tensor saveInvStd, /*float*/ at::Tensor runningMean,
-    /*float*/ at::Tensor runningVar,
-    /*float*/ at::Tensor weight, /*float*/ at::Tensor bias, T eps, T momentum,
-    bool train, T leakiness) {
-  output_features.resize_as_(input_features);
-  if (input_features.ndimension() == 2) {
-    auto nActive = input_features.size(0);
-    auto nPlanes = input_features.size(1);
-    auto input_stride = input_features.stride(0);
-    auto output_stride = output_features.stride(0);
-    BatchNormalization_ForwardPass<T>(
-        input_features.data<T>(), output_features.data<T>(), nPlanes,
-        input_stride, output_stride, nActive, saveMean.data<T>(),
-        saveInvStd.data<T>(), runningMean.data<T>(), runningVar.data<T>(),
-        OptionalTensorData<T>(weight), OptionalTensorData<T>(bias), eps,
-        momentum, train, leakiness);
+void BatchNormalization_ForwardPass(T *input_features, T *output_features,
+                                    Int nPlanes, Int input_stride,
+                                    Int output_stride, Int nActive, T *saveMean,
+                                    T *saveInvStd, T *runningMean,
+                                    T *runningVar, T *weight, T *bias, T eps,
+                                    T momentum, bool train, T leakiness) {
+  if (train) {
+    std::memset(saveMean, 0, nPlanes * sizeof(T));
+    std::memset(saveInvStd, 0, nPlanes * sizeof(T));
+    for (Int row = 0, ci = 0; row < nActive;
+         row++, ci += input_stride - nPlanes) {
+      for (Int plane = 0; plane < nPlanes; plane++, ci++) {
+        saveMean[plane] += input_features[ci];
+      }
+    }
+    for (Int plane = 0; plane < nPlanes; plane++) {
+      saveMean[plane] /= nActive;
+      runningMean[plane] =
+          momentum * runningMean[plane] + (1 - momentum) * saveMean[plane];
+    }
+    for (Int row = 0, ci = 0; row < nActive;
+         row++, ci += input_stride - nPlanes) {
+      for (Int plane = 0; plane < nPlanes; plane++, ci++) {
+        saveInvStd[plane] +=
+            (input_features[ci] - saveMean[plane]) *
+            (input_features[ci] - saveMean[plane]); // accumulate sum-squares
+        // before inverse square
+        // rooting
+      }
+    }
+    for (Int plane = 0; plane < nPlanes; plane++) {
+      runningVar[plane] = momentum * runningVar[plane] +
+                          (1 - momentum) * saveInvStd[plane] / (nActive - 1);
+      saveInvStd[plane] = powf(saveInvStd[plane] / nActive + eps, -0.5);
+    }
+  } else {
+    for (Int plane = 0; plane < nPlanes; plane++) {
+      saveMean[plane] = runningMean[plane];
+      saveInvStd[plane] = powf(runningVar[plane] + eps, -0.5);
+    }
+  }
+  std::vector<T> w(nPlanes);
+  std::vector<T> b(nPlanes);
+  for (Int plane = 0; plane < nPlanes; plane++) {
+    w[plane] = saveInvStd[plane] * (weight ? weight[plane] : 1);
+    b[plane] = -saveMean[plane] * w[plane] + (bias ? bias[plane] : 0);
+  }
+  for (Int row = 0, ci = 0, co = 0; row < nActive;
+       row++, ci += input_stride - nPlanes, co += output_stride - nPlanes) {
+    for (Int plane = 0; plane < nPlanes; plane++, ci++, co++) {
+      T out = input_features[ci] * w[plane] + b[plane];
+      out = (out > 0) ? out : (out * leakiness);
+      output_features[co] = out;
+    }
+  }
+}
+
+template <typename T>
+void BatchNormalization_BackwardPass(T *input_features, T *d_input_features,
+                                     T *output_features, T *d_output_features,
+                                     Int nPlanes, Int input_stride,
+                                     Int output_stride, Int nActive,
+                                     T *saveMean, T *saveInvStd, T *runningMean,
+                                     T *runningVar, T *weight, T *bias,
+                                     T *d_weight, T *d_bias, T leakiness) {
+  std::vector<T> gradMean(nPlanes);
+  std::vector<T> dotp(nPlanes);
+  std::vector<T> k(nPlanes);
+  for (Int row = 0, ci = 0, co = 0; row < nActive;
+       row++, ci += input_stride - nPlanes, co += output_stride - nPlanes) {
+    for (Int plane = 0; plane < nPlanes; plane++, ci++, co++) {
+      T d = d_output_features[co];
+      d = (output_features[co] > 0) ? d : (d * leakiness);
+      d_output_features[co] = d;
+      gradMean[plane] += d;
+      dotp[plane] += (input_features[ci] - saveMean[plane]) * d;
+    }
+  }
+  for (Int plane = 0; plane < nPlanes; plane++) {
+    if (d_bias)
+      d_bias[plane] = gradMean[plane]; // sum of grads, really, until ...
+    gradMean[plane] /= nActive;        // ...now
+    k[plane] = dotp[plane] * saveInvStd[plane] * saveInvStd[plane] / nActive;
+  }
+  for (Int row = 0, ci = 0, co = 0; row < nActive;
+       row++, ci += input_stride - nPlanes, co += output_stride - nPlanes) {
+    for (Int plane = 0; plane < nPlanes; plane++, ci++, co++) {
+      d_input_features[ci] =
+          (d_output_features[co] - gradMean[plane] -
+           (input_features[ci] - saveMean[plane]) * k[plane]) *
+          saveInvStd[plane] * (weight ? weight[plane] : 1);
+    }
  }
+  if (d_weight)
+    for (Int plane = 0; plane < nPlanes; plane++) {
+      d_weight[plane] = dotp[plane] * saveInvStd[plane];
+    }
 }

 template <typename T>
-void cpu_BatchNormalizationInTensor_updateOutput(
+void cpu_BatchNormalization_updateOutput(
    /*float*/ at::Tensor input_features, /*float*/ at::Tensor output_features,
    /*float*/ at::Tensor saveMean,
    /*float*/ at::Tensor saveInvStd, /*float*/ at::Tensor runningMean,
    /*float*/ at::Tensor runningVar,
    /*float*/ at::Tensor weight, /*float*/ at::Tensor bias, T eps, T momentum,
    bool train, T leakiness) {
-
+  output_features.resize_as_(input_features);
  if (input_features.ndimension() == 2) {
    auto nActive = input_features.size(0);
    auto nPlanes = input_features.size(1);
    auto input_stride = input_features.stride(0);
    auto output_stride = output_features.stride(0);
-
    BatchNormalization_ForwardPass<T>(
        input_features.data<T>(), output_features.data<T>(), nPlanes,
        input_stride, output_stride, nActive, saveMean.data<T>(),

--- a/sparseconvnet/SCN/CPU/BatchNormalization.h
+++ b/sparseconvnet/SCN/CPU/BatchNormalization.h
-// Copyright 2016-present, Facebook, Inc.
-// All rights reserved.
-//
-// This source code is licensed under the license found in the
-// LICENSE file in the root directory of this source tree.
-
-#ifndef CPU_BATCHNORMALIZATION_H
-#define CPU_BATCHNORMALIZATION_H
-
-#include <vector>
-
-// in/output_stride is normally the same as nPlanes; allow other values to act
-// on a subset of columns, i.e. an inplace DenseNet blocks
-
-template <typename T>
-void BatchNormalization_ForwardPass(T *input_features, T *output_features,
-                                    Int nPlanes, Int input_stride,
-                                    Int output_stride, Int nActive,
-                                    T *saveMean, T *saveInvStd, T *runningMean,
-                                    T *runningVar, T *weight, T *bias, T eps,
-                                    T momentum, bool train, T leakiness) {
-  if (train) {
-    std::memset(saveMean, 0, nPlanes * sizeof(T));
-    std::memset(saveInvStd, 0, nPlanes * sizeof(T));
-    for (Int row = 0, ci = 0; row < nActive;
-         row++, ci += input_stride - nPlanes) {
-      for (Int plane = 0; plane < nPlanes; plane++, ci++) {
-        saveMean[plane] += input_features[ci];
-      }
-    }
-    for (Int plane = 0; plane < nPlanes; plane++) {
-      saveMean[plane] /= nActive;
-      runningMean[plane] =
-          momentum * runningMean[plane] + (1 - momentum) * saveMean[plane];
-    }
-    for (Int row = 0, ci = 0; row < nActive;
-         row++, ci += input_stride - nPlanes) {
-      for (Int plane = 0; plane < nPlanes; plane++, ci++) {
-        saveInvStd[plane] +=
-            (input_features[ci] - saveMean[plane]) *
-            (input_features[ci] - saveMean[plane]); // accumulate sum-squares
-        // before inverse square
-        // rooting
-      }
-    }
-    for (Int plane = 0; plane < nPlanes; plane++) {
-      runningVar[plane] = momentum * runningVar[plane] +
-                          (1 - momentum) * saveInvStd[plane] / (nActive - 1);
-      saveInvStd[plane] = powf(saveInvStd[plane] / nActive + eps, -0.5);
-    }
-  } else {
-    for (Int plane = 0; plane < nPlanes; plane++) {
-      saveMean[plane] = runningMean[plane];
-      saveInvStd[plane] = powf(runningVar[plane] + eps, -0.5);
-    }
-  }
-  std::vector<T> w(nPlanes);
-  std::vector<T> b(nPlanes);
-  for (Int plane = 0; plane < nPlanes; plane++) {
-    w[plane] = saveInvStd[plane] * (weight ? weight[plane] : 1);
-    b[plane] = -saveMean[plane] * w[plane] + (bias ? bias[plane] : 0);
-  }
-  for (Int row = 0, ci = 0, co = 0; row < nActive;
-       row++, ci += input_stride - nPlanes, co += output_stride - nPlanes) {
-    for (Int plane = 0; plane < nPlanes; plane++, ci++, co++) {
-      T out = input_features[ci] * w[plane] + b[plane];
-      out = (out > 0) ? out : (out * leakiness);
-      output_features[co] = out;
-    }
-  }
-}
-
-template <typename T>
-void BatchNormalization_BackwardPass(T *input_features, T *d_input_features,
-                                     T *output_features, T *d_output_features,
-                                     Int nPlanes, Int input_stride,
-                                     Int output_stride, Int nActive,
-                                     T *saveMean, T *saveInvStd, T *runningMean,
-                                     T *runningVar, T *weight, T *bias,
-                                     T *d_weight, T *d_bias, T leakiness) {
-  std::vector<T> gradMean(nPlanes);
-  std::vector<T> dotp(nPlanes);
-  std::vector<T> k(nPlanes);
-  for (Int row = 0, ci = 0, co = 0; row < nActive;
-       row++, ci += input_stride - nPlanes, co += output_stride - nPlanes) {
-    for (Int plane = 0; plane < nPlanes; plane++, ci++, co++) {
-      T d = d_output_features[co];
-      d = (output_features[co] > 0) ? d : (d * leakiness);
-      d_output_features[co] = d;
-      gradMean[plane] += d;
-      dotp[plane] += (input_features[ci] - saveMean[plane]) * d;
-    }
-  }
-  for (Int plane = 0; plane < nPlanes; plane++) {
-    if (d_bias)
-      d_bias[plane] = gradMean[plane]; // sum of grads, really, until ...
-    gradMean[plane] /= nActive;        // ...now
-    k[plane] = dotp[plane] * saveInvStd[plane] * saveInvStd[plane] / nActive;
-  }
-  for (Int row = 0, ci = 0, co = 0; row < nActive;
-       row++, ci += input_stride - nPlanes, co += output_stride - nPlanes) {
-    for (Int plane = 0; plane < nPlanes; plane++, ci++, co++) {
-      d_input_features[ci] =
-          (d_output_features[co] - gradMean[plane] -
-           (input_features[ci] - saveMean[plane]) * k[plane]) *
-          saveInvStd[plane] * (weight ? weight[plane] : 1);
-    }
-  }
-  if (d_weight)
-    for (Int plane = 0; plane < nPlanes; plane++) {
-      d_weight[plane] = dotp[plane] * saveInvStd[plane];
-    }
-}
-#endif /* CPU_BATCHNORMALIZATION_H */
--- a/sparseconvnet/SCN/CPU/BatchwiseMultiplicativeDropout.cpp
+++ b/sparseconvnet/SCN/CPU/BatchwiseMultiplicativeDropout.cpp
@@ -7,7 +7,7 @@
 template <typename T>
 void cpu_BatchwiseMultiplicativeDropout_updateOutput(
    /*float*/ at::Tensor input_features, /*float*/ at::Tensor output_features,
-    /*float*/ at::Tensor noise, float alpha) {
+    /*float*/ at::Tensor noise, T alpha) {
  output_features.resize_as_(input_features);
  auto nActive = input_features.size(0);
  auto nPlanes = input_features.size(1);
@@ -23,7 +23,7 @@ template <typename T>
 void cpu_BatchwiseMultiplicativeDropout_updateGradInput(
    /*float*/ at::Tensor input_features, /*float*/ at::Tensor d_input_features,
    /*float*/ at::Tensor d_output_features, /*float*/ at::Tensor noise,
-    float alpha) {
+    T alpha) {
  d_input_features.resize_as_(d_output_features);
  auto nActive = input_features.size(0);
  auto nPlanes = input_features.size(1);

--- a/sparseconvnet/SCN/CPU/Convolution.cpp
+++ b/sparseconvnet/SCN/CPU/Convolution.cpp
@@ -11,7 +11,9 @@ void rule_index_select(at::Tensor target, at::Tensor src, Int nRules,
  auto t_ptr = target.data<T>();
  auto s_ptr = src.data<T>();
  auto n = target.size(1);
-  for (int i = 0; i < nRules; ++i)
+  Int i;
+  #pragma omp parallel for private(i)
+  for (i = 0; i < nRules; ++i)
    std::memcpy(t_ptr + i * n, s_ptr + rules[2 * i] * n, sizeof(T) * n);
 }
 template <typename T>
@@ -20,7 +22,9 @@ void rule_index_add_(at::Tensor target, at::Tensor src, Int nRules,
  auto t_ptr = target.data<T>();
  auto s_ptr = src.data<T>();
  auto n = target.size(1);
-  for (int i = 0; i < nRules; ++i) {
+  Int i;
+  #pragma omp parallel for private(i)
+  for (i = 0; i < nRules; ++i) {
    auto t = t_ptr + rules[2 * i] * n;
    auto s = s_ptr + i * n;
    for (int j = 0; j < n; ++j)

--- a/sparseconvnet/SCN/CPU/IOLayers.cpp
+++ b/sparseconvnet/SCN/CPU/IOLayers.cpp
@@ -4,7 +4,43 @@
 // This source code is licensed under the license found in the
 // LICENSE file in the root directory of this source tree.

-#include "IOLayers.h"
+#include <cstring>
+
+// Assume output and d_input_features have been zero-ed
+
+template <typename T>
+void InputLayer_ForwardPass(T *input_features, T *output_features, Int nRows,
+                            Int maxActive, Int nPlanes, Int *rules,
+                            bool average) {
+  for (Int row = 0; row < nRows; row++) {
+    auto nActive = rules[0];
+    T multiplier = (average and nActive > 0) ? 1.0f / nActive : 1.0f;
+    for (Int i = 1; i <= nActive; ++i) {
+      auto in_f = input_features + nPlanes * rules[i];
+      for (Int plane = 0; plane < nPlanes; plane++) {
+        output_features[plane] += multiplier * in_f[plane];
+      }
+    }
+    output_features += nPlanes;
+    rules += 1 + maxActive;
+  }
+}
+template <typename T>
+void InputLayer_BackwardPass(T *d_input_features, T *d_output_features,
+                             Int nRows, Int maxActive, Int nPlanes, Int *rules,
+                             bool average) {
+  for (Int row = 0; row < nRows; row++) {
+    auto nActive = rules[0];
+    T multiplier = (average and nActive > 0) ? 1.0f / nActive : 1.0f;
+    for (Int i = 1; i <= nActive; ++i) {
+      auto d_in_f = d_input_features + nPlanes * rules[i];
+      for (Int plane = 0; plane < nPlanes; plane++)
+        d_in_f[plane] += multiplier * d_output_features[plane];
+    }
+    d_output_features += nPlanes;
+    rules += 1 + maxActive;
+  }
+}

 template <typename T, Int Dimension>
 void cpu_InputLayer_updateOutput(Metadata<Dimension> &m,
@@ -26,8 +62,8 @@ void cpu_InputLayer_updateOutput(Metadata<Dimension> &m,
    output_features.resize_({*m.inputNActive, nPlanes});
    output_features.zero_();
    InputLayer_ForwardPass<T>(input_features.data<T>(),
-                                 output_features.data<T>(), nRows,
-                                 maxActive, nPlanes, &rules[1][0], mode == 4);
+                              output_features.data<T>(), nRows, maxActive,
+                              nPlanes, &rules[1][0], mode == 4);
  }
 }
 template <typename T, Int Dimension>
@@ -47,8 +83,8 @@ void cpu_InputLayer_updateGradInput(Metadata<Dimension> &m,
    d_input_features.resize_({rules[0][2], nPlanes});
    d_input_features.zero_();
    InputLayer_BackwardPass<T>(d_input_features.data<T>(),
-                                  d_output_features.data<T>(), nRows,
-                                  maxActive, nPlanes, &rules[1][0], mode == 4);
+                               d_output_features.data<T>(), nRows, maxActive,
+                               nPlanes, &rules[1][0], mode == 4);
  }
 }

@@ -69,8 +105,8 @@ void cpu_OutputLayer_updateOutput(Metadata<Dimension> &m,
    output_features.resize_({rules[0][2], nPlanes});
    output_features.zero_();
    InputLayer_BackwardPass<T>(output_features.data<T>(),
-                                  input_features.data<T>(), nRows,
-                                  maxActive, nPlanes, &rules[1][0], false);
+                               input_features.data<T>(), nRows, maxActive,
+                               nPlanes, &rules[1][0], false);
  }
 }
 template <typename T, Int Dimension>
@@ -90,8 +126,8 @@ void cpu_OutputLayer_updateGradInput(Metadata<Dimension> &m,
    d_input_features.resize_({nRows, nPlanes});
    d_input_features.zero_();
    InputLayer_ForwardPass<T>(d_output_features.data<T>(),
-                                 d_input_features.data<T>(), nRows,
-                                 maxActive, nPlanes, &rules[1][0], false);
+                              d_input_features.data<T>(), nRows, maxActive,
+                              nPlanes, &rules[1][0], false);
  }
 }

@@ -116,8 +152,8 @@ void cpu_BLInputLayer_updateOutput(Metadata<Dimension> &m,
    output_features.resize_({*m.inputNActive, nPlanes});
    output_features.zero_();
    InputLayer_ForwardPass<T>(input_features.data<T>(),
-                                 output_features.data<T>(), nRows,
-                                 maxActive, nPlanes, &rules[1][0], mode == 4);
+                              output_features.data<T>(), nRows, maxActive,
+                              nPlanes, &rules[1][0], mode == 4);
  }
 }
 template <typename T, Int Dimension>
@@ -139,8 +175,8 @@ void cpu_BLInputLayer_updateGradInput(Metadata<Dimension> &m,
    d_input_features.resize_({rules[0][2], rules[0][3], nPlanes});
    d_input_features.zero_();
    InputLayer_BackwardPass<T>(d_input_features.data<T>(),
-                                  d_output_features.data<T>(), nRows,
-                                  maxActive, nPlanes, &rules[1][0], mode == 4);
+                               d_output_features.data<T>(), nRows, maxActive,
+                               nPlanes, &rules[1][0], mode == 4);
  }
 }

@@ -162,8 +198,8 @@ void cpu_BLOutputLayer_updateOutput(Metadata<Dimension> &m,
    output_features.resize_({rules[0][2], rules[0][3], nPlanes});
    output_features.zero_();
    InputLayer_BackwardPass<T>(output_features.data<T>(),
-                                  input_features.data<T>(), nRows,
-                                  maxActive, nPlanes, &rules[1][0], false);
+                               input_features.data<T>(), nRows, maxActive,
+                               nPlanes, &rules[1][0], false);
  }
 }
 template <typename T, Int Dimension>
@@ -184,7 +220,7 @@ void cpu_BLOutputLayer_updateGradInput(Metadata<Dimension> &m,
    d_input_features.resize_({nRows, nPlanes});
    d_input_features.zero_();
    InputLayer_ForwardPass<T>(d_output_features.data<T>(),
-                                 d_input_features.data<T>(), nRows,
-                                 maxActive, nPlanes, &rules[1][0], false);
+                              d_input_features.data<T>(), nRows, maxActive,
+                              nPlanes, &rules[1][0], false);
  }
 }
--- a/sparseconvnet/SCN/CPU/IOLayers.h
+++ b/sparseconvnet/SCN/CPU/IOLayers.h
-// Copyright 2016-present, Facebook, Inc.
-// All rights reserved.
-//
-// This source code is licensed under the license found in the
-// LICENSE file in the root directory of this source tree.
-
-#ifndef CPU_IOLAYERS_H
-#define CPU_IOLAYERS_H
-
-#include <cstring>
-
-// Assume output and d_input_features have been zero-ed
-
-template <typename T>
-void InputLayer_ForwardPass(T *input_features, T *output_features, Int nRows,
-                            Int maxActive, Int nPlanes, Int *rules,
-                            bool average) {
-  for (Int row = 0; row < nRows; row++) {
-    auto nActive = rules[0];
-    T multiplier = (average and nActive > 0) ? 1.0f / nActive : 1.0f;
-    for (Int i = 1; i <= nActive; ++i) {
-      auto in_f = input_features + nPlanes * rules[i];
-      for (Int plane = 0; plane < nPlanes; plane++) {
-        output_features[plane] += multiplier * in_f[plane];
-      }
-    }
-    output_features += nPlanes;
-    rules += 1 + maxActive;
-  }
-}
-template <typename T>
-void InputLayer_BackwardPass(T *d_input_features, T *d_output_features,
-                             Int nRows, Int maxActive, Int nPlanes,
-                             Int *rules, bool average) {
-  for (Int row = 0; row < nRows; row++) {
-    auto nActive = rules[0];
-    T multiplier = (average and nActive > 0) ? 1.0f / nActive : 1.0f;
-    for (Int i = 1; i <= nActive; ++i) {
-      auto d_in_f = d_input_features + nPlanes * rules[i];
-      for (Int plane = 0; plane < nPlanes; plane++)
-        d_in_f[plane] += multiplier * d_output_features[plane];
-    }
-    d_output_features += nPlanes;
-    rules += 1 + maxActive;
-  }
-}
-#endif /* CPU_IOLAYERS_H */
--- a/sparseconvnet/SCN/CPU/LeakyReLU.cpp
+++ b/sparseconvnet/SCN/CPU/LeakyReLU.cpp
@@ -6,8 +6,7 @@

 template <typename T>
 void cpu_LeakyReLU_updateOutput(/*float*/ at::Tensor input_features,
-                                /*float*/ at::Tensor output_features,
-                                float alpha) {
+                                /*float*/ at::Tensor output_features, T alpha) {
  output_features.resize_as_(input_features);
  auto iF = input_features.data<T>();
  auto oF = output_features.data<T>();
@@ -20,7 +19,7 @@ template <typename T>
 void cpu_LeakyReLU_updateGradInput(/*float*/ at::Tensor input_features,
                                   /*float*/ at::Tensor d_input_features,
                                   /*float*/ at::Tensor d_output_features,
-                                   float alpha) {
+                                   T alpha) {
  d_input_features.resize_as_(d_output_features);
  auto iF = input_features.data<T>();
  auto diF = d_input_features.data<T>();