Factor out CUDA code

de3743f6 · Benjamin Thomas Graham · f0407b36 · f0407b36 · de3743f6 · de3743f6
Commit de3743f6 authored Jul 13, 2018 by Benjamin Thomas Graham
20 changed files
--- a/sparseconvnet/SCN/CUDA/BatchwiseMultiplicativeDropout.h
+++ b/sparseconvnet/SCN/CUDA/BatchwiseMultiplicativeDropout.h
-// Copyright 2016-present, Facebook, Inc.
-// All rights reserved.
-//
-// This source code is licensed under the license found in the
-// LICENSE file in the root directory of this source tree.
-#ifndef CUDA_BATCHWISEMULTIPLICATIVEDROPOUT_H
-#define CUDA_BATCHWISEMULTIPLICATIVEDROPOUT_H
-template <typename T, Int NTX, Int NTY>
-__global__ void BatchwiseMultiplicativeDropout_fp(T *input_features,
-                                                  T *output_features, T *noise,
-                                                  Int nActive, Int nPlanes,
-                                                  Int input_stride,
-                                                  Int output_stride, T alpha) {
-  __shared__ T nz[NTX];
-  for (Int plane = threadIdx.x + blockIdx.x * NTX; plane < nPlanes;
-       plane += gridDim.x * NTX) {
-    if (threadIdx.y == 0)
-      nz[threadIdx.x] = noise[plane];
-    __syncthreads();
-    for (Int row = threadIdx.y + blockIdx.y * NTY; row < nActive;
-         row += gridDim.y * NTY) {
-      Int i = row * input_stride + plane;
-      Int o = row * output_stride + plane;
-      output_features[o] = input_features[i] * nz[threadIdx.x] *
-                           ((input_features[i] > 0) ? 1 : alpha);
-    }
-    __syncthreads();
-  }
-}
-template <typename T, Int NTX, Int NTY>
-__global__ void
-BatchwiseMultiplicativeDropout_bp(T *input_features, T *d_input_features,
-                                  T *d_output_features, T *noise, Int nActive,
-                                  Int nPlanes, Int input_stride,
-                                  Int output_stride, T alpha) {
-  __shared__ T nz[NTX];
-  for (Int plane = threadIdx.x + blockIdx.x * NTX; plane < nPlanes;
-       plane += gridDim.x * NTX) {
-    if (threadIdx.y == 0)
-      nz[threadIdx.x] = noise[plane];
-    __syncthreads();
-    for (Int row = threadIdx.y + blockIdx.y * NTY; row < nActive;
-         row += gridDim.y * NTY) {
-      Int i = row * input_stride + plane;
-      Int o = row * output_stride + plane;
-      d_input_features[i] = d_output_features[o] * nz[threadIdx.x] *
-                            ((input_features[i] > 0) ? 1 : alpha);
-    }
-    __syncthreads();
-  }
-}
-#endif /* CUDA_BATCHWISEMULTIPLICATIVEDROPOUT_H */
--- a/sparseconvnet/SCN/CUDA/Convolution.cpp
+++ b/sparseconvnet/SCN/CUDA/Convolution.cpp
+// Copyright 2016-present, Facebook, Inc.
+// All rights reserved.
+//
+// This source code is licensed under the license found in the
+// LICENSE file in the root directory of this source tree.
+template <typename T>
+void Convolution_fp_bias(T *of, T *b, Int nPlanes, Int nActiveOut);
+template <typename T>
+void Convolution_bp_bias(T *matrix, T *target, Int nRows, Int nColumns,
+                         Int nCOLUMNS);
+template <typename T>
+double dConvolution_forward2(T *inFeatures, T *outFeatures, T *w,
+                             RuleBook _rules, Int input_nPlanes,
+                             Int input_stride, Int output_nPlanes,
+                             Int output_stride);
+template <typename T>
+void dConvolution_backward_dW2(T *inFeatures, T *dInFeatures, T *dOutFeatures,
+                               T *w, T *dw, RuleBook _rules, Int input_nPlanes,
+                               Int input_stride, Int output_nPlanes,
+                               Int output_stride);
+template <typename T, Int Dimension>
+double cuda_Convolution_updateOutput(
+    /*long*/ at::Tensor inputSize, /*long*/ at::Tensor outputSize,
+    /*long*/ at::Tensor filterSize,
+    /*long*/ at::Tensor filterStride, Metadata<Dimension> &m,
+    /*cuda float*/ at::Tensor input_features,
+    /*cuda float*/ at::Tensor output_features, /*cuda float*/ at::Tensor weight,
+    /*cuda float*/ at::Tensor bias) {
+  auto _rules =
+      m.getRuleBook(inputSize, outputSize, filterSize, filterStride, true);
+  Int nActiveOut = m.getNActive(outputSize);
+  if (nActiveOut) {
+    Int ip = weight.size(1);
+    Int op = weight.size(2);
+    output_features.resize_({nActiveOut, op});
+    auto iF = input_features.data<T>();
+    auto oF = output_features.data<T>();
+    auto w = weight.data<T>();
+    if (bias.numel())
+      Convolution_fp_bias(oF, bias.data<T>(), op, nActiveOut);
+    else
+      output_features.zero_();
+    return dConvolution_forward2<T>(iF, oF, w, _rules, ip, ip, op, op);
+  } else {
+    return 0;
+  }
+}
+template <typename T, Int Dimension>
+void cuda_Convolution_backward(
+    /*long*/ at::Tensor inputSize, /*long*/ at::Tensor outputSize,
+    /*long*/ at::Tensor filterSize,
+    /*long*/ at::Tensor filterStride, Metadata<Dimension> &m,
+    /*cuda float*/ at::Tensor input_features,
+    /*cuda float*/ at::Tensor d_input_features,
+    /*cuda float*/ at::Tensor d_output_features,
+    /*cuda float*/ at::Tensor weight, /*cuda float*/ at::Tensor d_weight,
+    /*cuda float*/ at::Tensor d_bias) {
+  auto _rules =
+      m.getRuleBook(inputSize, outputSize, filterSize, filterStride, true);
+  Int nActiveIn = m.getNActive(inputSize);
+  Int nActiveOut = m.getNActive(outputSize);
+  if (nActiveOut) {
+    Int ip = weight.size(1);
+    Int op = weight.size(2);
+    d_input_features.resize_({nActiveIn, ip});
+    d_input_features.zero_();
+    auto iF = input_features.data<T>();
+    auto diF = d_input_features.data<T>();
+    auto doF = d_output_features.data<T>();
+    auto w = weight.data<T>();
+    auto dw = d_weight.data<T>();
+    dConvolution_backward_dW2<T>(iF, diF, doF, w, dw, _rules, ip, ip, op, op);
+    if (d_bias.numel()) {
+      auto db = d_bias.data<T>();
+      Convolution_bp_bias(doF, db, op, op, nActiveOut);
+    }
+  }
+}
+template <typename T, Int Dimension>
+double cuda_SubmanifoldConvolution_updateOutput(
+    /*long*/ at::Tensor inputSize, /*long*/ at::Tensor filterSize,
+    Metadata<Dimension> &m,
+    /*cuda float*/ at::Tensor input_features,
+    /*cuda float*/ at::Tensor output_features, /*cuda float*/ at::Tensor weight,
+    /*cuda float*/ at::Tensor bias) {
+  auto _rules = m.getSubmanifoldRuleBook(inputSize, filterSize, true);
+  Int nActive = m.getNActive(inputSize);
+  if (nActive) {
+    Int ip = weight.size(1);
+    Int op = weight.size(2);
+    output_features.resize_({nActive, op});
+    auto iF = input_features.data<T>();
+    auto oF = output_features.data<T>();
+    auto w = weight.data<T>();
+    if (bias.numel())
+      Convolution_fp_bias(oF, bias.data<T>(), op, nActive);
+    else
+      output_features.zero_();
+    return dConvolution_forward2<T>(iF, oF, w, _rules, ip, ip, op, op);
+  } else {
+    return 0;
+  }
+}
+template <typename T, Int Dimension>
+void cuda_SubmanifoldConvolution_backward(
+    /*long*/ at::Tensor inputSize, /*long*/ at::Tensor filterSize,
+    Metadata<Dimension> &m,
+    /*cuda float*/ at::Tensor input_features,
+    /*cuda float*/ at::Tensor d_input_features,
+    /*cuda float*/ at::Tensor d_output_features,
+    /*cuda float*/ at::Tensor weight, /*cuda float*/ at::Tensor d_weight,
+    /*cuda float*/ at::Tensor d_bias) {
+  auto _rules = m.getSubmanifoldRuleBook(inputSize, filterSize, true);
+  Int nActive = m.getNActive(inputSize);
+  if (nActive) {
+    Int ip = weight.size(1);
+    Int op = weight.size(2);
+    d_input_features.resize_({nActive, ip});
+    d_input_features.zero_();
+    auto iF = input_features.data<T>();
+    auto diF = d_input_features.data<T>();
+    auto doF = d_output_features.data<T>();
+    auto w = weight.data<T>();
+    auto dw = d_weight.data<T>();
+    dConvolution_backward_dW2<T>(iF, diF, doF, w, dw, _rules, ip, ip, op, op);
+    if (d_bias.numel()) {
+      auto db = d_bias.data<T>();
+      Convolution_bp_bias(doF, db, op, op, nActive);
+    }
+  }
+}
+template <typename T, Int Dimension>
+double cuda_FullConvolution_updateOutput(
+    /*long*/ at::Tensor inputSize, /*long*/ at::Tensor outputSize,
+    /*long*/ at::Tensor filterSize,
+    /*long*/ at::Tensor filterStride, Metadata<Dimension> &mIn,
+    Metadata<Dimension> &mOut,
+    /*cuda float*/ at::Tensor input_features,
+    /*cuda float*/ at::Tensor output_features, /*cuda float*/ at::Tensor weight,
+    /*cuda float*/ at::Tensor bias) {
+  auto _rules = mIn.getFullConvolutionRuleBook(inputSize, outputSize,
+                                               filterSize, filterStride, mOut);
+  Int nActiveOut = mOut.getNActive(outputSize);
+  if (nActiveOut) {
+    Int ip = weight.size(1);
+    Int op = weight.size(2);
+    output_features.resize_({nActiveOut, op});
+    auto iF = input_features.data<T>();
+    auto oF = output_features.data<T>();
+    auto w = weight.data<T>();
+    if (bias.numel())
+      Convolution_fp_bias(oF, bias.data<T>(), op, nActiveOut);
+    else
+      output_features.zero_();
+    return dConvolution_forward2<T>(iF, oF, w, _rules, ip, ip, op, op);
+  } else {
+    return 0;
+  }
+}
+template <typename T, Int Dimension>
+void cuda_FullConvolution_backward(
+    /*long*/ at::Tensor inputSize, /*long*/ at::Tensor outputSize,
+    /*long*/ at::Tensor filterSize,
+    /*long*/ at::Tensor filterStride, Metadata<Dimension> &mIn,
+    Metadata<Dimension> &mOut,
+    /*cuda float*/ at::Tensor input_features,
+    /*cuda float*/ at::Tensor d_input_features,
+    /*cuda float*/ at::Tensor d_output_features,
+    /*cuda float*/ at::Tensor weight, /*cuda float*/ at::Tensor d_weight,
+    /*cuda float*/ at::Tensor d_bias) {
+  auto _rules = mIn.getFullConvolutionRuleBook(inputSize, outputSize,
+                                               filterSize, filterStride, mOut);
+  Int nActiveIn = mIn.getNActive(inputSize);
+  Int nActiveOut = mOut.getNActive(outputSize);
+  if (nActiveOut) {
+    Int ip = weight.size(1);
+    Int op = weight.size(2);
+    d_input_features.resize_({nActiveIn, ip});
+    d_input_features.zero_();
+    auto iF = input_features.data<T>();
+    auto diF = d_input_features.data<T>();
+    auto doF = d_output_features.data<T>();
+    auto w = weight.data<T>();
+    auto dw = d_weight.data<T>();
+    dConvolution_backward_dW2<T>(iF, diF, doF, w, dw, _rules, ip, ip, op, op);
+    if (d_bias.numel()) {
+      auto db = d_bias.data<T>();
+      Convolution_bp_bias(doF, db, op, op, nActiveOut);
+    }
+  }
+}
+template <typename T, Int Dimension>
+double cuda_RandomizedStrideConvolution_updateOutput(
+    /*long*/ at::Tensor inputSize, /*long*/ at::Tensor outputSize,
+    /*long*/ at::Tensor filterSize,
+    /*long*/ at::Tensor filterStride, Metadata<Dimension> &m,
+    /*cuda float*/ at::Tensor input_features,
+    /*cuda float*/ at::Tensor output_features,
+    /*cuda float*/ at::Tensor weight, /*cuda float*/ at::Tensor bias) {
+  auto _rules = m.getRandomizedStrideRuleBook(inputSize, outputSize, filterSize,
+                                              filterStride, true);
+  Int nActiveOut = m.getNActive(outputSize);
+  if (nActiveOut) {
+    Int ip = weight.size(1);
+    Int op = weight.size(2);
+    output_features.resize_({nActiveOut, op});
+    auto iF = input_features.data<T>();
+    auto oF = output_features.data<T>();
+    auto w = weight.data<T>();
+    if (bias.numel())
+      Convolution_fp_bias(oF, bias.data<T>(), op, nActiveOut);
+    else
+      output_features.zero_();
+    return dConvolution_forward2<T>(iF, oF, w, _rules, ip, ip, op, op);
+  } else {
+    return 0;
+  }
+}
+template <typename T, Int Dimension>
+void cuda_RandomizedStrideConvolution_backward(
+    /*long*/ at::Tensor inputSize, /*long*/ at::Tensor outputSize,
+    /*long*/ at::Tensor filterSize,
+    /*long*/ at::Tensor filterStride, Metadata<Dimension> &m,
+    /*cuda float*/ at::Tensor input_features,
+    /*cuda float*/ at::Tensor d_input_features,
+    /*cuda float*/ at::Tensor d_output_features,
+    /*cuda float*/ at::Tensor weight, /*cuda float*/ at::Tensor d_weight,
+    /*cuda float*/ at::Tensor d_bias) {
+  auto _rules = m.getRandomizedStrideRuleBook(inputSize, outputSize, filterSize,
+                                              filterStride, true);
+  Int nActiveIn = m.getNActive(inputSize);
+  Int nActiveOut = m.getNActive(outputSize);
+  if (nActiveOut) {
+    Int ip = weight.size(1);
+    Int op = weight.size(2);
+    d_input_features.resize_({nActiveIn, ip});
+    d_input_features.zero_();
+    auto iF = input_features.data<T>();
+    auto diF = d_input_features.data<T>();
+    auto doF = d_output_features.data<T>();
+    auto w = weight.data<T>();
+    auto dw = d_weight.data<T>();
+    dConvolution_backward_dW2<T>(iF, diF, doF, w, dw, _rules, ip, ip, op, op);
+    if (d_bias.numel()) {
+      auto db = d_bias.data<T>();
+      Convolution_bp_bias(doF, db, op, op, nActiveOut);
+    }
+  }
+}
--- a/sparseconvnet/SCN/CUDA/Convolution.cu
+++ b/sparseconvnet/SCN/CUDA/Convolution.cu
--- a/sparseconvnet/SCN/CUDA/Convolution.h
+++ b/sparseconvnet/SCN/CUDA/Convolution.h
--- a/sparseconvnet/SCN/CUDA/Deconvolution.cpp
+++ b/sparseconvnet/SCN/CUDA/Deconvolution.cpp
+// Copyright 2016-present, Facebook, Inc.
+// All rights reserved.
+//
+// This source code is licensed under the license found in the
+// LICENSE file in the root directory of this source tree.
+template <typename T>
+double dDeconvolution_forward2(T *inFeatures, T *outFeatures, T *w,
+                               RuleBook _rules, Int input_nPlanes,
+                               Int input_stride, Int output_nPlanes,
+                               Int output_stride);
+template <typename T>
+void dDeconvolution_backward_dW2(T *inFeatures, T *dInFeatures, T *dOutFeatures,
+                                 T *w, T *dw, RuleBook _rules,
+                                 Int input_nPlanes, Int input_stride,
+                                 Int output_nPlanes, Int output_stride);
+template <typename T, Int Dimension>
+double cuda_Deconvolution_updateOutput(
+    /*long*/ at::Tensor inputSize, /*long*/ at::Tensor outputSize,
+    /*long*/ at::Tensor filterSize,
+    /*long*/ at::Tensor filterStride, Metadata<Dimension> &m,
+    /*cuda float*/ at::Tensor input_features,
+    /*cuda float*/ at::Tensor output_features, /*cuda float*/ at::Tensor weight,
+    /*cuda float*/ at::Tensor bias) {
+  auto _rules =
+      m.getRuleBook(outputSize, inputSize, filterSize, filterStride, true);
+  Int nActiveOut = m.getNActive(outputSize);
+  if (nActiveOut) {
+    Int ip = weight.size(1);
+    Int op = weight.size(2);
+    output_features.resize_({nActiveOut, op});
+    auto iF = input_features.data<T>();
+    auto oF = output_features.data<T>();
+    auto w = weight.data<T>();
+    if (bias.numel())
+      Convolution_fp_bias(oF, bias.data<T>(), op, nActiveOut);
+    else
+      output_features.zero_();
+    return dDeconvolution_forward2<T>(iF, oF, w, _rules, ip, ip, op, op);
+  } else {
+    return 0;
+  }
+}
+template <typename T, Int Dimension>
+void cuda_Deconvolution_backward(
+    /*long*/ at::Tensor inputSize, /*long*/ at::Tensor outputSize,
+    /*long*/ at::Tensor filterSize,
+    /*long*/ at::Tensor filterStride, Metadata<Dimension> &m,
+    /*cuda float*/ at::Tensor input_features,
+    /*cuda float*/ at::Tensor d_input_features,
+    /*cuda float*/ at::Tensor d_output_features,
+    /*cuda float*/ at::Tensor weight, /*cuda float*/ at::Tensor d_weight,
+    /*cuda float*/ at::Tensor d_bias) {
+  auto _rules =
+      m.getRuleBook(outputSize, inputSize, filterSize, filterStride, true);
+  Int nActiveIn = m.getNActive(inputSize);
+  Int nActiveOut = m.getNActive(outputSize);
+  if (nActiveOut) {
+    Int ip = weight.size(1);
+    Int op = weight.size(2);
+    d_input_features.resize_({nActiveIn, ip});
+    d_input_features.zero_();
+    auto iF = input_features.data<T>();
+    auto diF = d_input_features.data<T>();
+    auto doF = d_output_features.data<T>();
+    auto w = weight.data<T>();
+    auto dw = d_weight.data<T>();
+    dDeconvolution_backward_dW2<T>(iF, diF, doF, w, dw, _rules, ip, ip, op, op);
+    if (d_bias.numel()) {
+      auto db = d_bias.data<T>();
+      Convolution_bp_bias(doF, db, op, op, nActiveOut);
+    }
+  }
+}
--- a/sparseconvnet/SCN/CUDA/Deconvolution.cu
+++ b/sparseconvnet/SCN/CUDA/Deconvolution.cu
--- a/sparseconvnet/SCN/CUDA/Deconvolution.h
+++ b/sparseconvnet/SCN/CUDA/Deconvolution.h
--- a/sparseconvnet/SCN/CUDA/IOLayers.cpp
+++ b/sparseconvnet/SCN/CUDA/IOLayers.cpp
+// Copyright 2016-present, Facebook, Inc.
+// All rights reserved.
+//
+// This source code is licensed under the license found in the
+// LICENSE file in the root directory of this source tree.
+template <typename T>
+void InputLayer_fp(T *input_features, T *output_features, Int nRows,
+                   Int maxActive, Int nPlanes, Int *rules_cpu, Int *rules_gpu,
+                   bool average);
+template <typename T>
+void InputLayer_bp(T *d_input_features, T *d_output_features, Int nRows,
+                   Int maxActive, Int nPlanes, Int *rules_cpu, Int *rules_gpu,
+                   bool average);
+template <typename T, Int Dimension>
+void cuda_InputLayer_updateOutput(Metadata<Dimension> &m,
+                                  /*long*/ at::Tensor spatialSize,
+                                  /*long*/ at::Tensor input_coords,
+                                  /*cuda float*/ at::Tensor input_features,
+                                  /*cuda float*/ at::Tensor output_features,
+                                  long batchSize, long mode) {
+  m.inputLayer(spatialSize, input_coords, batchSize, mode);
+  Int nPlanes = input_features.size(1);
+  auto &rules = m.inputLayerRuleBook;
+  Int maxActive = rules[0][1];
+  Int nRows = rules[0][3];
+  if (mode == 0) {
+    output_features.resize_as_(input_features);
+    output_features.copy_(input_features);
+  } else {
+    output_features.resize_({*m.inputNActive, nPlanes});
+    output_features.zero_();
+    auto rulesBuffer = at::CUDA(at_kINT).tensor({(int)rules[1].size()});
+    auto iF = input_features.data<T>();
+    auto oF = output_features.data<T>();
+    Int *rb = rulesBuffer.data<Int>();
+    InputLayer_fp<T>(iF, oF, nRows, maxActive, nPlanes, &rules[1][0], rb,
+                     mode == 4);
+  }
+}
+template <typename T, Int Dimension>
+void cuda_InputLayer_updateGradInput(
+    Metadata<Dimension> &m,
+    /*cuda float*/ at::Tensor d_input_features,
+    /*cuda float*/ at::Tensor d_output_features) {
+  auto &rules = m.inputLayerRuleBook;
+  Int nPlanes = d_output_features.size(1);
+  auto mode = rules[0][0];
+  Int maxActive = rules[0][1];
+  Int nRows = rules[0][3];
+  if (mode == 0) {
+    d_input_features.resize_as_(d_output_features);
+    d_input_features.copy_(d_output_features);
+  } else {
+    d_input_features.resize_({rules[0][2], nPlanes});
+    d_input_features.zero_();
+    auto rulesBuffer = at::CUDA(at_kINT).tensor({(int)rules[1].size()});
+    auto diF = d_input_features.data<T>();
+    auto doF = d_output_features.data<T>();
+    Int *rb = rulesBuffer.data<Int>();
+    InputLayer_bp(diF, doF, nRows, maxActive, nPlanes, &rules[1][0], rb,
+                  mode == 4);
+  }
+}
+template <typename T, Int Dimension>
+void cuda_OutputLayer_updateOutput(Metadata<Dimension> &m,
+                                   /*cuda float*/ at::Tensor input_features,
+                                   /*cuda float*/ at::Tensor output_features) {
+  auto &rules = m.inputLayerRuleBook;
+  Int nPlanes = input_features.size(1);
+  auto mode = rules[0][0];
+  auto maxActive = rules[0][1];
+  auto nRows = rules[0][3];
+  if (mode == 0) {
+    output_features.resize_as_(input_features);
+    output_features.copy_(input_features);
+  } else {
+    output_features.resize_({rules[0][2], nPlanes});
+    output_features.zero_();
+    auto rulesBuffer = at::CUDA(at_kINT).tensor({(int)rules[1].size()});
+    auto iF = input_features.data<T>();
+    auto oF = output_features.data<T>();
+    Int *rb = rulesBuffer.data<Int>();
+    InputLayer_bp(oF, iF, nRows, maxActive, nPlanes, &rules[1][0], rb, false);
+  }
+}
+template <typename T, Int Dimension>
+void cuda_OutputLayer_updateGradInput(
+    Metadata<Dimension> &m,
+    /*cuda float*/ at::Tensor d_input_features,
+    /*cuda float*/ at::Tensor d_output_features) {
+  auto &rules = m.inputLayerRuleBook;
+  Int nPlanes = d_output_features.size(1);
+  auto mode = rules[0][0];
+  auto maxActive = rules[0][1];
+  auto nRows = rules[0][3];
+  if (mode == 0) {
+    d_input_features.resize_as_(d_output_features);
+    d_input_features.copy_(d_output_features);
+  } else {
+    d_input_features.resize_({nRows, nPlanes});
+    d_input_features.zero_();
+    auto rulesBuffer = at::CUDA(at_kINT).tensor({(int)rules[1].size()});
+    auto diF = d_input_features.data<T>();
+    auto doF = d_output_features.data<T>();
+    Int *rb = rulesBuffer.data<Int>();
+    InputLayer_fp<T>(doF, diF, nRows, maxActive, nPlanes, &rules[1][0], rb,
+                     false);
+  }
+}
+template <typename T, Int Dimension>
+void cuda_BLInputLayer_updateOutput(Metadata<Dimension> &m,
+                                    /*long*/ at::Tensor spatialSize,
+                                    /*long*/ at::Tensor input_coords,
+                                    /*cuda float*/ at::Tensor input_features,
+                                    /*cuda float*/ at::Tensor output_features,
+                                    long mode) {
+  m.blLayer(spatialSize, input_coords, mode);
+  Int nPlanes = input_features.size(2);
+  output_features.resize_({*m.inputNActive, nPlanes});
+  output_features.zero_();
+  auto &rules = m.blLayerRuleBook;
+  Int maxActive = rules[0][1];
+  Int nRows = rules[0][4];
+  if (mode == 0) {
+    output_features.resize_as_(input_features);
+    output_features.copy_(input_features);
+    output_features.resize_({*m.inputNActive, nPlanes});
+  } else {
+    auto rulesBuffer = at::CUDA(at_kINT).tensor({(int)rules[1].size()});
+    auto iF = input_features.data<T>();
+    auto oF = output_features.data<T>();
+    Int *rb = rulesBuffer.data<Int>();
+    InputLayer_fp<T>(iF, oF, nRows, maxActive, nPlanes, &rules[1][0], rb,
+                     mode == 4);
+  }
+}
+template <typename T, Int Dimension>
+void cuda_BLInputLayer_updateGradInput(
+    Metadata<Dimension> &m,
+    /*cuda float*/ at::Tensor d_input_features,
+    /*cuda float*/ at::Tensor d_output_features) {
+  auto &rules = m.blLayerRuleBook;
+  Int nPlanes = d_output_features.size(1);
+  Int mode = rules[0][0];
+  Int maxActive = rules[0][1];
+  Int nRows = rules[0][4];
+  if (mode == 0) {
+    d_input_features.resize_as_(d_output_features);
+    d_input_features.copy_(d_output_features);
+    d_input_features.resize_({rules[0][2], rules[0][3], nPlanes});
+  } else {
+    d_input_features.resize_({rules[0][2], rules[0][3], nPlanes});
+    d_input_features.zero_();
+    auto rulesBuffer = at::CUDA(at_kINT).tensor({(int)rules[1].size()});
+    auto diF = d_input_features.data<T>();
+    auto doF = d_output_features.data<T>();
+    Int *rb = rulesBuffer.data<Int>();
+    InputLayer_bp(diF, doF, nRows, maxActive, nPlanes, &rules[1][0], rb,
+                  mode == 4);
+  }
+}
+template <typename T, Int Dimension>
+void cuda_BLOutputLayer_updateOutput(
+    Metadata<Dimension> &m,
+    /*cuda float*/ at::Tensor input_features,
+    /*cuda float*/ at::Tensor output_features) {
+  auto &rules = m.blLayerRuleBook;
+  Int nPlanes = input_features.size(1);
+  auto mode = rules[0][0];
+  Int maxActive = rules[0][1];
+  Int nRows = rules[0][4];
+  if (mode == 0) {
+    output_features.resize_as_(input_features);
+    output_features.copy_(input_features);
+    output_features.resize_({rules[0][2], rules[0][3], nPlanes});
+  } else {
+    output_features.resize_({rules[0][2], rules[0][3], nPlanes});
+    output_features.zero_();
+    auto rulesBuffer = at::CUDA(at_kINT).tensor({(int)rules[1].size()});
+    auto iF = input_features.data<T>();
+    auto oF = output_features.data<T>();
+    Int *rb = rulesBuffer.data<Int>();
+    InputLayer_bp(oF, iF, nRows, maxActive, nPlanes, &rules[1][0], rb, false);
+  }
+}
+template <typename T, Int Dimension>
+void cuda_BLOutputLayer_updateGradInput(
+    Metadata<Dimension> &m,
+    /*cuda float*/ at::Tensor d_input_features,
+    /*cuda float*/ at::Tensor d_output_features) {
+  auto &rules = m.blLayerRuleBook;
+  Int nPlanes = d_output_features.size(2);
+  Int mode = rules[0][0];
+  Int maxActive = rules[0][1];
+  Int nRows = rules[0][4];
+  if (mode == 0) {
+    d_input_features.resize_as_(d_output_features);
+    d_input_features.copy_(d_output_features);
+    d_input_features.resize_({nRows, nPlanes});
+  } else {
+    d_input_features.resize_({nRows, nPlanes});
+    d_input_features.zero_();
+    auto rulesBuffer = at::CUDA(at_kINT).tensor({(int)rules[1].size()});
+    auto diF = d_input_features.data<T>();
+    auto doF = d_output_features.data<T>();
+    Int *rb = rulesBuffer.data<Int>();
+    InputLayer_fp<T>(doF, diF, nRows, maxActive, nPlanes, &rules[1][0], rb,
+                     false);
+  }
+}
--- a/sparseconvnet/SCN/CUDA/IOLayers.cu
+++ b/sparseconvnet/SCN/CUDA/IOLayers.cu
--- a/sparseconvnet/SCN/CUDA/IOLayers.h
+++ b/sparseconvnet/SCN/CUDA/IOLayers.h
-// Copyright 2016-present, Facebook, Inc.
-// All rights reserved.
-//
-// This source code is licensed under the license found in the
-// LICENSE file in the root directory of this source tree.
-#ifndef CUDA_IOLAYERS_H
-#define CUDA_IOLAYERS_H
-template <typename T>
-__global__ void InputLayer_fp(T *input_features, T *output_features,
-                              Int nRows, Int maxActive, Int nPlanes,
-                              Int *rules, bool average) {
-  for (int row = blockIdx.x; row < nRows; row += gridDim.x) {
-    T *out = output_features + row * nPlanes;
-    Int *r = rules + row * (1 + maxActive);
-    Int nActive = r[0];
-    T multiplier = (average and nActive > 0) ? 1.0f / nActive : 1.0f;
-    for (int i = 1; i <= nActive; i++) {
-      T *inp = input_features + r[i] * nPlanes;
-      for (Int plane = threadIdx.x; plane < nPlanes; plane += blockDim.x)
-        out[plane] += multiplier * inp[plane];
-    }
-  }
-}
-template <typename T>
-__global__ void InputLayer_bp(T *d_input_features, T *d_output_features,
-                              Int nRows, Int maxActive, Int nPlanes,
-                              Int *rules, bool average) {
-  for (int row = blockIdx.x; row < nRows; row += gridDim.x) {
-    T *out = d_output_features + row * nPlanes;
-    Int *r = rules + row * (1 + maxActive);
-    Int nActive = r[0];
-    T multiplier = (average and nActive > 0) ? 1.0f / nActive : 1.0f;
-    for (int i = 1; i <= nActive; i++) {
-      T *inp = d_input_features + r[i] * nPlanes;
-      for (Int plane = threadIdx.x; plane < nPlanes; plane += blockDim.x)
-        atomicAdd(&inp[plane], multiplier * out[plane]);
-    }
-  }
-}
-#endif /* CUDA_IOLAYERS_H */
--- a/sparseconvnet/SCN/CUDA/LeakyReLU.cpp
+++ b/sparseconvnet/SCN/CUDA/LeakyReLU.cpp
+// Copyright 2016-present, Facebook, Inc.
+// All rights reserved.
+//
+// This source code is licensed under the license found in the
+// LICENSE file in the root directory of this source tree.
+template <typename T>
+void LeakyReLU_fp(T *input_features, T *output_features, Int n, T alpha);
+template <typename T>
+void LeakyReLU_bp(T *input_features, T *d_input_features, T *output_features,
+                  Int n, T alpha);
+template <typename T>
+void cuda_LeakyReLU_updateOutput(/*cuda float*/ at::Tensor input_features,
+                                 /*cuda float*/ at::Tensor output_features,
+                                 T alpha) {
+  output_features.resize_as_(input_features);
+  auto n = input_features.numel();
+  LeakyReLU_fp<T>(input_features.data<T>(), output_features.data<T>(), n,
+                  alpha);
+}
+template <typename T>
+void cuda_LeakyReLU_updateGradInput(
+    /*cuda float*/ at::Tensor input_features,
+    /*cuda float*/ at::Tensor d_input_features,
+    /*cuda float*/ at::Tensor d_output_features, T alpha) {
+  d_input_features.resize_as_(d_output_features);
+  auto n = d_input_features.numel();
+  LeakyReLU_bp<T>(input_features.data<T>(), d_input_features.data<T>(),
+                  d_output_features.data<T>(), n, alpha);
+}
--- a/sparseconvnet/SCN/CUDA/LeakyReLU.cu
+++ b/sparseconvnet/SCN/CUDA/LeakyReLU.cu
@@ -4,26 +4,28 @@
 // This source code is licensed under the license found in the
 // LICENSE file in the root directory of this source tree.
-#include "LeakyReLU.h"
 template <typename T>
-void cuda_LeakyReLU_updateOutput(/*cuda float*/ at::Tensor input_features,
+__global__ void LeakyReLU_fp_(T *input_features, T *output_features, Int n,
-                                 /*cuda float*/ at::Tensor output_features,
+                              T alpha) {
-                                 float alpha) {
+  for (Int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; i += 16 * 1024)
-  output_features.resize_as_(input_features);
+    output_features[i] = (input_features[i] > 0) ? input_features[i]
-  auto n = input_features.numel();
+                                                 : (input_features[i] * alpha);
-  LeakyReLU_fp<T><<<16, 1024>>>(input_features.data<T>(),
+}
-                                output_features.data<T>(), n, alpha);
+template <typename T>
+void LeakyReLU_fp(T *input_features, T *output_features, Int n, T alpha) {
+  LeakyReLU_fp_<T><<<16, 1024>>>(input_features, output_features, n, alpha);
+}
+template <typename T>
+__global__ void LeakyReLU_bp_(T *input_features, T *d_input_features,
+                              T *d_output_features, Int n, T alpha) {
+  for (Int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; i += 16 * 1024)
+    d_input_features[i] = (input_features[i] > 0)
+                              ? d_output_features[i]
+                              : (d_output_features[i] * alpha);
 }
 template <typename T>
-void cuda_LeakyReLU_updateGradInput(
+void LeakyReLU_bp(T *input_features, T *d_input_features, T *output_features,
-    /*cuda float*/ at::Tensor input_features,
+                  Int n, T alpha) {
-    /*cuda float*/ at::Tensor d_input_features,
+  LeakyReLU_bp_<T><<<16, 1024>>>(input_features, d_input_features,
-    /*cuda float*/ at::Tensor d_output_features, float alpha) {
+                                 output_features, n, alpha);
-  d_input_features.resize_as_(d_output_features);
-  auto n = d_input_features.numel();
-  LeakyReLU_bp<T><<<16, 1024>>>(input_features.data<T>(),
-                                d_input_features.data<T>(),
-                                d_output_features.data<T>(), n, alpha);
 }
--- a/sparseconvnet/SCN/CUDA/LeakyReLU.h
+++ b/sparseconvnet/SCN/CUDA/LeakyReLU.h
-// Copyright 2016-present, Facebook, Inc.
-// All rights reserved.
-//
-// This source code is licensed under the license found in the
-// LICENSE file in the root directory of this source tree.
-#ifndef LEAKYRELU_H
-#define LEAKYRELU_H
-template <typename T>
-__global__ void LeakyReLU_fp(T *input_features, T *output_features, Int n,
-                             T alpha) {
-  for (Int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; i += 16 * 1024)
-    output_features[i] = (input_features[i] > 0) ? input_features[i]
-                                                 : (input_features[i] * alpha);
-}
-template <typename T>
-__global__ void LeakyReLU_bp(T *input_features, T *d_input_features,
-                             T *d_output_features, Int n, T alpha) {
-  for (Int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; i += 16 * 1024)
-    d_input_features[i] = (input_features[i] > 0)
-                              ? d_output_features[i]
-                              : (d_output_features[i] * alpha);
-}
-#endif
--- a/sparseconvnet/SCN/CUDA/MaxPooling.cpp
+++ b/sparseconvnet/SCN/CUDA/MaxPooling.cpp
--- a/sparseconvnet/SCN/CUDA/MaxPooling.cu
+++ b/sparseconvnet/SCN/CUDA/MaxPooling.cu
--- a/sparseconvnet/SCN/CUDA/MaxPooling.h
+++ b/sparseconvnet/SCN/CUDA/MaxPooling.h
-// Copyright 2016-present, Facebook, Inc.
-// All rights reserved.
-//
-// This source code is licensed under the license found in the
-// LICENSE file in the root directory of this source tree.
-#ifndef CUDA_MAXPOOLING_H
-#define CUDA_MAXPOOLING_H
-// NTX must be >=2 so r is filled properly
-template <typename T, Int NTX, Int NTY>
-__global__ void MaxPooling_fp(T *input_features, T *output_features,
-                              Int nPlanes, Int input_stride, Int output_stride,
-                              Int *rules, Int nHot) {
-  __shared__ Int r[NTY * 2];
-  for (Int n = blockIdx.x * NTY; n < nHot; n += gridDim.x * NTY) {
-    {
-      Int i = threadIdx.x + NTX * threadIdx.y;
-      if (i < NTY * 2 and i < 2 * (nHot - n))
-        r[i] = rules[2 * n + i];
-    }
-    __syncthreads();
-    if (n + threadIdx.y < nHot) {
-      Int i = r[2 * threadIdx.y] * input_stride;
-      Int o = r[2 * threadIdx.y + 1] * output_stride;
-      for (Int plane = threadIdx.x; plane < nPlanes; plane += NTX) {
-        T inp = input_features[i + plane];
-        if (output_features[o + plane] < inp)
-          output_features[o + plane] = inp;
-      }
-    }
-    __syncthreads();
-  }
-}
-template <typename T>
-void cuda_MaxPooling_ForwardPass(T *input_features, T *output_features,
-                                 Int nPlanes, Int input_stride,
-                                 Int output_stride, Int *rules, Int nHot) {
-  MaxPooling_fp<T, 32, 32><<<32, dim3(32, 32)>>>(
-      input_features, output_features, nPlanes, input_stride, output_stride,
-      rules, nHot);
-}
-template <typename T, Int NTX, Int NTY>
-__global__ void MaxPooling_bp(T *input_features, T *d_input_features,
-                              T *output_features, T *d_output_features,
-                              Int nPlanes, Int input_stride, Int output_stride,
-                              Int *rules, Int nHot) {
-  __shared__ Int r[NTY * 2];
-  for (Int n = blockIdx.x * NTY; n < nHot; n += gridDim.x * NTY) {
-    {
-      Int i = threadIdx.x + NTX * threadIdx.y;
-      if (i < NTY * 2 and i < 2 * (nHot - n))
-        r[i] = rules[2 * n + i];
-    }
-    __syncthreads();
-    if (n + threadIdx.y < nHot) {
-      Int i = r[2 * threadIdx.y] * input_stride;
-      Int o = r[2 * threadIdx.y + 1] * output_stride;
-      for (Int plane = threadIdx.x; plane < nPlanes; plane += NTX)
-        if (output_features[o + plane] == input_features[i + plane])
-          d_input_features[i + plane] += d_output_features[o + plane];
-    }
-    __syncthreads();
-  }
-}
-template <typename T>
-void cuda_MaxPooling_BackwardPass(T *input_features, T *d_input_features,
-                                  T *output_features, T *d_output_features,
-                                  Int nPlanes, Int input_stride,
-                                  Int output_stride, Int *rules, Int nHot) {
-  MaxPooling_bp<T, 32, 32><<<32, dim3(32, 32)>>>(
-      input_features, d_input_features, output_features, d_output_features,
-      nPlanes, input_stride, output_stride, rules, nHot);
-}
-#endif /* CUDA_MAXPOOLING_H */
--- a/sparseconvnet/SCN/CUDA/NetworkInNetwork.cu
+++ b/sparseconvnet/SCN/CUDA/NetworkInNetwork.cu
--- a/sparseconvnet/SCN/CUDA/SparseToDense.cpp
+++ b/sparseconvnet/SCN/CUDA/SparseToDense.cpp
--- a/sparseconvnet/SCN/CUDA/SparseToDense.cu
+++ b/sparseconvnet/SCN/CUDA/SparseToDense.cu
--- a/sparseconvnet/SCN/CUDA/SparseToDense.h
+++ b/sparseconvnet/SCN/CUDA/SparseToDense.h