Rename ValidConvolutions to SubmanifoldConvolutions, update for PyTorch 0.4 Tensor/Variable merge

d77687a6 · Benjamin Graham · Benjamin Thomas Graham · 297e04c0 · d77687a6 · d77687a6
Commit d77687a6 authored Mar 04, 2018 by Benjamin Graham Committed by Benjamin Thomas Graham Mar 05, 2018
20 changed files
--- a/PyTorch/setup.py
+++ b/PyTorch/setup.py
@@ -15,7 +15,7 @@ if torch.cuda.is_available():
    r = os.system(
        'cd sparseconvnet/SCN; nvcc init.cu -c -o init.cu.o -ccbin /usr/bin/cc'
        + ' -m64 --std c++11 -Xcompiler '
-        + ',\"-fopenmp\",\"-fPIC\",\"-O3\",\"-DNDEBUG\" '
+        + ',\"-fopenmp\",\"-fPIC\",\"-O3\" '
        + '-gencode arch=compute_62,code=sm_62 '
        + '-gencode arch=compute_61,code=sm_61 '
        + '-gencode arch=compute_60,code=sm_60 '
@@ -40,10 +40,11 @@ if torch.cuda.is_available():
            this_dir +
            '/sparseconvnet/SCN/init.cu.o'],
        relative_to=__file__,
+        extra_compile_args=["-std=c99"],
        with_cuda=True)
 else:
    r = os.system(
-        'cd sparseconvnet/SCN; g++ -std=c++11 -fPIC -c init.cpp -o init.cpp.o -I' +
+        'cd sparseconvnet/SCN; g++ -std=c++11 -DENABLE_OPENMP -fPIC -c init.cpp -o init.cpp.o -I' +
        torch_dir +
        '/lib/include -I' +
        torch_dir +
@@ -57,6 +58,7 @@ else:
            this_dir +
            '/sparseconvnet/SCN/init.cpp.o'],
        relative_to=__file__,
+        extra_compile_args=["-std=c99"],
        with_cuda=False)
 ffi.build()

--- a/PyTorch/sparseconvnet/SCN/generic/CPU/Convolution.cpp
+++ b/PyTorch/sparseconvnet/SCN/generic/CPU/Convolution.cpp
@@ -68,13 +68,13 @@ extern "C" void scn_DR_(Convolution_backward)(
  }
 }
-extern "C" double scn_DR_(ValidConvolution_updateOutput)(
+extern "C" double scn_DR_(SubmanifoldConvolution_updateOutput)(
    THLongTensor *inputSize, THLongTensor *filterSize, void **m,
    THTensor *input_features, THTensor *output_features, THTensor *weight,
    THTensor *bias, long filterVolume, void *rulesBuffer) {
  SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
-  auto _rules = _m.getValidRuleBook(inputSize, filterSize, true);
+  auto _rules = _m.getSubmanifoldRuleBook(inputSize, filterSize, true);
  uInt nActive = _m.getNActive(inputSize);
  THTensor_(resize2d)(output_features, nActive, weight->size[1]);
  if (not bias)
@@ -97,14 +97,14 @@ extern "C" double scn_DR_(ValidConvolution_updateOutput)(
  return flops;
 }
-extern "C" void scn_DR_(ValidConvolution_backward)(
+extern "C" void scn_DR_(SubmanifoldConvolution_backward)(
    THLongTensor *inputSize, THLongTensor *filterSize, void **m,
    THTensor *input_features, THTensor *d_input_features,
    THTensor *d_output_features, THTensor *weight, THTensor *d_weight,
    THTensor *d_bias, long filterVolume, void *rulesBuffer) {
  SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
-  auto _rules = _m.getValidRuleBook(inputSize, filterSize, true);
+  auto _rules = _m.getSubmanifoldRuleBook(inputSize, filterSize, true);
  uInt nActive = _m.getNActive(inputSize);
  THTensor_(resizeAs)(d_input_features, input_features);
  THTensor_(zero)(d_input_features);

--- a/PyTorch/sparseconvnet/SCN/generic/CPU/InputLayer.cpp
+++ b/PyTorch/sparseconvnet/SCN/generic/CPU/InputLayer.cpp
+// Copyright 2016-present, Facebook, Inc.
+// All rights reserved.
+//
+// This source code is licensed under the license found in the
+// LICENSE file in the root directory of this source tree.
+#ifndef TH_GENERIC_FILE_
+#define TH_GENERIC_FILE_ "generic/CPU/InputLayer.cpp"
+#else
+#include "InputLayer.h"
+extern "C" void scn_DR_(InputLayer_updateOutput)(
+    void **m, THLongTensor *spatialSize, THLongTensor *input_coords,
+    THTensor *input_features, THTensor *output_features, long batchSize,
+    long mode, void *rulesBuffer) {
+  SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
+  _m.inputLayer(spatialSize, input_coords, batchSize, mode);
+  auto nPlanes = input_features->size[1];
+  THTensor_(resize2d)(output_features, *_m.inputNActive, nPlanes);
+  THTensor_(zero)(output_features);
+  auto &rules = _m.inputLayerRuleBook;
+  auto maxActive = rules[0][1];
+  auto nRows = rules[0][3];
+  InputLayer_ForwardPass<real>(THTensor_(data)(input_features),
+                               THTensor_(data)(output_features), nRows,
+                               maxActive, nPlanes, &rules[1][0], mode == 4);
+}
+extern "C" void scn_DR_(InputLayer_updateGradInput)(void **m,
+                                                    THTensor *d_input_features,
+                                                    THTensor *d_output_features,
+                                                    void *rulesBuffer) {
+  SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
+  auto &rules = _m.inputLayerRuleBook;
+  auto nPlanes = d_output_features->size[1];
+  THTensor_(resize2d)(d_input_features, rules[0][2], nPlanes);
+  THTensor_(zero)(d_input_features);
+  auto mode = rules[0][0];
+  auto maxActive = rules[0][1];
+  auto nRows = rules[0][3];
+  InputLayer_BackwardPass<real>(THTensor_(data)(d_input_features),
+                                THTensor_(data)(d_output_features), nRows,
+                                maxActive, nPlanes, &rules[1][0], mode == 4);
+}
+extern "C" void scn_DR_(BLInputLayer_updateOutput)(
+    void **m, THLongTensor *spatialSize, THLongTensor *input_coords,
+    THTensor *input_features, THTensor *output_features, long mode,
+    void *rulesBuffer) {
+  SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
+  _m.blLayer(spatialSize, input_coords, mode);
+  auto nPlanes = input_features->size[2];
+  THTensor_(resize2d)(output_features, *_m.inputNActive, nPlanes);
+  THTensor_(zero)(output_features);
+  auto &rules = _m.blLayerRuleBook;
+  auto maxActive = rules[0][1];
+  auto nRows = rules[0][4];
+  InputLayer_ForwardPass<real>(THTensor_(data)(input_features),
+                               THTensor_(data)(output_features), nRows,
+                               maxActive, nPlanes, &rules[1][0], mode == 4);
+}
+extern "C" void
+scn_DR_(BLInputLayer_updateGradInput)(void **m, THTensor *d_input_features,
+                                      THTensor *d_output_features,
+                                      void *rulesBuffer) {
+  SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
+  auto &rules = _m.blLayerRuleBook;
+  auto nPlanes = d_output_features->size[1];
+  THTensor_(resize3d)(d_input_features, rules[0][2], rules[0][3], nPlanes);
+  THTensor_(zero)(d_input_features);
+  auto mode = rules[0][0];
+  auto maxActive = rules[0][1];
+  auto nRows = rules[0][4];
+  InputLayer_BackwardPass<real>(THTensor_(data)(d_input_features),
+                                THTensor_(data)(d_output_features), nRows,
+                                maxActive, nPlanes, &rules[1][0], mode == 4);
+}
+extern "C" void scn_DR_(BLOutputLayer_updateOutput)(
+    void **m,
+    THTensor *input_features, THTensor *output_features,
+    void *rulesBuffer) {
+  SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
+  auto &rules = _m.blLayerRuleBook;
+  auto nPlanes = input_features->size[1];
+  THTensor_(resize3d)(output_features, rules[0][2], rules[0][3], nPlanes);
+  THTensor_(zero)(output_features);
+  auto mode = rules[0][0];
+  auto maxActive = rules[0][1];
+  auto nRows = rules[0][4];
+  InputLayer_BackwardPass<real>(THTensor_(data)(output_features),
+                                THTensor_(data)(input_features), nRows,
+                                maxActive, nPlanes, &rules[1][0], false);
+}
+extern "C" void
+scn_DR_(BLOutputLayer_updateGradInput)(void **m, THTensor *d_input_features,
+                                       THTensor *d_output_features,
+                                       void *rulesBuffer) {
+  SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
+  auto &rules = _m.blLayerRuleBook;
+  auto nPlanes = d_output_features->size[2];
+  auto mode = rules[0][0];
+  auto maxActive = rules[0][1];
+  auto nRows = rules[0][4];
+  THTensor_(resize2d)(d_input_features, nRows, nPlanes);
+  THTensor_(zero)(d_input_features);
+  InputLayer_ForwardPass<real>(THTensor_(data)(d_output_features),
+                               THTensor_(data)(d_input_features), nRows,
+                               maxActive, nPlanes, &rules[1][0], false);
+}
+#endif
--- a/PyTorch/sparseconvnet/SCN/generic/CPU/InputLayer.h
+++ b/PyTorch/sparseconvnet/SCN/generic/CPU/InputLayer.h
+// Copyright 2016-present, Facebook, Inc.
+// All rights reserved.
+//
+// This source code is licensed under the license found in the
+// LICENSE file in the root directory of this source tree.
+#ifndef CPU_INPUTLAYER_H
+#define CPU_INPUTLAYER_H
+#include "../SparseConvNet.h"
+#include <cstring>
+// Assume output and d_input_features have been zero-ed
+template <typename T>
+void InputLayer_ForwardPass(T *input_features, T *output_features, uInt nRows,
+                            uInt maxActive, uInt nPlanes, uInt *rules,
+                            bool average) {
+  for (uInt row = 0; row < nRows; row++) {
+    auto nActive = rules[0];
+    T multiplier = (average and nActive > 0) ? 1.0f / nActive : 1.0f;
+    for (uInt i = 1; i <= nActive; ++i) {
+      auto in_f = input_features + nPlanes * rules[i];
+      for (uInt plane = 0; plane < nPlanes; plane++) {
+        output_features[plane] += multiplier * in_f[plane];
+      }
+    }
+    output_features += nPlanes;
+    rules += 1 + maxActive;
+  }
+}
+template <typename T>
+void InputLayer_BackwardPass(T *d_input_features, T *d_output_features,
+                             uInt nRows, uInt maxActive, uInt nPlanes,
+                             uInt *rules, bool average) {
+  for (uInt row = 0; row < nRows; row++) {
+    auto nActive = rules[0];
+    T multiplier = (average and nActive > 0) ? 1.0f / nActive : 1.0f;
+    for (uInt i = 1; i <= nActive; ++i) {
+      auto d_in_f = d_input_features + nPlanes * rules[i];
+      for (uInt plane = 0; plane < nPlanes; plane++)
+        d_in_f[plane] += multiplier * d_output_features[plane];
+    }
+    d_output_features += nPlanes;
+    rules += 1 + maxActive;
+  }
+}
+#endif /* CPU_INPUTLAYER_H */
--- a/PyTorch/sparseconvnet/SCN/generic/GPU/Convolution.cu
+++ b/PyTorch/sparseconvnet/SCN/generic/GPU/Convolution.cu
@@ -87,12 +87,12 @@ extern "C" void scn_DR_(Convolution_backward)(
  }
 }
-extern "C" double scn_DR_(ValidConvolution_updateOutput)(
+extern "C" double scn_DR_(SubmanifoldConvolution_updateOutput)(
    THLongTensor *inputSize, THLongTensor *filterSize, void **m,
    THCTensor *input_features, THCTensor *output_features, THCTensor *weight,
    THCTensor *bias, long filterVolume, THCITensor *rulesBuffer) {
  SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
-  auto _rules = _m.getValidRuleBook(inputSize, filterSize, true);
+  auto _rules = _m.getSubmanifoldRuleBook(inputSize, filterSize, true);
  uInt nActive = _m.getNActive(inputSize);
  THCTensor_(resize2d)(state, output_features, nActive, weight->size[1]);
  if (not bias)
@@ -125,13 +125,13 @@ extern "C" double scn_DR_(ValidConvolution_updateOutput)(
  return flops;
 }
-extern "C" void scn_DR_(ValidConvolution_backward)(
+extern "C" void scn_DR_(SubmanifoldConvolution_backward)(
    THLongTensor *inputSize, THLongTensor *filterSize, void **m,
    THCTensor *input_features, THCTensor *d_input_features,
    THCTensor *d_output_features, THCTensor *weight, THCTensor *d_weight,
    THCTensor *d_bias, long filterVolume, THCITensor *rulesBuffer) {
  SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
-  auto _rules = _m.getValidRuleBook(inputSize, filterSize, true);
+  auto _rules = _m.getSubmanifoldRuleBook(inputSize, filterSize, true);
  uInt nActive = _m.getNActive(inputSize);
  THCTensor_(resizeAs)(state, d_input_features, input_features);
  THCTensor_(zero)(state, d_input_features);

--- a/PyTorch/sparseconvnet/SCN/generic/GPU/InputLayer.cu
+++ b/PyTorch/sparseconvnet/SCN/generic/GPU/InputLayer.cu
+// Copyright 2016-present, Facebook, Inc.
+// All rights reserved.
+//
+// This source code is licensed under the license found in the
+// LICENSE file in the root directory of this source tree.
+#ifndef TH_GENERIC_FILE_
+#define TH_GENERIC_FILE_ "generic/GPU/InputLayer.cu"
+#else
+#include "InputLayer.h"
+extern "C" void scn_DR_(InputLayer_updateOutput)(
+    void **m, THLongTensor *spatialSize, THLongTensor *input_coords,
+    THCTensor *input_features, THCTensor *output_features, long batchSize,
+    long mode, THCITensor *rulesBuffer) {
+  SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
+  _m.inputLayer(spatialSize, input_coords, batchSize, mode);
+  uInt nPlanes = input_features->size[1];
+  THCTensor_(resize2d)(state, output_features, *_m.inputNActive, nPlanes);
+  THCTensor_(zero)(state, output_features);
+  auto &rules = _m.inputLayerRuleBook;
+  uInt maxActive = rules[0][1];
+  uInt nRows = rules[0][3];
+  THCITensor_resize1d(state, rulesBuffer, sizeof(uInt) * rules[1].size());
+  auto iF = THCTensor_(data)(state, input_features);
+  auto oF = THCTensor_(data)(state, output_features);
+  auto rb = (uInt*) THCITensor_data(state, rulesBuffer);
+  cudaMemcpy(rb, &rules[1][0], sizeof(uInt) * rules[1].size(),
+             cudaMemcpyHostToDevice);
+  InputLayer_fp<real><<<std::min(nRows, 32768U), std::min(nPlanes, 32U), 0,
+                        THCState_getCurrentStream(state)>>>(
+      iF, oF, nRows, maxActive, nPlanes, rb, mode == 4);
+}
+extern "C" void
+scn_DR_(InputLayer_updateGradInput)(void **m, THCTensor *d_input_features,
+                                    THCTensor *d_output_features,
+                                    THCITensor *rulesBuffer) {
+  SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
+  auto &rules = _m.inputLayerRuleBook;
+  uInt nPlanes = d_output_features->size[1];
+  THCTensor_(resize2d)(state, d_input_features, rules[0][2], nPlanes);
+  THCTensor_(zero)(state, d_input_features);
+  uInt mode = rules[0][0];
+  uInt maxActive = rules[0][1];
+  uInt nRows = rules[0][3];
+  THCITensor_resize1d(state, rulesBuffer, sizeof(uInt) * rules[1].size());
+  auto diF = THCTensor_(data)(state, d_input_features);
+  auto doF = THCTensor_(data)(state, d_output_features);
+  auto rb = (uInt*)THCITensor_data(state, rulesBuffer);
+  cudaMemcpy(rb, &rules[1][0], sizeof(uInt) * rules[1].size(),
+             cudaMemcpyHostToDevice);
+  InputLayer_bp<real><<<std::min(nRows, 32768U), std::min(nPlanes, 32U), 0,
+                        THCState_getCurrentStream(state)>>>(
+      diF, doF, nRows, maxActive, nPlanes, rb, mode == 4);
+}
+extern "C" void scn_DR_(BLInputLayer_updateOutput)(
+    void **m, THLongTensor *spatialSize, THLongTensor *input_coords,
+    THCTensor *input_features, THCTensor *output_features, long mode,
+    THCITensor *rulesBuffer) {
+  SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
+  _m.blLayer(spatialSize, input_coords, mode);
+  uInt nPlanes = input_features->size[2];
+  THCTensor_(resize2d)(state, output_features, *_m.inputNActive, nPlanes);
+  THCTensor_(zero)(state, output_features);
+  auto &rules = _m.blLayerRuleBook;
+  uInt maxActive = rules[0][1];
+  uInt nRows = rules[0][4];
+  THCITensor_resize1d(state, rulesBuffer, sizeof(uInt) * rules[1].size());
+  auto iF = THCTensor_(data)(state, input_features);
+  auto oF = THCTensor_(data)(state, output_features);
+  auto rb = (uInt*) THCITensor_data(state, rulesBuffer);
+  cudaMemcpy(rb, &rules[1][0], sizeof(uInt) * rules[1].size(),
+             cudaMemcpyHostToDevice);
+  InputLayer_fp<real><<<std::min(nRows, 32768U), std::min(nPlanes, 32U), 0,
+                        THCState_getCurrentStream(state)>>>(
+      iF, oF, nRows, maxActive, nPlanes, rb, mode == 4);
+}
+extern "C" void
+scn_DR_(BLInputLayer_updateGradInput)(void **m, THCTensor *d_input_features,
+                                    THCTensor *d_output_features,
+                                    THCITensor *rulesBuffer) {
+  SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
+  auto &rules = _m.blLayerRuleBook;
+  uInt nPlanes = d_output_features->size[1];
+  THCTensor_(resize3d)(state, d_input_features, rules[0][2], rules[0][3], nPlanes);
+  THCTensor_(zero)(state, d_input_features);
+  uInt mode = rules[0][0];
+  uInt maxActive = rules[0][1];
+  uInt nRows = rules[0][4];
+  THCITensor_resize1d(state, rulesBuffer, sizeof(uInt) * rules[1].size());
+  auto diF = THCTensor_(data)(state, d_input_features);
+  auto doF = THCTensor_(data)(state, d_output_features);
+  auto rb = (uInt*)THCITensor_data(state, rulesBuffer);
+  cudaMemcpy(rb, &rules[1][0], sizeof(uInt) * rules[1].size(),
+             cudaMemcpyHostToDevice);
+  InputLayer_bp<real><<<std::min(nRows, 32768U), std::min(nPlanes, 32U), 0,
+                        THCState_getCurrentStream(state)>>>(
+      diF, doF, nRows, maxActive, nPlanes, rb, mode == 4);
+}
+extern "C" void scn_DR_(BLOutputLayer_updateOutput)(
+    void **m,
+    THCTensor *input_features, THCTensor *output_features,
+    THCITensor *rulesBuffer) {
+  SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
+  auto &rules = _m.blLayerRuleBook;
+  uInt nPlanes = input_features->size[1];
+  THCTensor_(resize3d)(state, output_features, rules[0][2], rules[0][3], nPlanes);
+  THCTensor_(zero)(state, output_features);
+  auto mode = rules[0][0];
+  uInt maxActive = rules[0][1];
+  uInt nRows = rules[0][4];
+  THCITensor_resize1d(state, rulesBuffer, sizeof(uInt) * rules[1].size());
+  auto iF = THCTensor_(data)(state, input_features);
+  auto oF = THCTensor_(data)(state, output_features);
+  auto rb = (uInt*) THCITensor_data(state, rulesBuffer);
+  cudaMemcpy(rb, &rules[1][0], sizeof(uInt) * rules[1].size(),
+             cudaMemcpyHostToDevice);
+  InputLayer_bp<real><<<std::min(nRows, 32768U), std::min(nPlanes, 32U), 0,
+                        THCState_getCurrentStream(state)>>>(
+      oF, iF, nRows, maxActive, nPlanes, rb, false);
+}
+extern "C" void
+scn_DR_(BLOutputLayer_updateGradInput)(void **m, THCTensor *d_input_features,
+                                    THCTensor *d_output_features,
+                                    THCITensor *rulesBuffer) {
+  SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
+  auto &rules = _m.blLayerRuleBook;
+  uInt nPlanes = d_output_features->size[2];
+  uInt mode = rules[0][0];
+  uInt maxActive = rules[0][1];
+  uInt nRows = rules[0][4];
+  THCTensor_(resize2d)(state, d_input_features, nRows, nPlanes);
+  THCTensor_(zero)(state, d_input_features);
+  THCITensor_resize1d(state, rulesBuffer, sizeof(uInt) * rules[1].size());
+  auto diF = THCTensor_(data)(state, d_input_features);
+  auto doF = THCTensor_(data)(state, d_output_features);
+  auto rb = (uInt*)THCITensor_data(state, rulesBuffer);
+  cudaMemcpy(rb, &rules[1][0], sizeof(uInt) * rules[1].size(),
+             cudaMemcpyHostToDevice);
+  InputLayer_fp<real><<<std::min(nRows, 32768U), std::min(nPlanes, 32U), 0,
+                        THCState_getCurrentStream(state)>>>(
+      doF, diF, nRows, maxActive, nPlanes, rb, false);
+}
+#endif
--- a/PyTorch/sparseconvnet/SCN/generic/GPU/InputLayer.h
+++ b/PyTorch/sparseconvnet/SCN/generic/GPU/InputLayer.h
+// Copyright 2016-present, Facebook, Inc.
+// All rights reserved.
+//
+// This source code is licensed under the license found in the
+// LICENSE file in the root directory of this source tree.
+#ifndef GPU_INPUTLAYER_H
+#define GPU_INPUTLAYER_H
+template <typename T>
+__global__ void InputLayer_fp(T *input_features, T *output_features,
+                              uInt nRows, uInt maxActive, uInt nPlanes,
+                              uInt *rules, bool average) {
+  for (int row = blockIdx.x; row < nRows; row += gridDim.x) {
+    T *out = output_features + row * nPlanes;
+    uInt *r = rules + row * (1 + maxActive);
+    uInt nActive = r[0];
+    T multiplier = (average and nActive > 0) ? 1.0f / nActive : 1.0f;
+    for (int i = 1; i <= nActive; i++) {
+      T *inp = input_features + r[i] * nPlanes;
+      for (uInt plane = threadIdx.x; plane < nPlanes; plane += blockDim.x)
+        out[plane] += multiplier * inp[plane];
+    }
+  }
+}
+template <typename T>
+__global__ void InputLayer_bp(T *d_input_features, T *d_output_features,
+                              uInt nRows, uInt maxActive, uInt nPlanes,
+                              uInt *rules, bool average) {
+  for (int row = blockIdx.x; row < nRows; row += gridDim.x) {
+    T *out = d_output_features + row * nPlanes;
+    uInt *r = rules + row * (1 + maxActive);
+    uInt nActive = r[0];
+    T multiplier = (average and nActive > 0) ? 1.0f / nActive : 1.0f;
+    for (int i = 1; i <= nActive; i++) {
+      T *inp = d_input_features + r[i] * nPlanes;
+      for (uInt plane = threadIdx.x; plane < nPlanes; plane += blockDim.x)
+        atomicAdd(&inp[plane], multiplier * out[plane]);
+    }
+  }
+}
+#endif /* GPU_INPUTLAYER_H */
--- a/PyTorch/sparseconvnet/SCN/generic/Geometry/InputLayerRules.h
+++ b/PyTorch/sparseconvnet/SCN/generic/Geometry/InputLayerRules.h
+// Copyright 2016-present, Facebook, Inc.
+// All rights reserved.
+//
+// This source code is licensed under the license found in the
+// LICENSE file in the root directory of this source tree.
+#ifndef INPUTLAYER_H
+#define INPUTLAYER_H
+#include "../SparseConvNet.h"
+#include <omp.h>
+// mode 1==overwrite, 2=keep, 3=sum, 4=mean
+template <uInt dimension>
+void inputLayerRules(SparseGrids<dimension> &SGs, RuleBook &rules, long *coords,
+                     uInt nInputRows, uInt nInputColumns, uInt batchSize,
+                     uInt mode, uInt &nActive) {
+  assert(nActive == 0);
+  assert(rules.size() == 0);
+  assert(SGs.size() == 0);
+  SGs.resize(batchSize); // Set a minimum batch size if necessary
+  Point<dimension> p;
+  // Compile list of how input rows correspond to output rows
+  std::vector<std::vector<uInt>> outputRows;
+  if (nInputColumns == dimension) {
+    SGs.resize(1);
+    auto &sg = SGs[0];
+    for (int i = 0; i < nInputRows; ++i) {
+      for (int j = 0; j < dimension; j++)
+        p[j] = coords[j];
+      coords += dimension;
+      auto iter = sg.mp.find(p);
+      if (iter == sg.mp.end()) {
+        sg.mp[p] = nActive++;
+        outputRows.resize(nActive);
+      }
+      outputRows[sg.mp[p]].push_back(i);
+    }
+  } else { // nInputColumns == dimension + 1
+    uInt idx;
+    for (int i = 0; i < nInputRows; ++i) {
+      for (int j = 0; j < dimension; j++)
+        p[j] = coords[j];
+      idx = coords[dimension];
+      coords += dimension + 1;
+      if (idx + 1 >= SGs.size())
+        SGs.resize(idx + 1);
+      auto &sg = SGs[idx];
+      auto iter = sg.mp.find(p);
+      if (iter == sg.mp.end()) {
+        sg.mp[p] = nActive++;
+        outputRows.resize(nActive);
+      }
+      outputRows[sg.mp[p]].push_back(i);
+    }
+  }
+  rules.resize(2);
+  rules[0].push_back(mode);
+  rules[0].push_back(1); // replace with maxActive if mode==3 or 4
+  rules[0].push_back(nInputRows);
+  rules[0].push_back(outputRows.size());
+  auto &rule = rules[1];
+  if (mode == 1) {
+    for (uInt i = 0; i < nActive; ++i) {
+      rule.push_back(1);
+      rule.push_back(outputRows[i].front());
+    }
+  }
+  if (mode == 2) {
+    for (uInt i = 0; i < nActive; ++i) {
+      rule.push_back(1);
+      rule.push_back(outputRows[i].back());
+    }
+  }
+  if (mode == 3 or mode == 4) {
+    uInt maxActive = 0;
+    for (auto &row : outputRows)
+      maxActive = std::max(maxActive, (uInt)row.size());
+    rules[0][1] = maxActive;
+    for (auto &row : outputRows) {
+      rule.push_back(row.size());
+      for (auto &r : row)
+        rule.push_back(r);
+      rule.resize((rule.size() + maxActive) / (maxActive + 1) *
+                  (maxActive + 1));
+    }
+  }
+}
+// bl is a batchSize x length x dimension long array of coordinates
+// mode 0==guaranteed unique and all present; 1==overwrite, 2=keep, 3=sum,
+// 4=mean
+template <uInt dimension>
+void blRules(SparseGrids<dimension> &SGs, RuleBook &rules, long *coords,
+             uInt batchSize, uInt length, uInt mode, uInt &nActive) {
+  assert(nActive == 0);
+  assert(rules.size() == 0);
+  assert(SGs.size() == 0);
+  SGs.resize(batchSize);
+  uInt I;
+  if (mode == 0) {
+    nActive = batchSize * length;
+#pragma omp parallel for private(I)
+    for (I = 0; I < batchSize; I++) {
+      auto &sg = SGs[I];
+      sg.ctr = I * length;
+      auto c = coords + I * length * dimension;
+      Point<dimension> p;
+      for (int l = 0; l < length; ++l) {
+        for (int j = 0; j < dimension; ++j)
+          p[j] = c[j];
+        c += dimension;
+        sg.mp[p] = l;
+      }
+    }
+    rules.resize(2);
+    rules[0].push_back(0);
+    rules[0].push_back(1);
+    rules[0].push_back(batchSize);
+    rules[0].push_back(length);
+    rules[0].push_back(nActive);
+    auto &rule = rules[1];
+    int ll = 0;
+    for (I = 0; I < batchSize; I++) {
+      for (int l = 0; l < length; ++l, ++ll) {
+        rule.push_back(1);
+        rule.push_back(ll);
+      }
+    }
+    return;
+  }
+  // Compile list of how input rows correspond to output rows
+  std::vector<std::vector<std::vector<uInt>>> outputRows(batchSize);
+  std::vector<uInt> nActives(batchSize);
+#pragma omp parallel for private(I)
+  for (I = 0; I < batchSize; I++) {
+    auto &sg = SGs[I];
+    auto &ors = outputRows[I];
+    auto &nAct = nActives[I];
+    auto c = coords + I * length * dimension;
+    uInt i = I * length;
+    Point<dimension> p;
+    for (int l = 0; l < length; ++l, ++i) {
+      for (int j = 0; j < dimension; ++j)
+        p[j] = *c++;
+      if (p[0] >= 0) {
+        auto iter = sg.mp.find(p);
+        if (iter == sg.mp.end()) {
+          sg.mp[p] = nAct++;
+          ors.resize(nAct);
+        }
+        ors[sg.mp[p]].push_back(i);
+      }
+    }
+  }
+  for (I = 0; I < batchSize; I++) {
+    SGs[I].ctr = nActive;
+    nActive += nActives[I];
+  }
+  uInt maxActive = 1;
+  if (mode >= 3)
+    for (auto &ors : outputRows)
+      for (auto &row : ors)
+        maxActive = std::max(maxActive, (uInt)row.size());
+  rules.resize(2);
+  rules[0].push_back(mode);
+  rules[0].push_back(maxActive);
+  rules[0].push_back(batchSize);
+  rules[0].push_back(length);
+  rules[0].push_back(nActive);
+  auto &rule = rules[1];
+  if (mode == 1) {
+    rule.resize(2 * nActive);
+#pragma omp parallel for private(I)
+    for (I = 0; I < batchSize; I++) {
+      auto &ors = outputRows[I];
+      auto rr = &rule[SGs[I].ctr * 2];
+      for (auto &row : ors) {
+        rr[0] = row.size();
+        rr[1] = row.back();
+        rr += 2;
+      }
+    }
+  }
+  if (mode == 2) {
+    rule.resize(2 * nActive);
+#pragma omp parallel for private(I)
+    for (I = 0; I < batchSize; I++) {
+      auto &ors = outputRows[I];
+      auto rr = &rule[SGs[I].ctr * 2];
+      for (auto &row : ors) {
+        rr[0] = row.size();
+        rr[1] = row.front();
+        rr += 2;
+      }
+    }
+  }
+  if (mode == 3 or mode == 4) {
+    std::cout << omp_get_num_threads() << std::endl;
+    rule.resize((maxActive + 1) * nActive);
+#pragma omp parallel for private(I)
+    for (I = 0; I < batchSize; I++) {
+      auto &ors = outputRows[I];
+      auto rr = &rule[SGs[I].ctr * (maxActive + 1)];
+      for (auto &row : ors) {
+        rr[0] = row.size();
+        for (int i = 0; i < row.size(); ++i)
+          rr[i + 1] = row[i];
+        rr += 1 + maxActive;
+      }
+    }
+  }
+}
+#endif /* INPUTLAYER_H */
--- a/PyTorch/sparseconvnet/SCN/generic/Geometry/Metadata.cpp
+++ b/PyTorch/sparseconvnet/SCN/generic/Geometry/Metadata.cpp
@@ -221,7 +221,7 @@ extern "C" void scn_D_(addSampleFromThresholdedTensor)(
  THFloatTensor_resize2d(features_, nActive, nPlanes);
 }
-// 3x3 valid convolutions, 3x3/2x2 pooling or strided convolutions
+// 3x3 submanifold convolutions, 3x3/2x2 pooling or strided convolutions
 extern "C" void scn_D_(generateRuleBooks3s2)(void **m) {
  SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
  long sz[Dimension], str[Dimension], inS[Dimension], outS[Dimension];
@@ -237,7 +237,7 @@ extern "C" void scn_D_(generateRuleBooks3s2)(void **m) {
    auto &SGs = _m.grids[p1];
    auto &rb = _m.validRuleBooks[p2];
    if (rb.empty())
-      ValidConvolution_SgsToRules(SGs, rb, sz);
+      SubmanifoldConvolution_SgsToRules(SGs, rb, sz);
    for (int i = 0; i < Dimension; ++i)
      if (p1[i] < 3 or p1[i] % 2 != 1)
        return;
@@ -253,7 +253,7 @@ extern "C" void scn_D_(generateRuleBooks3s2)(void **m) {
  }
 }
-// 3x3 valid convolutions, 2x2 pooling or strided convolutions
+// 3x3 submanifold convolutions, 2x2 pooling or strided convolutions
 extern "C" void scn_D_(generateRuleBooks2s2)(void **m) {
  SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
  long s2[Dimension], s3[Dimension], inS[Dimension], outS[Dimension];
@@ -268,7 +268,7 @@ extern "C" void scn_D_(generateRuleBooks2s2)(void **m) {
  while (true) {
    auto &SGs = _m.grids[p1];
    auto &rb = _m.validRuleBooks[p2];
-    ValidConvolution_SgsToRules(SGs, rb, s3);
+    SubmanifoldConvolution_SgsToRules(SGs, rb, s3);
    for (int i = 0; i < Dimension; ++i)
      if (p1[i] < 2 or p1[i] % 2 != 0)
        return;

--- a/PyTorch/sparseconvnet/SCN/generic/Geometry/Metadata.h
+++ b/PyTorch/sparseconvnet/SCN/generic/Geometry/Metadata.h
@@ -10,20 +10,26 @@
 #include "../SparseConvNet.h"
 #include "ActivePoolingRules.h"
 #include "ConvolutionRules.h"
-#include "ValidConvolutionRules.h"
+#include "InputLayerRules.h"
+#include "SubmanifoldConvolutionRules.h"
 #include <tuple>
 #include <unordered_map>
 template <uInt dimension> class Metadata {
 public:
+  //Count of active sites for each scale
  std::unordered_map<Point<dimension>, uInt, IntArrayHash<dimension>> nActive;
+  //Hash tables for each scale locating the active points
  std::unordered_map<Point<dimension>, SparseGrids<dimension>,
                     IntArrayHash<dimension>> grids;
  std::unordered_map<Point<dimension>, RuleBook, IntArrayHash<dimension>>
      activePoolingRuleBooks;
+  RuleBook inputLayerRuleBook;
+  RuleBook blLayerRuleBook;
  std::unordered_map<Point<2 * dimension>, RuleBook,
                     IntArrayHash<2 * dimension>> validRuleBooks;
@@ -49,6 +55,8 @@ public:
    inputSGs = nullptr;
    inputSG = nullptr;
    inputNActive = nullptr;
+    inputLayerRuleBook.clear();
+    blLayerRuleBook.clear();
  }
  void setInputSpatialSize(THLongTensor *spatialSize) {
@@ -56,23 +64,43 @@ public:
    inputSGs = &grids[inputSpatialSize];
    inputNActive = &nActive[inputSpatialSize];
  }
+  void inputLayer(THLongTensor *spatialSize, THLongTensor *coords,
+                  uInt batchSize, uInt mode) {
+    assert(spatialSize->nDimension == 1);
+    assert(spatialSize->size[0] == dimension);
+    assert(coords->nDimension == 2);
+    assert(coords->size[1] >= dimension and coords->size[1] <= dimension + 1);
+    setInputSpatialSize(spatialSize);
+    inputLayerRules<dimension>(*inputSGs, inputLayerRuleBook,
+                               THLongTensor_data(coords), coords->size[0],
+                               coords->size[1], batchSize, mode, *inputNActive);
+  }
+  void blLayer(THLongTensor *spatialSize, THLongTensor *coords, uInt mode) {
+    assert(spatialSize->nDimension == 1);
+    assert(spatialSize->size[0] == dimension);
+    assert(coords->nDimension == 3);
+    assert(coords->size[2] == dimension);
+    setInputSpatialSize(spatialSize);
+    blRules<dimension>(*inputSGs, blLayerRuleBook, THLongTensor_data(coords),
+                       coords->size[0], coords->size[1], mode, *inputNActive);
+  }
  SparseGrids<dimension> &getSparseGrid(THLongTensor *spatialSize) {
    return grids[LongTensorToPoint<dimension>(spatialSize)];
  };
  uInt getNActive(THLongTensor *spatialSize) {
    return nActive[LongTensorToPoint<dimension>(spatialSize)];
  };
-  RuleBook &getValidRuleBook(THLongTensor *spatialSize, THLongTensor *size,
+  RuleBook &getSubmanifoldRuleBook(THLongTensor *spatialSize, THLongTensor *size,
                             bool openMP) {
    auto p = TwoLongTensorsToPoint<dimension>(spatialSize, size);
    auto &rb = validRuleBooks[p];
    if (rb.empty()) {
      auto &SGs = grids[LongTensorToPoint<dimension>(spatialSize)];
 #if defined(ENABLE_OPENMP)
-      openMP ? ValidConvolution_SgsToRules_OMP(SGs, rb, THLongTensor_data(size))
+      openMP ? SubmanifoldConvolution_SgsToRules_OMP(SGs, rb, THLongTensor_data(size))
             :
 #endif
-             ValidConvolution_SgsToRules(SGs, rb, THLongTensor_data(size));
+             SubmanifoldConvolution_SgsToRules(SGs, rb, THLongTensor_data(size));
    }
    return rb;
  }

--- a/PyTorch/sparseconvnet/SCN/generic/Geometry/ValidConvolutionRules.h
+++ b/PyTorch/sparseconvnet/SCN/generic/Geometry/ValidConvolutionRules.h
@@ -24,7 +24,7 @@ InputRegionCalculator_Valid(const Point<dimension> &output, long *size) {
 // rules is used to carry out the "lowering" whilst carrying out the convolution
 template <uInt dimension>
-double ValidConvolution_SgToRules(SparseGrid<dimension> &grid, RuleBook &rules,
+double SubmanifoldConvolution_SgToRules(SparseGrid<dimension> &grid, RuleBook &rules,
                                  long *size) {
  uInt sd = volume<dimension>(size);
  double countActiveInputs = 0;
@@ -46,7 +46,7 @@ double ValidConvolution_SgToRules(SparseGrid<dimension> &grid, RuleBook &rules,
 }
 template <uInt dimension>
-uInt ValidConvolution_SgsToRules(SparseGrids<dimension> &SGs, RuleBook &rules,
+uInt SubmanifoldConvolution_SgsToRules(SparseGrids<dimension> &SGs, RuleBook &rules,
                                 long *size) {
  uInt sd = volume<dimension>(size);
  uInt countActiveInputs = 0;
@@ -54,11 +54,11 @@ uInt ValidConvolution_SgsToRules(SparseGrids<dimension> &SGs, RuleBook &rules,
  rules.resize(sd);
  for (uInt i = 0; i < SGs.size(); i++)
    countActiveInputs +=
-        ValidConvolution_SgToRules<dimension>(SGs[i], rules, size);
+        SubmanifoldConvolution_SgToRules<dimension>(SGs[i], rules, size);
  return countActiveInputs;
 }
 template <uInt dimension>
-uInt ValidConvolution_SgsToRules_OMP(SparseGrids<dimension> &SGs,
+uInt SubmanifoldConvolution_SgsToRules_OMP(SparseGrids<dimension> &SGs,
                                     RuleBook &rules, long *size) {
  std::vector<RuleBook> rbs(SGs.size());
  std::vector<double> countActiveInputs(SGs.size());
@@ -71,7 +71,7 @@ uInt ValidConvolution_SgsToRules_OMP(SparseGrids<dimension> &SGs,
    for (i = 0; i < SGs.size(); i++) {
      rbs[i].resize(sd);
      countActiveInputs[i] =
-          ValidConvolution_SgToRules<dimension>(SGs[i], rbs[i], size);
+          SubmanifoldConvolution_SgToRules<dimension>(SGs[i], rbs[i], size);
    }
  }
  {

--- a/PyTorch/sparseconvnet/SCN/generic/SparseConvNet.h
+++ b/PyTorch/sparseconvnet/SCN/generic/SparseConvNet.h
@@ -16,6 +16,7 @@
 #include <string>
 #include <tuple>
 #include <vector>
+#define ENABLE_OPENMP YES
 #if defined(ENABLE_OPENMP)
 #include <omp.h>
 #endif

--- a/PyTorch/sparseconvnet/SCN/header_cpu.c
+++ b/PyTorch/sparseconvnet/SCN/header_cpu.c
--- a/PyTorch/sparseconvnet/SCN/header_cpu.h
+++ b/PyTorch/sparseconvnet/SCN/header_cpu.h
--- a/PyTorch/sparseconvnet/SCN/header_gpu.c
+++ b/PyTorch/sparseconvnet/SCN/header_gpu.c
--- a/PyTorch/sparseconvnet/SCN/header_gpu.h
+++ b/PyTorch/sparseconvnet/SCN/header_gpu.h
--- a/PyTorch/sparseconvnet/SCN/init.cpp
+++ b/PyTorch/sparseconvnet/SCN/init.cpp
@@ -36,6 +36,9 @@
 #include "generic/CPU/Deconvolution.cpp"
 #include "generic/CPU/THGenerateDimFloatTypes.h"
+#include "generic/CPU/InputLayer.cpp"
+#include "generic/CPU/THGenerateDimFloatTypes.h"
 #include "generic/CPU/LeakyReLU.cpp"
 #include "generic/CPU/THGenerateFloatTypes.h"
@@ -50,7 +53,7 @@
 extern "C" long scn_readPtr(void **ptr) { return (long)(ptr[0]); }
 extern "C" void scn_writePtr(long p, void **ptr) { ptr[0] = (void *)p; }
-extern "C" double scn_ruleBookBits() { return 8 * sizeof(uInt); }
+extern "C" double scn_ruleBookBits(void) { return 8 * sizeof(uInt); }
 #undef scn_D_
 #undef scn_DR_

--- a/PyTorch/sparseconvnet/SCN/init.cu
+++ b/PyTorch/sparseconvnet/SCN/init.cu
@@ -37,6 +37,9 @@ extern THCState *state;
 #include "generic/GPU/Deconvolution.cu"
 #include "generic/GPU/THGenerateDimCudaFloatTypes.h"
+#include "generic/GPU/InputLayer.cu"
+#include "generic/GPU/THGenerateDimCudaFloatTypes.h"
 #include "generic/GPU/LeakyReLU.cu"
 #include "generic/GPU/THGenerateCudaFloatTypes.h"

--- a/PyTorch/sparseconvnet/SCN/make_ffi_headers.py
+++ b/PyTorch/sparseconvnet/SCN/make_ffi_headers.py
+# Copyright 2016-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+n_bits = 32
+f_cpu = [open('header_cpu.c', 'w'), open('header_cpu.h', 'w')]
+f_gpu = [open('header_gpu.c', 'w'), open('header_gpu.h', 'w')]
+for f in f_cpu + f_gpu:
+    f.write("""// Copyright 2016-present, Facebook, Inc.
+// All rights reserved.
+//
+// This source code is licensed under the license found in the
+// LICENSE file in the root directory of this source tree.
+""")
+def fn(st, f=f_cpu):
+    f[0].write(st + '{}')
+    f[1].write(st + ';')
+def dim_fn(st, f=f_cpu):
+    for DIMENSION in range(1, 11):
+        s = st.replace('DIMENSION', str(DIMENSION))
+        fn(s, f)
+def typed_fn(st):
+    s = st
+    s = s.replace('ARCH', 'cpu')
+    s = s.replace('THITensor', 'void')
+    s = s.replace('REAL', 'float')
+    s = s.replace('THTensor', 'THFloatTensor')
+    fn(s, f_cpu)
+    s = st
+    s = s.replace('ARCH', 'cpu')
+    s = s.replace('THITensor', 'void')
+    s = s.replace('REAL', 'double')
+    s = s.replace('THTensor', 'THDoubleTensor')
+    fn(s, f_cpu)
+    s = st
+    s = s.replace('ARCH', 'gpu')
+    s = s.replace('THITensor', 'THCudaIntTensor' if n_bits ==
+                  32 else 'THCudaLongTensor')
+    s = s.replace('REAL', 'float')
+    s = s.replace('THTensor', 'THCudaTensor')
+    fn(s, f_gpu)
+def dim_typed_fn(st):
+    for DIMENSION in range(1, 11):
+        typed_fn(
+            st.replace(
+                'REAL_',
+                'REAL').replace(
+                'DIMENSION',
+                str(DIMENSION)))
+fn("""
+long scn_readPtr(void **ptr)""")
+fn("""
+void scn_writePtr(long p, void **ptr)""")
+fn("""
+double scn_ruleBookBits(void)""")
+fn("""
+void scn_2_drawCurve(void **m, THFloatTensor *features, THFloatTensor *stroke)""")
+dim_fn("""
+double scn_DIMENSION_addSampleFromThresholdedTensor(
+  void **m, THFloatTensor *features_, THFloatTensor *tensor_,
+  THLongTensor *offset_, THLongTensor *spatialSize_, float threshold)""")
+dim_fn("""
+void scn_DIMENSION_batchAddSample(void **m)""")
+dim_fn("""
+void scn_DIMENSION_createMetadataForDenseToSparse(
+  void **m, THLongTensor *spatialSize_, THLongTensor *pad, THLongTensor *nz,
+  long batchSize)""")
+dim_fn("""
+void scn_DIMENSION_freeMetadata(void **metadata)""")
+dim_fn("""
+void scn_DIMENSION_generateRuleBooks3s2(void **m)""")
+dim_fn("""
+void scn_DIMENSION_generateRuleBooks2s2(void **m)""")
+dim_fn("""
+void scn_DIMENSION_setInputSpatialSize(void **m, THLongTensor *spatialSize)""")
+dim_fn("""
+void scn_DIMENSION_setInputSpatialLocation(void **m, THFloatTensor *features,
+  THLongTensor *location, THFloatTensor *vec, _Bool overwrite)""")
+dim_fn("""
+void scn_DIMENSION_setInputSpatialLocations(void **m, THFloatTensor *features,
+  THLongTensor *locations, THFloatTensor *vecs, _Bool overwrite)""")
+dim_fn("""
+void scn_DIMENSION_getSpatialLocations(void **m, THLongTensor *spatialSize,
+  THLongTensor *locations)""")
+typed_fn("""
+void scn_ARCH_REAL_AffineReluTrivialConvolution_updateOutput(
+  THTensor *input_features, THTensor *output_features,
+  THTensor *affineWeight, THTensor *affineBias, THTensor *convWeight)""")
+typed_fn("""
+void scn_ARCH_REAL_AffineReluTrivialConvolution_backward(
+  THTensor *input_features, THTensor *d_input_features,
+  THTensor *d_output_features, THTensor *affineWeight,
+  THTensor *d_affineWeight, THTensor *affineBias, THTensor *d_affineBias,
+  THTensor *convWeight, THTensor *d_convWeight, _Bool additiveGrad)""")
+typed_fn("""
+void scn_ARCH_REAL_BatchwiseMultiplicativeDropout_updateOutput(
+  THTensor *input_features, THTensor *output_features,
+  THTensor *noise, long nPlanes, long input_stride, long output_stride,
+  float alpha)""")
+typed_fn("""
+void scn_ARCH_REAL_BatchwiseMultiplicativeDropout_updateGradInput(
+  THTensor *input_features, THTensor *d_input_features,
+  THTensor *d_output_features, THTensor *noise, long nPlanes,
+  long input_stride, long output_stride, float alpha)""")
+typed_fn("""
+void scn_ARCH_REAL_BatchNormalization_updateOutput(
+  THTensor *input_features, THTensor *output_features,
+  THTensor *saveMean, THTensor *saveInvStd, THTensor *runningMean,
+  THTensor *runningVar, THTensor *weight, THTensor *bias, REAL eps,
+  REAL momentum, _Bool train, REAL leakiness)""")
+typed_fn("""
+void scn_ARCH_REAL_BatchNormalization_backward(
+  THTensor *input_features, THTensor *d_input_features,
+  THTensor *output_features, THTensor *d_output_features, THTensor *saveMean,
+  THTensor *saveInvStd, THTensor *runningMean, THTensor *runningVar,
+  THTensor *weight, THTensor *bias, THTensor *d_weight, THTensor *d_bias,
+  REAL leakiness)""")
+typed_fn("""
+void scn_ARCH_REAL_BatchNormalizationInTensor_updateOutput(
+  THTensor *input_features, THTensor *output_features,
+  THTensor *saveMean, THTensor *saveInvStd, THTensor *runningMean,
+  THTensor *runningVar, THTensor *weight, THTensor *bias, REAL eps,
+  REAL momentum, _Bool train, REAL leakiness)""")
+typed_fn("""
+void scn_ARCH_REAL_LeakyReLU_updateOutput(
+  THTensor *input_features, THTensor *output_features,
+  float alpha)""")
+typed_fn("""
+void scn_ARCH_REAL_LeakyReLU_updateGradInput(
+  THTensor *input_features, THTensor *d_input_features,
+  THTensor *d_output_features, float alpha)""")
+typed_fn("""
+double scn_ARCH_REAL_NetworkInNetwork_updateOutput(
+  THTensor *input_features, THTensor *output_features,
+  THTensor *weight, THTensor *bias)""")
+typed_fn("""
+void scn_ARCH_REAL_NetworkInNetwork_updateGradInput(
+  THTensor *d_input_features, THTensor *d_output_features,
+  THTensor *weight)""")
+typed_fn("""
+void scn_ARCH_REAL_NetworkInNetwork_accGradParameters(
+  THTensor *input_features, THTensor *d_output_features,
+  THTensor *d_weight, THTensor *d_bias)""")
+dim_typed_fn("""
+void scn_ARCH_REAL_DIMENSIONActivePooling_updateOutput(
+  THLongTensor *inputSize, void **m, THFloatTensor *input_features,
+  THFloatTensor *output_features, void *rulesBuffer, _Bool average);""")
+dim_typed_fn("""void scn_ARCH_REAL_DIMENSIONActivePooling_updateGradInput(
+  THLongTensor *inputSize, void **m,
+  THFloatTensor *d_input_features, THFloatTensor *d_output_features,
+  void *rulesBuffer, _Bool average);""")
+dim_typed_fn("""
+void scn_ARCH_REAL_DIMENSIONAveragePooling_updateOutput(
+  THLongTensor *inputSize, THLongTensor *outputSize,
+  THLongTensor *poolSize, THLongTensor *poolStride, void **m,
+  THTensor *input_features, THTensor *output_features, long nFeaturesToDrop,
+  THITensor *rulesBuffer)""")
+dim_typed_fn("""
+void scn_ARCH_REAL_DIMENSIONAveragePooling_updateGradInput(
+  THLongTensor * inputSize, THLongTensor * outputSize,
+  THLongTensor * poolSize, THLongTensor * poolStride, void **m,
+  THTensor *input_features, THTensor *d_input_features,
+  THTensor *d_output_features, long nFeaturesToDrop,
+  THITensor *rulesBuffer)""")
+dim_typed_fn("""
+double scn_ARCH_REAL_DIMENSIONConvolution_updateOutput(
+  THLongTensor *inputSize, THLongTensor *outputSize,
+  THLongTensor *filterSize, THLongTensor *filterStride, void **m,
+  THTensor *input_features, THTensor *output_features, THTensor *weight,
+  THTensor *bias, long filterVolume, THITensor *rulesBuffer)""")
+dim_typed_fn("""
+void scn_ARCH_REAL_DIMENSIONConvolution_backward(
+  THLongTensor *inputSize, THLongTensor *outputSize,
+  THLongTensor *filterSize, THLongTensor *filterStride, void **m,
+  THTensor *input_features, THTensor *d_input_features,
+  THTensor *d_output_features, THTensor *weight, THTensor *d_weight,
+  THTensor *d_bias, long filterVolume, THITensor *rulesBuffer)""")
+dim_typed_fn("""
+double scn_ARCH_REAL_DIMENSIONDeconvolution_updateOutput(
+  THLongTensor *inputSize, THLongTensor *outputSize,
+  THLongTensor *filterSize, THLongTensor *filterStride, void **m,
+  THTensor *input_features, THTensor *output_features, THTensor *weight,
+  THTensor *bias, long filterVolume, THITensor *rulesBuffer)""")
+dim_typed_fn("""
+void scn_ARCH_REAL_DIMENSIONDeconvolution_backward(
+  THLongTensor *inputSize, THLongTensor *outputSize,
+  THLongTensor *filterSize, THLongTensor *filterStride, void **m,
+  THTensor *input_features, THTensor *d_input_features,
+  THTensor *d_output_features, THTensor *weight, THTensor *d_weight,
+  THTensor *d_bias, long filterVolume, THITensor *rulesBuffer)""")
+dim_typed_fn("""
+void scn_ARCH_REAL_DIMENSIONMaxPooling_updateOutput(
+  THLongTensor *inputSize, THLongTensor *outputSize,
+  THLongTensor *poolSize, THLongTensor *poolStride, void **m,
+  THTensor *input_features, THTensor *output_features, long nFeaturesToDrop,
+  THITensor *rulesBuffer)""")
+dim_typed_fn("""
+void scn_ARCH_REAL_DIMENSIONMaxPooling_updateGradInput(
+  THLongTensor * inputSize, THLongTensor * outputSize,
+  THLongTensor * poolSize, THLongTensor * poolStride, void **m,
+  THTensor *input_features, THTensor *d_input_features,
+  THTensor *output_features, THTensor *d_output_features,
+  long nFeaturesToDrop, THITensor *rulesBuffer)""")
+dim_typed_fn("""
+void scn_ARCH_REAL_DIMENSIONSparseToDense_updateOutput(
+  THLongTensor *inputSize, void **m, THTensor *input_features,
+  THTensor *output_features, THITensor *rulesBuffer, long nPlanes)""")
+dim_typed_fn("""
+void scn_ARCH_REAL_DIMENSIONSparseToDense_updateGradInput(
+  THLongTensor *inputSize, void **m, THTensor *input_features,
+  THTensor *d_input_features, THTensor *d_output_features,
+  THITensor *rulesBuffer)""")
+dim_typed_fn("""
+double scn_ARCH_REAL_DIMENSIONSubmanifoldConvolution_updateOutput(
+  THLongTensor *inputSize, THLongTensor *filterSize, void **m,
+  THTensor *input_features, THTensor *output_features, THTensor *weight,
+  THTensor *bias, long filterVolume, THITensor *rulesBuffer)""")
+dim_typed_fn("""
+void scn_ARCH_REAL_DIMENSIONSubmanifoldConvolution_backward(
+  THLongTensor *inputSize, THLongTensor *filterSize, void **m,
+  THTensor *input_features, THTensor *d_input_features,
+  THTensor *d_output_features, THTensor *weight, THTensor *d_weight,
+  THTensor *d_bias, long filterVolume, THITensor *rulesBuffer)""")
+dim_typed_fn("""
+void scn_ARCH_REAL_DIMENSIONInputLayer_updateOutput(
+  void **m, THLongTensor *spatialSize, THLongTensor *input_coords,
+  THFloatTensor *input_features, THFloatTensor *output_features, long batchSize,
+  long mode, void *rulesBuffer)""")
+dim_typed_fn("""
+void scn_ARCH_REAL_DIMENSIONInputLayer_updateGradInput(
+  void **m, THFloatTensor *d_input_features, THFloatTensor *d_output_features,
+  void *rulesBuffer)""")
+dim_typed_fn("""
+void scn_ARCH_REAL_DIMENSIONBLInputLayer_updateOutput(
+  void **m, THLongTensor *spatialSize, THLongTensor *input_coords,
+  THFloatTensor *input_features, THFloatTensor *output_features, long mode,
+  void *rulesBuffer)""")
+dim_typed_fn("""
+void scn_ARCH_REAL_DIMENSIONBLInputLayer_updateGradInput(
+  void **m, THFloatTensor *d_input_features,THFloatTensor *d_output_features,
+  void *rulesBuffer)""")
+dim_typed_fn("""
+void scn_ARCH_REAL_DIMENSIONBLOutputLayer_updateOutput(
+  void **m, THFloatTensor *input_features, THFloatTensor *output_features,
+  void *rulesBuffer)""")
+dim_typed_fn("""
+void scn_ARCH_REAL_DIMENSIONBLOutputLayer_updateGradInput(
+  void **m, THFloatTensor *d_input_features, THFloatTensor *d_output_features,
+  void *rulesBuffer)""")
--- a/PyTorch/sparseconvnet/__init__.py
+++ b/PyTorch/sparseconvnet/__init__.py
@@ -16,6 +16,7 @@ from .denseToSparse import DenseToSparse
 from .dropout import Dropout, BatchwiseDropout
 from .identity import Identity
 from .inputBatch import InputBatch
+from .inputLayer import InputLayer, BLInputLayer, BLOutputLayer
 from .maxPooling import MaxPooling
 from .metadata import Metadata
 from .networkArchitectures import *
@@ -26,16 +27,18 @@ from .sparseToDense import SparseToDense
 from .submanifoldConvolution import SubmanifoldConvolution, ValidConvolution
 from .tables import *
 def concatenate_feature_planes(input):
    output = SparseConvNetTensor()
    output.metadata = input[0].metadata
    output.spatial_size = input[0].metadata
-    output.features=torch.cat([i.features for i in input],1)
+    output.features = torch.cat([i.features for i in input], 1)
    return output
 def add_feature_planes(input):
    output = SparseConvNetTensor()
    output.metadata = input[0].metadata
    output.spatial_size = input[0].metadata
-    output.features=sum([i.features for i in input])
+    output.features = sum([i.features for i in input])
    return output