Goodbye THNN. Hello ATen!

2c4ed608 · Benjamin Thomas Graham · 6d4475db · 2c4ed608 · 2c4ed608 · 2c4ed608
Commit 2c4ed608 authored Jun 20, 2018 by Benjamin Thomas Graham
20 changed files
--- a/sparseconvnet/SCN/pybind2.py
+++ b/sparseconvnet/SCN/pybind2.py
+# Copyright 2016-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+f_cpu = open('pybind_cpu.cpp', 'w')
+f_cuda = open('pybind_cuda.cpp', 'w')
+
+txt="""
+// Copyright 2016-present, Facebook, Inc.
+// All rights reserved.
+//
+// This source code is licensed under the license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <torch/torch.h>
+
+#include "Metadata/Metadata.h"
+"""
+f_cpu.write(txt)
+f_cuda.write(txt)
+
+txt="""
+template <typename T>
+double cpu_AffineReluTrivialConvolution_updateOutput(at::Tensor input_features,
+                                                     at::Tensor output_features,
+                                                     at::Tensor affineWeight,
+                                                     at::Tensor affineBias,
+                                                     at::Tensor convWeight);
+template <typename T>
+void cpu_AffineReluTrivialConvolution_backward(
+    at::Tensor input_features, at::Tensor d_input_features,
+    at::Tensor d_output_features, at::Tensor affineWeight,
+    at::Tensor d_affineWeight, at::Tensor affineBias, at::Tensor d_affineBias,
+    at::Tensor convWeight, at::Tensor d_convWeight, bool additiveGrad);
+template <typename T>
+void cpu_BatchNormalization_updateOutput(
+    at::Tensor input_features, at::Tensor output_features, at::Tensor saveMean,
+    at::Tensor saveInvStd, at::Tensor runningMean, at::Tensor runningVar,
+    at::Tensor weight, at::Tensor bias, T eps, T momentum, bool train,
+    T leakiness);
+template <typename T>
+void cpu_BatchNormalizationInTensor_updateOutput(
+    at::Tensor input_features, at::Tensor output_features, at::Tensor saveMean,
+    at::Tensor saveInvStd, at::Tensor runningMean, at::Tensor runningVar,
+    at::Tensor weight, at::Tensor bias, T eps, T momentum, bool train,
+    T leakiness);
+template <typename T>
+void cpu_BatchNormalization_backward(
+    at::Tensor input_features, at::Tensor d_input_features,
+    at::Tensor output_features, at::Tensor d_output_features,
+    at::Tensor saveMean, at::Tensor saveInvStd, at::Tensor runningMean,
+    at::Tensor runningVar, at::Tensor weight, at::Tensor bias,
+    at::Tensor d_weight, at::Tensor d_bias, T leakiness);
+template <typename T>
+void cpu_BatchwiseMultiplicativeDropout_updateOutput(at::Tensor input_features,
+                                                     at::Tensor output_features,
+                                                     at::Tensor noise,
+                                                     float alpha);
+template <typename T>
+void cpu_BatchwiseMultiplicativeDropout_updateGradInput(
+    at::Tensor input_features, at::Tensor d_input_features,
+    at::Tensor d_output_features, at::Tensor noise, float alpha);
+template <typename T>
+void cpu_LeakyReLU_updateOutput(at::Tensor input_features,
+                                at::Tensor output_features, float alpha);
+template <typename T>
+void cpu_LeakyReLU_updateGradInput(at::Tensor input_features,
+                                   at::Tensor d_input_features,
+                                   at::Tensor d_output_features, float alpha);
+template <typename T>
+double cpu_NetworkInNetwork_updateOutput(at::Tensor input_features,
+                                         at::Tensor output_features,
+                                         at::Tensor weight, at::Tensor bias);
+template <typename T>
+void cpu_NetworkInNetwork_updateGradInput(at::Tensor d_input_features,
+                                          at::Tensor d_output_features,
+                                          at::Tensor weight);
+template <typename T>
+void cpu_NetworkInNetwork_accGradParameters(at::Tensor input_features,
+                                            at::Tensor d_output_features,
+                                            at::Tensor d_weight,
+                                            at::Tensor d_bias);
+template <typename T, Int Dimension>
+void cpu_ActivePooling_updateOutput(at::Tensor inputSize,
+                                    Metadata<Dimension> &m,
+                                    at::Tensor input_features,
+                                    at::Tensor output_features, bool average);
+template <typename T, Int Dimension>
+void cpu_ActivePooling_updateGradInput(
+    at::Tensor inputSize, Metadata<Dimension> &m, at::Tensor input_features,
+    at::Tensor d_input_features, at::Tensor d_output_features, bool average);
+template <typename T, Int Dimension>
+void cpu_AveragePooling_updateOutput(
+    at::Tensor inputSize, at::Tensor outputSize, at::Tensor poolSize,
+    at::Tensor poolStride, Metadata<Dimension> &m, at::Tensor input_features,
+    at::Tensor output_features, long nFeaturesToDrop);
+template <typename T, Int Dimension>
+void cpu_AveragePooling_updateGradInput(
+    at::Tensor inputSize, at::Tensor outputSize, at::Tensor poolSize,
+    at::Tensor poolStride, Metadata<Dimension> &m, at::Tensor input_features,
+    at::Tensor d_input_features, at::Tensor d_output_features,
+    long nFeaturesToDrop);
+template <typename T, Int Dimension>
+double cpu_Convolution_updateOutput(
+    at::Tensor inputSize, at::Tensor outputSize, at::Tensor filterSize,
+    at::Tensor filterStride, Metadata<Dimension> &m, at::Tensor input_features,
+    at::Tensor output_features, at::Tensor weight, at::Tensor bias);
+template <typename T, Int Dimension>
+void cpu_Convolution_backward(at::Tensor inputSize, at::Tensor outputSize,
+                              at::Tensor filterSize, at::Tensor filterStride,
+                              Metadata<Dimension> &m, at::Tensor input_features,
+                              at::Tensor d_input_features,
+                              at::Tensor d_output_features, at::Tensor weight,
+                              at::Tensor d_weight, at::Tensor d_bias);
+template <typename T, Int Dimension>
+double cpu_SubmanifoldConvolution_updateOutput(
+    at::Tensor inputSize, at::Tensor filterSize, Metadata<Dimension> &m,
+    at::Tensor input_features, at::Tensor output_features, at::Tensor weight,
+    at::Tensor bias);
+template <typename T, Int Dimension>
+void cpu_SubmanifoldConvolution_backward(
+    at::Tensor inputSize, at::Tensor filterSize, Metadata<Dimension> &m,
+    at::Tensor input_features, at::Tensor d_input_features,
+    at::Tensor d_output_features, at::Tensor weight, at::Tensor d_weight,
+    at::Tensor d_bias);
+template <typename T, Int Dimension>
+double cpu_FullConvolution_updateOutput(
+    at::Tensor inputSize, at::Tensor outputSize, at::Tensor filterSize,
+    at::Tensor filterStride, Metadata<Dimension> &mIn,
+    Metadata<Dimension> &mOut, at::Tensor input_features,
+    at::Tensor output_features, at::Tensor weight, at::Tensor bias);
+template <typename T, Int Dimension>
+void cpu_FullConvolution_backward(
+    at::Tensor inputSize, at::Tensor outputSize, at::Tensor filterSize,
+    at::Tensor filterStride, Metadata<Dimension> &mIn,
+    Metadata<Dimension> &mOut, at::Tensor input_features,
+    at::Tensor d_input_features, at::Tensor d_output_features,
+    at::Tensor weight, at::Tensor d_weight, at::Tensor d_bias);
+template <typename T, Int Dimension>
+double cpu_RandomizedStrideConvolution_updateOutput(
+    at::Tensor inputSize, at::Tensor outputSize, at::Tensor filterSize,
+    at::Tensor filterStride, Metadata<Dimension> &m, at::Tensor input_features,
+    at::Tensor output_features, at::Tensor weight, at::Tensor bias);
+template <typename T, Int Dimension>
+void cpu_RandomizedStrideConvolution_backward(
+    at::Tensor inputSize, at::Tensor outputSize, at::Tensor filterSize,
+    at::Tensor filterStride, Metadata<Dimension> &m, at::Tensor input_features,
+    at::Tensor d_input_features, at::Tensor d_output_features,
+    at::Tensor weight, at::Tensor d_weight, at::Tensor d_bias);
+template <typename T, Int Dimension>
+double cpu_Deconvolution_updateOutput(
+    at::Tensor inputSize, at::Tensor outputSize, at::Tensor filterSize,
+    at::Tensor filterStride, Metadata<Dimension> &m, at::Tensor input_features,
+    at::Tensor output_features, at::Tensor weight, at::Tensor bias);
+template <typename T, Int Dimension>
+void cpu_Deconvolution_backward(at::Tensor inputSize, at::Tensor outputSize,
+                                at::Tensor filterSize, at::Tensor filterStride,
+                                Metadata<Dimension> &m,
+                                at::Tensor input_features,
+                                at::Tensor d_input_features,
+                                at::Tensor d_output_features, at::Tensor weight,
+                                at::Tensor d_weight, at::Tensor d_bias);
+template <typename T, Int Dimension>
+void cpu_InputLayer_updateOutput(Metadata<Dimension> &m, at::Tensor spatialSize,
+                                 at::Tensor input_coords,
+                                 at::Tensor input_features,
+                                 at::Tensor output_features, long batchSize,
+                                 long mode);
+template <typename T, Int Dimension>
+void cpu_InputLayer_updateGradInput(Metadata<Dimension> &m,
+                                    at::Tensor d_input_features,
+                                    at::Tensor d_output_features);
+template <typename T, Int Dimension>
+void cpu_OutputLayer_updateOutput(Metadata<Dimension> &m,
+                                  at::Tensor input_features,
+                                  at::Tensor output_features);
+template <typename T, Int Dimension>
+void cpu_OutputLayer_updateGradInput(Metadata<Dimension> &m,
+                                     at::Tensor d_input_features,
+                                     at::Tensor d_output_features);
+template <typename T, Int Dimension>
+void cpu_BLInputLayer_updateOutput(Metadata<Dimension> &m,
+                                   at::Tensor spatialSize,
+                                   at::Tensor input_coords,
+                                   at::Tensor input_features,
+                                   at::Tensor output_features, long mode);
+template <typename T, Int Dimension>
+void cpu_BLInputLayer_updateGradInput(Metadata<Dimension> &m,
+                                      at::Tensor d_input_features,
+                                      at::Tensor d_output_features);
+template <typename T, Int Dimension>
+void cpu_BLOutputLayer_updateOutput(Metadata<Dimension> &m,
+                                    at::Tensor input_features,
+                                    at::Tensor output_features);
+template <typename T, Int Dimension>
+void cpu_BLOutputLayer_updateGradInput(Metadata<Dimension> &m,
+                                       at::Tensor d_input_features,
+                                       at::Tensor d_output_features);
+template <typename T, Int Dimension>
+void cpu_MaxPooling_updateOutput(at::Tensor inputSize, at::Tensor outputSize,
+                                 at::Tensor poolSize, at::Tensor poolStride,
+                                 Metadata<Dimension> &m,
+                                 at::Tensor input_features,
+                                 at::Tensor output_features,
+                                 long nFeaturesToDrop);
+template <typename T, Int Dimension>
+void cpu_MaxPooling_updateGradInput(
+    at::Tensor inputSize, at::Tensor outputSize, at::Tensor poolSize,
+    at::Tensor poolStride, Metadata<Dimension> &m, at::Tensor input_features,
+    at::Tensor d_input_features, at::Tensor output_features,
+    at::Tensor d_output_features, long nFeaturesToDrop);
+template <typename T, Int Dimension>
+void cpu_RandomizedStrideMaxPooling_updateOutput(
+    at::Tensor inputSize, at::Tensor outputSize, at::Tensor poolSize,
+    at::Tensor poolStride, Metadata<Dimension> &m, at::Tensor input_features,
+    at::Tensor output_features, long nFeaturesToDrop);
+template <typename T, Int Dimension>
+void cpu_RandomizedStrideMaxPooling_updateGradInput(
+    at::Tensor inputSize, at::Tensor outputSize, at::Tensor poolSize,
+    at::Tensor poolStride, Metadata<Dimension> &m, at::Tensor input_features,
+    at::Tensor d_input_features, at::Tensor output_features,
+    at::Tensor d_output_features, long nFeaturesToDrop);
+template <typename T, Int Dimension>
+void cpu_SparseToDense_updateOutput(at::Tensor inputSize,
+                                    Metadata<Dimension> &m,
+                                    at::Tensor input_features,
+                                    at::Tensor output_features, long nPlanes);
+template <typename T, Int Dimension>
+void cpu_SparseToDense_updateGradInput(at::Tensor inputSize,
+                                       Metadata<Dimension> &m,
+                                       at::Tensor input_features,
+                                       at::Tensor d_input_features,
+                                       at::Tensor d_output_features);
+template <typename T, Int Dimension>
+void cpu_UnPooling_updateOutput(at::Tensor inputSize, at::Tensor outputSize,
+                                at::Tensor poolSize, at::Tensor poolStride,
+                                Metadata<Dimension> &m,
+                                at::Tensor input_features,
+                                at::Tensor output_features,
+                                long nFeaturesToDrop);
+template <typename T, Int Dimension>
+void cpu_UnPooling_updateGradInput(at::Tensor inputSize, at::Tensor outputSize,
+                                   at::Tensor poolSize, at::Tensor poolStride,
+                                   Metadata<Dimension> &m,
+                                   at::Tensor input_features,
+                                   at::Tensor d_input_features,
+                                   at::Tensor d_output_features,
+                                   long nFeaturesToDrop);
+"""
+f_cpu.write(txt)
+f_cuda.write(txt)
+f_cuda.write(txt.replace('cpu','cuda'))
+
+
+# txt="""
+# void cpu_float_DrawCurve_2(Metadata<2> &m, at::Tensor features,
+#                            at::Tensor stroke);
+# """
+# f_cpu.write(txt)
+# f_cuda.write(txt)
+
+for f in [f_cpu, f_cuda]:
+    f.write("""
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+""")
+
+for f in [f_cpu, f_cuda]:
+    for DIMENSION in range(1,5):
+        f.write("""
+pybind11::class_<Metadata<DIMENSION>>(m, "Metadata_DIMENSION")
+  .def(pybind11::init<>())
+  .def("clear", &Metadata<DIMENSION>::clear)
+  .def("setInputSpatialSize", &Metadata<DIMENSION>::setInputSpatialSize)
+  .def("batchAddSample", &Metadata<DIMENSION>::batchAddSample)
+  .def("setInputSpatialLocation", &Metadata<DIMENSION>::setInputSpatialLocation)
+  .def("setInputSpatialLocations", &Metadata<DIMENSION>::setInputSpatialLocations)
+  .def("getSpatialLocations", &Metadata<DIMENSION>::getSpatialLocations)
+  .def("createMetadataForDenseToSparse", &Metadata<DIMENSION>::createMetadataForDenseToSparse)
+  .def("sparsifyMetadata", &Metadata<DIMENSION>::sparsifyMetadata)
+  .def("addSampleFromThresholdedTensor", &Metadata<DIMENSION>::addSampleFromThresholdedTensor)
+  .def("generateRuleBooks3s2", &Metadata<DIMENSION>::generateRuleBooks3s2)
+  .def("generateRuleBooks2s2", &Metadata<DIMENSION>::generateRuleBooks2s2);
+""".replace('DIMENSION', str(DIMENSION)))
+
+def typed_fn(st):
+    st='m.def("ARCH_REAL_'+st+'", &ARCH_'+st+'<REAL>, "");\n'
+    for f in [f_cpu, f_cuda]:
+        f.write(st.replace('ARCH', 'cpu').replace('REAL', 'float'))
+        f.write(st.replace('ARCH', 'cpu').replace('REAL', 'double'))
+    f_cuda.write(st.replace('ARCH', 'cuda').replace('REAL', 'float'))
+
+def dim_typed_fn(st):
+    st='m.def("ARCH_REAL_'+st+'_DIMENSION", &ARCH_'+st+'<REAL,DIMENSION>, "");\n'
+    for DIMENSION in range(1,5):
+        for f in [f_cpu, f_cuda]:
+            f.write(st.replace('DIMENSION', str(DIMENSION)).replace('ARCH', 'cpu').replace('REAL', 'float'))
+            f.write(st.replace('DIMENSION', str(DIMENSION)).replace('ARCH', 'cpu').replace('REAL', 'double'))
+        f_cuda.write(st.replace('DIMENSION', str(DIMENSION)).replace('ARCH', 'cuda').replace('REAL', 'float'))
+
+typed_fn("AffineReluTrivialConvolution_updateOutput")
+typed_fn("AffineReluTrivialConvolution_backward")
+typed_fn("BatchwiseMultiplicativeDropout_updateOutput")
+typed_fn("BatchwiseMultiplicativeDropout_updateGradInput")
+typed_fn("BatchNormalization_updateOutput")
+typed_fn("BatchNormalization_backward")
+typed_fn("LeakyReLU_updateOutput")
+typed_fn("LeakyReLU_updateGradInput")
+typed_fn("NetworkInNetwork_updateOutput")
+typed_fn("NetworkInNetwork_updateGradInput")
+typed_fn("NetworkInNetwork_accGradParameters")
+dim_typed_fn("ActivePooling_updateOutput")
+dim_typed_fn("ActivePooling_updateGradInput")
+dim_typed_fn("AveragePooling_updateOutput")
+dim_typed_fn("AveragePooling_updateGradInput")
+dim_typed_fn("Convolution_updateOutput")
+dim_typed_fn("Convolution_backward")
+dim_typed_fn("RandomizedStrideConvolution_updateOutput")
+dim_typed_fn("RandomizedStrideConvolution_backward")
+dim_typed_fn("Deconvolution_updateOutput")
+dim_typed_fn("Deconvolution_backward")
+dim_typed_fn("FullConvolution_updateOutput")
+dim_typed_fn("FullConvolution_backward")
+dim_typed_fn("MaxPooling_updateOutput")
+dim_typed_fn("MaxPooling_updateGradInput")
+dim_typed_fn("RandomizedStrideMaxPooling_updateOutput")
+dim_typed_fn("RandomizedStrideMaxPooling_updateGradInput")
+dim_typed_fn("SparseToDense_updateOutput")
+dim_typed_fn("SparseToDense_updateGradInput")
+dim_typed_fn("SubmanifoldConvolution_updateOutput")
+dim_typed_fn("SubmanifoldConvolution_backward")
+dim_typed_fn("InputLayer_updateOutput")
+dim_typed_fn("InputLayer_updateGradInput")
+dim_typed_fn("OutputLayer_updateOutput")
+dim_typed_fn("OutputLayer_updateGradInput")
+dim_typed_fn("BLInputLayer_updateOutput")
+dim_typed_fn("BLInputLayer_updateGradInput")
+dim_typed_fn("BLOutputLayer_updateOutput")
+dim_typed_fn("BLOutputLayer_updateGradInput")
+dim_typed_fn("UnPooling_updateOutput")
+dim_typed_fn("UnPooling_updateGradInput")
+
+for f in [f_cpu, f_cuda]:
+    f.write(
+"""
+m.def("n_rulebook_bits", []() {return 8*sizeof(Int);}, "");
+}
+""")
+
+f_cpu.close()
+f_cuda.close()
--- a/sparseconvnet/SCN/pybind_cpu.cpp
+++ b/sparseconvnet/SCN/pybind_cpu.cpp
+
+// Copyright 2016-present, Facebook, Inc.
+// All rights reserved.
+//
+// This source code is licensed under the license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <torch/torch.h>
+
+#include "Metadata/Metadata.h"
+
+template <typename T>
+double cpu_AffineReluTrivialConvolution_updateOutput(at::Tensor input_features,
+                                                     at::Tensor output_features,
+                                                     at::Tensor affineWeight,
+                                                     at::Tensor affineBias,
+                                                     at::Tensor convWeight);
+template <typename T>
+void cpu_AffineReluTrivialConvolution_backward(
+    at::Tensor input_features, at::Tensor d_input_features,
+    at::Tensor d_output_features, at::Tensor affineWeight,
+    at::Tensor d_affineWeight, at::Tensor affineBias, at::Tensor d_affineBias,
+    at::Tensor convWeight, at::Tensor d_convWeight, bool additiveGrad);
+template <typename T>
+void cpu_BatchNormalization_updateOutput(
+    at::Tensor input_features, at::Tensor output_features, at::Tensor saveMean,
+    at::Tensor saveInvStd, at::Tensor runningMean, at::Tensor runningVar,
+    at::Tensor weight, at::Tensor bias, T eps, T momentum, bool train,
+    T leakiness);
+template <typename T>
+void cpu_BatchNormalizationInTensor_updateOutput(
+    at::Tensor input_features, at::Tensor output_features, at::Tensor saveMean,
+    at::Tensor saveInvStd, at::Tensor runningMean, at::Tensor runningVar,
+    at::Tensor weight, at::Tensor bias, T eps, T momentum, bool train,
+    T leakiness);
+template <typename T>
+void cpu_BatchNormalization_backward(
+    at::Tensor input_features, at::Tensor d_input_features,
+    at::Tensor output_features, at::Tensor d_output_features,
+    at::Tensor saveMean, at::Tensor saveInvStd, at::Tensor runningMean,
+    at::Tensor runningVar, at::Tensor weight, at::Tensor bias,
+    at::Tensor d_weight, at::Tensor d_bias, T leakiness);
+template <typename T>
+void cpu_BatchwiseMultiplicativeDropout_updateOutput(at::Tensor input_features,
+                                                     at::Tensor output_features,
+                                                     at::Tensor noise,
+                                                     float alpha);
+template <typename T>
+void cpu_BatchwiseMultiplicativeDropout_updateGradInput(
+    at::Tensor input_features, at::Tensor d_input_features,
+    at::Tensor d_output_features, at::Tensor noise, float alpha);
+template <typename T>
+void cpu_LeakyReLU_updateOutput(at::Tensor input_features,
+                                at::Tensor output_features, float alpha);
+template <typename T>
+void cpu_LeakyReLU_updateGradInput(at::Tensor input_features,
+                                   at::Tensor d_input_features,
+                                   at::Tensor d_output_features, float alpha);
+template <typename T>
+double cpu_NetworkInNetwork_updateOutput(at::Tensor input_features,
+                                         at::Tensor output_features,
+                                         at::Tensor weight, at::Tensor bias);
+template <typename T>
+void cpu_NetworkInNetwork_updateGradInput(at::Tensor d_input_features,
+                                          at::Tensor d_output_features,
+                                          at::Tensor weight);
+template <typename T>
+void cpu_NetworkInNetwork_accGradParameters(at::Tensor input_features,
+                                            at::Tensor d_output_features,
+                                            at::Tensor d_weight,
+                                            at::Tensor d_bias);
+template <typename T, Int Dimension>
+void cpu_ActivePooling_updateOutput(at::Tensor inputSize,
+                                    Metadata<Dimension> &m,
+                                    at::Tensor input_features,
+                                    at::Tensor output_features, bool average);
+template <typename T, Int Dimension>
+void cpu_ActivePooling_updateGradInput(
+    at::Tensor inputSize, Metadata<Dimension> &m, at::Tensor input_features,
+    at::Tensor d_input_features, at::Tensor d_output_features, bool average);
+template <typename T, Int Dimension>
+void cpu_AveragePooling_updateOutput(
+    at::Tensor inputSize, at::Tensor outputSize, at::Tensor poolSize,
+    at::Tensor poolStride, Metadata<Dimension> &m, at::Tensor input_features,
+    at::Tensor output_features, long nFeaturesToDrop);
+template <typename T, Int Dimension>
+void cpu_AveragePooling_updateGradInput(
+    at::Tensor inputSize, at::Tensor outputSize, at::Tensor poolSize,
+    at::Tensor poolStride, Metadata<Dimension> &m, at::Tensor input_features,
+    at::Tensor d_input_features, at::Tensor d_output_features,
+    long nFeaturesToDrop);
+template <typename T, Int Dimension>
+double cpu_Convolution_updateOutput(
+    at::Tensor inputSize, at::Tensor outputSize, at::Tensor filterSize,
+    at::Tensor filterStride, Metadata<Dimension> &m, at::Tensor input_features,
+    at::Tensor output_features, at::Tensor weight, at::Tensor bias);
+template <typename T, Int Dimension>
+void cpu_Convolution_backward(at::Tensor inputSize, at::Tensor outputSize,
+                              at::Tensor filterSize, at::Tensor filterStride,
+                              Metadata<Dimension> &m, at::Tensor input_features,
+                              at::Tensor d_input_features,
+                              at::Tensor d_output_features, at::Tensor weight,
+                              at::Tensor d_weight, at::Tensor d_bias);
+template <typename T, Int Dimension>
+double cpu_SubmanifoldConvolution_updateOutput(
+    at::Tensor inputSize, at::Tensor filterSize, Metadata<Dimension> &m,
+    at::Tensor input_features, at::Tensor output_features, at::Tensor weight,
+    at::Tensor bias);
+template <typename T, Int Dimension>
+void cpu_SubmanifoldConvolution_backward(
+    at::Tensor inputSize, at::Tensor filterSize, Metadata<Dimension> &m,
+    at::Tensor input_features, at::Tensor d_input_features,
+    at::Tensor d_output_features, at::Tensor weight, at::Tensor d_weight,
+    at::Tensor d_bias);
+template <typename T, Int Dimension>
+double cpu_FullConvolution_updateOutput(
+    at::Tensor inputSize, at::Tensor outputSize, at::Tensor filterSize,
+    at::Tensor filterStride, Metadata<Dimension> &mIn,
+    Metadata<Dimension> &mOut, at::Tensor input_features,
+    at::Tensor output_features, at::Tensor weight, at::Tensor bias);
+template <typename T, Int Dimension>
+void cpu_FullConvolution_backward(
+    at::Tensor inputSize, at::Tensor outputSize, at::Tensor filterSize,
+    at::Tensor filterStride, Metadata<Dimension> &mIn,
+    Metadata<Dimension> &mOut, at::Tensor input_features,
+    at::Tensor d_input_features, at::Tensor d_output_features,
+    at::Tensor weight, at::Tensor d_weight, at::Tensor d_bias);
+template <typename T, Int Dimension>
+double cpu_RandomizedStrideConvolution_updateOutput(
+    at::Tensor inputSize, at::Tensor outputSize, at::Tensor filterSize,
+    at::Tensor filterStride, Metadata<Dimension> &m, at::Tensor input_features,
+    at::Tensor output_features, at::Tensor weight, at::Tensor bias);
+template <typename T, Int Dimension>
+void cpu_RandomizedStrideConvolution_backward(
+    at::Tensor inputSize, at::Tensor outputSize, at::Tensor filterSize,
+    at::Tensor filterStride, Metadata<Dimension> &m, at::Tensor input_features,
+    at::Tensor d_input_features, at::Tensor d_output_features,
+    at::Tensor weight, at::Tensor d_weight, at::Tensor d_bias);
+template <typename T, Int Dimension>
+double cpu_Deconvolution_updateOutput(
+    at::Tensor inputSize, at::Tensor outputSize, at::Tensor filterSize,
+    at::Tensor filterStride, Metadata<Dimension> &m, at::Tensor input_features,
+    at::Tensor output_features, at::Tensor weight, at::Tensor bias);
+template <typename T, Int Dimension>
+void cpu_Deconvolution_backward(at::Tensor inputSize, at::Tensor outputSize,
+                                at::Tensor filterSize, at::Tensor filterStride,
+                                Metadata<Dimension> &m,
+                                at::Tensor input_features,
+                                at::Tensor d_input_features,
+                                at::Tensor d_output_features, at::Tensor weight,
+                                at::Tensor d_weight, at::Tensor d_bias);
+template <typename T, Int Dimension>
+void cpu_InputLayer_updateOutput(Metadata<Dimension> &m, at::Tensor spatialSize,
+                                 at::Tensor input_coords,
+                                 at::Tensor input_features,
+                                 at::Tensor output_features, long batchSize,
+                                 long mode);
+template <typename T, Int Dimension>
+void cpu_InputLayer_updateGradInput(Metadata<Dimension> &m,
+                                    at::Tensor d_input_features,
+                                    at::Tensor d_output_features);
+template <typename T, Int Dimension>
+void cpu_OutputLayer_updateOutput(Metadata<Dimension> &m,
+                                  at::Tensor input_features,
+                                  at::Tensor output_features);
+template <typename T, Int Dimension>
+void cpu_OutputLayer_updateGradInput(Metadata<Dimension> &m,
+                                     at::Tensor d_input_features,
+                                     at::Tensor d_output_features);
+template <typename T, Int Dimension>
+void cpu_BLInputLayer_updateOutput(Metadata<Dimension> &m,
+                                   at::Tensor spatialSize,
+                                   at::Tensor input_coords,
+                                   at::Tensor input_features,
+                                   at::Tensor output_features, long mode);
+template <typename T, Int Dimension>
+void cpu_BLInputLayer_updateGradInput(Metadata<Dimension> &m,
+                                      at::Tensor d_input_features,
+                                      at::Tensor d_output_features);
+template <typename T, Int Dimension>
+void cpu_BLOutputLayer_updateOutput(Metadata<Dimension> &m,
+                                    at::Tensor input_features,
+                                    at::Tensor output_features);
+template <typename T, Int Dimension>
+void cpu_BLOutputLayer_updateGradInput(Metadata<Dimension> &m,
+                                       at::Tensor d_input_features,
+                                       at::Tensor d_output_features);
+template <typename T, Int Dimension>
+void cpu_MaxPooling_updateOutput(at::Tensor inputSize, at::Tensor outputSize,
+                                 at::Tensor poolSize, at::Tensor poolStride,
+                                 Metadata<Dimension> &m,
+                                 at::Tensor input_features,
+                                 at::Tensor output_features,
+                                 long nFeaturesToDrop);
+template <typename T, Int Dimension>
+void cpu_MaxPooling_updateGradInput(
+    at::Tensor inputSize, at::Tensor outputSize, at::Tensor poolSize,
+    at::Tensor poolStride, Metadata<Dimension> &m, at::Tensor input_features,
+    at::Tensor d_input_features, at::Tensor output_features,
+    at::Tensor d_output_features, long nFeaturesToDrop);
+template <typename T, Int Dimension>
+void cpu_RandomizedStrideMaxPooling_updateOutput(
+    at::Tensor inputSize, at::Tensor outputSize, at::Tensor poolSize,
+    at::Tensor poolStride, Metadata<Dimension> &m, at::Tensor input_features,
+    at::Tensor output_features, long nFeaturesToDrop);
+template <typename T, Int Dimension>
+void cpu_RandomizedStrideMaxPooling_updateGradInput(
+    at::Tensor inputSize, at::Tensor outputSize, at::Tensor poolSize,
+    at::Tensor poolStride, Metadata<Dimension> &m, at::Tensor input_features,
+    at::Tensor d_input_features, at::Tensor output_features,
+    at::Tensor d_output_features, long nFeaturesToDrop);
+template <typename T, Int Dimension>
+void cpu_SparseToDense_updateOutput(at::Tensor inputSize,
+                                    Metadata<Dimension> &m,
+                                    at::Tensor input_features,
+                                    at::Tensor output_features, long nPlanes);
+template <typename T, Int Dimension>
+void cpu_SparseToDense_updateGradInput(at::Tensor inputSize,
+                                       Metadata<Dimension> &m,
+                                       at::Tensor input_features,
+                                       at::Tensor d_input_features,
+                                       at::Tensor d_output_features);
+template <typename T, Int Dimension>
+void cpu_UnPooling_updateOutput(at::Tensor inputSize, at::Tensor outputSize,
+                                at::Tensor poolSize, at::Tensor poolStride,
+                                Metadata<Dimension> &m,
+                                at::Tensor input_features,
+                                at::Tensor output_features,
+                                long nFeaturesToDrop);
+template <typename T, Int Dimension>
+void cpu_UnPooling_updateGradInput(at::Tensor inputSize, at::Tensor outputSize,
+                                   at::Tensor poolSize, at::Tensor poolStride,
+                                   Metadata<Dimension> &m,
+                                   at::Tensor input_features,
+                                   at::Tensor d_input_features,
+                                   at::Tensor d_output_features,
+                                   long nFeaturesToDrop);
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+
+pybind11::class_<Metadata<1>>(m, "Metadata_1")
+  .def(pybind11::init<>())
+  .def("clear", &Metadata<1>::clear)
+  .def("setInputSpatialSize", &Metadata<1>::setInputSpatialSize)
+  .def("batchAddSample", &Metadata<1>::batchAddSample)
+  .def("setInputSpatialLocation", &Metadata<1>::setInputSpatialLocation)
+  .def("setInputSpatialLocations", &Metadata<1>::setInputSpatialLocations)
+  .def("getSpatialLocations", &Metadata<1>::getSpatialLocations)
+  .def("createMetadataForDenseToSparse", &Metadata<1>::createMetadataForDenseToSparse)
+  .def("sparsifyMetadata", &Metadata<1>::sparsifyMetadata)
+  .def("addSampleFromThresholdedTensor", &Metadata<1>::addSampleFromThresholdedTensor)
+  .def("generateRuleBooks3s2", &Metadata<1>::generateRuleBooks3s2)
+  .def("generateRuleBooks2s2", &Metadata<1>::generateRuleBooks2s2);
+
+pybind11::class_<Metadata<2>>(m, "Metadata_2")
+  .def(pybind11::init<>())
+  .def("clear", &Metadata<2>::clear)
+  .def("setInputSpatialSize", &Metadata<2>::setInputSpatialSize)
+  .def("batchAddSample", &Metadata<2>::batchAddSample)
+  .def("setInputSpatialLocation", &Metadata<2>::setInputSpatialLocation)
+  .def("setInputSpatialLocations", &Metadata<2>::setInputSpatialLocations)
+  .def("getSpatialLocations", &Metadata<2>::getSpatialLocations)
+  .def("createMetadataForDenseToSparse", &Metadata<2>::createMetadataForDenseToSparse)
+  .def("sparsifyMetadata", &Metadata<2>::sparsifyMetadata)
+  .def("addSampleFromThresholdedTensor", &Metadata<2>::addSampleFromThresholdedTensor)
+  .def("generateRuleBooks3s2", &Metadata<2>::generateRuleBooks3s2)
+  .def("generateRuleBooks2s2", &Metadata<2>::generateRuleBooks2s2);
+
+pybind11::class_<Metadata<3>>(m, "Metadata_3")
+  .def(pybind11::init<>())
+  .def("clear", &Metadata<3>::clear)
+  .def("setInputSpatialSize", &Metadata<3>::setInputSpatialSize)
+  .def("batchAddSample", &Metadata<3>::batchAddSample)
+  .def("setInputSpatialLocation", &Metadata<3>::setInputSpatialLocation)
+  .def("setInputSpatialLocations", &Metadata<3>::setInputSpatialLocations)
+  .def("getSpatialLocations", &Metadata<3>::getSpatialLocations)
+  .def("createMetadataForDenseToSparse", &Metadata<3>::createMetadataForDenseToSparse)
+  .def("sparsifyMetadata", &Metadata<3>::sparsifyMetadata)
+  .def("addSampleFromThresholdedTensor", &Metadata<3>::addSampleFromThresholdedTensor)
+  .def("generateRuleBooks3s2", &Metadata<3>::generateRuleBooks3s2)
+  .def("generateRuleBooks2s2", &Metadata<3>::generateRuleBooks2s2);
+
+pybind11::class_<Metadata<4>>(m, "Metadata_4")
+  .def(pybind11::init<>())
+  .def("clear", &Metadata<4>::clear)
+  .def("setInputSpatialSize", &Metadata<4>::setInputSpatialSize)
+  .def("batchAddSample", &Metadata<4>::batchAddSample)
+  .def("setInputSpatialLocation", &Metadata<4>::setInputSpatialLocation)
+  .def("setInputSpatialLocations", &Metadata<4>::setInputSpatialLocations)
+  .def("getSpatialLocations", &Metadata<4>::getSpatialLocations)
+  .def("createMetadataForDenseToSparse", &Metadata<4>::createMetadataForDenseToSparse)
+  .def("sparsifyMetadata", &Metadata<4>::sparsifyMetadata)
+  .def("addSampleFromThresholdedTensor", &Metadata<4>::addSampleFromThresholdedTensor)
+  .def("generateRuleBooks3s2", &Metadata<4>::generateRuleBooks3s2)
+  .def("generateRuleBooks2s2", &Metadata<4>::generateRuleBooks2s2);
+m.def("cpu_float_AffineReluTrivialConvolution_updateOutput", &cpu_AffineReluTrivialConvolution_updateOutput<float>, "");
+m.def("cpu_double_AffineReluTrivialConvolution_updateOutput", &cpu_AffineReluTrivialConvolution_updateOutput<double>, "");
+m.def("cpu_float_AffineReluTrivialConvolution_backward", &cpu_AffineReluTrivialConvolution_backward<float>, "");
+m.def("cpu_double_AffineReluTrivialConvolution_backward", &cpu_AffineReluTrivialConvolution_backward<double>, "");
+m.def("cpu_float_BatchwiseMultiplicativeDropout_updateOutput", &cpu_BatchwiseMultiplicativeDropout_updateOutput<float>, "");
+m.def("cpu_double_BatchwiseMultiplicativeDropout_updateOutput", &cpu_BatchwiseMultiplicativeDropout_updateOutput<double>, "");
+m.def("cpu_float_BatchwiseMultiplicativeDropout_updateGradInput", &cpu_BatchwiseMultiplicativeDropout_updateGradInput<float>, "");
+m.def("cpu_double_BatchwiseMultiplicativeDropout_updateGradInput", &cpu_BatchwiseMultiplicativeDropout_updateGradInput<double>, "");
+m.def("cpu_float_BatchNormalization_updateOutput", &cpu_BatchNormalization_updateOutput<float>, "");
+m.def("cpu_double_BatchNormalization_updateOutput", &cpu_BatchNormalization_updateOutput<double>, "");
+m.def("cpu_float_BatchNormalization_backward", &cpu_BatchNormalization_backward<float>, "");
+m.def("cpu_double_BatchNormalization_backward", &cpu_BatchNormalization_backward<double>, "");
+m.def("cpu_float_LeakyReLU_updateOutput", &cpu_LeakyReLU_updateOutput<float>, "");
+m.def("cpu_double_LeakyReLU_updateOutput", &cpu_LeakyReLU_updateOutput<double>, "");
+m.def("cpu_float_LeakyReLU_updateGradInput", &cpu_LeakyReLU_updateGradInput<float>, "");
+m.def("cpu_double_LeakyReLU_updateGradInput", &cpu_LeakyReLU_updateGradInput<double>, "");
+m.def("cpu_float_NetworkInNetwork_updateOutput", &cpu_NetworkInNetwork_updateOutput<float>, "");
+m.def("cpu_double_NetworkInNetwork_updateOutput", &cpu_NetworkInNetwork_updateOutput<double>, "");
+m.def("cpu_float_NetworkInNetwork_updateGradInput", &cpu_NetworkInNetwork_updateGradInput<float>, "");
+m.def("cpu_double_NetworkInNetwork_updateGradInput", &cpu_NetworkInNetwork_updateGradInput<double>, "");
+m.def("cpu_float_NetworkInNetwork_accGradParameters", &cpu_NetworkInNetwork_accGradParameters<float>, "");
+m.def("cpu_double_NetworkInNetwork_accGradParameters", &cpu_NetworkInNetwork_accGradParameters<double>, "");
+m.def("cpu_float_ActivePooling_updateOutput_1", &cpu_ActivePooling_updateOutput<float,1>, "");
+m.def("cpu_double_ActivePooling_updateOutput_1", &cpu_ActivePooling_updateOutput<double,1>, "");
+m.def("cpu_float_ActivePooling_updateOutput_2", &cpu_ActivePooling_updateOutput<float,2>, "");
+m.def("cpu_double_ActivePooling_updateOutput_2", &cpu_ActivePooling_updateOutput<double,2>, "");
+m.def("cpu_float_ActivePooling_updateOutput_3", &cpu_ActivePooling_updateOutput<float,3>, "");
+m.def("cpu_double_ActivePooling_updateOutput_3", &cpu_ActivePooling_updateOutput<double,3>, "");
+m.def("cpu_float_ActivePooling_updateOutput_4", &cpu_ActivePooling_updateOutput<float,4>, "");
+m.def("cpu_double_ActivePooling_updateOutput_4", &cpu_ActivePooling_updateOutput<double,4>, "");
+m.def("cpu_float_ActivePooling_updateGradInput_1", &cpu_ActivePooling_updateGradInput<float,1>, "");
+m.def("cpu_double_ActivePooling_updateGradInput_1", &cpu_ActivePooling_updateGradInput<double,1>, "");
+m.def("cpu_float_ActivePooling_updateGradInput_2", &cpu_ActivePooling_updateGradInput<float,2>, "");
+m.def("cpu_double_ActivePooling_updateGradInput_2", &cpu_ActivePooling_updateGradInput<double,2>, "");
+m.def("cpu_float_ActivePooling_updateGradInput_3", &cpu_ActivePooling_updateGradInput<float,3>, "");
+m.def("cpu_double_ActivePooling_updateGradInput_3", &cpu_ActivePooling_updateGradInput<double,3>, "");
+m.def("cpu_float_ActivePooling_updateGradInput_4", &cpu_ActivePooling_updateGradInput<float,4>, "");
+m.def("cpu_double_ActivePooling_updateGradInput_4", &cpu_ActivePooling_updateGradInput<double,4>, "");
+m.def("cpu_float_AveragePooling_updateOutput_1", &cpu_AveragePooling_updateOutput<float,1>, "");
+m.def("cpu_double_AveragePooling_updateOutput_1", &cpu_AveragePooling_updateOutput<double,1>, "");
+m.def("cpu_float_AveragePooling_updateOutput_2", &cpu_AveragePooling_updateOutput<float,2>, "");
+m.def("cpu_double_AveragePooling_updateOutput_2", &cpu_AveragePooling_updateOutput<double,2>, "");
+m.def("cpu_float_AveragePooling_updateOutput_3", &cpu_AveragePooling_updateOutput<float,3>, "");
+m.def("cpu_double_AveragePooling_updateOutput_3", &cpu_AveragePooling_updateOutput<double,3>, "");
+m.def("cpu_float_AveragePooling_updateOutput_4", &cpu_AveragePooling_updateOutput<float,4>, "");
+m.def("cpu_double_AveragePooling_updateOutput_4", &cpu_AveragePooling_updateOutput<double,4>, "");
+m.def("cpu_float_AveragePooling_updateGradInput_1", &cpu_AveragePooling_updateGradInput<float,1>, "");
+m.def("cpu_double_AveragePooling_updateGradInput_1", &cpu_AveragePooling_updateGradInput<double,1>, "");
+m.def("cpu_float_AveragePooling_updateGradInput_2", &cpu_AveragePooling_updateGradInput<float,2>, "");
+m.def("cpu_double_AveragePooling_updateGradInput_2", &cpu_AveragePooling_updateGradInput<double,2>, "");
+m.def("cpu_float_AveragePooling_updateGradInput_3", &cpu_AveragePooling_updateGradInput<float,3>, "");
+m.def("cpu_double_AveragePooling_updateGradInput_3", &cpu_AveragePooling_updateGradInput<double,3>, "");
+m.def("cpu_float_AveragePooling_updateGradInput_4", &cpu_AveragePooling_updateGradInput<float,4>, "");
+m.def("cpu_double_AveragePooling_updateGradInput_4", &cpu_AveragePooling_updateGradInput<double,4>, "");
+m.def("cpu_float_Convolution_updateOutput_1", &cpu_Convolution_updateOutput<float,1>, "");
+m.def("cpu_double_Convolution_updateOutput_1", &cpu_Convolution_updateOutput<double,1>, "");
+m.def("cpu_float_Convolution_updateOutput_2", &cpu_Convolution_updateOutput<float,2>, "");
+m.def("cpu_double_Convolution_updateOutput_2", &cpu_Convolution_updateOutput<double,2>, "");
+m.def("cpu_float_Convolution_updateOutput_3", &cpu_Convolution_updateOutput<float,3>, "");
+m.def("cpu_double_Convolution_updateOutput_3", &cpu_Convolution_updateOutput<double,3>, "");
+m.def("cpu_float_Convolution_updateOutput_4", &cpu_Convolution_updateOutput<float,4>, "");
+m.def("cpu_double_Convolution_updateOutput_4", &cpu_Convolution_updateOutput<double,4>, "");
+m.def("cpu_float_Convolution_backward_1", &cpu_Convolution_backward<float,1>, "");
+m.def("cpu_double_Convolution_backward_1", &cpu_Convolution_backward<double,1>, "");
+m.def("cpu_float_Convolution_backward_2", &cpu_Convolution_backward<float,2>, "");
+m.def("cpu_double_Convolution_backward_2", &cpu_Convolution_backward<double,2>, "");
+m.def("cpu_float_Convolution_backward_3", &cpu_Convolution_backward<float,3>, "");
+m.def("cpu_double_Convolution_backward_3", &cpu_Convolution_backward<double,3>, "");
+m.def("cpu_float_Convolution_backward_4", &cpu_Convolution_backward<float,4>, "");
+m.def("cpu_double_Convolution_backward_4", &cpu_Convolution_backward<double,4>, "");
+m.def("cpu_float_RandomizedStrideConvolution_updateOutput_1", &cpu_RandomizedStrideConvolution_updateOutput<float,1>, "");
+m.def("cpu_double_RandomizedStrideConvolution_updateOutput_1", &cpu_RandomizedStrideConvolution_updateOutput<double,1>, "");
+m.def("cpu_float_RandomizedStrideConvolution_updateOutput_2", &cpu_RandomizedStrideConvolution_updateOutput<float,2>, "");
+m.def("cpu_double_RandomizedStrideConvolution_updateOutput_2", &cpu_RandomizedStrideConvolution_updateOutput<double,2>, "");
+m.def("cpu_float_RandomizedStrideConvolution_updateOutput_3", &cpu_RandomizedStrideConvolution_updateOutput<float,3>, "");
+m.def("cpu_double_RandomizedStrideConvolution_updateOutput_3", &cpu_RandomizedStrideConvolution_updateOutput<double,3>, "");
+m.def("cpu_float_RandomizedStrideConvolution_updateOutput_4", &cpu_RandomizedStrideConvolution_updateOutput<float,4>, "");
+m.def("cpu_double_RandomizedStrideConvolution_updateOutput_4", &cpu_RandomizedStrideConvolution_updateOutput<double,4>, "");
+m.def("cpu_float_RandomizedStrideConvolution_backward_1", &cpu_RandomizedStrideConvolution_backward<float,1>, "");
+m.def("cpu_double_RandomizedStrideConvolution_backward_1", &cpu_RandomizedStrideConvolution_backward<double,1>, "");
+m.def("cpu_float_RandomizedStrideConvolution_backward_2", &cpu_RandomizedStrideConvolution_backward<float,2>, "");
+m.def("cpu_double_RandomizedStrideConvolution_backward_2", &cpu_RandomizedStrideConvolution_backward<double,2>, "");
+m.def("cpu_float_RandomizedStrideConvolution_backward_3", &cpu_RandomizedStrideConvolution_backward<float,3>, "");
+m.def("cpu_double_RandomizedStrideConvolution_backward_3", &cpu_RandomizedStrideConvolution_backward<double,3>, "");
+m.def("cpu_float_RandomizedStrideConvolution_backward_4", &cpu_RandomizedStrideConvolution_backward<float,4>, "");
+m.def("cpu_double_RandomizedStrideConvolution_backward_4", &cpu_RandomizedStrideConvolution_backward<double,4>, "");
+m.def("cpu_float_Deconvolution_updateOutput_1", &cpu_Deconvolution_updateOutput<float,1>, "");
+m.def("cpu_double_Deconvolution_updateOutput_1", &cpu_Deconvolution_updateOutput<double,1>, "");
+m.def("cpu_float_Deconvolution_updateOutput_2", &cpu_Deconvolution_updateOutput<float,2>, "");
+m.def("cpu_double_Deconvolution_updateOutput_2", &cpu_Deconvolution_updateOutput<double,2>, "");
+m.def("cpu_float_Deconvolution_updateOutput_3", &cpu_Deconvolution_updateOutput<float,3>, "");
+m.def("cpu_double_Deconvolution_updateOutput_3", &cpu_Deconvolution_updateOutput<double,3>, "");
+m.def("cpu_float_Deconvolution_updateOutput_4", &cpu_Deconvolution_updateOutput<float,4>, "");
+m.def("cpu_double_Deconvolution_updateOutput_4", &cpu_Deconvolution_updateOutput<double,4>, "");
+m.def("cpu_float_Deconvolution_backward_1", &cpu_Deconvolution_backward<float,1>, "");
+m.def("cpu_double_Deconvolution_backward_1", &cpu_Deconvolution_backward<double,1>, "");
+m.def("cpu_float_Deconvolution_backward_2", &cpu_Deconvolution_backward<float,2>, "");
+m.def("cpu_double_Deconvolution_backward_2", &cpu_Deconvolution_backward<double,2>, "");
+m.def("cpu_float_Deconvolution_backward_3", &cpu_Deconvolution_backward<float,3>, "");
+m.def("cpu_double_Deconvolution_backward_3", &cpu_Deconvolution_backward<double,3>, "");
+m.def("cpu_float_Deconvolution_backward_4", &cpu_Deconvolution_backward<float,4>, "");
+m.def("cpu_double_Deconvolution_backward_4", &cpu_Deconvolution_backward<double,4>, "");
+m.def("cpu_float_FullConvolution_updateOutput_1", &cpu_FullConvolution_updateOutput<float,1>, "");
+m.def("cpu_double_FullConvolution_updateOutput_1", &cpu_FullConvolution_updateOutput<double,1>, "");
+m.def("cpu_float_FullConvolution_updateOutput_2", &cpu_FullConvolution_updateOutput<float,2>, "");
+m.def("cpu_double_FullConvolution_updateOutput_2", &cpu_FullConvolution_updateOutput<double,2>, "");
+m.def("cpu_float_FullConvolution_updateOutput_3", &cpu_FullConvolution_updateOutput<float,3>, "");
+m.def("cpu_double_FullConvolution_updateOutput_3", &cpu_FullConvolution_updateOutput<double,3>, "");
+m.def("cpu_float_FullConvolution_updateOutput_4", &cpu_FullConvolution_updateOutput<float,4>, "");
+m.def("cpu_double_FullConvolution_updateOutput_4", &cpu_FullConvolution_updateOutput<double,4>, "");
+m.def("cpu_float_FullConvolution_backward_1", &cpu_FullConvolution_backward<float,1>, "");
+m.def("cpu_double_FullConvolution_backward_1", &cpu_FullConvolution_backward<double,1>, "");
+m.def("cpu_float_FullConvolution_backward_2", &cpu_FullConvolution_backward<float,2>, "");
+m.def("cpu_double_FullConvolution_backward_2", &cpu_FullConvolution_backward<double,2>, "");
+m.def("cpu_float_FullConvolution_backward_3", &cpu_FullConvolution_backward<float,3>, "");
+m.def("cpu_double_FullConvolution_backward_3", &cpu_FullConvolution_backward<double,3>, "");
+m.def("cpu_float_FullConvolution_backward_4", &cpu_FullConvolution_backward<float,4>, "");
+m.def("cpu_double_FullConvolution_backward_4", &cpu_FullConvolution_backward<double,4>, "");
+m.def("cpu_float_MaxPooling_updateOutput_1", &cpu_MaxPooling_updateOutput<float,1>, "");
+m.def("cpu_double_MaxPooling_updateOutput_1", &cpu_MaxPooling_updateOutput<double,1>, "");
+m.def("cpu_float_MaxPooling_updateOutput_2", &cpu_MaxPooling_updateOutput<float,2>, "");
+m.def("cpu_double_MaxPooling_updateOutput_2", &cpu_MaxPooling_updateOutput<double,2>, "");
+m.def("cpu_float_MaxPooling_updateOutput_3", &cpu_MaxPooling_updateOutput<float,3>, "");
+m.def("cpu_double_MaxPooling_updateOutput_3", &cpu_MaxPooling_updateOutput<double,3>, "");
+m.def("cpu_float_MaxPooling_updateOutput_4", &cpu_MaxPooling_updateOutput<float,4>, "");
+m.def("cpu_double_MaxPooling_updateOutput_4", &cpu_MaxPooling_updateOutput<double,4>, "");
+m.def("cpu_float_MaxPooling_updateGradInput_1", &cpu_MaxPooling_updateGradInput<float,1>, "");
+m.def("cpu_double_MaxPooling_updateGradInput_1", &cpu_MaxPooling_updateGradInput<double,1>, "");
+m.def("cpu_float_MaxPooling_updateGradInput_2", &cpu_MaxPooling_updateGradInput<float,2>, "");
+m.def("cpu_double_MaxPooling_updateGradInput_2", &cpu_MaxPooling_updateGradInput<double,2>, "");
+m.def("cpu_float_MaxPooling_updateGradInput_3", &cpu_MaxPooling_updateGradInput<float,3>, "");
+m.def("cpu_double_MaxPooling_updateGradInput_3", &cpu_MaxPooling_updateGradInput<double,3>, "");
+m.def("cpu_float_MaxPooling_updateGradInput_4", &cpu_MaxPooling_updateGradInput<float,4>, "");
+m.def("cpu_double_MaxPooling_updateGradInput_4", &cpu_MaxPooling_updateGradInput<double,4>, "");
+m.def("cpu_float_RandomizedStrideMaxPooling_updateOutput_1", &cpu_RandomizedStrideMaxPooling_updateOutput<float,1>, "");
+m.def("cpu_double_RandomizedStrideMaxPooling_updateOutput_1", &cpu_RandomizedStrideMaxPooling_updateOutput<double,1>, "");
+m.def("cpu_float_RandomizedStrideMaxPooling_updateOutput_2", &cpu_RandomizedStrideMaxPooling_updateOutput<float,2>, "");
+m.def("cpu_double_RandomizedStrideMaxPooling_updateOutput_2", &cpu_RandomizedStrideMaxPooling_updateOutput<double,2>, "");
+m.def("cpu_float_RandomizedStrideMaxPooling_updateOutput_3", &cpu_RandomizedStrideMaxPooling_updateOutput<float,3>, "");
+m.def("cpu_double_RandomizedStrideMaxPooling_updateOutput_3", &cpu_RandomizedStrideMaxPooling_updateOutput<double,3>, "");
+m.def("cpu_float_RandomizedStrideMaxPooling_updateOutput_4", &cpu_RandomizedStrideMaxPooling_updateOutput<float,4>, "");
+m.def("cpu_double_RandomizedStrideMaxPooling_updateOutput_4", &cpu_RandomizedStrideMaxPooling_updateOutput<double,4>, "");
+m.def("cpu_float_RandomizedStrideMaxPooling_updateGradInput_1", &cpu_RandomizedStrideMaxPooling_updateGradInput<float,1>, "");
+m.def("cpu_double_RandomizedStrideMaxPooling_updateGradInput_1", &cpu_RandomizedStrideMaxPooling_updateGradInput<double,1>, "");
+m.def("cpu_float_RandomizedStrideMaxPooling_updateGradInput_2", &cpu_RandomizedStrideMaxPooling_updateGradInput<float,2>, "");
+m.def("cpu_double_RandomizedStrideMaxPooling_updateGradInput_2", &cpu_RandomizedStrideMaxPooling_updateGradInput<double,2>, "");
+m.def("cpu_float_RandomizedStrideMaxPooling_updateGradInput_3", &cpu_RandomizedStrideMaxPooling_updateGradInput<float,3>, "");
+m.def("cpu_double_RandomizedStrideMaxPooling_updateGradInput_3", &cpu_RandomizedStrideMaxPooling_updateGradInput<double,3>, "");
+m.def("cpu_float_RandomizedStrideMaxPooling_updateGradInput_4", &cpu_RandomizedStrideMaxPooling_updateGradInput<float,4>, "");
+m.def("cpu_double_RandomizedStrideMaxPooling_updateGradInput_4", &cpu_RandomizedStrideMaxPooling_updateGradInput<double,4>, "");
+m.def("cpu_float_SparseToDense_updateOutput_1", &cpu_SparseToDense_updateOutput<float,1>, "");
+m.def("cpu_double_SparseToDense_updateOutput_1", &cpu_SparseToDense_updateOutput<double,1>, "");
+m.def("cpu_float_SparseToDense_updateOutput_2", &cpu_SparseToDense_updateOutput<float,2>, "");
+m.def("cpu_double_SparseToDense_updateOutput_2", &cpu_SparseToDense_updateOutput<double,2>, "");
+m.def("cpu_float_SparseToDense_updateOutput_3", &cpu_SparseToDense_updateOutput<float,3>, "");
+m.def("cpu_double_SparseToDense_updateOutput_3", &cpu_SparseToDense_updateOutput<double,3>, "");
+m.def("cpu_float_SparseToDense_updateOutput_4", &cpu_SparseToDense_updateOutput<float,4>, "");
+m.def("cpu_double_SparseToDense_updateOutput_4", &cpu_SparseToDense_updateOutput<double,4>, "");
+m.def("cpu_float_SparseToDense_updateGradInput_1", &cpu_SparseToDense_updateGradInput<float,1>, "");
+m.def("cpu_double_SparseToDense_updateGradInput_1", &cpu_SparseToDense_updateGradInput<double,1>, "");
+m.def("cpu_float_SparseToDense_updateGradInput_2", &cpu_SparseToDense_updateGradInput<float,2>, "");
+m.def("cpu_double_SparseToDense_updateGradInput_2", &cpu_SparseToDense_updateGradInput<double,2>, "");
+m.def("cpu_float_SparseToDense_updateGradInput_3", &cpu_SparseToDense_updateGradInput<float,3>, "");
+m.def("cpu_double_SparseToDense_updateGradInput_3", &cpu_SparseToDense_updateGradInput<double,3>, "");
+m.def("cpu_float_SparseToDense_updateGradInput_4", &cpu_SparseToDense_updateGradInput<float,4>, "");
+m.def("cpu_double_SparseToDense_updateGradInput_4", &cpu_SparseToDense_updateGradInput<double,4>, "");
+m.def("cpu_float_SubmanifoldConvolution_updateOutput_1", &cpu_SubmanifoldConvolution_updateOutput<float,1>, "");
+m.def("cpu_double_SubmanifoldConvolution_updateOutput_1", &cpu_SubmanifoldConvolution_updateOutput<double,1>, "");
+m.def("cpu_float_SubmanifoldConvolution_updateOutput_2", &cpu_SubmanifoldConvolution_updateOutput<float,2>, "");
+m.def("cpu_double_SubmanifoldConvolution_updateOutput_2", &cpu_SubmanifoldConvolution_updateOutput<double,2>, "");
+m.def("cpu_float_SubmanifoldConvolution_updateOutput_3", &cpu_SubmanifoldConvolution_updateOutput<float,3>, "");
+m.def("cpu_double_SubmanifoldConvolution_updateOutput_3", &cpu_SubmanifoldConvolution_updateOutput<double,3>, "");
+m.def("cpu_float_SubmanifoldConvolution_updateOutput_4", &cpu_SubmanifoldConvolution_updateOutput<float,4>, "");
+m.def("cpu_double_SubmanifoldConvolution_updateOutput_4", &cpu_SubmanifoldConvolution_updateOutput<double,4>, "");
+m.def("cpu_float_SubmanifoldConvolution_backward_1", &cpu_SubmanifoldConvolution_backward<float,1>, "");
+m.def("cpu_double_SubmanifoldConvolution_backward_1", &cpu_SubmanifoldConvolution_backward<double,1>, "");
+m.def("cpu_float_SubmanifoldConvolution_backward_2", &cpu_SubmanifoldConvolution_backward<float,2>, "");
+m.def("cpu_double_SubmanifoldConvolution_backward_2", &cpu_SubmanifoldConvolution_backward<double,2>, "");
+m.def("cpu_float_SubmanifoldConvolution_backward_3", &cpu_SubmanifoldConvolution_backward<float,3>, "");
+m.def("cpu_double_SubmanifoldConvolution_backward_3", &cpu_SubmanifoldConvolution_backward<double,3>, "");
+m.def("cpu_float_SubmanifoldConvolution_backward_4", &cpu_SubmanifoldConvolution_backward<float,4>, "");
+m.def("cpu_double_SubmanifoldConvolution_backward_4", &cpu_SubmanifoldConvolution_backward<double,4>, "");
+m.def("cpu_float_InputLayer_updateOutput_1", &cpu_InputLayer_updateOutput<float,1>, "");
+m.def("cpu_double_InputLayer_updateOutput_1", &cpu_InputLayer_updateOutput<double,1>, "");
+m.def("cpu_float_InputLayer_updateOutput_2", &cpu_InputLayer_updateOutput<float,2>, "");
+m.def("cpu_double_InputLayer_updateOutput_2", &cpu_InputLayer_updateOutput<double,2>, "");
+m.def("cpu_float_InputLayer_updateOutput_3", &cpu_InputLayer_updateOutput<float,3>, "");
+m.def("cpu_double_InputLayer_updateOutput_3", &cpu_InputLayer_updateOutput<double,3>, "");
+m.def("cpu_float_InputLayer_updateOutput_4", &cpu_InputLayer_updateOutput<float,4>, "");
+m.def("cpu_double_InputLayer_updateOutput_4", &cpu_InputLayer_updateOutput<double,4>, "");
+m.def("cpu_float_InputLayer_updateGradInput_1", &cpu_InputLayer_updateGradInput<float,1>, "");
+m.def("cpu_double_InputLayer_updateGradInput_1", &cpu_InputLayer_updateGradInput<double,1>, "");
+m.def("cpu_float_InputLayer_updateGradInput_2", &cpu_InputLayer_updateGradInput<float,2>, "");
+m.def("cpu_double_InputLayer_updateGradInput_2", &cpu_InputLayer_updateGradInput<double,2>, "");
+m.def("cpu_float_InputLayer_updateGradInput_3", &cpu_InputLayer_updateGradInput<float,3>, "");
+m.def("cpu_double_InputLayer_updateGradInput_3", &cpu_InputLayer_updateGradInput<double,3>, "");
+m.def("cpu_float_InputLayer_updateGradInput_4", &cpu_InputLayer_updateGradInput<float,4>, "");
+m.def("cpu_double_InputLayer_updateGradInput_4", &cpu_InputLayer_updateGradInput<double,4>, "");
+m.def("cpu_float_OutputLayer_updateOutput_1", &cpu_OutputLayer_updateOutput<float,1>, "");
+m.def("cpu_double_OutputLayer_updateOutput_1", &cpu_OutputLayer_updateOutput<double,1>, "");
+m.def("cpu_float_OutputLayer_updateOutput_2", &cpu_OutputLayer_updateOutput<float,2>, "");
+m.def("cpu_double_OutputLayer_updateOutput_2", &cpu_OutputLayer_updateOutput<double,2>, "");
+m.def("cpu_float_OutputLayer_updateOutput_3", &cpu_OutputLayer_updateOutput<float,3>, "");
+m.def("cpu_double_OutputLayer_updateOutput_3", &cpu_OutputLayer_updateOutput<double,3>, "");
+m.def("cpu_float_OutputLayer_updateOutput_4", &cpu_OutputLayer_updateOutput<float,4>, "");
+m.def("cpu_double_OutputLayer_updateOutput_4", &cpu_OutputLayer_updateOutput<double,4>, "");
+m.def("cpu_float_OutputLayer_updateGradInput_1", &cpu_OutputLayer_updateGradInput<float,1>, "");
+m.def("cpu_double_OutputLayer_updateGradInput_1", &cpu_OutputLayer_updateGradInput<double,1>, "");
+m.def("cpu_float_OutputLayer_updateGradInput_2", &cpu_OutputLayer_updateGradInput<float,2>, "");
+m.def("cpu_double_OutputLayer_updateGradInput_2", &cpu_OutputLayer_updateGradInput<double,2>, "");
+m.def("cpu_float_OutputLayer_updateGradInput_3", &cpu_OutputLayer_updateGradInput<float,3>, "");
+m.def("cpu_double_OutputLayer_updateGradInput_3", &cpu_OutputLayer_updateGradInput<double,3>, "");
+m.def("cpu_float_OutputLayer_updateGradInput_4", &cpu_OutputLayer_updateGradInput<float,4>, "");
+m.def("cpu_double_OutputLayer_updateGradInput_4", &cpu_OutputLayer_updateGradInput<double,4>, "");
+m.def("cpu_float_BLInputLayer_updateOutput_1", &cpu_BLInputLayer_updateOutput<float,1>, "");
+m.def("cpu_double_BLInputLayer_updateOutput_1", &cpu_BLInputLayer_updateOutput<double,1>, "");
+m.def("cpu_float_BLInputLayer_updateOutput_2", &cpu_BLInputLayer_updateOutput<float,2>, "");
+m.def("cpu_double_BLInputLayer_updateOutput_2", &cpu_BLInputLayer_updateOutput<double,2>, "");
+m.def("cpu_float_BLInputLayer_updateOutput_3", &cpu_BLInputLayer_updateOutput<float,3>, "");
+m.def("cpu_double_BLInputLayer_updateOutput_3", &cpu_BLInputLayer_updateOutput<double,3>, "");
+m.def("cpu_float_BLInputLayer_updateOutput_4", &cpu_BLInputLayer_updateOutput<float,4>, "");
+m.def("cpu_double_BLInputLayer_updateOutput_4", &cpu_BLInputLayer_updateOutput<double,4>, "");
+m.def("cpu_float_BLInputLayer_updateGradInput_1", &cpu_BLInputLayer_updateGradInput<float,1>, "");
+m.def("cpu_double_BLInputLayer_updateGradInput_1", &cpu_BLInputLayer_updateGradInput<double,1>, "");
+m.def("cpu_float_BLInputLayer_updateGradInput_2", &cpu_BLInputLayer_updateGradInput<float,2>, "");
+m.def("cpu_double_BLInputLayer_updateGradInput_2", &cpu_BLInputLayer_updateGradInput<double,2>, "");
+m.def("cpu_float_BLInputLayer_updateGradInput_3", &cpu_BLInputLayer_updateGradInput<float,3>, "");
+m.def("cpu_double_BLInputLayer_updateGradInput_3", &cpu_BLInputLayer_updateGradInput<double,3>, "");
+m.def("cpu_float_BLInputLayer_updateGradInput_4", &cpu_BLInputLayer_updateGradInput<float,4>, "");
+m.def("cpu_double_BLInputLayer_updateGradInput_4", &cpu_BLInputLayer_updateGradInput<double,4>, "");
+m.def("cpu_float_BLOutputLayer_updateOutput_1", &cpu_BLOutputLayer_updateOutput<float,1>, "");
+m.def("cpu_double_BLOutputLayer_updateOutput_1", &cpu_BLOutputLayer_updateOutput<double,1>, "");
+m.def("cpu_float_BLOutputLayer_updateOutput_2", &cpu_BLOutputLayer_updateOutput<float,2>, "");
+m.def("cpu_double_BLOutputLayer_updateOutput_2", &cpu_BLOutputLayer_updateOutput<double,2>, "");
+m.def("cpu_float_BLOutputLayer_updateOutput_3", &cpu_BLOutputLayer_updateOutput<float,3>, "");
+m.def("cpu_double_BLOutputLayer_updateOutput_3", &cpu_BLOutputLayer_updateOutput<double,3>, "");
+m.def("cpu_float_BLOutputLayer_updateOutput_4", &cpu_BLOutputLayer_updateOutput<float,4>, "");
+m.def("cpu_double_BLOutputLayer_updateOutput_4", &cpu_BLOutputLayer_updateOutput<double,4>, "");
+m.def("cpu_float_BLOutputLayer_updateGradInput_1", &cpu_BLOutputLayer_updateGradInput<float,1>, "");
+m.def("cpu_double_BLOutputLayer_updateGradInput_1", &cpu_BLOutputLayer_updateGradInput<double,1>, "");
+m.def("cpu_float_BLOutputLayer_updateGradInput_2", &cpu_BLOutputLayer_updateGradInput<float,2>, "");
+m.def("cpu_double_BLOutputLayer_updateGradInput_2", &cpu_BLOutputLayer_updateGradInput<double,2>, "");
+m.def("cpu_float_BLOutputLayer_updateGradInput_3", &cpu_BLOutputLayer_updateGradInput<float,3>, "");
+m.def("cpu_double_BLOutputLayer_updateGradInput_3", &cpu_BLOutputLayer_updateGradInput<double,3>, "");
+m.def("cpu_float_BLOutputLayer_updateGradInput_4", &cpu_BLOutputLayer_updateGradInput<float,4>, "");
+m.def("cpu_double_BLOutputLayer_updateGradInput_4", &cpu_BLOutputLayer_updateGradInput<double,4>, "");
+m.def("cpu_float_UnPooling_updateOutput_1", &cpu_UnPooling_updateOutput<float,1>, "");
+m.def("cpu_double_UnPooling_updateOutput_1", &cpu_UnPooling_updateOutput<double,1>, "");
+m.def("cpu_float_UnPooling_updateOutput_2", &cpu_UnPooling_updateOutput<float,2>, "");
+m.def("cpu_double_UnPooling_updateOutput_2", &cpu_UnPooling_updateOutput<double,2>, "");
+m.def("cpu_float_UnPooling_updateOutput_3", &cpu_UnPooling_updateOutput<float,3>, "");
+m.def("cpu_double_UnPooling_updateOutput_3", &cpu_UnPooling_updateOutput<double,3>, "");
+m.def("cpu_float_UnPooling_updateOutput_4", &cpu_UnPooling_updateOutput<float,4>, "");
+m.def("cpu_double_UnPooling_updateOutput_4", &cpu_UnPooling_updateOutput<double,4>, "");
+m.def("cpu_float_UnPooling_updateGradInput_1", &cpu_UnPooling_updateGradInput<float,1>, "");
+m.def("cpu_double_UnPooling_updateGradInput_1", &cpu_UnPooling_updateGradInput<double,1>, "");
+m.def("cpu_float_UnPooling_updateGradInput_2", &cpu_UnPooling_updateGradInput<float,2>, "");
+m.def("cpu_double_UnPooling_updateGradInput_2", &cpu_UnPooling_updateGradInput<double,2>, "");
+m.def("cpu_float_UnPooling_updateGradInput_3", &cpu_UnPooling_updateGradInput<float,3>, "");
+m.def("cpu_double_UnPooling_updateGradInput_3", &cpu_UnPooling_updateGradInput<double,3>, "");
+m.def("cpu_float_UnPooling_updateGradInput_4", &cpu_UnPooling_updateGradInput<float,4>, "");
+m.def("cpu_double_UnPooling_updateGradInput_4", &cpu_UnPooling_updateGradInput<double,4>, "");
+
+m.def("n_rulebook_bits", []() {return 8*sizeof(Int);}, "");
+}
--- a/sparseconvnet/SCN/pybind_cuda.cpp
+++ b/sparseconvnet/SCN/pybind_cuda.cpp
+
+// Copyright 2016-present, Facebook, Inc.
+// All rights reserved.
+//
+// This source code is licensed under the license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <torch/torch.h>
+
+#include "Metadata/Metadata.h"
+
+template <typename T>
+double cpu_AffineReluTrivialConvolution_updateOutput(at::Tensor input_features,
+                                                     at::Tensor output_features,
+                                                     at::Tensor affineWeight,
+                                                     at::Tensor affineBias,
+                                                     at::Tensor convWeight);
+template <typename T>
+void cpu_AffineReluTrivialConvolution_backward(
+    at::Tensor input_features, at::Tensor d_input_features,
+    at::Tensor d_output_features, at::Tensor affineWeight,
+    at::Tensor d_affineWeight, at::Tensor affineBias, at::Tensor d_affineBias,
+    at::Tensor convWeight, at::Tensor d_convWeight, bool additiveGrad);
+template <typename T>
+void cpu_BatchNormalization_updateOutput(
+    at::Tensor input_features, at::Tensor output_features, at::Tensor saveMean,
+    at::Tensor saveInvStd, at::Tensor runningMean, at::Tensor runningVar,
+    at::Tensor weight, at::Tensor bias, T eps, T momentum, bool train,
+    T leakiness);
+template <typename T>
+void cpu_BatchNormalizationInTensor_updateOutput(
+    at::Tensor input_features, at::Tensor output_features, at::Tensor saveMean,
+    at::Tensor saveInvStd, at::Tensor runningMean, at::Tensor runningVar,
+    at::Tensor weight, at::Tensor bias, T eps, T momentum, bool train,
+    T leakiness);
+template <typename T>
+void cpu_BatchNormalization_backward(
+    at::Tensor input_features, at::Tensor d_input_features,
+    at::Tensor output_features, at::Tensor d_output_features,
+    at::Tensor saveMean, at::Tensor saveInvStd, at::Tensor runningMean,
+    at::Tensor runningVar, at::Tensor weight, at::Tensor bias,
+    at::Tensor d_weight, at::Tensor d_bias, T leakiness);
+template <typename T>
+void cpu_BatchwiseMultiplicativeDropout_updateOutput(at::Tensor input_features,
+                                                     at::Tensor output_features,
+                                                     at::Tensor noise,
+                                                     float alpha);
+template <typename T>
+void cpu_BatchwiseMultiplicativeDropout_updateGradInput(
+    at::Tensor input_features, at::Tensor d_input_features,
+    at::Tensor d_output_features, at::Tensor noise, float alpha);
+template <typename T>
+void cpu_LeakyReLU_updateOutput(at::Tensor input_features,
+                                at::Tensor output_features, float alpha);
+template <typename T>
+void cpu_LeakyReLU_updateGradInput(at::Tensor input_features,
+                                   at::Tensor d_input_features,
+                                   at::Tensor d_output_features, float alpha);
+template <typename T>
+double cpu_NetworkInNetwork_updateOutput(at::Tensor input_features,
+                                         at::Tensor output_features,
+                                         at::Tensor weight, at::Tensor bias);
+template <typename T>
+void cpu_NetworkInNetwork_updateGradInput(at::Tensor d_input_features,
+                                          at::Tensor d_output_features,
+                                          at::Tensor weight);
+template <typename T>
+void cpu_NetworkInNetwork_accGradParameters(at::Tensor input_features,
+                                            at::Tensor d_output_features,
+                                            at::Tensor d_weight,
+                                            at::Tensor d_bias);
+template <typename T, Int Dimension>
+void cpu_ActivePooling_updateOutput(at::Tensor inputSize,
+                                    Metadata<Dimension> &m,
+                                    at::Tensor input_features,
+                                    at::Tensor output_features, bool average);
+template <typename T, Int Dimension>
+void cpu_ActivePooling_updateGradInput(
+    at::Tensor inputSize, Metadata<Dimension> &m, at::Tensor input_features,
+    at::Tensor d_input_features, at::Tensor d_output_features, bool average);
+template <typename T, Int Dimension>
+void cpu_AveragePooling_updateOutput(
+    at::Tensor inputSize, at::Tensor outputSize, at::Tensor poolSize,
+    at::Tensor poolStride, Metadata<Dimension> &m, at::Tensor input_features,
+    at::Tensor output_features, long nFeaturesToDrop);
+template <typename T, Int Dimension>
+void cpu_AveragePooling_updateGradInput(
+    at::Tensor inputSize, at::Tensor outputSize, at::Tensor poolSize,
+    at::Tensor poolStride, Metadata<Dimension> &m, at::Tensor input_features,
+    at::Tensor d_input_features, at::Tensor d_output_features,
+    long nFeaturesToDrop);
+template <typename T, Int Dimension>
+double cpu_Convolution_updateOutput(
+    at::Tensor inputSize, at::Tensor outputSize, at::Tensor filterSize,
+    at::Tensor filterStride, Metadata<Dimension> &m, at::Tensor input_features,
+    at::Tensor output_features, at::Tensor weight, at::Tensor bias);
+template <typename T, Int Dimension>
+void cpu_Convolution_backward(at::Tensor inputSize, at::Tensor outputSize,
+                              at::Tensor filterSize, at::Tensor filterStride,
+                              Metadata<Dimension> &m, at::Tensor input_features,
+                              at::Tensor d_input_features,
+                              at::Tensor d_output_features, at::Tensor weight,
+                              at::Tensor d_weight, at::Tensor d_bias);
+template <typename T, Int Dimension>
+double cpu_SubmanifoldConvolution_updateOutput(
+    at::Tensor inputSize, at::Tensor filterSize, Metadata<Dimension> &m,
+    at::Tensor input_features, at::Tensor output_features, at::Tensor weight,
+    at::Tensor bias);
+template <typename T, Int Dimension>
+void cpu_SubmanifoldConvolution_backward(
+    at::Tensor inputSize, at::Tensor filterSize, Metadata<Dimension> &m,
+    at::Tensor input_features, at::Tensor d_input_features,
+    at::Tensor d_output_features, at::Tensor weight, at::Tensor d_weight,
+    at::Tensor d_bias);
+template <typename T, Int Dimension>
+double cpu_FullConvolution_updateOutput(
+    at::Tensor inputSize, at::Tensor outputSize, at::Tensor filterSize,
+    at::Tensor filterStride, Metadata<Dimension> &mIn,
+    Metadata<Dimension> &mOut, at::Tensor input_features,
+    at::Tensor output_features, at::Tensor weight, at::Tensor bias);
+template <typename T, Int Dimension>
+void cpu_FullConvolution_backward(
+    at::Tensor inputSize, at::Tensor outputSize, at::Tensor filterSize,
+    at::Tensor filterStride, Metadata<Dimension> &mIn,
+    Metadata<Dimension> &mOut, at::Tensor input_features,
+    at::Tensor d_input_features, at::Tensor d_output_features,
+    at::Tensor weight, at::Tensor d_weight, at::Tensor d_bias);
+template <typename T, Int Dimension>
+double cpu_RandomizedStrideConvolution_updateOutput(
+    at::Tensor inputSize, at::Tensor outputSize, at::Tensor filterSize,
+    at::Tensor filterStride, Metadata<Dimension> &m, at::Tensor input_features,
+    at::Tensor output_features, at::Tensor weight, at::Tensor bias);
+template <typename T, Int Dimension>
+void cpu_RandomizedStrideConvolution_backward(
+    at::Tensor inputSize, at::Tensor outputSize, at::Tensor filterSize,
+    at::Tensor filterStride, Metadata<Dimension> &m, at::Tensor input_features,
+    at::Tensor d_input_features, at::Tensor d_output_features,
+    at::Tensor weight, at::Tensor d_weight, at::Tensor d_bias);
+template <typename T, Int Dimension>
+double cpu_Deconvolution_updateOutput(
+    at::Tensor inputSize, at::Tensor outputSize, at::Tensor filterSize,
+    at::Tensor filterStride, Metadata<Dimension> &m, at::Tensor input_features,
+    at::Tensor output_features, at::Tensor weight, at::Tensor bias);
+template <typename T, Int Dimension>
+void cpu_Deconvolution_backward(at::Tensor inputSize, at::Tensor outputSize,
+                                at::Tensor filterSize, at::Tensor filterStride,
+                                Metadata<Dimension> &m,
+                                at::Tensor input_features,
+                                at::Tensor d_input_features,
+                                at::Tensor d_output_features, at::Tensor weight,
+                                at::Tensor d_weight, at::Tensor d_bias);
+template <typename T, Int Dimension>
+void cpu_InputLayer_updateOutput(Metadata<Dimension> &m, at::Tensor spatialSize,
+                                 at::Tensor input_coords,
+                                 at::Tensor input_features,
+                                 at::Tensor output_features, long batchSize,
+                                 long mode);
+template <typename T, Int Dimension>
+void cpu_InputLayer_updateGradInput(Metadata<Dimension> &m,
+                                    at::Tensor d_input_features,
+                                    at::Tensor d_output_features);
+template <typename T, Int Dimension>
+void cpu_OutputLayer_updateOutput(Metadata<Dimension> &m,
+                                  at::Tensor input_features,
+                                  at::Tensor output_features);
+template <typename T, Int Dimension>
+void cpu_OutputLayer_updateGradInput(Metadata<Dimension> &m,
+                                     at::Tensor d_input_features,
+                                     at::Tensor d_output_features);
+template <typename T, Int Dimension>
+void cpu_BLInputLayer_updateOutput(Metadata<Dimension> &m,
+                                   at::Tensor spatialSize,
+                                   at::Tensor input_coords,
+                                   at::Tensor input_features,
+                                   at::Tensor output_features, long mode);
+template <typename T, Int Dimension>
+void cpu_BLInputLayer_updateGradInput(Metadata<Dimension> &m,
+                                      at::Tensor d_input_features,
+                                      at::Tensor d_output_features);
+template <typename T, Int Dimension>
+void cpu_BLOutputLayer_updateOutput(Metadata<Dimension> &m,
+                                    at::Tensor input_features,
+                                    at::Tensor output_features);
+template <typename T, Int Dimension>
+void cpu_BLOutputLayer_updateGradInput(Metadata<Dimension> &m,
+                                       at::Tensor d_input_features,
+                                       at::Tensor d_output_features);
+template <typename T, Int Dimension>
+void cpu_MaxPooling_updateOutput(at::Tensor inputSize, at::Tensor outputSize,
+                                 at::Tensor poolSize, at::Tensor poolStride,
+                                 Metadata<Dimension> &m,
+                                 at::Tensor input_features,
+                                 at::Tensor output_features,
+                                 long nFeaturesToDrop);
+template <typename T, Int Dimension>
+void cpu_MaxPooling_updateGradInput(
+    at::Tensor inputSize, at::Tensor outputSize, at::Tensor poolSize,
+    at::Tensor poolStride, Metadata<Dimension> &m, at::Tensor input_features,
+    at::Tensor d_input_features, at::Tensor output_features,
+    at::Tensor d_output_features, long nFeaturesToDrop);
+template <typename T, Int Dimension>
+void cpu_RandomizedStrideMaxPooling_updateOutput(
+    at::Tensor inputSize, at::Tensor outputSize, at::Tensor poolSize,
+    at::Tensor poolStride, Metadata<Dimension> &m, at::Tensor input_features,
+    at::Tensor output_features, long nFeaturesToDrop);
+template <typename T, Int Dimension>
+void cpu_RandomizedStrideMaxPooling_updateGradInput(
+    at::Tensor inputSize, at::Tensor outputSize, at::Tensor poolSize,
+    at::Tensor poolStride, Metadata<Dimension> &m, at::Tensor input_features,
+    at::Tensor d_input_features, at::Tensor output_features,
+    at::Tensor d_output_features, long nFeaturesToDrop);
+template <typename T, Int Dimension>
+void cpu_SparseToDense_updateOutput(at::Tensor inputSize,
+                                    Metadata<Dimension> &m,
+                                    at::Tensor input_features,
+                                    at::Tensor output_features, long nPlanes);
+template <typename T, Int Dimension>
+void cpu_SparseToDense_updateGradInput(at::Tensor inputSize,
+                                       Metadata<Dimension> &m,
+                                       at::Tensor input_features,
+                                       at::Tensor d_input_features,
+                                       at::Tensor d_output_features);
+template <typename T, Int Dimension>
+void cpu_UnPooling_updateOutput(at::Tensor inputSize, at::Tensor outputSize,
+                                at::Tensor poolSize, at::Tensor poolStride,
+                                Metadata<Dimension> &m,
+                                at::Tensor input_features,
+                                at::Tensor output_features,
+                                long nFeaturesToDrop);
+template <typename T, Int Dimension>
+void cpu_UnPooling_updateGradInput(at::Tensor inputSize, at::Tensor outputSize,
+                                   at::Tensor poolSize, at::Tensor poolStride,
+                                   Metadata<Dimension> &m,
+                                   at::Tensor input_features,
+                                   at::Tensor d_input_features,
+                                   at::Tensor d_output_features,
+                                   long nFeaturesToDrop);
+
+template <typename T>
+double cuda_AffineReluTrivialConvolution_updateOutput(at::Tensor input_features,
+                                                     at::Tensor output_features,
+                                                     at::Tensor affineWeight,
+                                                     at::Tensor affineBias,
+                                                     at::Tensor convWeight);
+template <typename T>
+void cuda_AffineReluTrivialConvolution_backward(
+    at::Tensor input_features, at::Tensor d_input_features,
+    at::Tensor d_output_features, at::Tensor affineWeight,
+    at::Tensor d_affineWeight, at::Tensor affineBias, at::Tensor d_affineBias,
+    at::Tensor convWeight, at::Tensor d_convWeight, bool additiveGrad);
+template <typename T>
+void cuda_BatchNormalization_updateOutput(
+    at::Tensor input_features, at::Tensor output_features, at::Tensor saveMean,
+    at::Tensor saveInvStd, at::Tensor runningMean, at::Tensor runningVar,
+    at::Tensor weight, at::Tensor bias, T eps, T momentum, bool train,
+    T leakiness);
+template <typename T>
+void cuda_BatchNormalizationInTensor_updateOutput(
+    at::Tensor input_features, at::Tensor output_features, at::Tensor saveMean,
+    at::Tensor saveInvStd, at::Tensor runningMean, at::Tensor runningVar,
+    at::Tensor weight, at::Tensor bias, T eps, T momentum, bool train,
+    T leakiness);
+template <typename T>
+void cuda_BatchNormalization_backward(
+    at::Tensor input_features, at::Tensor d_input_features,
+    at::Tensor output_features, at::Tensor d_output_features,
+    at::Tensor saveMean, at::Tensor saveInvStd, at::Tensor runningMean,
+    at::Tensor runningVar, at::Tensor weight, at::Tensor bias,
+    at::Tensor d_weight, at::Tensor d_bias, T leakiness);
+template <typename T>
+void cuda_BatchwiseMultiplicativeDropout_updateOutput(at::Tensor input_features,
+                                                     at::Tensor output_features,
+                                                     at::Tensor noise,
+                                                     float alpha);
+template <typename T>
+void cuda_BatchwiseMultiplicativeDropout_updateGradInput(
+    at::Tensor input_features, at::Tensor d_input_features,
+    at::Tensor d_output_features, at::Tensor noise, float alpha);
+template <typename T>
+void cuda_LeakyReLU_updateOutput(at::Tensor input_features,
+                                at::Tensor output_features, float alpha);
+template <typename T>
+void cuda_LeakyReLU_updateGradInput(at::Tensor input_features,
+                                   at::Tensor d_input_features,
+                                   at::Tensor d_output_features, float alpha);
+template <typename T>
+double cuda_NetworkInNetwork_updateOutput(at::Tensor input_features,
+                                         at::Tensor output_features,
+                                         at::Tensor weight, at::Tensor bias);
+template <typename T>
+void cuda_NetworkInNetwork_updateGradInput(at::Tensor d_input_features,
+                                          at::Tensor d_output_features,
+                                          at::Tensor weight);
+template <typename T>
+void cuda_NetworkInNetwork_accGradParameters(at::Tensor input_features,
+                                            at::Tensor d_output_features,
+                                            at::Tensor d_weight,
+                                            at::Tensor d_bias);
+template <typename T, Int Dimension>
+void cuda_ActivePooling_updateOutput(at::Tensor inputSize,
+                                    Metadata<Dimension> &m,
+                                    at::Tensor input_features,
+                                    at::Tensor output_features, bool average);
+template <typename T, Int Dimension>
+void cuda_ActivePooling_updateGradInput(
+    at::Tensor inputSize, Metadata<Dimension> &m, at::Tensor input_features,
+    at::Tensor d_input_features, at::Tensor d_output_features, bool average);
+template <typename T, Int Dimension>
+void cuda_AveragePooling_updateOutput(
+    at::Tensor inputSize, at::Tensor outputSize, at::Tensor poolSize,
+    at::Tensor poolStride, Metadata<Dimension> &m, at::Tensor input_features,
+    at::Tensor output_features, long nFeaturesToDrop);
+template <typename T, Int Dimension>
+void cuda_AveragePooling_updateGradInput(
+    at::Tensor inputSize, at::Tensor outputSize, at::Tensor poolSize,
+    at::Tensor poolStride, Metadata<Dimension> &m, at::Tensor input_features,
+    at::Tensor d_input_features, at::Tensor d_output_features,
+    long nFeaturesToDrop);
+template <typename T, Int Dimension>
+double cuda_Convolution_updateOutput(
+    at::Tensor inputSize, at::Tensor outputSize, at::Tensor filterSize,
+    at::Tensor filterStride, Metadata<Dimension> &m, at::Tensor input_features,
+    at::Tensor output_features, at::Tensor weight, at::Tensor bias);
+template <typename T, Int Dimension>
+void cuda_Convolution_backward(at::Tensor inputSize, at::Tensor outputSize,
+                              at::Tensor filterSize, at::Tensor filterStride,
+                              Metadata<Dimension> &m, at::Tensor input_features,
+                              at::Tensor d_input_features,
+                              at::Tensor d_output_features, at::Tensor weight,
+                              at::Tensor d_weight, at::Tensor d_bias);
+template <typename T, Int Dimension>
+double cuda_SubmanifoldConvolution_updateOutput(
+    at::Tensor inputSize, at::Tensor filterSize, Metadata<Dimension> &m,
+    at::Tensor input_features, at::Tensor output_features, at::Tensor weight,
+    at::Tensor bias);
+template <typename T, Int Dimension>
+void cuda_SubmanifoldConvolution_backward(
+    at::Tensor inputSize, at::Tensor filterSize, Metadata<Dimension> &m,
+    at::Tensor input_features, at::Tensor d_input_features,
+    at::Tensor d_output_features, at::Tensor weight, at::Tensor d_weight,
+    at::Tensor d_bias);
+template <typename T, Int Dimension>
+double cuda_FullConvolution_updateOutput(
+    at::Tensor inputSize, at::Tensor outputSize, at::Tensor filterSize,
+    at::Tensor filterStride, Metadata<Dimension> &mIn,
+    Metadata<Dimension> &mOut, at::Tensor input_features,
+    at::Tensor output_features, at::Tensor weight, at::Tensor bias);
+template <typename T, Int Dimension>
+void cuda_FullConvolution_backward(
+    at::Tensor inputSize, at::Tensor outputSize, at::Tensor filterSize,
+    at::Tensor filterStride, Metadata<Dimension> &mIn,
+    Metadata<Dimension> &mOut, at::Tensor input_features,
+    at::Tensor d_input_features, at::Tensor d_output_features,
+    at::Tensor weight, at::Tensor d_weight, at::Tensor d_bias);
+template <typename T, Int Dimension>
+double cuda_RandomizedStrideConvolution_updateOutput(
+    at::Tensor inputSize, at::Tensor outputSize, at::Tensor filterSize,
+    at::Tensor filterStride, Metadata<Dimension> &m, at::Tensor input_features,
+    at::Tensor output_features, at::Tensor weight, at::Tensor bias);
+template <typename T, Int Dimension>
+void cuda_RandomizedStrideConvolution_backward(
+    at::Tensor inputSize, at::Tensor outputSize, at::Tensor filterSize,
+    at::Tensor filterStride, Metadata<Dimension> &m, at::Tensor input_features,
+    at::Tensor d_input_features, at::Tensor d_output_features,
+    at::Tensor weight, at::Tensor d_weight, at::Tensor d_bias);
+template <typename T, Int Dimension>
+double cuda_Deconvolution_updateOutput(
+    at::Tensor inputSize, at::Tensor outputSize, at::Tensor filterSize,
+    at::Tensor filterStride, Metadata<Dimension> &m, at::Tensor input_features,
+    at::Tensor output_features, at::Tensor weight, at::Tensor bias);
+template <typename T, Int Dimension>
+void cuda_Deconvolution_backward(at::Tensor inputSize, at::Tensor outputSize,
+                                at::Tensor filterSize, at::Tensor filterStride,
+                                Metadata<Dimension> &m,
+                                at::Tensor input_features,
+                                at::Tensor d_input_features,
+                                at::Tensor d_output_features, at::Tensor weight,
+                                at::Tensor d_weight, at::Tensor d_bias);
+template <typename T, Int Dimension>
+void cuda_InputLayer_updateOutput(Metadata<Dimension> &m, at::Tensor spatialSize,
+                                 at::Tensor input_coords,
+                                 at::Tensor input_features,
+                                 at::Tensor output_features, long batchSize,
+                                 long mode);
+template <typename T, Int Dimension>
+void cuda_InputLayer_updateGradInput(Metadata<Dimension> &m,
+                                    at::Tensor d_input_features,
+                                    at::Tensor d_output_features);
+template <typename T, Int Dimension>
+void cuda_OutputLayer_updateOutput(Metadata<Dimension> &m,
+                                  at::Tensor input_features,
+                                  at::Tensor output_features);
+template <typename T, Int Dimension>
+void cuda_OutputLayer_updateGradInput(Metadata<Dimension> &m,
+                                     at::Tensor d_input_features,
+                                     at::Tensor d_output_features);
+template <typename T, Int Dimension>
+void cuda_BLInputLayer_updateOutput(Metadata<Dimension> &m,
+                                   at::Tensor spatialSize,
+                                   at::Tensor input_coords,
+                                   at::Tensor input_features,
+                                   at::Tensor output_features, long mode);
+template <typename T, Int Dimension>
+void cuda_BLInputLayer_updateGradInput(Metadata<Dimension> &m,
+                                      at::Tensor d_input_features,
+                                      at::Tensor d_output_features);
+template <typename T, Int Dimension>
+void cuda_BLOutputLayer_updateOutput(Metadata<Dimension> &m,
+                                    at::Tensor input_features,
+                                    at::Tensor output_features);
+template <typename T, Int Dimension>
+void cuda_BLOutputLayer_updateGradInput(Metadata<Dimension> &m,
+                                       at::Tensor d_input_features,
+                                       at::Tensor d_output_features);
+template <typename T, Int Dimension>
+void cuda_MaxPooling_updateOutput(at::Tensor inputSize, at::Tensor outputSize,
+                                 at::Tensor poolSize, at::Tensor poolStride,
+                                 Metadata<Dimension> &m,
+                                 at::Tensor input_features,
+                                 at::Tensor output_features,
+                                 long nFeaturesToDrop);
+template <typename T, Int Dimension>
+void cuda_MaxPooling_updateGradInput(
+    at::Tensor inputSize, at::Tensor outputSize, at::Tensor poolSize,
+    at::Tensor poolStride, Metadata<Dimension> &m, at::Tensor input_features,
+    at::Tensor d_input_features, at::Tensor output_features,
+    at::Tensor d_output_features, long nFeaturesToDrop);
+template <typename T, Int Dimension>
+void cuda_RandomizedStrideMaxPooling_updateOutput(
+    at::Tensor inputSize, at::Tensor outputSize, at::Tensor poolSize,
+    at::Tensor poolStride, Metadata<Dimension> &m, at::Tensor input_features,
+    at::Tensor output_features, long nFeaturesToDrop);
+template <typename T, Int Dimension>
+void cuda_RandomizedStrideMaxPooling_updateGradInput(
+    at::Tensor inputSize, at::Tensor outputSize, at::Tensor poolSize,
+    at::Tensor poolStride, Metadata<Dimension> &m, at::Tensor input_features,
+    at::Tensor d_input_features, at::Tensor output_features,
+    at::Tensor d_output_features, long nFeaturesToDrop);
+template <typename T, Int Dimension>
+void cuda_SparseToDense_updateOutput(at::Tensor inputSize,
+                                    Metadata<Dimension> &m,
+                                    at::Tensor input_features,
+                                    at::Tensor output_features, long nPlanes);
+template <typename T, Int Dimension>
+void cuda_SparseToDense_updateGradInput(at::Tensor inputSize,
+                                       Metadata<Dimension> &m,
+                                       at::Tensor input_features,
+                                       at::Tensor d_input_features,
+                                       at::Tensor d_output_features);
+template <typename T, Int Dimension>
+void cuda_UnPooling_updateOutput(at::Tensor inputSize, at::Tensor outputSize,
+                                at::Tensor poolSize, at::Tensor poolStride,
+                                Metadata<Dimension> &m,
+                                at::Tensor input_features,
+                                at::Tensor output_features,
+                                long nFeaturesToDrop);
+template <typename T, Int Dimension>
+void cuda_UnPooling_updateGradInput(at::Tensor inputSize, at::Tensor outputSize,
+                                   at::Tensor poolSize, at::Tensor poolStride,
+                                   Metadata<Dimension> &m,
+                                   at::Tensor input_features,
+                                   at::Tensor d_input_features,
+                                   at::Tensor d_output_features,
+                                   long nFeaturesToDrop);
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+
+pybind11::class_<Metadata<1>>(m, "Metadata_1")
+  .def(pybind11::init<>())
+  .def("clear", &Metadata<1>::clear)
+  .def("setInputSpatialSize", &Metadata<1>::setInputSpatialSize)
+  .def("batchAddSample", &Metadata<1>::batchAddSample)
+  .def("setInputSpatialLocation", &Metadata<1>::setInputSpatialLocation)
+  .def("setInputSpatialLocations", &Metadata<1>::setInputSpatialLocations)
+  .def("getSpatialLocations", &Metadata<1>::getSpatialLocations)
+  .def("createMetadataForDenseToSparse", &Metadata<1>::createMetadataForDenseToSparse)
+  .def("sparsifyMetadata", &Metadata<1>::sparsifyMetadata)
+  .def("addSampleFromThresholdedTensor", &Metadata<1>::addSampleFromThresholdedTensor)
+  .def("generateRuleBooks3s2", &Metadata<1>::generateRuleBooks3s2)
+  .def("generateRuleBooks2s2", &Metadata<1>::generateRuleBooks2s2);
+
+pybind11::class_<Metadata<2>>(m, "Metadata_2")
+  .def(pybind11::init<>())
+  .def("clear", &Metadata<2>::clear)
+  .def("setInputSpatialSize", &Metadata<2>::setInputSpatialSize)
+  .def("batchAddSample", &Metadata<2>::batchAddSample)
+  .def("setInputSpatialLocation", &Metadata<2>::setInputSpatialLocation)
+  .def("setInputSpatialLocations", &Metadata<2>::setInputSpatialLocations)
+  .def("getSpatialLocations", &Metadata<2>::getSpatialLocations)
+  .def("createMetadataForDenseToSparse", &Metadata<2>::createMetadataForDenseToSparse)
+  .def("sparsifyMetadata", &Metadata<2>::sparsifyMetadata)
+  .def("addSampleFromThresholdedTensor", &Metadata<2>::addSampleFromThresholdedTensor)
+  .def("generateRuleBooks3s2", &Metadata<2>::generateRuleBooks3s2)
+  .def("generateRuleBooks2s2", &Metadata<2>::generateRuleBooks2s2);
+
+pybind11::class_<Metadata<3>>(m, "Metadata_3")
+  .def(pybind11::init<>())
+  .def("clear", &Metadata<3>::clear)
+  .def("setInputSpatialSize", &Metadata<3>::setInputSpatialSize)
+  .def("batchAddSample", &Metadata<3>::batchAddSample)
+  .def("setInputSpatialLocation", &Metadata<3>::setInputSpatialLocation)
+  .def("setInputSpatialLocations", &Metadata<3>::setInputSpatialLocations)
+  .def("getSpatialLocations", &Metadata<3>::getSpatialLocations)
+  .def("createMetadataForDenseToSparse", &Metadata<3>::createMetadataForDenseToSparse)
+  .def("sparsifyMetadata", &Metadata<3>::sparsifyMetadata)
+  .def("addSampleFromThresholdedTensor", &Metadata<3>::addSampleFromThresholdedTensor)
+  .def("generateRuleBooks3s2", &Metadata<3>::generateRuleBooks3s2)
+  .def("generateRuleBooks2s2", &Metadata<3>::generateRuleBooks2s2);
+
+pybind11::class_<Metadata<4>>(m, "Metadata_4")
+  .def(pybind11::init<>())
+  .def("clear", &Metadata<4>::clear)
+  .def("setInputSpatialSize", &Metadata<4>::setInputSpatialSize)
+  .def("batchAddSample", &Metadata<4>::batchAddSample)
+  .def("setInputSpatialLocation", &Metadata<4>::setInputSpatialLocation)
+  .def("setInputSpatialLocations", &Metadata<4>::setInputSpatialLocations)
+  .def("getSpatialLocations", &Metadata<4>::getSpatialLocations)
+  .def("createMetadataForDenseToSparse", &Metadata<4>::createMetadataForDenseToSparse)
+  .def("sparsifyMetadata", &Metadata<4>::sparsifyMetadata)
+  .def("addSampleFromThresholdedTensor", &Metadata<4>::addSampleFromThresholdedTensor)
+  .def("generateRuleBooks3s2", &Metadata<4>::generateRuleBooks3s2)
+  .def("generateRuleBooks2s2", &Metadata<4>::generateRuleBooks2s2);
+m.def("cpu_float_AffineReluTrivialConvolution_updateOutput", &cpu_AffineReluTrivialConvolution_updateOutput<float>, "");
+m.def("cpu_double_AffineReluTrivialConvolution_updateOutput", &cpu_AffineReluTrivialConvolution_updateOutput<double>, "");
+m.def("cuda_float_AffineReluTrivialConvolution_updateOutput", &cuda_AffineReluTrivialConvolution_updateOutput<float>, "");
+m.def("cpu_float_AffineReluTrivialConvolution_backward", &cpu_AffineReluTrivialConvolution_backward<float>, "");
+m.def("cpu_double_AffineReluTrivialConvolution_backward", &cpu_AffineReluTrivialConvolution_backward<double>, "");
+m.def("cuda_float_AffineReluTrivialConvolution_backward", &cuda_AffineReluTrivialConvolution_backward<float>, "");
+m.def("cpu_float_BatchwiseMultiplicativeDropout_updateOutput", &cpu_BatchwiseMultiplicativeDropout_updateOutput<float>, "");
+m.def("cpu_double_BatchwiseMultiplicativeDropout_updateOutput", &cpu_BatchwiseMultiplicativeDropout_updateOutput<double>, "");
+m.def("cuda_float_BatchwiseMultiplicativeDropout_updateOutput", &cuda_BatchwiseMultiplicativeDropout_updateOutput<float>, "");
+m.def("cpu_float_BatchwiseMultiplicativeDropout_updateGradInput", &cpu_BatchwiseMultiplicativeDropout_updateGradInput<float>, "");
+m.def("cpu_double_BatchwiseMultiplicativeDropout_updateGradInput", &cpu_BatchwiseMultiplicativeDropout_updateGradInput<double>, "");
+m.def("cuda_float_BatchwiseMultiplicativeDropout_updateGradInput", &cuda_BatchwiseMultiplicativeDropout_updateGradInput<float>, "");
+m.def("cpu_float_BatchNormalization_updateOutput", &cpu_BatchNormalization_updateOutput<float>, "");
+m.def("cpu_double_BatchNormalization_updateOutput", &cpu_BatchNormalization_updateOutput<double>, "");
+m.def("cuda_float_BatchNormalization_updateOutput", &cuda_BatchNormalization_updateOutput<float>, "");
+m.def("cpu_float_BatchNormalization_backward", &cpu_BatchNormalization_backward<float>, "");
+m.def("cpu_double_BatchNormalization_backward", &cpu_BatchNormalization_backward<double>, "");
+m.def("cuda_float_BatchNormalization_backward", &cuda_BatchNormalization_backward<float>, "");
+m.def("cpu_float_LeakyReLU_updateOutput", &cpu_LeakyReLU_updateOutput<float>, "");
+m.def("cpu_double_LeakyReLU_updateOutput", &cpu_LeakyReLU_updateOutput<double>, "");
+m.def("cuda_float_LeakyReLU_updateOutput", &cuda_LeakyReLU_updateOutput<float>, "");
+m.def("cpu_float_LeakyReLU_updateGradInput", &cpu_LeakyReLU_updateGradInput<float>, "");
+m.def("cpu_double_LeakyReLU_updateGradInput", &cpu_LeakyReLU_updateGradInput<double>, "");
+m.def("cuda_float_LeakyReLU_updateGradInput", &cuda_LeakyReLU_updateGradInput<float>, "");
+m.def("cpu_float_NetworkInNetwork_updateOutput", &cpu_NetworkInNetwork_updateOutput<float>, "");
+m.def("cpu_double_NetworkInNetwork_updateOutput", &cpu_NetworkInNetwork_updateOutput<double>, "");
+m.def("cuda_float_NetworkInNetwork_updateOutput", &cuda_NetworkInNetwork_updateOutput<float>, "");
+m.def("cpu_float_NetworkInNetwork_updateGradInput", &cpu_NetworkInNetwork_updateGradInput<float>, "");
+m.def("cpu_double_NetworkInNetwork_updateGradInput", &cpu_NetworkInNetwork_updateGradInput<double>, "");
+m.def("cuda_float_NetworkInNetwork_updateGradInput", &cuda_NetworkInNetwork_updateGradInput<float>, "");
+m.def("cpu_float_NetworkInNetwork_accGradParameters", &cpu_NetworkInNetwork_accGradParameters<float>, "");
+m.def("cpu_double_NetworkInNetwork_accGradParameters", &cpu_NetworkInNetwork_accGradParameters<double>, "");
+m.def("cuda_float_NetworkInNetwork_accGradParameters", &cuda_NetworkInNetwork_accGradParameters<float>, "");
+m.def("cpu_float_ActivePooling_updateOutput_1", &cpu_ActivePooling_updateOutput<float,1>, "");
+m.def("cpu_double_ActivePooling_updateOutput_1", &cpu_ActivePooling_updateOutput<double,1>, "");
+m.def("cuda_float_ActivePooling_updateOutput_1", &cuda_ActivePooling_updateOutput<float,1>, "");
+m.def("cpu_float_ActivePooling_updateOutput_2", &cpu_ActivePooling_updateOutput<float,2>, "");
+m.def("cpu_double_ActivePooling_updateOutput_2", &cpu_ActivePooling_updateOutput<double,2>, "");
+m.def("cuda_float_ActivePooling_updateOutput_2", &cuda_ActivePooling_updateOutput<float,2>, "");
+m.def("cpu_float_ActivePooling_updateOutput_3", &cpu_ActivePooling_updateOutput<float,3>, "");
+m.def("cpu_double_ActivePooling_updateOutput_3", &cpu_ActivePooling_updateOutput<double,3>, "");
+m.def("cuda_float_ActivePooling_updateOutput_3", &cuda_ActivePooling_updateOutput<float,3>, "");
+m.def("cpu_float_ActivePooling_updateOutput_4", &cpu_ActivePooling_updateOutput<float,4>, "");
+m.def("cpu_double_ActivePooling_updateOutput_4", &cpu_ActivePooling_updateOutput<double,4>, "");
+m.def("cuda_float_ActivePooling_updateOutput_4", &cuda_ActivePooling_updateOutput<float,4>, "");
+m.def("cpu_float_ActivePooling_updateGradInput_1", &cpu_ActivePooling_updateGradInput<float,1>, "");
+m.def("cpu_double_ActivePooling_updateGradInput_1", &cpu_ActivePooling_updateGradInput<double,1>, "");
+m.def("cuda_float_ActivePooling_updateGradInput_1", &cuda_ActivePooling_updateGradInput<float,1>, "");
+m.def("cpu_float_ActivePooling_updateGradInput_2", &cpu_ActivePooling_updateGradInput<float,2>, "");
+m.def("cpu_double_ActivePooling_updateGradInput_2", &cpu_ActivePooling_updateGradInput<double,2>, "");
+m.def("cuda_float_ActivePooling_updateGradInput_2", &cuda_ActivePooling_updateGradInput<float,2>, "");
+m.def("cpu_float_ActivePooling_updateGradInput_3", &cpu_ActivePooling_updateGradInput<float,3>, "");
+m.def("cpu_double_ActivePooling_updateGradInput_3", &cpu_ActivePooling_updateGradInput<double,3>, "");
+m.def("cuda_float_ActivePooling_updateGradInput_3", &cuda_ActivePooling_updateGradInput<float,3>, "");
+m.def("cpu_float_ActivePooling_updateGradInput_4", &cpu_ActivePooling_updateGradInput<float,4>, "");
+m.def("cpu_double_ActivePooling_updateGradInput_4", &cpu_ActivePooling_updateGradInput<double,4>, "");
+m.def("cuda_float_ActivePooling_updateGradInput_4", &cuda_ActivePooling_updateGradInput<float,4>, "");
+m.def("cpu_float_AveragePooling_updateOutput_1", &cpu_AveragePooling_updateOutput<float,1>, "");
+m.def("cpu_double_AveragePooling_updateOutput_1", &cpu_AveragePooling_updateOutput<double,1>, "");
+m.def("cuda_float_AveragePooling_updateOutput_1", &cuda_AveragePooling_updateOutput<float,1>, "");
+m.def("cpu_float_AveragePooling_updateOutput_2", &cpu_AveragePooling_updateOutput<float,2>, "");
+m.def("cpu_double_AveragePooling_updateOutput_2", &cpu_AveragePooling_updateOutput<double,2>, "");
+m.def("cuda_float_AveragePooling_updateOutput_2", &cuda_AveragePooling_updateOutput<float,2>, "");
+m.def("cpu_float_AveragePooling_updateOutput_3", &cpu_AveragePooling_updateOutput<float,3>, "");
+m.def("cpu_double_AveragePooling_updateOutput_3", &cpu_AveragePooling_updateOutput<double,3>, "");
+m.def("cuda_float_AveragePooling_updateOutput_3", &cuda_AveragePooling_updateOutput<float,3>, "");
+m.def("cpu_float_AveragePooling_updateOutput_4", &cpu_AveragePooling_updateOutput<float,4>, "");
+m.def("cpu_double_AveragePooling_updateOutput_4", &cpu_AveragePooling_updateOutput<double,4>, "");
+m.def("cuda_float_AveragePooling_updateOutput_4", &cuda_AveragePooling_updateOutput<float,4>, "");
+m.def("cpu_float_AveragePooling_updateGradInput_1", &cpu_AveragePooling_updateGradInput<float,1>, "");
+m.def("cpu_double_AveragePooling_updateGradInput_1", &cpu_AveragePooling_updateGradInput<double,1>, "");
+m.def("cuda_float_AveragePooling_updateGradInput_1", &cuda_AveragePooling_updateGradInput<float,1>, "");
+m.def("cpu_float_AveragePooling_updateGradInput_2", &cpu_AveragePooling_updateGradInput<float,2>, "");
+m.def("cpu_double_AveragePooling_updateGradInput_2", &cpu_AveragePooling_updateGradInput<double,2>, "");
+m.def("cuda_float_AveragePooling_updateGradInput_2", &cuda_AveragePooling_updateGradInput<float,2>, "");
+m.def("cpu_float_AveragePooling_updateGradInput_3", &cpu_AveragePooling_updateGradInput<float,3>, "");
+m.def("cpu_double_AveragePooling_updateGradInput_3", &cpu_AveragePooling_updateGradInput<double,3>, "");
+m.def("cuda_float_AveragePooling_updateGradInput_3", &cuda_AveragePooling_updateGradInput<float,3>, "");
+m.def("cpu_float_AveragePooling_updateGradInput_4", &cpu_AveragePooling_updateGradInput<float,4>, "");
+m.def("cpu_double_AveragePooling_updateGradInput_4", &cpu_AveragePooling_updateGradInput<double,4>, "");
+m.def("cuda_float_AveragePooling_updateGradInput_4", &cuda_AveragePooling_updateGradInput<float,4>, "");
+m.def("cpu_float_Convolution_updateOutput_1", &cpu_Convolution_updateOutput<float,1>, "");
+m.def("cpu_double_Convolution_updateOutput_1", &cpu_Convolution_updateOutput<double,1>, "");
+m.def("cuda_float_Convolution_updateOutput_1", &cuda_Convolution_updateOutput<float,1>, "");
+m.def("cpu_float_Convolution_updateOutput_2", &cpu_Convolution_updateOutput<float,2>, "");
+m.def("cpu_double_Convolution_updateOutput_2", &cpu_Convolution_updateOutput<double,2>, "");
+m.def("cuda_float_Convolution_updateOutput_2", &cuda_Convolution_updateOutput<float,2>, "");
+m.def("cpu_float_Convolution_updateOutput_3", &cpu_Convolution_updateOutput<float,3>, "");
+m.def("cpu_double_Convolution_updateOutput_3", &cpu_Convolution_updateOutput<double,3>, "");
+m.def("cuda_float_Convolution_updateOutput_3", &cuda_Convolution_updateOutput<float,3>, "");
+m.def("cpu_float_Convolution_updateOutput_4", &cpu_Convolution_updateOutput<float,4>, "");
+m.def("cpu_double_Convolution_updateOutput_4", &cpu_Convolution_updateOutput<double,4>, "");
+m.def("cuda_float_Convolution_updateOutput_4", &cuda_Convolution_updateOutput<float,4>, "");
+m.def("cpu_float_Convolution_backward_1", &cpu_Convolution_backward<float,1>, "");
+m.def("cpu_double_Convolution_backward_1", &cpu_Convolution_backward<double,1>, "");
+m.def("cuda_float_Convolution_backward_1", &cuda_Convolution_backward<float,1>, "");
+m.def("cpu_float_Convolution_backward_2", &cpu_Convolution_backward<float,2>, "");
+m.def("cpu_double_Convolution_backward_2", &cpu_Convolution_backward<double,2>, "");
+m.def("cuda_float_Convolution_backward_2", &cuda_Convolution_backward<float,2>, "");
+m.def("cpu_float_Convolution_backward_3", &cpu_Convolution_backward<float,3>, "");
+m.def("cpu_double_Convolution_backward_3", &cpu_Convolution_backward<double,3>, "");
+m.def("cuda_float_Convolution_backward_3", &cuda_Convolution_backward<float,3>, "");
+m.def("cpu_float_Convolution_backward_4", &cpu_Convolution_backward<float,4>, "");
+m.def("cpu_double_Convolution_backward_4", &cpu_Convolution_backward<double,4>, "");
+m.def("cuda_float_Convolution_backward_4", &cuda_Convolution_backward<float,4>, "");
+m.def("cpu_float_RandomizedStrideConvolution_updateOutput_1", &cpu_RandomizedStrideConvolution_updateOutput<float,1>, "");
+m.def("cpu_double_RandomizedStrideConvolution_updateOutput_1", &cpu_RandomizedStrideConvolution_updateOutput<double,1>, "");
+m.def("cuda_float_RandomizedStrideConvolution_updateOutput_1", &cuda_RandomizedStrideConvolution_updateOutput<float,1>, "");
+m.def("cpu_float_RandomizedStrideConvolution_updateOutput_2", &cpu_RandomizedStrideConvolution_updateOutput<float,2>, "");
+m.def("cpu_double_RandomizedStrideConvolution_updateOutput_2", &cpu_RandomizedStrideConvolution_updateOutput<double,2>, "");
+m.def("cuda_float_RandomizedStrideConvolution_updateOutput_2", &cuda_RandomizedStrideConvolution_updateOutput<float,2>, "");
+m.def("cpu_float_RandomizedStrideConvolution_updateOutput_3", &cpu_RandomizedStrideConvolution_updateOutput<float,3>, "");
+m.def("cpu_double_RandomizedStrideConvolution_updateOutput_3", &cpu_RandomizedStrideConvolution_updateOutput<double,3>, "");
+m.def("cuda_float_RandomizedStrideConvolution_updateOutput_3", &cuda_RandomizedStrideConvolution_updateOutput<float,3>, "");
+m.def("cpu_float_RandomizedStrideConvolution_updateOutput_4", &cpu_RandomizedStrideConvolution_updateOutput<float,4>, "");
+m.def("cpu_double_RandomizedStrideConvolution_updateOutput_4", &cpu_RandomizedStrideConvolution_updateOutput<double,4>, "");
+m.def("cuda_float_RandomizedStrideConvolution_updateOutput_4", &cuda_RandomizedStrideConvolution_updateOutput<float,4>, "");
+m.def("cpu_float_RandomizedStrideConvolution_backward_1", &cpu_RandomizedStrideConvolution_backward<float,1>, "");
+m.def("cpu_double_RandomizedStrideConvolution_backward_1", &cpu_RandomizedStrideConvolution_backward<double,1>, "");
+m.def("cuda_float_RandomizedStrideConvolution_backward_1", &cuda_RandomizedStrideConvolution_backward<float,1>, "");
+m.def("cpu_float_RandomizedStrideConvolution_backward_2", &cpu_RandomizedStrideConvolution_backward<float,2>, "");
+m.def("cpu_double_RandomizedStrideConvolution_backward_2", &cpu_RandomizedStrideConvolution_backward<double,2>, "");
+m.def("cuda_float_RandomizedStrideConvolution_backward_2", &cuda_RandomizedStrideConvolution_backward<float,2>, "");
+m.def("cpu_float_RandomizedStrideConvolution_backward_3", &cpu_RandomizedStrideConvolution_backward<float,3>, "");
+m.def("cpu_double_RandomizedStrideConvolution_backward_3", &cpu_RandomizedStrideConvolution_backward<double,3>, "");
+m.def("cuda_float_RandomizedStrideConvolution_backward_3", &cuda_RandomizedStrideConvolution_backward<float,3>, "");
+m.def("cpu_float_RandomizedStrideConvolution_backward_4", &cpu_RandomizedStrideConvolution_backward<float,4>, "");
+m.def("cpu_double_RandomizedStrideConvolution_backward_4", &cpu_RandomizedStrideConvolution_backward<double,4>, "");
+m.def("cuda_float_RandomizedStrideConvolution_backward_4", &cuda_RandomizedStrideConvolution_backward<float,4>, "");
+m.def("cpu_float_Deconvolution_updateOutput_1", &cpu_Deconvolution_updateOutput<float,1>, "");
+m.def("cpu_double_Deconvolution_updateOutput_1", &cpu_Deconvolution_updateOutput<double,1>, "");
+m.def("cuda_float_Deconvolution_updateOutput_1", &cuda_Deconvolution_updateOutput<float,1>, "");
+m.def("cpu_float_Deconvolution_updateOutput_2", &cpu_Deconvolution_updateOutput<float,2>, "");
+m.def("cpu_double_Deconvolution_updateOutput_2", &cpu_Deconvolution_updateOutput<double,2>, "");
+m.def("cuda_float_Deconvolution_updateOutput_2", &cuda_Deconvolution_updateOutput<float,2>, "");
+m.def("cpu_float_Deconvolution_updateOutput_3", &cpu_Deconvolution_updateOutput<float,3>, "");
+m.def("cpu_double_Deconvolution_updateOutput_3", &cpu_Deconvolution_updateOutput<double,3>, "");
+m.def("cuda_float_Deconvolution_updateOutput_3", &cuda_Deconvolution_updateOutput<float,3>, "");
+m.def("cpu_float_Deconvolution_updateOutput_4", &cpu_Deconvolution_updateOutput<float,4>, "");
+m.def("cpu_double_Deconvolution_updateOutput_4", &cpu_Deconvolution_updateOutput<double,4>, "");
+m.def("cuda_float_Deconvolution_updateOutput_4", &cuda_Deconvolution_updateOutput<float,4>, "");
+m.def("cpu_float_Deconvolution_backward_1", &cpu_Deconvolution_backward<float,1>, "");
+m.def("cpu_double_Deconvolution_backward_1", &cpu_Deconvolution_backward<double,1>, "");
+m.def("cuda_float_Deconvolution_backward_1", &cuda_Deconvolution_backward<float,1>, "");
+m.def("cpu_float_Deconvolution_backward_2", &cpu_Deconvolution_backward<float,2>, "");
+m.def("cpu_double_Deconvolution_backward_2", &cpu_Deconvolution_backward<double,2>, "");
+m.def("cuda_float_Deconvolution_backward_2", &cuda_Deconvolution_backward<float,2>, "");
+m.def("cpu_float_Deconvolution_backward_3", &cpu_Deconvolution_backward<float,3>, "");
+m.def("cpu_double_Deconvolution_backward_3", &cpu_Deconvolution_backward<double,3>, "");
+m.def("cuda_float_Deconvolution_backward_3", &cuda_Deconvolution_backward<float,3>, "");
+m.def("cpu_float_Deconvolution_backward_4", &cpu_Deconvolution_backward<float,4>, "");
+m.def("cpu_double_Deconvolution_backward_4", &cpu_Deconvolution_backward<double,4>, "");
+m.def("cuda_float_Deconvolution_backward_4", &cuda_Deconvolution_backward<float,4>, "");
+m.def("cpu_float_FullConvolution_updateOutput_1", &cpu_FullConvolution_updateOutput<float,1>, "");
+m.def("cpu_double_FullConvolution_updateOutput_1", &cpu_FullConvolution_updateOutput<double,1>, "");
+m.def("cuda_float_FullConvolution_updateOutput_1", &cuda_FullConvolution_updateOutput<float,1>, "");
+m.def("cpu_float_FullConvolution_updateOutput_2", &cpu_FullConvolution_updateOutput<float,2>, "");
+m.def("cpu_double_FullConvolution_updateOutput_2", &cpu_FullConvolution_updateOutput<double,2>, "");
+m.def("cuda_float_FullConvolution_updateOutput_2", &cuda_FullConvolution_updateOutput<float,2>, "");
+m.def("cpu_float_FullConvolution_updateOutput_3", &cpu_FullConvolution_updateOutput<float,3>, "");
+m.def("cpu_double_FullConvolution_updateOutput_3", &cpu_FullConvolution_updateOutput<double,3>, "");
+m.def("cuda_float_FullConvolution_updateOutput_3", &cuda_FullConvolution_updateOutput<float,3>, "");
+m.def("cpu_float_FullConvolution_updateOutput_4", &cpu_FullConvolution_updateOutput<float,4>, "");
+m.def("cpu_double_FullConvolution_updateOutput_4", &cpu_FullConvolution_updateOutput<double,4>, "");
+m.def("cuda_float_FullConvolution_updateOutput_4", &cuda_FullConvolution_updateOutput<float,4>, "");
+m.def("cpu_float_FullConvolution_backward_1", &cpu_FullConvolution_backward<float,1>, "");
+m.def("cpu_double_FullConvolution_backward_1", &cpu_FullConvolution_backward<double,1>, "");
+m.def("cuda_float_FullConvolution_backward_1", &cuda_FullConvolution_backward<float,1>, "");
+m.def("cpu_float_FullConvolution_backward_2", &cpu_FullConvolution_backward<float,2>, "");
+m.def("cpu_double_FullConvolution_backward_2", &cpu_FullConvolution_backward<double,2>, "");
+m.def("cuda_float_FullConvolution_backward_2", &cuda_FullConvolution_backward<float,2>, "");
+m.def("cpu_float_FullConvolution_backward_3", &cpu_FullConvolution_backward<float,3>, "");
+m.def("cpu_double_FullConvolution_backward_3", &cpu_FullConvolution_backward<double,3>, "");
+m.def("cuda_float_FullConvolution_backward_3", &cuda_FullConvolution_backward<float,3>, "");
+m.def("cpu_float_FullConvolution_backward_4", &cpu_FullConvolution_backward<float,4>, "");
+m.def("cpu_double_FullConvolution_backward_4", &cpu_FullConvolution_backward<double,4>, "");
+m.def("cuda_float_FullConvolution_backward_4", &cuda_FullConvolution_backward<float,4>, "");
+m.def("cpu_float_MaxPooling_updateOutput_1", &cpu_MaxPooling_updateOutput<float,1>, "");
+m.def("cpu_double_MaxPooling_updateOutput_1", &cpu_MaxPooling_updateOutput<double,1>, "");
+m.def("cuda_float_MaxPooling_updateOutput_1", &cuda_MaxPooling_updateOutput<float,1>, "");
+m.def("cpu_float_MaxPooling_updateOutput_2", &cpu_MaxPooling_updateOutput<float,2>, "");
+m.def("cpu_double_MaxPooling_updateOutput_2", &cpu_MaxPooling_updateOutput<double,2>, "");
+m.def("cuda_float_MaxPooling_updateOutput_2", &cuda_MaxPooling_updateOutput<float,2>, "");
+m.def("cpu_float_MaxPooling_updateOutput_3", &cpu_MaxPooling_updateOutput<float,3>, "");
+m.def("cpu_double_MaxPooling_updateOutput_3", &cpu_MaxPooling_updateOutput<double,3>, "");
+m.def("cuda_float_MaxPooling_updateOutput_3", &cuda_MaxPooling_updateOutput<float,3>, "");
+m.def("cpu_float_MaxPooling_updateOutput_4", &cpu_MaxPooling_updateOutput<float,4>, "");
+m.def("cpu_double_MaxPooling_updateOutput_4", &cpu_MaxPooling_updateOutput<double,4>, "");
+m.def("cuda_float_MaxPooling_updateOutput_4", &cuda_MaxPooling_updateOutput<float,4>, "");
+m.def("cpu_float_MaxPooling_updateGradInput_1", &cpu_MaxPooling_updateGradInput<float,1>, "");
+m.def("cpu_double_MaxPooling_updateGradInput_1", &cpu_MaxPooling_updateGradInput<double,1>, "");
+m.def("cuda_float_MaxPooling_updateGradInput_1", &cuda_MaxPooling_updateGradInput<float,1>, "");
+m.def("cpu_float_MaxPooling_updateGradInput_2", &cpu_MaxPooling_updateGradInput<float,2>, "");
+m.def("cpu_double_MaxPooling_updateGradInput_2", &cpu_MaxPooling_updateGradInput<double,2>, "");
+m.def("cuda_float_MaxPooling_updateGradInput_2", &cuda_MaxPooling_updateGradInput<float,2>, "");
+m.def("cpu_float_MaxPooling_updateGradInput_3", &cpu_MaxPooling_updateGradInput<float,3>, "");
+m.def("cpu_double_MaxPooling_updateGradInput_3", &cpu_MaxPooling_updateGradInput<double,3>, "");
+m.def("cuda_float_MaxPooling_updateGradInput_3", &cuda_MaxPooling_updateGradInput<float,3>, "");
+m.def("cpu_float_MaxPooling_updateGradInput_4", &cpu_MaxPooling_updateGradInput<float,4>, "");
+m.def("cpu_double_MaxPooling_updateGradInput_4", &cpu_MaxPooling_updateGradInput<double,4>, "");
+m.def("cuda_float_MaxPooling_updateGradInput_4", &cuda_MaxPooling_updateGradInput<float,4>, "");
+m.def("cpu_float_RandomizedStrideMaxPooling_updateOutput_1", &cpu_RandomizedStrideMaxPooling_updateOutput<float,1>, "");
+m.def("cpu_double_RandomizedStrideMaxPooling_updateOutput_1", &cpu_RandomizedStrideMaxPooling_updateOutput<double,1>, "");
+m.def("cuda_float_RandomizedStrideMaxPooling_updateOutput_1", &cuda_RandomizedStrideMaxPooling_updateOutput<float,1>, "");
+m.def("cpu_float_RandomizedStrideMaxPooling_updateOutput_2", &cpu_RandomizedStrideMaxPooling_updateOutput<float,2>, "");
+m.def("cpu_double_RandomizedStrideMaxPooling_updateOutput_2", &cpu_RandomizedStrideMaxPooling_updateOutput<double,2>, "");
+m.def("cuda_float_RandomizedStrideMaxPooling_updateOutput_2", &cuda_RandomizedStrideMaxPooling_updateOutput<float,2>, "");
+m.def("cpu_float_RandomizedStrideMaxPooling_updateOutput_3", &cpu_RandomizedStrideMaxPooling_updateOutput<float,3>, "");
+m.def("cpu_double_RandomizedStrideMaxPooling_updateOutput_3", &cpu_RandomizedStrideMaxPooling_updateOutput<double,3>, "");
+m.def("cuda_float_RandomizedStrideMaxPooling_updateOutput_3", &cuda_RandomizedStrideMaxPooling_updateOutput<float,3>, "");
+m.def("cpu_float_RandomizedStrideMaxPooling_updateOutput_4", &cpu_RandomizedStrideMaxPooling_updateOutput<float,4>, "");
+m.def("cpu_double_RandomizedStrideMaxPooling_updateOutput_4", &cpu_RandomizedStrideMaxPooling_updateOutput<double,4>, "");
+m.def("cuda_float_RandomizedStrideMaxPooling_updateOutput_4", &cuda_RandomizedStrideMaxPooling_updateOutput<float,4>, "");
+m.def("cpu_float_RandomizedStrideMaxPooling_updateGradInput_1", &cpu_RandomizedStrideMaxPooling_updateGradInput<float,1>, "");
+m.def("cpu_double_RandomizedStrideMaxPooling_updateGradInput_1", &cpu_RandomizedStrideMaxPooling_updateGradInput<double,1>, "");
+m.def("cuda_float_RandomizedStrideMaxPooling_updateGradInput_1", &cuda_RandomizedStrideMaxPooling_updateGradInput<float,1>, "");
+m.def("cpu_float_RandomizedStrideMaxPooling_updateGradInput_2", &cpu_RandomizedStrideMaxPooling_updateGradInput<float,2>, "");
+m.def("cpu_double_RandomizedStrideMaxPooling_updateGradInput_2", &cpu_RandomizedStrideMaxPooling_updateGradInput<double,2>, "");
+m.def("cuda_float_RandomizedStrideMaxPooling_updateGradInput_2", &cuda_RandomizedStrideMaxPooling_updateGradInput<float,2>, "");
+m.def("cpu_float_RandomizedStrideMaxPooling_updateGradInput_3", &cpu_RandomizedStrideMaxPooling_updateGradInput<float,3>, "");
+m.def("cpu_double_RandomizedStrideMaxPooling_updateGradInput_3", &cpu_RandomizedStrideMaxPooling_updateGradInput<double,3>, "");
+m.def("cuda_float_RandomizedStrideMaxPooling_updateGradInput_3", &cuda_RandomizedStrideMaxPooling_updateGradInput<float,3>, "");
+m.def("cpu_float_RandomizedStrideMaxPooling_updateGradInput_4", &cpu_RandomizedStrideMaxPooling_updateGradInput<float,4>, "");
+m.def("cpu_double_RandomizedStrideMaxPooling_updateGradInput_4", &cpu_RandomizedStrideMaxPooling_updateGradInput<double,4>, "");
+m.def("cuda_float_RandomizedStrideMaxPooling_updateGradInput_4", &cuda_RandomizedStrideMaxPooling_updateGradInput<float,4>, "");
+m.def("cpu_float_SparseToDense_updateOutput_1", &cpu_SparseToDense_updateOutput<float,1>, "");
+m.def("cpu_double_SparseToDense_updateOutput_1", &cpu_SparseToDense_updateOutput<double,1>, "");
+m.def("cuda_float_SparseToDense_updateOutput_1", &cuda_SparseToDense_updateOutput<float,1>, "");
+m.def("cpu_float_SparseToDense_updateOutput_2", &cpu_SparseToDense_updateOutput<float,2>, "");
+m.def("cpu_double_SparseToDense_updateOutput_2", &cpu_SparseToDense_updateOutput<double,2>, "");
+m.def("cuda_float_SparseToDense_updateOutput_2", &cuda_SparseToDense_updateOutput<float,2>, "");
+m.def("cpu_float_SparseToDense_updateOutput_3", &cpu_SparseToDense_updateOutput<float,3>, "");
+m.def("cpu_double_SparseToDense_updateOutput_3", &cpu_SparseToDense_updateOutput<double,3>, "");
+m.def("cuda_float_SparseToDense_updateOutput_3", &cuda_SparseToDense_updateOutput<float,3>, "");
+m.def("cpu_float_SparseToDense_updateOutput_4", &cpu_SparseToDense_updateOutput<float,4>, "");
+m.def("cpu_double_SparseToDense_updateOutput_4", &cpu_SparseToDense_updateOutput<double,4>, "");
+m.def("cuda_float_SparseToDense_updateOutput_4", &cuda_SparseToDense_updateOutput<float,4>, "");
+m.def("cpu_float_SparseToDense_updateGradInput_1", &cpu_SparseToDense_updateGradInput<float,1>, "");
+m.def("cpu_double_SparseToDense_updateGradInput_1", &cpu_SparseToDense_updateGradInput<double,1>, "");
+m.def("cuda_float_SparseToDense_updateGradInput_1", &cuda_SparseToDense_updateGradInput<float,1>, "");
+m.def("cpu_float_SparseToDense_updateGradInput_2", &cpu_SparseToDense_updateGradInput<float,2>, "");
+m.def("cpu_double_SparseToDense_updateGradInput_2", &cpu_SparseToDense_updateGradInput<double,2>, "");
+m.def("cuda_float_SparseToDense_updateGradInput_2", &cuda_SparseToDense_updateGradInput<float,2>, "");
+m.def("cpu_float_SparseToDense_updateGradInput_3", &cpu_SparseToDense_updateGradInput<float,3>, "");
+m.def("cpu_double_SparseToDense_updateGradInput_3", &cpu_SparseToDense_updateGradInput<double,3>, "");
+m.def("cuda_float_SparseToDense_updateGradInput_3", &cuda_SparseToDense_updateGradInput<float,3>, "");
+m.def("cpu_float_SparseToDense_updateGradInput_4", &cpu_SparseToDense_updateGradInput<float,4>, "");
+m.def("cpu_double_SparseToDense_updateGradInput_4", &cpu_SparseToDense_updateGradInput<double,4>, "");
+m.def("cuda_float_SparseToDense_updateGradInput_4", &cuda_SparseToDense_updateGradInput<float,4>, "");
+m.def("cpu_float_SubmanifoldConvolution_updateOutput_1", &cpu_SubmanifoldConvolution_updateOutput<float,1>, "");
+m.def("cpu_double_SubmanifoldConvolution_updateOutput_1", &cpu_SubmanifoldConvolution_updateOutput<double,1>, "");
+m.def("cuda_float_SubmanifoldConvolution_updateOutput_1", &cuda_SubmanifoldConvolution_updateOutput<float,1>, "");
+m.def("cpu_float_SubmanifoldConvolution_updateOutput_2", &cpu_SubmanifoldConvolution_updateOutput<float,2>, "");
+m.def("cpu_double_SubmanifoldConvolution_updateOutput_2", &cpu_SubmanifoldConvolution_updateOutput<double,2>, "");
+m.def("cuda_float_SubmanifoldConvolution_updateOutput_2", &cuda_SubmanifoldConvolution_updateOutput<float,2>, "");
+m.def("cpu_float_SubmanifoldConvolution_updateOutput_3", &cpu_SubmanifoldConvolution_updateOutput<float,3>, "");
+m.def("cpu_double_SubmanifoldConvolution_updateOutput_3", &cpu_SubmanifoldConvolution_updateOutput<double,3>, "");
+m.def("cuda_float_SubmanifoldConvolution_updateOutput_3", &cuda_SubmanifoldConvolution_updateOutput<float,3>, "");
+m.def("cpu_float_SubmanifoldConvolution_updateOutput_4", &cpu_SubmanifoldConvolution_updateOutput<float,4>, "");
+m.def("cpu_double_SubmanifoldConvolution_updateOutput_4", &cpu_SubmanifoldConvolution_updateOutput<double,4>, "");
+m.def("cuda_float_SubmanifoldConvolution_updateOutput_4", &cuda_SubmanifoldConvolution_updateOutput<float,4>, "");
+m.def("cpu_float_SubmanifoldConvolution_backward_1", &cpu_SubmanifoldConvolution_backward<float,1>, "");
+m.def("cpu_double_SubmanifoldConvolution_backward_1", &cpu_SubmanifoldConvolution_backward<double,1>, "");
+m.def("cuda_float_SubmanifoldConvolution_backward_1", &cuda_SubmanifoldConvolution_backward<float,1>, "");
+m.def("cpu_float_SubmanifoldConvolution_backward_2", &cpu_SubmanifoldConvolution_backward<float,2>, "");
+m.def("cpu_double_SubmanifoldConvolution_backward_2", &cpu_SubmanifoldConvolution_backward<double,2>, "");
+m.def("cuda_float_SubmanifoldConvolution_backward_2", &cuda_SubmanifoldConvolution_backward<float,2>, "");
+m.def("cpu_float_SubmanifoldConvolution_backward_3", &cpu_SubmanifoldConvolution_backward<float,3>, "");
+m.def("cpu_double_SubmanifoldConvolution_backward_3", &cpu_SubmanifoldConvolution_backward<double,3>, "");
+m.def("cuda_float_SubmanifoldConvolution_backward_3", &cuda_SubmanifoldConvolution_backward<float,3>, "");
+m.def("cpu_float_SubmanifoldConvolution_backward_4", &cpu_SubmanifoldConvolution_backward<float,4>, "");
+m.def("cpu_double_SubmanifoldConvolution_backward_4", &cpu_SubmanifoldConvolution_backward<double,4>, "");
+m.def("cuda_float_SubmanifoldConvolution_backward_4", &cuda_SubmanifoldConvolution_backward<float,4>, "");
+m.def("cpu_float_InputLayer_updateOutput_1", &cpu_InputLayer_updateOutput<float,1>, "");
+m.def("cpu_double_InputLayer_updateOutput_1", &cpu_InputLayer_updateOutput<double,1>, "");
+m.def("cuda_float_InputLayer_updateOutput_1", &cuda_InputLayer_updateOutput<float,1>, "");
+m.def("cpu_float_InputLayer_updateOutput_2", &cpu_InputLayer_updateOutput<float,2>, "");
+m.def("cpu_double_InputLayer_updateOutput_2", &cpu_InputLayer_updateOutput<double,2>, "");
+m.def("cuda_float_InputLayer_updateOutput_2", &cuda_InputLayer_updateOutput<float,2>, "");
+m.def("cpu_float_InputLayer_updateOutput_3", &cpu_InputLayer_updateOutput<float,3>, "");
+m.def("cpu_double_InputLayer_updateOutput_3", &cpu_InputLayer_updateOutput<double,3>, "");
+m.def("cuda_float_InputLayer_updateOutput_3", &cuda_InputLayer_updateOutput<float,3>, "");
+m.def("cpu_float_InputLayer_updateOutput_4", &cpu_InputLayer_updateOutput<float,4>, "");
+m.def("cpu_double_InputLayer_updateOutput_4", &cpu_InputLayer_updateOutput<double,4>, "");
+m.def("cuda_float_InputLayer_updateOutput_4", &cuda_InputLayer_updateOutput<float,4>, "");
+m.def("cpu_float_InputLayer_updateGradInput_1", &cpu_InputLayer_updateGradInput<float,1>, "");
+m.def("cpu_double_InputLayer_updateGradInput_1", &cpu_InputLayer_updateGradInput<double,1>, "");
+m.def("cuda_float_InputLayer_updateGradInput_1", &cuda_InputLayer_updateGradInput<float,1>, "");
+m.def("cpu_float_InputLayer_updateGradInput_2", &cpu_InputLayer_updateGradInput<float,2>, "");
+m.def("cpu_double_InputLayer_updateGradInput_2", &cpu_InputLayer_updateGradInput<double,2>, "");
+m.def("cuda_float_InputLayer_updateGradInput_2", &cuda_InputLayer_updateGradInput<float,2>, "");
+m.def("cpu_float_InputLayer_updateGradInput_3", &cpu_InputLayer_updateGradInput<float,3>, "");
+m.def("cpu_double_InputLayer_updateGradInput_3", &cpu_InputLayer_updateGradInput<double,3>, "");
+m.def("cuda_float_InputLayer_updateGradInput_3", &cuda_InputLayer_updateGradInput<float,3>, "");
+m.def("cpu_float_InputLayer_updateGradInput_4", &cpu_InputLayer_updateGradInput<float,4>, "");
+m.def("cpu_double_InputLayer_updateGradInput_4", &cpu_InputLayer_updateGradInput<double,4>, "");
+m.def("cuda_float_InputLayer_updateGradInput_4", &cuda_InputLayer_updateGradInput<float,4>, "");
+m.def("cpu_float_OutputLayer_updateOutput_1", &cpu_OutputLayer_updateOutput<float,1>, "");
+m.def("cpu_double_OutputLayer_updateOutput_1", &cpu_OutputLayer_updateOutput<double,1>, "");
+m.def("cuda_float_OutputLayer_updateOutput_1", &cuda_OutputLayer_updateOutput<float,1>, "");
+m.def("cpu_float_OutputLayer_updateOutput_2", &cpu_OutputLayer_updateOutput<float,2>, "");
+m.def("cpu_double_OutputLayer_updateOutput_2", &cpu_OutputLayer_updateOutput<double,2>, "");
+m.def("cuda_float_OutputLayer_updateOutput_2", &cuda_OutputLayer_updateOutput<float,2>, "");
+m.def("cpu_float_OutputLayer_updateOutput_3", &cpu_OutputLayer_updateOutput<float,3>, "");
+m.def("cpu_double_OutputLayer_updateOutput_3", &cpu_OutputLayer_updateOutput<double,3>, "");
+m.def("cuda_float_OutputLayer_updateOutput_3", &cuda_OutputLayer_updateOutput<float,3>, "");
+m.def("cpu_float_OutputLayer_updateOutput_4", &cpu_OutputLayer_updateOutput<float,4>, "");
+m.def("cpu_double_OutputLayer_updateOutput_4", &cpu_OutputLayer_updateOutput<double,4>, "");
+m.def("cuda_float_OutputLayer_updateOutput_4", &cuda_OutputLayer_updateOutput<float,4>, "");
+m.def("cpu_float_OutputLayer_updateGradInput_1", &cpu_OutputLayer_updateGradInput<float,1>, "");
+m.def("cpu_double_OutputLayer_updateGradInput_1", &cpu_OutputLayer_updateGradInput<double,1>, "");
+m.def("cuda_float_OutputLayer_updateGradInput_1", &cuda_OutputLayer_updateGradInput<float,1>, "");
+m.def("cpu_float_OutputLayer_updateGradInput_2", &cpu_OutputLayer_updateGradInput<float,2>, "");
+m.def("cpu_double_OutputLayer_updateGradInput_2", &cpu_OutputLayer_updateGradInput<double,2>, "");
+m.def("cuda_float_OutputLayer_updateGradInput_2", &cuda_OutputLayer_updateGradInput<float,2>, "");
+m.def("cpu_float_OutputLayer_updateGradInput_3", &cpu_OutputLayer_updateGradInput<float,3>, "");
+m.def("cpu_double_OutputLayer_updateGradInput_3", &cpu_OutputLayer_updateGradInput<double,3>, "");
+m.def("cuda_float_OutputLayer_updateGradInput_3", &cuda_OutputLayer_updateGradInput<float,3>, "");
+m.def("cpu_float_OutputLayer_updateGradInput_4", &cpu_OutputLayer_updateGradInput<float,4>, "");
+m.def("cpu_double_OutputLayer_updateGradInput_4", &cpu_OutputLayer_updateGradInput<double,4>, "");
+m.def("cuda_float_OutputLayer_updateGradInput_4", &cuda_OutputLayer_updateGradInput<float,4>, "");
+m.def("cpu_float_BLInputLayer_updateOutput_1", &cpu_BLInputLayer_updateOutput<float,1>, "");
+m.def("cpu_double_BLInputLayer_updateOutput_1", &cpu_BLInputLayer_updateOutput<double,1>, "");
+m.def("cuda_float_BLInputLayer_updateOutput_1", &cuda_BLInputLayer_updateOutput<float,1>, "");
+m.def("cpu_float_BLInputLayer_updateOutput_2", &cpu_BLInputLayer_updateOutput<float,2>, "");
+m.def("cpu_double_BLInputLayer_updateOutput_2", &cpu_BLInputLayer_updateOutput<double,2>, "");
+m.def("cuda_float_BLInputLayer_updateOutput_2", &cuda_BLInputLayer_updateOutput<float,2>, "");
+m.def("cpu_float_BLInputLayer_updateOutput_3", &cpu_BLInputLayer_updateOutput<float,3>, "");
+m.def("cpu_double_BLInputLayer_updateOutput_3", &cpu_BLInputLayer_updateOutput<double,3>, "");
+m.def("cuda_float_BLInputLayer_updateOutput_3", &cuda_BLInputLayer_updateOutput<float,3>, "");
+m.def("cpu_float_BLInputLayer_updateOutput_4", &cpu_BLInputLayer_updateOutput<float,4>, "");
+m.def("cpu_double_BLInputLayer_updateOutput_4", &cpu_BLInputLayer_updateOutput<double,4>, "");
+m.def("cuda_float_BLInputLayer_updateOutput_4", &cuda_BLInputLayer_updateOutput<float,4>, "");
+m.def("cpu_float_BLInputLayer_updateGradInput_1", &cpu_BLInputLayer_updateGradInput<float,1>, "");
+m.def("cpu_double_BLInputLayer_updateGradInput_1", &cpu_BLInputLayer_updateGradInput<double,1>, "");
+m.def("cuda_float_BLInputLayer_updateGradInput_1", &cuda_BLInputLayer_updateGradInput<float,1>, "");
+m.def("cpu_float_BLInputLayer_updateGradInput_2", &cpu_BLInputLayer_updateGradInput<float,2>, "");
+m.def("cpu_double_BLInputLayer_updateGradInput_2", &cpu_BLInputLayer_updateGradInput<double,2>, "");
+m.def("cuda_float_BLInputLayer_updateGradInput_2", &cuda_BLInputLayer_updateGradInput<float,2>, "");
+m.def("cpu_float_BLInputLayer_updateGradInput_3", &cpu_BLInputLayer_updateGradInput<float,3>, "");
+m.def("cpu_double_BLInputLayer_updateGradInput_3", &cpu_BLInputLayer_updateGradInput<double,3>, "");
+m.def("cuda_float_BLInputLayer_updateGradInput_3", &cuda_BLInputLayer_updateGradInput<float,3>, "");
+m.def("cpu_float_BLInputLayer_updateGradInput_4", &cpu_BLInputLayer_updateGradInput<float,4>, "");
+m.def("cpu_double_BLInputLayer_updateGradInput_4", &cpu_BLInputLayer_updateGradInput<double,4>, "");
+m.def("cuda_float_BLInputLayer_updateGradInput_4", &cuda_BLInputLayer_updateGradInput<float,4>, "");
+m.def("cpu_float_BLOutputLayer_updateOutput_1", &cpu_BLOutputLayer_updateOutput<float,1>, "");
+m.def("cpu_double_BLOutputLayer_updateOutput_1", &cpu_BLOutputLayer_updateOutput<double,1>, "");
+m.def("cuda_float_BLOutputLayer_updateOutput_1", &cuda_BLOutputLayer_updateOutput<float,1>, "");
+m.def("cpu_float_BLOutputLayer_updateOutput_2", &cpu_BLOutputLayer_updateOutput<float,2>, "");
+m.def("cpu_double_BLOutputLayer_updateOutput_2", &cpu_BLOutputLayer_updateOutput<double,2>, "");
+m.def("cuda_float_BLOutputLayer_updateOutput_2", &cuda_BLOutputLayer_updateOutput<float,2>, "");
+m.def("cpu_float_BLOutputLayer_updateOutput_3", &cpu_BLOutputLayer_updateOutput<float,3>, "");
+m.def("cpu_double_BLOutputLayer_updateOutput_3", &cpu_BLOutputLayer_updateOutput<double,3>, "");
+m.def("cuda_float_BLOutputLayer_updateOutput_3", &cuda_BLOutputLayer_updateOutput<float,3>, "");
+m.def("cpu_float_BLOutputLayer_updateOutput_4", &cpu_BLOutputLayer_updateOutput<float,4>, "");
+m.def("cpu_double_BLOutputLayer_updateOutput_4", &cpu_BLOutputLayer_updateOutput<double,4>, "");
+m.def("cuda_float_BLOutputLayer_updateOutput_4", &cuda_BLOutputLayer_updateOutput<float,4>, "");
+m.def("cpu_float_BLOutputLayer_updateGradInput_1", &cpu_BLOutputLayer_updateGradInput<float,1>, "");
+m.def("cpu_double_BLOutputLayer_updateGradInput_1", &cpu_BLOutputLayer_updateGradInput<double,1>, "");
+m.def("cuda_float_BLOutputLayer_updateGradInput_1", &cuda_BLOutputLayer_updateGradInput<float,1>, "");
+m.def("cpu_float_BLOutputLayer_updateGradInput_2", &cpu_BLOutputLayer_updateGradInput<float,2>, "");
+m.def("cpu_double_BLOutputLayer_updateGradInput_2", &cpu_BLOutputLayer_updateGradInput<double,2>, "");
+m.def("cuda_float_BLOutputLayer_updateGradInput_2", &cuda_BLOutputLayer_updateGradInput<float,2>, "");
+m.def("cpu_float_BLOutputLayer_updateGradInput_3", &cpu_BLOutputLayer_updateGradInput<float,3>, "");
+m.def("cpu_double_BLOutputLayer_updateGradInput_3", &cpu_BLOutputLayer_updateGradInput<double,3>, "");
+m.def("cuda_float_BLOutputLayer_updateGradInput_3", &cuda_BLOutputLayer_updateGradInput<float,3>, "");
+m.def("cpu_float_BLOutputLayer_updateGradInput_4", &cpu_BLOutputLayer_updateGradInput<float,4>, "");
+m.def("cpu_double_BLOutputLayer_updateGradInput_4", &cpu_BLOutputLayer_updateGradInput<double,4>, "");
+m.def("cuda_float_BLOutputLayer_updateGradInput_4", &cuda_BLOutputLayer_updateGradInput<float,4>, "");
+m.def("cpu_float_UnPooling_updateOutput_1", &cpu_UnPooling_updateOutput<float,1>, "");
+m.def("cpu_double_UnPooling_updateOutput_1", &cpu_UnPooling_updateOutput<double,1>, "");
+m.def("cuda_float_UnPooling_updateOutput_1", &cuda_UnPooling_updateOutput<float,1>, "");
+m.def("cpu_float_UnPooling_updateOutput_2", &cpu_UnPooling_updateOutput<float,2>, "");
+m.def("cpu_double_UnPooling_updateOutput_2", &cpu_UnPooling_updateOutput<double,2>, "");
+m.def("cuda_float_UnPooling_updateOutput_2", &cuda_UnPooling_updateOutput<float,2>, "");
+m.def("cpu_float_UnPooling_updateOutput_3", &cpu_UnPooling_updateOutput<float,3>, "");
+m.def("cpu_double_UnPooling_updateOutput_3", &cpu_UnPooling_updateOutput<double,3>, "");
+m.def("cuda_float_UnPooling_updateOutput_3", &cuda_UnPooling_updateOutput<float,3>, "");
+m.def("cpu_float_UnPooling_updateOutput_4", &cpu_UnPooling_updateOutput<float,4>, "");
+m.def("cpu_double_UnPooling_updateOutput_4", &cpu_UnPooling_updateOutput<double,4>, "");
+m.def("cuda_float_UnPooling_updateOutput_4", &cuda_UnPooling_updateOutput<float,4>, "");
+m.def("cpu_float_UnPooling_updateGradInput_1", &cpu_UnPooling_updateGradInput<float,1>, "");
+m.def("cpu_double_UnPooling_updateGradInput_1", &cpu_UnPooling_updateGradInput<double,1>, "");
+m.def("cuda_float_UnPooling_updateGradInput_1", &cuda_UnPooling_updateGradInput<float,1>, "");
+m.def("cpu_float_UnPooling_updateGradInput_2", &cpu_UnPooling_updateGradInput<float,2>, "");
+m.def("cpu_double_UnPooling_updateGradInput_2", &cpu_UnPooling_updateGradInput<double,2>, "");
+m.def("cuda_float_UnPooling_updateGradInput_2", &cuda_UnPooling_updateGradInput<float,2>, "");
+m.def("cpu_float_UnPooling_updateGradInput_3", &cpu_UnPooling_updateGradInput<float,3>, "");
+m.def("cpu_double_UnPooling_updateGradInput_3", &cpu_UnPooling_updateGradInput<double,3>, "");
+m.def("cuda_float_UnPooling_updateGradInput_3", &cuda_UnPooling_updateGradInput<float,3>, "");
+m.def("cpu_float_UnPooling_updateGradInput_4", &cpu_UnPooling_updateGradInput<float,4>, "");
+m.def("cpu_double_UnPooling_updateGradInput_4", &cpu_UnPooling_updateGradInput<double,4>, "");
+m.def("cuda_float_UnPooling_updateGradInput_4", &cuda_UnPooling_updateGradInput<float,4>, "");
+
+m.def("n_rulebook_bits", []() {return 8*sizeof(Int);}, "");
+}
--- a/sparseconvnet/__init__.py
+++ b/sparseconvnet/__init__.py
@@ -17,7 +17,7 @@ from .dropout import Dropout, BatchwiseDropout
 from .fullConvolution import FullConvolution
 from .identity import Identity
 from .inputBatch import InputBatch
-from .ioLayers import InputLayer, OutputLayer, BLInputLayer, BLOutputLayer
+from .ioLayers import InputLayer, OutputLayer, BLInputLayer, BLOutputLayer, InputLayerInput
 from .maxPooling import MaxPooling
 from .metadata import Metadata
 from .networkArchitectures import *

--- a/sparseconvnet/averagePooling.py
+++ b/sparseconvnet/averagePooling.py
@@ -86,7 +86,7 @@ class AveragePoolingFunction(Function):
            output_spatial_size,
            pool_size,
            pool_stride,
-            input_metadata.ffi,
+            input_metadata,
            input_features,
            output_features,
            nFeaturesToDrop)
@@ -114,7 +114,7 @@ class AveragePoolingFunction(Function):
            output_spatial_size,
            pool_size,
            pool_stride,
-            ctx.input_metadata.ffi,
+            ctx.input_metadata,
            input_features,
            grad_input,
            grad_output.contiguous(),

--- a/sparseconvnet/batchNormalization.py
+++ b/sparseconvnet/batchNormalization.py
@@ -39,9 +39,6 @@ class BatchNormalization(Module):
        if affine:
            self.weight = Parameter(torch.Tensor(nPlanes).fill_(1))
            self.bias = Parameter(torch.Tensor(nPlanes).fill_(0))
-        else:
-            self.weight = None
-            self.bias = None

    def forward(self, input):
        assert input.features.nelement() == 0 or input.features.size(1) == self.nPlanes
@@ -50,8 +47,8 @@ class BatchNormalization(Module):
        output.spatial_size = input.spatial_size
        output.features = BatchNormalizationFunction.apply(
            input.features,
-            self.weight,
-            self.bias,
+            optionalTensor(self, 'weight'),
+            optionalTensor(self, 'bias'),
            self.runningMean,
            self.runningVar,
            self.eps,
@@ -117,8 +114,8 @@ class BatchNormalizationFunction(Function):
            saveInvStd,
            runningMean,
            runningVar,
-            weight if weight is not None else nullptr,
-            bias if bias is not None else nullptr,
+            weight,
+            bias,
            eps,
            momentum,
            ctx.train,
@@ -145,14 +142,8 @@ class BatchNormalizationFunction(Function):
            saveInvStd = ctx.saved_tensors
        assert ctx.train
        grad_input = grad_output.new()
-        if weight is None:
-            grad_weight = None
-        else:
-            grad_weight = input_features.new().resize_(ctx.nPlanes).zero_()
-        if bias is None:
-            grad_bias = None
-        else:
-            grad_bias = input_features.new().resize_(ctx.nPlanes).zero_()
+        grad_weight = torch.zeros_like(weight)
+        grad_bias = torch.zeros_like(bias)
        typed_fn(input_features, 'BatchNormalization_backward')(
            input_features,
            grad_input,
@@ -162,80 +153,9 @@ class BatchNormalizationFunction(Function):
            saveInvStd,
            runningMean,
            runningVar,
-            weight if weight is not None else nullptr,
-            bias if bias is not None else nullptr,
-            grad_weight if grad_weight is not None else nullptr,
-            grad_bias if grad_bias is not None else nullptr,
+            weight,
+            bias,
+            grad_weight,
+            grad_bias,
            ctx.leakiness)
-        return grad_input, grad_weight, grad_bias, None, None, None, None, None, None
-#
-#
-# class BatchNormalization(Module):
-#     """
-#     Parameters:
-#     nPlanes : number of input planes
-#     eps : small number used to stabilise standard deviation calculation
-#     momentum : for calculating running average for testing (default 0.9)
-#     affine : only 'true' is supported at present (default 'true')
-#     noise : add multiplicative and additive noise during training if >0.
-#     leakiness : Apply activation def inplace: 0<=leakiness<=1.
-#     0 for ReLU, values in (0,1) for LeakyReLU, 1 for no activation def.
-#     """
-#     def __init__(
-#             self,
-#             nPlanes,
-#             eps=1e-4,
-#             momentum=0.9,
-#             affine=True,
-#             leakiness=1):
-#         Module.__init__(self)
-#         self.nPlanes = nPlanes
-#         self.eps = eps
-#         self.momentum = momentum
-#         self.affine = affine
-#         self.leakiness = leakiness
-#         self.register_buffer("runningMean", torch.Tensor(nPlanes).fill_(0))
-#         self.register_buffer("runningVar", torch.Tensor(nPlanes).fill_(1))
-#         if affine:
-#             self.weight = Parameter(torch.Tensor(nPlanes).fill_(1))
-#             self.bias = Parameter(torch.Tensor(nPlanes).fill_(0))
-#         else:
-#             self.weight = None
-#             self.bias = None
-#
-#     def forward(self, input):
-#         output = SparseConvNetTensor()
-#         output.metadata = input.metadata
-#         output.spatial_size = input.spatial_size
-#         if input.features.ndimension() == 0:
-#             output.features = input.features
-#         else:
-#             output.features = input.features - input.features.mean(0, keepdim=True)
-#         if self.leakiness != 1:
-#             output.features = torch.nn.functional.leaky_relu(output.features, self.leakiness)
-#         return output
-#
-#     def input_spatial_size(self, out_size):
-#         return out_size
-#
-#     def __repr__(self):
-#         return str(self.bn)
-#
-# class BatchNormReLU(BatchNormalization):
-#     def __init__(self, nPlanes, eps=1e-4, momentum=0.9):
-#         BatchNormalization.__init__(self, nPlanes, eps, momentum, True, 0)
-#
-#     def __repr__(self):
-#         s = 'BatchNormReLU(' + str(self.nPlanes) + ',eps=' + str(self.eps) + \
-#             ',momentum=' + str(self.momentum) + ',affine=' + str(self.affine) + ')'
-#         return s
-#
-#
-# class BatchNormLeakyReLU(BatchNormalization):
-#     def __init__(self, nPlanes, eps=1e-4, momentum=0.9):
-#         BatchNormalization.__init__(self, nPlanes, eps, momentum, True, 0.333)
-#
-#     def __repr__(self):
-#         s = 'BatchNormReLU(' + str(self.nPlanes) + ',eps=' + str(self.eps) + \
-#             ',momentum=' + str(self.momentum) + ',affine=' + str(self.affine) + ')'
-#         return s
+        return grad_input, optionalTensorReturn(grad_weight), optionalTensorReturn(grad_bias), None, None, None, None, None, None
--- a/sparseconvnet/classificationTrainValidate.py
+++ b/sparseconvnet/classificationTrainValidate.py
@@ -53,9 +53,12 @@ def ClassificationTrainValidate(model, dataset, p):
        p['momentum'] = 0.9
    if 'check_point' not in p:
        p['check_point'] = False
-    if 'use_gpu' not in p:
-        p['use_gpu'] = torch.cuda.is_available()
-    if p['use_gpu']:
+    if 'use_gpu' in p:
+        p['use_cuda']=p['use_gpu'] #Back compatibility
+        del p['use_gpu']
+    if 'use_cuda' not in p:
+        p['use_cuda'] = torch.cuda.is_available()
+    if p['use_cuda']:
        model.cuda()
    if 'test_reps' not in p:
        p['test_reps'] = 1
@@ -81,8 +84,8 @@ def ClassificationTrainValidate(model, dataset, p):
            param_group['lr'] = p['initial_lr'] * \
                math.exp((1 - epoch) * p['lr_decay'])
        start = time.time()
-        for batch in dataset['train']():
-            if p['use_gpu']:
+        for batch in dataset['train']:
+            if p['use_cuda']:
                batch['input'] = batch['input'].cuda()
                batch['target'] = batch['target'].cuda()
            optimizer.zero_grad()
@@ -116,8 +119,8 @@ def ClassificationTrainValidate(model, dataset, p):
        start = time.time()
        if p['test_reps'] == 1:
            stats = {}
-            for batch in dataset['val']():
-                if p['use_gpu']:
+            for batch in dataset['val']:
+                if p['use_cuda']:
                    batch['input'] = batch['input'].cuda()
                    batch['target'] = batch['target'].cuda()
                output = model(batch['input'])
@@ -144,7 +147,7 @@ def ClassificationTrainValidate(model, dataset, p):
                ta = []
                idxs = []
                for batch in dataset['val']():
-                    if p['use_gpu']:
+                    if p['use_cuda']:
                        batch['input'] = batch['input'].cuda()
                        batch['target'] = batch['target'].cuda()
                        batch['idx'] = batch['idx'].cuda()

--- a/sparseconvnet/convolution.py
+++ b/sparseconvnet/convolution.py
@@ -10,6 +10,61 @@ from torch.nn import Module, Parameter
 from .utils import *
 from .sparseConvNetTensor import SparseConvNetTensor

+class Convolution(Module):
+    def __init__(self, dimension, nIn, nOut, filter_size, filter_stride, bias):
+        Module.__init__(self)
+        self.dimension = dimension
+        self.nIn = nIn
+        self.nOut = nOut
+        self.filter_size = toLongTensor(dimension, filter_size)
+        self.filter_volume = self.filter_size.prod().item()
+        self.filter_stride = toLongTensor(dimension, filter_stride)
+        std = (2.0 / nIn / self.filter_volume)**0.5
+        self.weight = Parameter(torch.Tensor(
+            self.filter_volume, nIn, nOut).normal_(
+            0,
+            std))
+        if bias:
+            self.bias = Parameter(torch.Tensor(nOut).zero_())
+
+    def forward(self, input):
+        assert input.features.nelement() == 0 or input.features.size(1) == self.nIn
+        output = SparseConvNetTensor()
+        output.metadata = input.metadata
+        output.spatial_size =\
+            (input.spatial_size - self.filter_size) / self.filter_stride + 1
+        assert ((output.spatial_size - 1) * self.filter_stride +
+                self.filter_size == input.spatial_size).all()
+        output.features = ConvolutionFunction.apply(
+            input.features,
+            self.weight,
+            optionalTensor(self, 'bias'),
+            input.metadata,
+            input.spatial_size,
+            output.spatial_size,
+            self.dimension,
+            self.filter_size,
+            self.filter_stride)
+        return output
+
+    def __repr__(self):
+        s = 'Convolution ' + str(self.nIn) + '->' + str(self.nOut) + ' C'
+        if self.filter_size.max().item() == self.filter_size.min().item() and\
+                self.filter_stride.max().item() == self.filter_stride.min().item():
+            s = s + str(self.filter_size[0].item()) + \
+                '/' + str(self.filter_stride[0].item())
+        else:
+            s = s + '(' + str(self.filter_size[0].item())
+            for i in self.filter_size[1:]:
+                s = s + ',' + str(i.item())
+            s = s + ')/(' + str(self.filter_stride[0].item())
+            for i in self.filter_stride[1:]:
+                s = s + ',' + str(i.item())
+            s = s + ')'
+        return s
+
+    def input_spatial_size(self, out_size):
+        return (out_size - 1) * self.filter_stride + self.filter_size

 class ConvolutionFunction(Function):
    @staticmethod
@@ -42,12 +97,11 @@ class ConvolutionFunction(Function):
                output_spatial_size,
                filter_size,
                filter_stride,
-                input_metadata.ffi,
+                input_metadata,
                input_features,
                output_features,
                weight,
-                bias if bias is not None else nullptr,
-                0)  # remove this parameter!!
+                bias)
        sparseconvnet.forward_pass_hidden_states += output_features.nelement()
        return output_features

@@ -55,84 +109,19 @@ class ConvolutionFunction(Function):
    def backward(ctx, grad_output):
        input_features, input_spatial_size, weight, bias, output_spatial_size, filter_size, filter_stride = ctx.saved_tensors
        grad_input = grad_output.new()
-        grad_weight = grad_output.new().resize_as_(weight).zero_()
-        if bias is None:
-            grad_bias = None
-        else:
-            grad_bias = grad_output.new().resize_as_(bias).zero_()
+        grad_weight = torch.zeros_like(weight)
+        grad_bias = torch.zeros_like(bias)
        dim_typed_fn(
            ctx.dimension, input_features, 'Convolution_backward')(
            input_spatial_size,
            output_spatial_size,
            filter_size,
            filter_stride,
-            ctx.input_metadata.ffi,
+            ctx.input_metadata,
            input_features,
            grad_input,
            grad_output.contiguous(),
            weight,
            grad_weight,
-            grad_bias if grad_bias is not None else nullptr,
-            0,  # remove this parameter
-            )
-        return grad_input, grad_weight, grad_bias, None, None, None, None, None, None
-
-
-class Convolution(Module):
-    def __init__(self, dimension, nIn, nOut, filter_size, filter_stride, bias):
-        Module.__init__(self)
-        self.dimension = dimension
-        self.nIn = nIn
-        self.nOut = nOut
-        self.filter_size = toLongTensor(dimension, filter_size)
-        self.filter_volume = self.filter_size.prod().item()
-        self.filter_stride = toLongTensor(dimension, filter_stride)
-        std = (2.0 / nIn / self.filter_volume)**0.5
-        self.weight = Parameter(torch.Tensor(
-            self.filter_volume * nIn, nOut).normal_(
-            0,
-            std))
-        if bias:
-            self.bias = Parameter(torch.Tensor(nOut).zero_())
-        else:
-            self.bias = None
-
-    def forward(self, input):
-        assert input.features.nelement() == 0 or input.features.size(1) == self.nIn
-        output = SparseConvNetTensor()
-        output.metadata = input.metadata
-        output.spatial_size =\
-            (input.spatial_size - self.filter_size) / self.filter_stride + 1
-        assert ((output.spatial_size - 1) * self.filter_stride +
-                self.filter_size == input.spatial_size).all()
-        output.features = ConvolutionFunction.apply(
-            input.features,
-            self.weight,
-            self.bias,
-            input.metadata,
-            input.spatial_size,
-            output.spatial_size,
-            self.dimension,
-            self.filter_size,
-            self.filter_stride,
-        )
-        return output
-
-    def __repr__(self):
-        s = 'Convolution ' + str(self.nIn) + '->' + str(self.nOut) + ' C'
-        if self.filter_size.max().item() == self.filter_size.min().item() and\
-                self.filter_stride.max().item() == self.filter_stride.min().item():
-            s = s + str(self.filter_size[0].item()) + \
-                '/' + str(self.filter_stride[0].item())
-        else:
-            s = s + '(' + str(self.filter_size[0].item())
-            for i in self.filter_size[1:]:
-                s = s + ',' + str(i.item())
-            s = s + ')/(' + str(self.filter_stride[0].item())
-            for i in self.filter_stride[1:]:
-                s = s + ',' + str(i.item())
-            s = s + ')'
-        return s
-
-    def input_spatial_size(self, out_size):
-        return (out_size - 1) * self.filter_stride + self.filter_size
+            grad_bias)
+        return grad_input, grad_weight, optionalTensorReturn(grad_bias), None, None, None, None, None, None
--- a/sparseconvnet/deconvolution.py
+++ b/sparseconvnet/deconvolution.py
@@ -10,6 +10,62 @@ from torch.nn import Module, Parameter
 from .utils import *
 from .sparseConvNetTensor import SparseConvNetTensor

+class Deconvolution(Module):
+    def __init__(self, dimension, nIn, nOut, filter_size, filter_stride, bias):
+        Module.__init__(self)
+        self.dimension = dimension
+        self.nIn = nIn
+        self.nOut = nOut
+        self.filter_size = toLongTensor(dimension, filter_size)
+        self.filter_volume = self.filter_size.prod().item()
+        self.filter_stride = toLongTensor(dimension, filter_stride)
+        std = (2.0 / nIn / self.filter_volume)**0.5
+        self.weight = Parameter(torch.Tensor(
+            self.filter_volume, nIn, nOut).normal_(
+            0,
+            std))
+        if bias:
+            self.bias = Parameter(torch.Tensor(nOut).zero_())
+
+    def forward(self, input):
+        assert input.features.nelement() == 0 or input.features.size(1) == self.nIn
+        output = SparseConvNetTensor()
+        output.metadata = input.metadata
+        output.spatial_size =\
+            (input.spatial_size - 1) * self.filter_stride + self.filter_size
+        output.features = DeconvolutionFunction.apply(
+            input.features,
+            self.weight,
+            optionalTensor(self, 'bias'),
+            input.metadata,
+            input.spatial_size,
+            output.spatial_size,
+            self.dimension,
+            self.filter_size,
+            self.filter_stride)
+        return output
+
+    def __repr__(self):
+        s = 'Deconvolution ' + str(self.nIn) + '->' + str(self.nOut) + ' C'
+        if self.filter_size.max().item() == self.filter_size.min().item() and\
+                self.filter_stride.max().item() == self.filter_stride.min().item():
+            s = s + str(self.filter_size[0].item()) + \
+                '/' + str(self.filter_stride[0].item())
+        else:
+            s = s + '(' + str(self.filter_size[0].item())
+            for i in self.filter_size[1:]:
+                s = s + ',' + str(i.item())
+            s = s + ')/(' + str(self.filter_stride[0].item())
+            for i in self.filter_stride[1:]:
+                s = s + ',' + str(i.item())
+            s = s + ')'
+        return s
+
+    def input_spatial_size(self, out_size):
+        in_size = (out_size - self.filter_size) / self.filter_stride + 1
+        assert ((in_size - 1) * self.filter_stride +
+                self.filter_size == out_size).all()
+        return in_size

 class DeconvolutionFunction(Function):
    @staticmethod
@@ -35,13 +91,11 @@ class DeconvolutionFunction(Function):
                output_spatial_size,
                filter_size,
                filter_stride,
-                input_metadata.ffi,
+                input_metadata,
                input_features,
                output_features,
                weight,
-                bias if bias is not None else nullptr,
-                0,  # remove this parameter!!
-                )
+                bias)
        sparseconvnet.forward_pass_hidden_states += output_features.nelement()
        ctx.save_for_backward(input_features,
                              output_features,
@@ -64,85 +118,19 @@ class DeconvolutionFunction(Function):
            filter_size,\
            filter_stride = ctx.saved_tensors
        grad_input = grad_output.new()
-        grad_weight = grad_output.new().resize_as_(weight).zero_()
-        if bias is None:
-            grad_bias = None
-        else:
-            grad_bias = grad_output.new().resize_as_(bias).zero_()
+        grad_weight = torch.zeros_like(weight)
+        grad_bias = torch.zeros_like(bias)
        dim_typed_fn(
            ctx.dimension, input_features, 'Deconvolution_backward')(
            input_spatial_size,
            output_spatial_size,
            filter_size,
            filter_stride,
-            ctx.input_metadata.ffi,
+            ctx.input_metadata,
            input_features,
            grad_input,
            grad_output.contiguous(),
            weight,
            grad_weight,
-            grad_bias if grad_bias is not None else nullptr,
-            0,  # remove this parameter
-            )
-        return grad_input, grad_weight, grad_bias, None, None, None, None, None, None
-
-
-class Deconvolution(Module):
-    def __init__(self, dimension, nIn, nOut, filter_size, filter_stride, bias):
-        Module.__init__(self)
-        self.dimension = dimension
-        self.nIn = nIn
-        self.nOut = nOut
-        self.filter_size = toLongTensor(dimension, filter_size)
-        self.filter_volume = self.filter_size.prod().item()
-        self.filter_stride = toLongTensor(dimension, filter_stride)
-        std = (2.0 / nIn / self.filter_volume)**0.5
-        self.weight = Parameter(torch.Tensor(
-            self.filter_volume * nIn, nOut).normal_(
-            0,
-            std))
-        if bias:
-            self.bias = Parameter(torch.Tensor(nOut).zero_())
-        else:
-            self.bias = None
-
-    def forward(self, input):
-        assert input.features.nelement() == 0 or input.features.size(1) == self.nIn
-        output = SparseConvNetTensor()
-        output.metadata = input.metadata
-        output.spatial_size =\
-            (input.spatial_size - 1) * self.filter_stride + self.filter_size
-        output.features = DeconvolutionFunction.apply(
-            input.features,
-            self.weight,
-            self.bias,
-            input.metadata,
-            input.spatial_size,
-            output.spatial_size,
-            self.dimension,
-            self.filter_size,
-            self.filter_stride,
-        )
-        return output
-
-    def __repr__(self):
-        s = 'Deconvolution ' + str(self.nIn) + '->' + str(self.nOut) + ' C'
-        if self.filter_size.max().item() == self.filter_size.min().item() and\
-                self.filter_stride.max().item() == self.filter_stride.min().item():
-            s = s + str(self.filter_size[0].item()) + \
-                '/' + str(self.filter_stride[0].item())
-        else:
-            s = s + '(' + str(self.filter_size[0].item())
-            for i in self.filter_size[1:]:
-                s = s + ',' + str(i.item())
-            s = s + ')/(' + str(self.filter_stride[0].item())
-            for i in self.filter_stride[1:]:
-                s = s + ',' + str(i.item())
-            s = s + ')'
-        return s
-
-    def input_spatial_size(self, out_size):
-        in_size = (out_size - self.filter_size) / self.filter_stride + 1
-        assert ((in_size - 1) * self.filter_stride +
-                self.filter_size == out_size).all()
-        return in_size
+            grad_bias)
+        return grad_input, grad_weight, optionalTensorReturn(grad_bias), None, None, None, None, None, None
--- a/sparseconvnet/denseToSparse.py
+++ b/sparseconvnet/denseToSparse.py
@@ -60,7 +60,7 @@ class DenseToSparseFunction(Function):
        r = (nz * s.expand_as(nz)).sum(1).view(-1)
        output_features = aa.index_select(0, r)
        dim_fn(dimension, 'createMetadataForDenseToSparse')(
-            output_metadata.ffi,
+            output_metadata,
            output_spatial_size,
            nz.cpu(),
            input.size(0))
@@ -70,8 +70,6 @@ class DenseToSparseFunction(Function):
    @staticmethod
    def backward(ctx, grad_output):
        output_features, r = ctx.saved_tensors
-        print(r)
-        print(grad_output)
        grad_input = grad_output.new().resize_(
            ctx.aas2).zero_().index_copy_(0, r, grad_output)
        grad_input = grad_input.view(ctx.aas).permute(

--- a/sparseconvnet/fullConvolution.py
+++ b/sparseconvnet/fullConvolution.py
@@ -11,6 +11,61 @@ from .utils import *
 from .sparseConvNetTensor import SparseConvNetTensor
 from .metadata import Metadata

+class FullConvolution(Module):
+    def __init__(self, dimension, nIn, nOut, filter_size, filter_stride, bias):
+        Module.__init__(self)
+        self.dimension = dimension
+        self.nIn = nIn
+        self.nOut = nOut
+        self.filter_size = toLongTensor(dimension, filter_size)
+        self.filter_volume = self.filter_size.prod().item()
+        self.filter_stride = toLongTensor(dimension, filter_stride)
+        std = (2.0 / nIn / self.filter_volume)**0.5
+        self.weight = Parameter(torch.Tensor(
+            self.filter_volume, nIn, nOut).normal_(
+            0,
+            std))
+        if bias:
+            self.bias = Parameter(torch.Tensor(nOut).zero_())
+
+    def forward(self, input):
+        assert input.features.nelement()==0 or input.features.size(1) == self.nIn
+        output = SparseConvNetTensor()
+        output.metadata = Metadata(self.dimension)
+        output.spatial_size =\
+            (input.spatial_size - 1) * self.filter_stride + self.filter_size
+        output.features=FullConvolutionFunction().apply(
+            input.features,
+            self.weight,
+            optionalTensor(self, 'bias'),
+            input.metadata,
+            output.metadata,
+            input.spatial_size,
+            output.spatial_size,
+            self.dimension,
+            self.filter_size,
+            self.filter_stride,
+        )
+        return output
+
+    def __repr__(self):
+        s = 'FullConvolution ' + str(self.nIn) + '->' + str(self.nOut) + ' C'
+        if self.filter_size.max() == self.filter_size.min() and\
+                self.filter_stride.max() == self.filter_stride.min():
+            s = s + str(self.filter_size[0]) + '/' + str(self.filter_stride[0])
+        else:
+            s = s + '(' + str(self.filter_size[0])
+            for i in self.filter_size[1:]:
+                s = s + ',' + str(i)
+            s = s + ')/(' + str(self.filter_stride[0])
+            for i in self.filter_stride[1:]:
+                s = s + ',' + str(i)
+            s = s + ')'
+        return s
+
+    def input_spatial_size(self, out_size):
+        return (out_size - 1) * self.filter_stride + self.filter_size
+
 class FullConvolutionFunction(Function):
    @staticmethod
    def forward(
@@ -44,93 +99,32 @@ class FullConvolutionFunction(Function):
                output_spatial_size,
                filter_size,
                filter_stride,
-                input_metadata.ffi,
-                output_metadata.ffi,
+                input_metadata,
+                output_metadata,
                input_features,
                output_features,
                weight,
-                bias if bias is not None else nullptr,
-                0) #remove this parameter!!
+                bias)
        sparseconvnet.forward_pass_hidden_states += output_features.nelement()
        return output_features
    @staticmethod
    def backward(ctx, grad_output):
        input_features, input_spatial_size, weight, bias, output_spatial_size, filter_size, filter_stride = ctx.saved_tensors
-        grad_input=grad_output.new()
-        grad_weight=grad_output.new().resize_as_(weight).zero_()
-        if bias is None:
-            grad_bias=None
-        else:
-            grad_bias = grad_output.data.new().resize_as_(bias).zero_()
+        grad_input = grad_output.new()
+        grad_weight = torch.zeros_like(weight)
+        grad_bias = torch.zeros_like(bias)
        dim_typed_fn(
            ctx.dimension, input_features, 'FullConvolution_backward')(
            input_spatial_size,
            output_spatial_size,
            filter_size,
            filter_stride,
-            ctx.input_metadata.ffi,
-            ctx.output_metadata.ffi,
+            ctx.input_metadata,
+            ctx.output_metadata,
            input_features,
            grad_input,
            grad_output.contiguous(),
            weight,
            grad_weight,
-            grad_bias if grad_bias is not None else nullptr,
-            0) #remove this parameter
-        return grad_input, grad_weight, grad_bias, None, None, None, None, None, None, None
-
-class FullConvolution(Module):
-    def __init__(self, dimension, nIn, nOut, filter_size, filter_stride, bias):
-        Module.__init__(self)
-        self.dimension = dimension
-        self.nIn = nIn
-        self.nOut = nOut
-        self.filter_size = toLongTensor(dimension, filter_size)
-        self.filter_volume = self.filter_size.prod().item()
-        self.filter_stride = toLongTensor(dimension, filter_stride)
-        std = (2.0 / nIn / self.filter_volume)**0.5
-        self.weight = Parameter(torch.Tensor(
-            self.filter_volume * nIn, nOut).normal_(
-            0,
-            std))
-        if bias:
-            self.bias = Parameter(torch.Tensor(nOut).zero_())
-        else:
-            self.bias=None
-    def forward(self, input):
-        assert input.features.nelement()==0 or input.features.size(1) == self.nIn
-        output = SparseConvNetTensor()
-        output.metadata = Metadata(self.dimension)
-        output.spatial_size =\
-            (input.spatial_size - 1) * self.filter_stride + self.filter_size
-        output.features=FullConvolutionFunction().apply(
-            input.features,
-            self.weight,
-            self.bias,
-            input.metadata,
-            output.metadata,
-            input.spatial_size,
-            output.spatial_size,
-            self.dimension,
-            self.filter_size,
-            self.filter_stride,
-        )
-        return output
-
-    def __repr__(self):
-        s = 'FullConvolution ' + str(self.nIn) + '->' + str(self.nOut) + ' C'
-        if self.filter_size.max() == self.filter_size.min() and\
-                self.filter_stride.max() == self.filter_stride.min():
-            s = s + str(self.filter_size[0]) + '/' + str(self.filter_stride[0])
-        else:
-            s = s + '(' + str(self.filter_size[0])
-            for i in self.filter_size[1:]:
-                s = s + ',' + str(i)
-            s = s + ')/(' + str(self.filter_stride[0])
-            for i in self.filter_stride[1:]:
-                s = s + ',' + str(i)
-            s = s + ')'
-        return s
-
-    def input_spatial_size(self, out_size):
-        return (out_size - 1) * self.filter_stride + self.filter_size
+            grad_bias)
+        return grad_input, grad_weight, optionalTensorReturn(grad_bias), None, None, None, None, None, None, None
--- a/sparseconvnet/inputBatch.py
+++ b/sparseconvnet/inputBatch.py
@@ -17,21 +17,19 @@ class InputBatch(SparseConvNetTensor):
        self.spatial_size = toLongTensor(dimension, spatial_size)
        self.features = torch.FloatTensor()
        self.metadata = Metadata(dimension)
-        dim_fn(dimension, 'setInputSpatialSize')(
-            self.metadata.ffi, self.spatial_size)
+        self.metadata.setInputSpatialSize(self.spatial_size)

    def add_sample(self):
-        dim_fn(self.dimension, 'batchAddSample')(
-            self.metadata.ffi)
+        self.metadata.batchAddSample()

    def set_location(self, location, vector, overwrite=False):
        assert location.min() >= 0 and (self.spatial_size - location).min() > 0
-        dim_fn(self.dimension, 'setInputSpatialLocation')(
-            self.metadata.ffi, self.features, location, vector, overwrite)
+        self.metadata.setInputSpatialLocation(
+            self.features, location, vector, overwrite)

    def set_location_(self, location, vector, overwrite=False):
-        dim_fn(self.dimension, 'setInputSpatialLocation')(
-            self.metadata.ffi, self.features, location, vector, overwrite)
+        self.metadata.setInputSpatialLocation(
+            self.features, location, vector, overwrite)

    def set_locations(self, locations, vectors, overwrite=False):
        """
@@ -41,7 +39,7 @@ class InputBatch(SparseConvNetTensor):
        - A size (n,d+1) LongTensor; the extra column specifies the sample
          number (within the minibatch of samples).

-          Example with d=3 and n=2:
+          Example with d==3 and n==2:
          Set
          locations = LongTensor([[1,2,3],
                                  [4,5,6]])
@@ -54,18 +52,15 @@ class InputBatch(SparseConvNetTensor):
        """
        l = locations[:, :self.dimension]
        assert l.min() >= 0 and (self.spatial_size.expand_as(l) - l).min() > 0
-        dim_fn(self.dimension, 'setInputSpatialLocations')(
-            self.metadata.ffi, self.features, locations, vectors, overwrite)
+        self.metadata.setInputSpatialLocations(
+            self.features, locations, vectors, overwrite)

    def set_locations_(self, locations, vector, overwrite=False):
-        dim_fn(self.dimension, 'setInputSpatialLocations')(
-            self.metadata.ffi, self.features, locations, vectors, overwrite)
+        self.metadata.setInputSpatialLocations(
+            self.features, locations, vectors, overwrite)

    def add_sample_from_tensor(self, tensor, offset, threshold=0):
-        self.nActive = dim_fn(
-            self.dimension,
-            'addSampleFromThresholdedTensor')(
-            self.metadata.ffi,
+        self.metadata.addSampleFromThresholdedTensor(
            self.features,
            tensor,
            offset,
@@ -80,39 +75,35 @@ class InputBatch(SparseConvNetTensor):
        Use size == 3 if downsizing with size-3 stride-2 operations
        """
        if size == 2:
-            dim_fn(self.dimension, 'generateRuleBooks2s2')(self.metadata.ffi)
+            self.metadata.generateRuleBooks2s2(self.metadata)
        if size == 3 :
-            dim_fn(self.dimension, 'generateRuleBooks3s2')(self.metadata.ffi)
+            self.metadata.generateRuleBooks3s2(self.metadata)

    "Deprecated method names."
    def addSample(self):
-        dim_fn(self.dimension, 'batchAddSample')(
-            self.metadata.ffi)
+        self.metadata.batchAddSample()

    def setLocation(self, location, vector, overwrite=False):
        assert location.min() >= 0 and (self.spatial_size - location).min() > 0
-        dim_fn(self.dimension, 'setInputSpatialLocation')(
-            self.metadata.ffi, self.features, location, vector, overwrite)
+        self.metadata.setInputSpatialLocation(
+            self.features, location, vector, overwrite)

    def setLocation_(self, location, vector, overwrite=False):
-        dim_fn(self.dimension, 'setInputSpatialLocation')(
-            self.metadata.ffi, self.features, location, vector, overwrite)
+        self.metadata.setInputSpatialLocation(
+            self.features, location, vector, overwrite)

    def setLocations(self, locations, vectors, overwrite=False):
        l = locations[:, :self.dimension]
        assert l.min() >= 0 and (self.spatial_size.expand_as(l) - l).min() > 0
-        dim_fn(self.dimension, 'setInputSpatialLocations')(
-            self.metadata.ffi, self.features, locations, vectors, overwrite)
+        self.metadata.setInputSpatialLocations(
+            self.features, locations, vectors, overwrite)

    def setLocations_(self, locations, vector, overwrite=False):
-        dim_fn(self.dimension, 'setInputSpatialLocations')(
-            self.metadata.ffi, self.features, locations, vectors, overwrite)
+        self.metadata.setInputSpatialLocations(
+            self.features, locations, vectors, overwrite)

    def addSampleFromTensor(self, tensor, offset, threshold=0):
-        self.nActive = dim_fn(
-            self.dimension,
-            'addSampleFromThresholdedTensor')(
-            self.metadata.ffi,
+        self.metadata.addSampleFromThresholdedTensor(
            self.features,
            tensor,
            offset,
@@ -127,6 +118,6 @@ class InputBatch(SparseConvNetTensor):
        Use size == 3 if downsizing with size-3 stride-2 operations
        """
        if size == 2:
-            dim_fn(self.dimension, 'generateRuleBooks2s2')(self.metadata.ffi)
+            self.metadata.generateRuleBooks2s2()
        if size == 3 :
-            dim_fn(self.dimension, 'generateRuleBooks3s2')(self.metadata.ffi)
+            self.metadata.generateRuleBooks3s2()
--- a/sparseconvnet/ioLayers.py
+++ b/sparseconvnet/ioLayers.py
@@ -163,10 +163,9 @@ class InputLayerFunction(Function):
            mode):
        output_features = input_features.new()
        ctx.dimension = dimension
-        ctx.metadata = metadata
-        ctx.dimension = dimension
+        ctx.metadata_ = metadata
        dim_typed_fn(dimension, input_features, 'InputLayer_updateOutput')(
-            metadata.ffi,
+            metadata,
            spatial_size,
            coords,
            input_features.contiguous(),
@@ -183,7 +182,7 @@ class InputLayerFunction(Function):
            ctx.dimension,
            grad_output,
            'InputLayer_updateGradInput')(
-            ctx.metadata.ffi,
+            ctx.metadata_,
            grad_input,
            grad_output.contiguous())
        return None, None, None, None, grad_input, None, None
@@ -197,10 +196,10 @@ class OutputLayerFunction(Function):
            metadata,
            input_features):
        output_features = input_features.new()
-        ctx.metadata = metadata
+        ctx.metadata_ = metadata
        ctx.dimension = dimension
        dim_typed_fn(dimension, input_features, 'OutputLayer_updateOutput')(
-            metadata.ffi,
+            metadata,
            input_features.contiguous(),
            output_features
        )
@@ -214,7 +213,7 @@ class OutputLayerFunction(Function):
            ctx.dimension,
            grad_output,
            'OutputLayer_updateGradInput')(
-            ctx.metadata.ffi,
+            ctx.metadata_,
            grad_input,
            grad_output.contiguous())
        return None, None, grad_input
@@ -231,10 +230,10 @@ class BLInputLayerFunction(Function):
            input_features,
            mode):
        output_features = input_features.new()
-        ctx.metadata = metadata
+        ctx.metadata_ = metadata
        ctx.dimension = dimension
        dim_typed_fn(dimension, input_features, 'BLInputLayer_updateOutput')(
-            metadata.ffi,
+            metadata,
            spatial_size,
            coords,
            input_features.contiguous(),
@@ -250,7 +249,7 @@ class BLInputLayerFunction(Function):
            ctx.dimension,
            grad_output,
            'BLInputLayer_updateGradInput')(
-            ctx.metadata.ffi,
+            ctx.metadata_,
            grad_input,
            grad_output.contiguous())
        return None, None, None, None, grad_input, None
@@ -264,10 +263,10 @@ class BLOutputLayerFunction(Function):
            metadata,
            input_features):
        output_features = input_features.new()
-        ctx.metadata = metadata
+        ctx.metadata_ = metadata
        ctx.dimension = dimension
        dim_typed_fn(dimension, input_features, 'BLOutputLayer_updateOutput')(
-            metadata.ffi,
+            metadata,
            input_features.contiguous(),
            output_features
        )
@@ -280,7 +279,18 @@ class BLOutputLayerFunction(Function):
            ctx.dimension,
            grad_output,
            'BLOutputLayer_updateGradInput')(
-            ctx.metadata.ffi,
+            ctx.metadata_,
            grad_input,
            grad_output.contiguous())
        return None, None, grad_input
+
+class InputLayerInput(object):
+    def __init__(self,coords,features):
+        self.x=[coords,features]
+    def __getitem__(self,n):
+        return self.x[n]
+    def __len__(self):
+        return 2
+    def cuda(self):
+        self.x[1]=self.x[1].cuda()
+        return self
--- a/sparseconvnet/maxPooling.py
+++ b/sparseconvnet/maxPooling.py
@@ -31,7 +31,7 @@ class MaxPoolingFunction(Function):
            output_spatial_size,
            pool_size,
            pool_stride,
-            input_metadata.ffi,
+            input_metadata,
            input_features,
            output_features,
            nFeaturesToDrop)
@@ -59,7 +59,7 @@ class MaxPoolingFunction(Function):
            output_spatial_size,
            pool_size,
            pool_stride,
-            ctx.input_metadata.ffi,
+            ctx.input_metadata,
            input_features,
            grad_input,
            output_features,

--- a/sparseconvnet/metadata.py
+++ b/sparseconvnet/metadata.py
@@ -9,41 +9,9 @@ Store Metadata relating to which spatial locations are active at each scale.
 Convolutions, submanifold convolutions and 'convolution reversing' deconvolutions
 all coexist within the same MetaData object as long as each spatial size
 only occurs once.
-
-Serialization is emulated by storing the pointer as an integer.
-This is sufficient for mutithreaded batch preparation: each 'serialized'
-object must be de-serialized exactly once.
 """

-import cffi
 from .utils import dim_fn
-from .SCN import scn_readPtr, scn_writePtr, scn_3_setInputSpatialSize
-
-ffi = cffi.FFI()
-
-
-class Metadata(object):
-    def __init__(self, dimension, ptr=0):
-        self.dimension = dimension
-        self.ffi = ffi.new('void *[1]')
-        scn_writePtr(ptr, self.ffi)
-        self.ffigc = ffi.gc(self.ffi, dim_fn(self.dimension, 'freeMetadata'))
-
-    def set_(self):
-        dim_fn(self.dimension, 'freeMetadata')(self.ffi)
-        # if hasattr(self, 'ffi'):
-        #     del self.ffigc
-        #     del self.ffi
-
-    def __reduce__(self):
-        if hasattr(self, 'ffi'):
-            del self.ffigc
-            del self.ffi
-        return (self.__class__, (self.dimension,))

-    def __repr__(self):
-        if hasattr(self, 'ffi'):
-            return '<<Metadata:dim=' + \
-                str(self.dimension) + ', p=' + str(scn_readPtr(self.ffi)) + '>>'
-        else:
-            return '<<Metadata:dim=' + str(self.dimension) + '>>'
+def Metadata(dim):
+    return dim_fn(dim,'Metadata')()
--- a/sparseconvnet/networkInNetwork.py
+++ b/sparseconvnet/networkInNetwork.py
@@ -28,7 +28,7 @@ class NetworkInNetworkFunction(Function):
                input_features,
                output_features,
                weight,
-                bias if bias is not None else nullptr)
+                bias)
        sparseconvnet.forward_pass_hidden_states += output_features.nelement()
        return output_features

@@ -40,6 +40,7 @@ class NetworkInNetworkFunction(Function):
            bias = ctx.saved_tensors
        grad_input = grad_output.new()
        grad_weight = grad_output.new().resize_as_(weight).zero_()
+        grad_bias = torch.zeros_like(bias)
        if bias is None:
            grad_bias = None
        else:
@@ -52,7 +53,7 @@ class NetworkInNetworkFunction(Function):
            input_features,
            grad_output,
            grad_weight,
-            grad_bias if grad_bias is not None else nullptr)
+            grad_bias)
        return grad_input, grad_weight, grad_bias


@@ -68,8 +69,6 @@ class NetworkInNetwork(Module):
            std))
        if bias:
            self.bias = Parameter(torch.Tensor(nOut).zero_())
-        else:
-            self.bias = None

    def forward(self, input):
        assert input.features.nelement() == 0 or input.features.size(1) == self.nIn
@@ -79,7 +78,7 @@ class NetworkInNetwork(Module):
        output.features = NetworkInNetworkFunction.apply(
            input.features,
            self.weight,
-            self.bias)
+            optionalTensor(self, 'bias'))
        return output

    def __repr__(self):

--- a/sparseconvnet/randomizedStrideConvolution.py
+++ b/sparseconvnet/randomizedStrideConvolution.py
@@ -11,74 +11,6 @@ from .utils import *
 from .sparseConvNetTensor import SparseConvNetTensor
 from .convolution import ConvolutionFunction

-class RandomizedStrideConvolutionFunction(Function):
-    @staticmethod
-    def forward(
-            ctx,
-            input_features,
-            weight,
-            bias,
-            input_metadata,
-            input_spatial_size,
-            output_spatial_size,
-            dimension,
-            filter_size,
-            filter_stride):
-        output_features = input_features.new()
-        ctx.input_metadata = input_metadata
-        ctx.dimension = dimension
-        ctx.save_for_backward(
-            input_features,
-            input_spatial_size,
-            weight,
-            bias,
-            output_spatial_size,
-            filter_size,
-            filter_stride)
-        sparseconvnet.forward_pass_multiplyAdd_count +=\
-            dim_typed_fn(
-                dimension, input_features, 'RandomizedStrideConvolution_updateOutput')(
-                input_spatial_size,
-                output_spatial_size,
-                filter_size,
-                filter_stride,
-                input_metadata.ffi,
-                input_features,
-                output_features,
-                weight,
-                bias if bias is not None else nullptr,
-                0)  # remove this parameter!!
-
-        sparseconvnet.forward_pass_hidden_states += output_features.nelement()
-        return output_features
-
-    @staticmethod
-    def backward(ctx, grad_output):
-        input_features, input_spatial_size, weight, bias, output_spatial_size, filter_size, filter_stride = ctx.saved_tensors
-        grad_input = grad_output.new()
-        grad_weight = grad_output.new().resize_as_(weight).zero_()
-        if bias is None:
-            grad_bias = None
-        else:
-            grad_bias = grad_output.new().resize_as_(bias).zero_()
-        dim_typed_fn(
-            ctx.dimension, input_features, 'RandomizedStrideConvolution_backward')(
-            input_spatial_size,
-            output_spatial_size,
-            filter_size,
-            filter_stride,
-            ctx.input_metadata.ffi,
-            input_features,
-            grad_input,
-            grad_output.contiguous(),
-            weight,
-            grad_weight,
-            grad_bias if grad_bias is not None else nullptr,
-            0,  # remove this parameter
-            )
-        return grad_input, grad_weight, grad_bias, None, None, None, None, None, None
-
-
 class RandomizedStrideConvolution(Module):
    """
    A bit like Fractional Max Pooling during training, but at test time it
@@ -100,13 +32,11 @@ class RandomizedStrideConvolution(Module):
        self.filter_stride = toLongTensor(dimension, filter_stride)
        std = (2.0 / nIn / self.filter_volume)**0.5
        self.weight = Parameter(torch.Tensor(
-            self.filter_volume * nIn, nOut).normal_(
+            self.filter_volume, nIn, nOut).normal_(
            0,
            std))
        if bias:
            self.bias = Parameter(torch.Tensor(nOut).zero_())
-        else:
-            self.bias = None

    def forward(self, input):
        assert input.features.ndimension() == 0 or input.features.size(1) == self.nIn
@@ -121,14 +51,13 @@ class RandomizedStrideConvolution(Module):
        #output.features = RandomizedStrideConvolutionFunction.apply(
            input.features,
            self.weight,
-            self.bias,
+            optionalTensor(self, 'bias'),
            input.metadata,
            input.spatial_size,
            output.spatial_size,
            self.dimension,
            self.filter_size,
-            self.filter_stride,
-        )
+            self.filter_stride)
        return output

    def __repr__(self):
@@ -149,3 +78,64 @@ class RandomizedStrideConvolution(Module):

    def input_spatial_size(self, out_size):
        return (out_size - 1) * self.filter_stride + self.filter_size
+
+class RandomizedStrideConvolutionFunction(Function):
+    @staticmethod
+    def forward(
+            ctx,
+            input_features,
+            weight,
+            bias,
+            input_metadata,
+            input_spatial_size,
+            output_spatial_size,
+            dimension,
+            filter_size,
+            filter_stride):
+        output_features = input_features.new()
+        ctx.input_metadata = input_metadata
+        ctx.dimension = dimension
+        ctx.save_for_backward(
+            input_features,
+            input_spatial_size,
+            weight,
+            bias,
+            output_spatial_size,
+            filter_size,
+            filter_stride)
+        sparseconvnet.forward_pass_multiplyAdd_count +=\
+            dim_typed_fn(
+                dimension, input_features, 'RandomizedStrideConvolution_updateOutput')(
+                input_spatial_size,
+                output_spatial_size,
+                filter_size,
+                filter_stride,
+                input_metadata,
+                input_features,
+                output_features,
+                weight,
+                bias)
+
+        sparseconvnet.forward_pass_hidden_states += output_features.nelement()
+        return output_features
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        input_features, input_spatial_size, weight, bias, output_spatial_size, filter_size, filter_stride = ctx.saved_tensors
+        grad_input = grad_output.new()
+        grad_weight = torch.zeros_like(weight)
+        grad_bias = torch.zeros_like(bias)
+        dim_typed_fn(
+            ctx.dimension, input_features, 'RandomizedStrideConvolution_backward')(
+            input_spatial_size,
+            output_spatial_size,
+            filter_size,
+            filter_stride,
+            ctx.input_metadata,
+            input_features,
+            grad_input,
+            grad_output.contiguous(),
+            weight,
+            grad_weight,
+            grad_bias)
+        return grad_input, grad_weight, optionalTensorReturn(grad_bias), None, None, None, None, None, None
--- a/sparseconvnet/randomizedStrideMaxPooling.py
+++ b/sparseconvnet/randomizedStrideMaxPooling.py
@@ -31,7 +31,7 @@ class RandomizedStrideMaxPoolingFunction(Function):
            output_spatial_size,
            pool_size,
            pool_stride,
-            input_metadata.ffi,
+            input_metadata,
            input_features,
            output_features,
            nFeaturesToDrop)
@@ -59,7 +59,7 @@ class RandomizedStrideMaxPoolingFunction(Function):
            output_spatial_size,
            pool_size,
            pool_stride,
-            ctx.input_metadata.ffi,
+            ctx.input_metadata,
            input_features,
            grad_input,
            output_features,

--- a/sparseconvnet/sequential.py
+++ b/sparseconvnet/sequential.py
@@ -5,8 +5,6 @@
 # LICENSE file in the root directory of this source tree.

 from torch.nn import Sequential as S
-from .utils import set
-

 class Sequential(S):
    def input_spatial_size(self, out_size):

--- a/sparseconvnet/sparseConvNetTensor.py
+++ b/sparseconvnet/sparseConvNetTensor.py
@@ -20,14 +20,8 @@ class SparseConvNetTensor(object):
        "Coordinates and batch index for the active spatial locations"
        if spatial_size is None:
            spatial_size = self.spatial_size
-
        t = torch.LongTensor()
-        dim_fn(
-            self.metadata.dimension,
-            'getSpatialLocations')(
-            self.metadata.ffi,
-            spatial_size,
-            t)
+        self.metadata.getSpatialLocations(spatial_size, t)
        return t

    def type(self, t=None):