Use references where possible

d8c8a060 · Michal Pandy · 1171aae3 · d8c8a060 · d8c8a060 · d8c8a060
Commit d8c8a060 authored Aug 29, 2019 by Michal Pandy
20 changed files
--- a/sparseconvnet/SCN/CPU/ActivePooling.cpp
+++ b/sparseconvnet/SCN/CPU/ActivePooling.cpp
@@ -8,12 +8,12 @@
 template <typename T>
 void ActivePooling_ForwardPass(T *input_features, T *output_features,
                               Int batchSize, Int maxActive, Int nPlanes,
-                               RuleBook &rules, bool average) {
+                               const RuleBook &rules, bool average) {
  Int outSite;
 #pragma omp parallel for private(outSite)
  for (outSite = 0; outSite < batchSize; outSite++) {
    T *out = &output_features[outSite * nPlanes];
-    Int *r = &rules[0][outSite * (maxActive + 1)];
+    const Int *r = &rules[0][outSite * (maxActive + 1)];
    Int nActive = *r++;
    T multiplier = (average and nActive > 0) ? (T)1 / nActive : (T)1;
    while (nActive-- > 0) {
@@ -26,12 +26,12 @@ void ActivePooling_ForwardPass(T *input_features, T *output_features,
 template <typename T>
 void ActivePooling_BackwardPass(T *d_input_features, T *d_output_features,
                                Int batchSize, Int maxActive, Int nPlanes,
-                                RuleBook &rules, bool average) {
+                                const RuleBook &rules, bool average) {
  Int outSite;
 #pragma omp parallel for private(outSite)
  for (outSite = 0; outSite < batchSize; outSite++) {
    T *out = &d_output_features[outSite * nPlanes];
-    Int *r = &rules[0][outSite * (maxActive + 1)];
+    const Int *r = &rules[0][outSite * (maxActive + 1)];
    Int nActive = *r++;
    T multiplier = (average and nActive > 0) ? (T)1 / nActive : (T)1;
    while (nActive-- > 0) {
@@ -44,12 +44,12 @@ void ActivePooling_BackwardPass(T *d_input_features, T *d_output_features,
 template <typename T, Int Dimension>
 void cpu_ActivePooling_updateOutput(
-    /*long*/ at::Tensor inputSize, Metadata<Dimension> &m,
+    /*long*/ at::Tensor &inputSize, Metadata<Dimension> &m,
-    /*float*/ at::Tensor input_features,
+    /*float*/ at::Tensor &input_features,
-    /*float*/ at::Tensor output_features, bool average) {
+    /*float*/ at::Tensor &output_features, bool average) {
  Int nPlanes = input_features.size(1);
-  auto _rules = m.getActivePoolingRuleBook(inputSize);
+  const auto &_rules = m.getActivePoolingRuleBook(inputSize);
  Int batchSize = _rules[1][0];
  Int maxActive = _rules[1][1];
  output_features.resize_({batchSize, nPlanes});
@@ -62,13 +62,13 @@ void cpu_ActivePooling_updateOutput(
 template <typename T, Int Dimension>
 void cpu_ActivePooling_updateGradInput(
-    /*long*/ at::Tensor inputSize, Metadata<Dimension> &m,
+    /*long*/ at::Tensor &inputSize, Metadata<Dimension> &m,
-    /*float*/ at::Tensor input_features,
+    /*float*/ at::Tensor &input_features,
-    /*float*/ at::Tensor d_input_features,
+    /*float*/ at::Tensor &d_input_features,
-    /*float*/ at::Tensor d_output_features, bool average) {
+    /*float*/ at::Tensor &d_output_features, bool average) {
  Int nPlanes = input_features.size(1);
-  auto _rules = m.getActivePoolingRuleBook(inputSize);
+  const auto &_rules = m.getActivePoolingRuleBook(inputSize);
  Int batchSize = _rules[1][0];
  Int maxActive = _rules[1][1];
  d_input_features.resize_as_(input_features);

--- a/sparseconvnet/SCN/CPU/AffineReluTrivialConvolution.cpp
+++ b/sparseconvnet/SCN/CPU/AffineReluTrivialConvolution.cpp
@@ -69,9 +69,9 @@ void AffineReluTrivialConvolution_BackwardPass(
 template <typename T>
 double cpu_AffineReluTrivialConvolution_updateOutput(
-    /*float*/ at::Tensor input_features, /*float*/ at::Tensor output_features,
+    /*float*/ at::Tensor &input_features, /*float*/ at::Tensor &output_features,
-    /*float*/ at::Tensor affineWeight,
+    /*float*/ at::Tensor &affineWeight,
-    /*float*/ at::Tensor affineBias, /*float*/ at::Tensor convWeight) {
+    /*float*/ at::Tensor &affineBias, /*float*/ at::Tensor &convWeight) {
  output_features.resize_({input_features.size(0), convWeight.size(1)});
  AffineReluTrivialConvolution_ForwardPass(
      input_features.data<T>(), convWeight.size(0), input_features.stride(0),
@@ -84,11 +84,12 @@ double cpu_AffineReluTrivialConvolution_updateOutput(
 template <typename T>
 void cpu_AffineReluTrivialConvolution_backward(
-    /*float*/ at::Tensor input_features, /*float*/ at::Tensor d_input_features,
+    /*float*/ at::Tensor &input_features,
-    /*float*/ at::Tensor d_output_features, /*float*/ at::Tensor affineWeight,
+    /*float*/ at::Tensor &d_input_features,
-    /*float*/ at::Tensor d_affineWeight, /*float*/ at::Tensor affineBias,
+    /*float*/ at::Tensor &d_output_features, /*float*/ at::Tensor &affineWeight,
-    /*float*/ at::Tensor d_affineBias,
+    /*float*/ at::Tensor &d_affineWeight, /*float*/ at::Tensor &affineBias,
-    /*float*/ at::Tensor convWeight, /*float*/ at::Tensor d_convWeight,
+    /*float*/ at::Tensor &d_affineBias,
+    /*float*/ at::Tensor &convWeight, /*float*/ at::Tensor &d_convWeight,
    bool additiveGrad) {
  d_input_features.resize_as_(input_features);

--- a/sparseconvnet/SCN/CPU/AveragePooling.cpp
+++ b/sparseconvnet/SCN/CPU/AveragePooling.cpp
@@ -7,7 +7,7 @@
 template <typename T>
 void AveragePooling_ForwardPass(T *input_features, T *output_features,
                                Int nPlanes, Int input_stride,
-                                Int output_stride, Int *rules, Int nHot,
+                                Int output_stride, const Int *rules, Int nHot,
                                Int filterVolume) {
  Int outSite;
 #pragma omp parallel for private(outSite)
@@ -21,7 +21,7 @@ void AveragePooling_ForwardPass(T *input_features, T *output_features,
 template <typename T>
 void AveragePooling_BackwardPass(T *d_input_features, T *d_output_features,
                                 Int nPlanes, Int input_stride,
-                                 Int output_stride, Int *rules, Int nHot,
+                                 Int output_stride, const Int *rules, Int nHot,
                                 Int filterVolume) {
  Int outSite;
 #pragma omp parallel for private(outSite)
@@ -36,14 +36,14 @@ void AveragePooling_BackwardPass(T *d_input_features, T *d_output_features,
 template <typename T, Int Dimension>
 void cpu_AveragePooling_updateOutput(
-    /*long*/ at::Tensor inputSize, /*long*/ at::Tensor outputSize,
+    /*long*/ at::Tensor &inputSize, /*long*/ at::Tensor &outputSize,
-    /*long*/ at::Tensor poolSize,
+    /*long*/ at::Tensor &poolSize,
-    /*long*/ at::Tensor poolStride, Metadata<Dimension> &m,
+    /*long*/ at::Tensor &poolStride, Metadata<Dimension> &m,
-    /*float*/ at::Tensor input_features,
+    /*float*/ at::Tensor &input_features,
-    /*float*/ at::Tensor output_features, long nFeaturesToDrop) {
+    /*float*/ at::Tensor &output_features, long nFeaturesToDrop) {
  Int nPlanes = input_features.size(1) - nFeaturesToDrop;
-  auto _rules =
+  const auto &_rules =
      m.getRuleBook(inputSize, outputSize, poolSize, poolStride, true);
  Int nActive = m.getNActive(outputSize);
  output_features.resize_({nActive, input_features.size(1) - nFeaturesToDrop});
@@ -52,7 +52,7 @@ void cpu_AveragePooling_updateOutput(
  auto iF = input_features.data<T>() + nFeaturesToDrop;
  auto oF = output_features.data<T>();
-  for (auto &r : _rules) {
+  for (const auto &r : _rules) {
    Int nHot = r.size() / 2;
    AveragePooling_ForwardPass<T>(iF, oF, nPlanes, input_features.stride(0),
                                  output_features.stride(0), &r[0], nHot,
@@ -61,15 +61,15 @@ void cpu_AveragePooling_updateOutput(
 }
 template <typename T, Int Dimension>
 void cpu_AveragePooling_updateGradInput(
-    /*long*/ at::Tensor inputSize, /*long*/ at::Tensor outputSize,
+    /*long*/ at::Tensor &inputSize, /*long*/ at::Tensor &outputSize,
-    /*long*/ at::Tensor poolSize,
+    /*long*/ at::Tensor &poolSize,
-    /*long*/ at::Tensor poolStride, Metadata<Dimension> &m,
+    /*long*/ at::Tensor &poolStride, Metadata<Dimension> &m,
-    /*float*/ at::Tensor input_features,
+    /*float*/ at::Tensor &input_features,
-    /*float*/ at::Tensor d_input_features,
+    /*float*/ at::Tensor &d_input_features,
-    /*float*/ at::Tensor d_output_features, long nFeaturesToDrop) {
+    /*float*/ at::Tensor &d_output_features, long nFeaturesToDrop) {
  Int nPlanes = input_features.size(1) - nFeaturesToDrop;
-  auto _rules =
+  const auto &_rules =
      m.getRuleBook(inputSize, outputSize, poolSize, poolStride, true);
  d_input_features.resize_as_(input_features);
  d_input_features.zero_();
@@ -77,7 +77,7 @@ void cpu_AveragePooling_updateGradInput(
  auto diF = d_input_features.data<T>() + nFeaturesToDrop;
  auto doF = d_output_features.data<T>();
-  for (auto &r : _rules) {
+  for (const auto &r : _rules) {
    Int nHot = r.size() / 2;
    AveragePooling_BackwardPass<T>(diF, doF, nPlanes, input_features.stride(0),
                                   d_output_features.stride(0), &r[0], nHot,
@@ -86,8 +86,8 @@ void cpu_AveragePooling_updateGradInput(
 }
 template <typename T>
-void cpu_CopyFeaturesHelper_updateOutput(at::Tensor rules, at::Tensor context,
+void cpu_CopyFeaturesHelper_updateOutput(at::Tensor &rules, at::Tensor &context,
-                                         at::Tensor Context) {
+                                         at::Tensor &Context) {
  Int nHot = rules.size(0) / 2;
  Int nPlanes = context.size(1);
  auto iF = context.data<T>();
@@ -102,9 +102,9 @@ void cpu_CopyFeaturesHelper_updateOutput(at::Tensor rules, at::Tensor context,
  }
 }
 template <typename T>
-void cpu_CopyFeaturesHelper_updateGradInput(at::Tensor rules,
+void cpu_CopyFeaturesHelper_updateGradInput(at::Tensor &rules,
-                                            at::Tensor dcontext,
+                                            at::Tensor &dcontext,
-                                            at::Tensor dContext) {
+                                            at::Tensor &dContext) {
  Int nHot = rules.size(0) / 2;
  Int nPlanes = dcontext.size(1);
  auto iF = dcontext.data<T>();

--- a/sparseconvnet/SCN/CPU/BatchNormalization.cpp
+++ b/sparseconvnet/SCN/CPU/BatchNormalization.cpp
@@ -108,11 +108,11 @@ void BatchNormalization_BackwardPass(T *input_features, T *d_input_features,
 template <typename T>
 void cpu_BatchNormalization_updateOutput(
-    /*float*/ at::Tensor input_features, /*float*/ at::Tensor output_features,
+    /*float*/ at::Tensor &input_features, /*float*/ at::Tensor &output_features,
-    /*float*/ at::Tensor saveMean,
+    /*float*/ at::Tensor &saveMean,
-    /*float*/ at::Tensor saveInvStd, /*float*/ at::Tensor runningMean,
+    /*float*/ at::Tensor &saveInvStd, /*float*/ at::Tensor &runningMean,
-    /*float*/ at::Tensor runningVar,
+    /*float*/ at::Tensor &runningVar,
-    /*float*/ at::Tensor weight, /*float*/ at::Tensor bias, T eps, T momentum,
+    /*float*/ at::Tensor &weight, /*float*/ at::Tensor &bias, T eps, T momentum,
    bool train, T leakiness) {
  output_features.resize_as_(input_features);
  if (input_features.ndimension() == 2) {
@@ -131,13 +131,14 @@ void cpu_BatchNormalization_updateOutput(
 template <typename T>
 void cpu_BatchNormalization_backward(
-    /*float*/ at::Tensor input_features, /*float*/ at::Tensor d_input_features,
+    /*float*/ at::Tensor &input_features,
-    /*float*/ at::Tensor output_features,
+    /*float*/ at::Tensor &d_input_features,
-    /*float*/ at::Tensor d_output_features, /*float*/ at::Tensor saveMean,
+    /*float*/ at::Tensor &output_features,
-    /*float*/ at::Tensor saveInvStd, /*float*/ at::Tensor runningMean,
+    /*float*/ at::Tensor &d_output_features, /*float*/ at::Tensor &saveMean,
-    /*float*/ at::Tensor runningVar,
+    /*float*/ at::Tensor &saveInvStd, /*float*/ at::Tensor &runningMean,
-    /*float*/ at::Tensor weight, /*float*/ at::Tensor bias,
+    /*float*/ at::Tensor &runningVar,
-    /*float*/ at::Tensor d_weight, /*float*/ at::Tensor d_bias, T leakiness) {
+    /*float*/ at::Tensor &weight, /*float*/ at::Tensor &bias,
+    /*float*/ at::Tensor &d_weight, /*float*/ at::Tensor &d_bias, T leakiness) {
  d_input_features.resize_as_(input_features);
  if (input_features.ndimension() == 2) {
@@ -155,4 +156,3 @@ void cpu_BatchNormalization_backward(
        leakiness);
  }
 }
--- a/sparseconvnet/SCN/CPU/BatchwiseMultiplicativeDropout.cpp
+++ b/sparseconvnet/SCN/CPU/BatchwiseMultiplicativeDropout.cpp
@@ -6,8 +6,8 @@
 template <typename T>
 void cpu_BatchwiseMultiplicativeDropout_updateOutput(
-    /*float*/ at::Tensor input_features, /*float*/ at::Tensor output_features,
+    /*float*/ at::Tensor &input_features, /*float*/ at::Tensor &output_features,
-    /*float*/ at::Tensor noise, T alpha) {
+    /*float*/ at::Tensor &noise, T alpha) {
  output_features.resize_as_(input_features);
  auto nActive = input_features.size(0);
  auto nPlanes = input_features.size(1);
@@ -21,8 +21,9 @@ void cpu_BatchwiseMultiplicativeDropout_updateOutput(
 }
 template <typename T>
 void cpu_BatchwiseMultiplicativeDropout_updateGradInput(
-    /*float*/ at::Tensor input_features, /*float*/ at::Tensor d_input_features,
+    /*float*/ at::Tensor &input_features,
-    /*float*/ at::Tensor d_output_features, /*float*/ at::Tensor noise,
+    /*float*/ at::Tensor &d_input_features,
+    /*float*/ at::Tensor &d_output_features, /*float*/ at::Tensor &noise,
    T alpha) {
  d_input_features.resize_as_(d_output_features);
  auto nActive = input_features.size(0);

--- a/sparseconvnet/SCN/CPU/Convolution.cpp
+++ b/sparseconvnet/SCN/CPU/Convolution.cpp
@@ -6,7 +6,7 @@
 // rows x groups x planes -> groups x rows x planes
 template <typename T>
-at::Tensor rule_index_select(at::Tensor src, Int nRules, Int *rules,
+at::Tensor rule_index_select(at::Tensor &src, Int nRules, const Int *rules,
                              Int groups) {
  auto planes = src.size(1) / groups;
  auto target = at::empty({groups, nRules, planes}, src.options());
@@ -26,8 +26,8 @@ at::Tensor rule_index_select(at::Tensor src, Int nRules, Int *rules,
 // groups x rows x planes -> rows x groups x planes
 template <typename T>
-void rule_index_add_(at::Tensor target, at::Tensor src, Int nRules, Int *rules,
+void rule_index_add_(at::Tensor &target, at::Tensor &src, Int nRules,
-                     Int groups) {
+                     const Int *rules, Int groups) {
  auto planes = target.size(1) / groups;
  auto s_ptr = src.data<T>();
  auto t_ptr = target.data<T>();
@@ -44,13 +44,13 @@ void rule_index_add_(at::Tensor target, at::Tensor src, Int nRules, Int *rules,
 template <typename T, Int Dimension>
 double cpu_Convolution_updateOutput(
-    /*long*/ at::Tensor inputSize, /*long*/ at::Tensor outputSize,
+    /*long*/ at::Tensor &inputSize, /*long*/ at::Tensor &outputSize,
-    /*long*/ at::Tensor filterSize,
+    /*long*/ at::Tensor &filterSize,
-    /*long*/ at::Tensor filterStride, Metadata<Dimension> &m,
+    /*long*/ at::Tensor &filterStride, Metadata<Dimension> &m,
-    /*float*/ at::Tensor input_features,
+    /*float*/ at::Tensor &input_features,
-    /*float*/ at::Tensor output_features, /*float*/ at::Tensor weight,
+    /*float*/ at::Tensor &output_features, /*float*/ at::Tensor &weight,
-    /*float*/ at::Tensor bias) {
+    /*float*/ at::Tensor &bias) {
-  auto _rules =
+  const auto &_rules =
      m.getRuleBook(inputSize, outputSize, filterSize, filterStride, true);
  Int nActive = m.getNActive(outputSize);
  output_features.resize_({nActive, weight.size(1) * weight.size(3)});
@@ -64,7 +64,7 @@ double cpu_Convolution_updateOutput(
  auto ip = weight.size(2);
  auto op = weight.size(3);
  for (Int i = 0; i < (Int)_rules.size(); ++i) {
-    auto r = _rules[i];
+    const auto &r = _rules[i];
    Int nRules = r.size() / 2;
    if (nRules) {
      flops += nRules * ip * op * groups;
@@ -80,15 +80,15 @@ double cpu_Convolution_updateOutput(
 template <typename T, Int Dimension>
 void cpu_Convolution_backward(
-    /*long*/ at::Tensor inputSize, /*long*/ at::Tensor outputSize,
+    /*long*/ at::Tensor &inputSize, /*long*/ at::Tensor &outputSize,
-    /*long*/ at::Tensor filterSize,
+    /*long*/ at::Tensor &filterSize,
-    /*long*/ at::Tensor filterStride, Metadata<Dimension> &m,
+    /*long*/ at::Tensor &filterStride, Metadata<Dimension> &m,
-    /*float*/ at::Tensor input_features,
+    /*float*/ at::Tensor &input_features,
-    /*float*/ at::Tensor d_input_features,
+    /*float*/ at::Tensor &d_input_features,
-    /*float*/ at::Tensor d_output_features, /*float*/ at::Tensor weight,
+    /*float*/ at::Tensor &d_output_features, /*float*/ at::Tensor &weight,
-    /*float*/ at::Tensor d_weight, /*float*/ at::Tensor d_bias) {
+    /*float*/ at::Tensor &d_weight, /*float*/ at::Tensor &d_bias) {
-  auto _rules =
+  const auto &_rules =
      m.getRuleBook(inputSize, outputSize, filterSize, filterStride, true);
  Int nActive = m.getNActive(inputSize);
  d_input_features.resize_as_(input_features);
@@ -98,7 +98,7 @@ void cpu_Convolution_backward(
  if (nActive and d_bias.numel())
    at::sum_out(d_bias, d_output_features, {0}, false);
  for (Int i = 0; i < (Int)_rules.size(); ++i) {
-    auto r = _rules[i];
+    const auto &r = _rules[i];
    Int nRules = r.size() / 2;
    if (nRules) {
      auto w = weight.select(0, i);
@@ -116,13 +116,13 @@ void cpu_Convolution_backward(
 template <typename T, Int Dimension>
 double cpu_SubmanifoldConvolution_updateOutput(
-    /*long*/ at::Tensor inputSize, /*long*/ at::Tensor filterSize,
+    /*long*/ at::Tensor &inputSize, /*long*/ at::Tensor &filterSize,
    Metadata<Dimension> &m,
-    /*float*/ at::Tensor input_features,
+    /*float*/ at::Tensor &input_features,
-    /*float*/ at::Tensor output_features,
+    /*float*/ at::Tensor &output_features,
-    /*float*/ at::Tensor weight,
+    /*float*/ at::Tensor &weight,
-    /*float*/ at::Tensor bias) {
+    /*float*/ at::Tensor &bias) {
-  auto _rules = m.getSubmanifoldRuleBook(inputSize, filterSize, true);
+  const auto &_rules = m.getSubmanifoldRuleBook(inputSize, filterSize, true);
  Int nActive = m.getNActive(inputSize);
  output_features.resize_({nActive, weight.size(1) * weight.size(3)});
  if (bias.numel() and nActive)
@@ -135,7 +135,7 @@ double cpu_SubmanifoldConvolution_updateOutput(
  auto ip = weight.size(2);
  auto op = weight.size(3);
  for (Int i = 0; i < (Int)_rules.size(); ++i) {
-    auto r = _rules[i];
+    const auto &r = _rules[i];
    Int nRules = r.size() / 2;
    if (nRules) {
      flops += nRules * ip * op * groups;
@@ -151,15 +151,15 @@ double cpu_SubmanifoldConvolution_updateOutput(
 template <typename T, Int Dimension>
 void cpu_SubmanifoldConvolution_backward(
-    /*long*/ at::Tensor inputSize, /*long*/ at::Tensor filterSize,
+    /*long*/ at::Tensor &inputSize, /*long*/ at::Tensor &filterSize,
    Metadata<Dimension> &m,
-    /*float*/ at::Tensor input_features,
+    /*float*/ at::Tensor &input_features,
-    /*float*/ at::Tensor d_input_features,
+    /*float*/ at::Tensor &d_input_features,
-    /*float*/ at::Tensor d_output_features, /*float*/ at::Tensor weight,
+    /*float*/ at::Tensor &d_output_features, /*float*/ at::Tensor &weight,
-    /*float*/ at::Tensor d_weight,
+    /*float*/ at::Tensor &d_weight,
-    /*float*/ at::Tensor d_bias) {
+    /*float*/ at::Tensor &d_bias) {
-  auto _rules = m.getSubmanifoldRuleBook(inputSize, filterSize, true);
+  const auto &_rules = m.getSubmanifoldRuleBook(inputSize, filterSize, true);
  Int nActive = m.getNActive(inputSize);
  d_input_features.resize_as_(input_features);
  d_input_features.zero_();
@@ -168,7 +168,7 @@ void cpu_SubmanifoldConvolution_backward(
  if (nActive and d_bias.numel())
    at::sum_out(d_bias, d_output_features, {0}, false);
  for (Int i = 0; i < (Int)_rules.size(); ++i) {
-    auto r = _rules[i];
+    const auto &r = _rules[i];
    Int nRules = r.size() / 2;
    if (nRules) {
      auto w = weight.select(0, i);
@@ -186,12 +186,12 @@ void cpu_SubmanifoldConvolution_backward(
 template <typename T, Int Dimension>
 double cpu_PermutohedralSubmanifoldConvolution_updateOutput(
-    /*long*/ at::Tensor inputSize, Metadata<Dimension> &m,
+    /*long*/ at::Tensor &inputSize, Metadata<Dimension> &m,
-    /*float*/ at::Tensor input_features,
+    /*float*/ at::Tensor &input_features,
-    /*float*/ at::Tensor output_features,
+    /*float*/ at::Tensor &output_features,
-    /*float*/ at::Tensor weight,
+    /*float*/ at::Tensor &weight,
-    /*float*/ at::Tensor bias) {
+    /*float*/ at::Tensor &bias) {
-  auto _rules = m.getPermutohedralSubmanifoldRuleBook(inputSize, true);
+  const auto &_rules = m.getPermutohedralSubmanifoldRuleBook(inputSize, true);
  Int nActive = m.getNActive(inputSize);
  output_features.resize_({nActive, weight.size(1) * weight.size(3)});
  if (bias.numel() and nActive)
@@ -204,7 +204,7 @@ double cpu_PermutohedralSubmanifoldConvolution_updateOutput(
  auto ip = weight.size(2);
  auto op = weight.size(3);
  for (Int i = 0; i < (Int)_rules.size(); ++i) {
-    auto r = _rules[i];
+    const auto &r = _rules[i];
    Int nRules = r.size() / 2;
    if (nRules) {
      flops += nRules * ip * op * groups;
@@ -220,14 +220,14 @@ double cpu_PermutohedralSubmanifoldConvolution_updateOutput(
 template <typename T, Int Dimension>
 void cpu_PermutohedralSubmanifoldConvolution_backward(
-    /*long*/ at::Tensor inputSize, Metadata<Dimension> &m,
+    /*long*/ at::Tensor &inputSize, Metadata<Dimension> &m,
-    /*float*/ at::Tensor input_features,
+    /*float*/ at::Tensor &input_features,
-    /*float*/ at::Tensor d_input_features,
+    /*float*/ at::Tensor &d_input_features,
-    /*float*/ at::Tensor d_output_features, /*float*/ at::Tensor weight,
+    /*float*/ at::Tensor &d_output_features, /*float*/ at::Tensor &weight,
-    /*float*/ at::Tensor d_weight,
+    /*float*/ at::Tensor &d_weight,
-    /*float*/ at::Tensor d_bias) {
+    /*float*/ at::Tensor &d_bias) {
-  auto _rules = m.getPermutohedralSubmanifoldRuleBook(inputSize, true);
+  const auto &_rules = m.getPermutohedralSubmanifoldRuleBook(inputSize, true);
  Int nActive = m.getNActive(inputSize);
  d_input_features.resize_as_(input_features);
  d_input_features.zero_();
@@ -236,7 +236,7 @@ void cpu_PermutohedralSubmanifoldConvolution_backward(
  if (nActive and d_bias.numel())
    at::sum_out(d_bias, d_output_features, {0}, false);
  for (Int i = 0; i < (Int)_rules.size(); ++i) {
-    auto r = _rules[i];
+    const auto &r = _rules[i];
    Int nRules = r.size() / 2;
    if (nRules) {
      auto w = weight.select(0, i);
@@ -254,15 +254,15 @@ void cpu_PermutohedralSubmanifoldConvolution_backward(
 template <typename T, Int Dimension>
 double cpu_FullConvolution_updateOutput(
-    /*long*/ at::Tensor inputSize, /*long*/ at::Tensor outputSize,
+    /*long*/ at::Tensor &inputSize, /*long*/ at::Tensor &outputSize,
-    /*long*/ at::Tensor filterSize,
+    /*long*/ at::Tensor &filterSize,
-    /*long*/ at::Tensor filterStride, Metadata<Dimension> &mIn,
+    /*long*/ at::Tensor &filterStride, Metadata<Dimension> &mIn,
    Metadata<Dimension> &mOut,
-    /*float*/ at::Tensor input_features,
+    /*float*/ at::Tensor &input_features,
-    /*float*/ at::Tensor output_features,
+    /*float*/ at::Tensor &output_features,
-    /*float*/ at::Tensor weight,
+    /*float*/ at::Tensor &weight,
-    /*float*/ at::Tensor bias) {
+    /*float*/ at::Tensor &bias) {
-  auto _rules = mIn.getFullConvolutionRuleBook(inputSize, outputSize,
+  const auto &_rules = mIn.getFullConvolutionRuleBook(inputSize, outputSize,
                                               filterSize, filterStride, mOut);
  Int nActive = mOut.getNActive(outputSize);
  output_features.resize_({nActive, weight.size(1) * weight.size(3)});
@@ -276,7 +276,7 @@ double cpu_FullConvolution_updateOutput(
  auto ip = weight.size(2);
  auto op = weight.size(3);
  for (Int i = 0; i < (Int)_rules.size(); ++i) {
-    auto r = _rules[i];
+    const auto &r = _rules[i];
    Int nRules = r.size() / 2;
    if (nRules) {
      flops += nRules * ip * op * groups;
@@ -292,17 +292,17 @@ double cpu_FullConvolution_updateOutput(
 template <typename T, Int Dimension>
 void cpu_FullConvolution_backward(
-    /*long*/ at::Tensor inputSize, /*long*/ at::Tensor outputSize,
+    /*long*/ at::Tensor &inputSize, /*long*/ at::Tensor &outputSize,
-    /*long*/ at::Tensor filterSize,
+    /*long*/ at::Tensor &filterSize,
-    /*long*/ at::Tensor filterStride, Metadata<Dimension> &mIn,
+    /*long*/ at::Tensor &filterStride, Metadata<Dimension> &mIn,
    Metadata<Dimension> &mOut,
-    /*float*/ at::Tensor input_features,
+    /*float*/ at::Tensor &input_features,
-    /*float*/ at::Tensor d_input_features,
+    /*float*/ at::Tensor &d_input_features,
-    /*float*/ at::Tensor d_output_features, /*float*/ at::Tensor weight,
+    /*float*/ at::Tensor &d_output_features, /*float*/ at::Tensor &weight,
-    /*float*/ at::Tensor d_weight,
+    /*float*/ at::Tensor &d_weight,
-    /*float*/ at::Tensor d_bias) {
+    /*float*/ at::Tensor &d_bias) {
-  auto _rules = mIn.getFullConvolutionRuleBook(inputSize, outputSize,
+  const auto &_rules = mIn.getFullConvolutionRuleBook(inputSize, outputSize,
                                               filterSize, filterStride, mOut);
  Int nActive = mOut.getNActive(inputSize);
  d_input_features.resize_as_(input_features);
@@ -312,7 +312,7 @@ void cpu_FullConvolution_backward(
  if (nActive and d_bias.numel())
    at::sum_out(d_bias, d_output_features, {0}, false);
  for (Int i = 0; i < (Int)_rules.size(); ++i) {
-    auto r = _rules[i];
+    const auto &r = _rules[i];
    Int nRules = r.size() / 2;
    if (nRules) {
      auto w = weight.select(0, i);
@@ -330,13 +330,13 @@ void cpu_FullConvolution_backward(
 template <typename T, Int Dimension>
 double cpu_RandomizedStrideConvolution_updateOutput(
-    /*long*/ at::Tensor inputSize, /*long*/ at::Tensor outputSize,
+    /*long*/ at::Tensor &inputSize, /*long*/ at::Tensor &outputSize,
-    /*long*/ at::Tensor filterSize,
+    /*long*/ at::Tensor &filterSize,
-    /*long*/ at::Tensor filterStride, Metadata<Dimension> &m,
+    /*long*/ at::Tensor &filterStride, Metadata<Dimension> &m,
-    /*float*/ at::Tensor input_features,
+    /*float*/ at::Tensor &input_features,
-    /*float*/ at::Tensor output_features, /*float*/ at::Tensor weight,
+    /*float*/ at::Tensor &output_features, /*float*/ at::Tensor &weight,
-    /*float*/ at::Tensor bias) {
+    /*float*/ at::Tensor &bias) {
-  auto _rules = m.getRandomizedStrideRuleBook(inputSize, outputSize, filterSize,
+  const auto &_rules = m.getRandomizedStrideRuleBook(inputSize, outputSize, filterSize,
                                              filterStride, true);
  Int nActive = m.getNActive(outputSize);
  output_features.resize_({nActive, weight.size(1) * weight.size(3)});
@@ -350,7 +350,7 @@ double cpu_RandomizedStrideConvolution_updateOutput(
  auto ip = weight.size(2);
  auto op = weight.size(3);
  for (Int i = 0; i < (Int)_rules.size(); ++i) {
-    auto r = _rules[i];
+    const auto &r = _rules[i];
    Int nRules = r.size() / 2;
    if (nRules) {
      flops += nRules * ip * op * groups;
@@ -366,15 +366,15 @@ double cpu_RandomizedStrideConvolution_updateOutput(
 template <typename T, Int Dimension>
 void cpu_RandomizedStrideConvolution_backward(
-    /*long*/ at::Tensor inputSize, /*long*/ at::Tensor outputSize,
+    /*long*/ at::Tensor &inputSize, /*long*/ at::Tensor &outputSize,
-    /*long*/ at::Tensor filterSize,
+    /*long*/ at::Tensor &filterSize,
-    /*long*/ at::Tensor filterStride, Metadata<Dimension> &m,
+    /*long*/ at::Tensor &filterStride, Metadata<Dimension> &m,
-    /*float*/ at::Tensor input_features,
+    /*float*/ at::Tensor &input_features,
-    /*float*/ at::Tensor d_input_features,
+    /*float*/ at::Tensor &d_input_features,
-    /*float*/ at::Tensor d_output_features, /*float*/ at::Tensor weight,
+    /*float*/ at::Tensor &d_output_features, /*float*/ at::Tensor &weight,
-    /*float*/ at::Tensor d_weight, /*float*/ at::Tensor d_bias) {
+    /*float*/ at::Tensor &d_weight, /*float*/ at::Tensor &d_bias) {
-  auto _rules = m.getRandomizedStrideRuleBook(inputSize, outputSize, filterSize,
+  const auto &_rules = m.getRandomizedStrideRuleBook(inputSize, outputSize, filterSize,
                                              filterStride, true);
  Int nActive = m.getNActive(inputSize);
  d_input_features.resize_as_(input_features);
@@ -384,7 +384,7 @@ void cpu_RandomizedStrideConvolution_backward(
  if (nActive and d_bias.numel())
    at::sum_out(d_bias, d_output_features, {0}, false);
  for (Int i = 0; i < (Int)_rules.size(); ++i) {
-    auto r = _rules[i];
+    const auto &r = _rules[i];
    Int nRules = r.size() / 2;
    if (nRules) {
      auto w = weight.select(0, i);

--- a/sparseconvnet/SCN/CPU/Deconvolution.cpp
+++ b/sparseconvnet/SCN/CPU/Deconvolution.cpp
@@ -6,13 +6,13 @@
 template <typename T, Int Dimension>
 double cpu_Deconvolution_updateOutput(
-    /*long*/ at::Tensor inputSize, /*long*/ at::Tensor outputSize,
+    /*long*/ at::Tensor &inputSize, /*long*/ at::Tensor &outputSize,
-    /*long*/ at::Tensor filterSize,
+    /*long*/ at::Tensor &filterSize,
-    /*long*/ at::Tensor filterStride, Metadata<Dimension> &m,
+    /*long*/ at::Tensor &filterStride, Metadata<Dimension> &m,
-    /*float*/ at::Tensor input_features,
+    /*float*/ at::Tensor &input_features,
-    /*float*/ at::Tensor output_features, /*float*/ at::Tensor weight,
+    /*float*/ at::Tensor &output_features, /*float*/ at::Tensor &weight,
-    /*float*/ at::Tensor bias) {
+    /*float*/ at::Tensor &bias) {
-  auto _rules =
+  const auto &_rules =
      m.getRuleBook(outputSize, inputSize, filterSize, filterStride, true);
  Int nActive = m.getNActive(outputSize);
  output_features.resize_({nActive, weight.size(1) * weight.size(3)});
@@ -26,7 +26,7 @@ double cpu_Deconvolution_updateOutput(
  auto ip = weight.size(2);
  auto op = weight.size(3);
  for (Int i = 0; i < (Int)_rules.size(); ++i) {
-    auto r = _rules[i];
+    const auto &r = _rules[i];
    Int nRules = r.size() / 2;
    if (nRules) {
      flops += nRules * ip * op * groups;
@@ -42,15 +42,15 @@ double cpu_Deconvolution_updateOutput(
 template <typename T, Int Dimension>
 void cpu_Deconvolution_backward(
-    /*long*/ at::Tensor inputSize, /*long*/ at::Tensor outputSize,
+    /*long*/ at::Tensor &inputSize, /*long*/ at::Tensor &outputSize,
-    /*long*/ at::Tensor filterSize,
+    /*long*/ at::Tensor &filterSize,
-    /*long*/ at::Tensor filterStride, Metadata<Dimension> &m,
+    /*long*/ at::Tensor &filterStride, Metadata<Dimension> &m,
-    /*float*/ at::Tensor input_features,
+    /*float*/ at::Tensor &input_features,
-    /*float*/ at::Tensor d_input_features,
+    /*float*/ at::Tensor &d_input_features,
-    /*float*/ at::Tensor d_output_features, /*float*/ at::Tensor weight,
+    /*float*/ at::Tensor &d_output_features, /*float*/ at::Tensor &weight,
-    /*float*/ at::Tensor d_weight, /*float*/ at::Tensor d_bias) {
+    /*float*/ at::Tensor &d_weight, /*float*/ at::Tensor &d_bias) {
-  auto _rules =
+  const auto &_rules =
      m.getRuleBook(outputSize, inputSize, filterSize, filterStride, true);
  Int nActive = m.getNActive(inputSize);
  d_input_features.resize_as_(input_features);
@@ -60,7 +60,7 @@ void cpu_Deconvolution_backward(
  if (nActive and d_bias.numel())
    at::sum_out(d_bias, d_output_features, {0}, false);
  for (Int i = 0; i < (Int)_rules.size(); ++i) {
-    auto r = _rules[i];
+    const auto &r = _rules[i];
    Int nRules = r.size() / 2;
    if (nRules) {
      auto w = weight.select(0, i);

--- a/sparseconvnet/SCN/CPU/IOLayers.cpp
+++ b/sparseconvnet/SCN/CPU/IOLayers.cpp
@@ -48,10 +48,10 @@ void InputLayer_BackwardPass(T *d_input_features, T *d_output_features,
 template <typename T, Int Dimension>
 void cpu_InputLayer_updateOutput(Metadata<Dimension> &m,
-                                 /*long*/ at::Tensor spatialSize,
+                                 /*long*/ at::Tensor &spatialSize,
-                                 /*long*/ at::Tensor input_coords,
+                                 /*long*/ at::Tensor &input_coords,
-                                 /*float*/ at::Tensor input_features,
+                                 /*float*/ at::Tensor &input_features,
-                                 /*float*/ at::Tensor output_features,
+                                 /*float*/ at::Tensor &output_features,
                                 long batchSize, long mode) {
  m.inputLayer(spatialSize, input_coords, batchSize, mode);
@@ -72,8 +72,8 @@ void cpu_InputLayer_updateOutput(Metadata<Dimension> &m,
 }
 template <typename T, Int Dimension>
 void cpu_InputLayer_updateGradInput(Metadata<Dimension> &m,
-                                    /*float*/ at::Tensor d_input_features,
+                                    /*float*/ at::Tensor &d_input_features,
-                                    /*float*/ at::Tensor d_output_features) {
+                                    /*float*/ at::Tensor &d_output_features) {
  auto &rules = m.inputLayerRuleBook;
  auto nPlanes = d_output_features.size(1);
@@ -94,8 +94,8 @@ void cpu_InputLayer_updateGradInput(Metadata<Dimension> &m,
 template <typename T, Int Dimension>
 void cpu_OutputLayer_updateOutput(Metadata<Dimension> &m,
-                                  /*float*/ at::Tensor input_features,
+                                  /*float*/ at::Tensor &input_features,
-                                  /*float*/ at::Tensor output_features) {
+                                  /*float*/ at::Tensor &output_features) {
  auto &rules = m.inputLayerRuleBook;
  auto nPlanes = input_features.size(1);
@@ -115,8 +115,8 @@ void cpu_OutputLayer_updateOutput(Metadata<Dimension> &m,
 }
 template <typename T, Int Dimension>
 void cpu_OutputLayer_updateGradInput(Metadata<Dimension> &m,
-                                     /*float*/ at::Tensor d_input_features,
+                                     /*float*/ at::Tensor &d_input_features,
-                                     /*float*/ at::Tensor d_output_features) {
+                                     /*float*/ at::Tensor &d_output_features) {
  auto &rules = m.inputLayerRuleBook;
  auto nPlanes = d_output_features.size(1);
@@ -137,10 +137,10 @@ void cpu_OutputLayer_updateGradInput(Metadata<Dimension> &m,
 template <typename T, Int Dimension>
 void cpu_BLInputLayer_updateOutput(Metadata<Dimension> &m,
-                                   /*long*/ at::Tensor spatialSize,
+                                   /*long*/ at::Tensor &spatialSize,
-                                   /*long*/ at::Tensor input_coords,
+                                   /*long*/ at::Tensor &input_coords,
-                                   /*float*/ at::Tensor input_features,
+                                   /*float*/ at::Tensor &input_features,
-                                   /*float*/ at::Tensor output_features,
+                                   /*float*/ at::Tensor &output_features,
                                   long mode) {
  m.blLayer(spatialSize, input_coords, mode);
@@ -162,8 +162,8 @@ void cpu_BLInputLayer_updateOutput(Metadata<Dimension> &m,
 }
 template <typename T, Int Dimension>
 void cpu_BLInputLayer_updateGradInput(Metadata<Dimension> &m,
-                                      /*float*/ at::Tensor d_input_features,
+                                      /*float*/ at::Tensor &d_input_features,
-                                      /*float*/ at::Tensor d_output_features) {
+                                      /*float*/ at::Tensor &d_output_features) {
  auto &rules = m.blLayerRuleBook;
  auto nPlanes = d_output_features.size(1);
@@ -186,8 +186,8 @@ void cpu_BLInputLayer_updateGradInput(Metadata<Dimension> &m,
 template <typename T, Int Dimension>
 void cpu_BLOutputLayer_updateOutput(Metadata<Dimension> &m,
-                                    /*float*/ at::Tensor input_features,
+                                    /*float*/ at::Tensor &input_features,
-                                    /*float*/ at::Tensor output_features) {
+                                    /*float*/ at::Tensor &output_features) {
  auto &rules = m.blLayerRuleBook;
  auto nPlanes = input_features.size(1);
@@ -207,9 +207,10 @@ void cpu_BLOutputLayer_updateOutput(Metadata<Dimension> &m,
  }
 }
 template <typename T, Int Dimension>
-void cpu_BLOutputLayer_updateGradInput(Metadata<Dimension> &m,
+void cpu_BLOutputLayer_updateGradInput(
-                                       /*float*/ at::Tensor d_input_features,
+    Metadata<Dimension> &m,
-                                       /*float*/ at::Tensor d_output_features) {
+    /*float*/ at::Tensor &d_input_features,
+    /*float*/ at::Tensor &d_output_features) {
  auto &rules = m.blLayerRuleBook;
  auto nPlanes = d_output_features.size(2);

--- a/sparseconvnet/SCN/CPU/LeakyReLU.cpp
+++ b/sparseconvnet/SCN/CPU/LeakyReLU.cpp
@@ -5,8 +5,9 @@
 // LICENSE file in the root directory of this source tree.
 template <typename T>
-void cpu_LeakyReLU_updateOutput(/*float*/ at::Tensor input_features,
+void cpu_LeakyReLU_updateOutput(/*float*/ at::Tensor &input_features,
-                                /*float*/ at::Tensor output_features, T alpha) {
+                                /*float*/ at::Tensor &output_features,
+                                T alpha) {
  output_features.resize_as_(input_features);
  auto iF = input_features.data<T>();
  auto oF = output_features.data<T>();
@@ -19,9 +20,9 @@ void cpu_LeakyReLU_updateOutput(/*float*/ at::Tensor input_features,
  }
 }
 template <typename T>
-void cpu_LeakyReLU_updateGradInput(/*float*/ at::Tensor input_features,
+void cpu_LeakyReLU_updateGradInput(/*float*/ at::Tensor &input_features,
-                                   /*float*/ at::Tensor d_input_features,
+                                   /*float*/ at::Tensor &d_input_features,
-                                   /*float*/ at::Tensor d_output_features,
+                                   /*float*/ at::Tensor &d_output_features,
                                   T alpha) {
  d_input_features.resize_as_(d_output_features);
  auto iF = input_features.data<T>();

--- a/sparseconvnet/SCN/CPU/MaxPooling.cpp
+++ b/sparseconvnet/SCN/CPU/MaxPooling.cpp
@@ -6,7 +6,7 @@
 template <typename T>
 void MaxPooling_ForwardPass(T *input_features, T *output_features, Int nPlanes,
-                            Int input_stride, Int output_stride, Int *rules,
+                            Int input_stride, Int output_stride, const Int *rules,
                            Int nHot) {
  Int outSite;
 #pragma omp parallel for private(outSite)
@@ -22,7 +22,7 @@ template <typename T>
 void MaxPooling_BackwardPass(T *input_features, T *d_input_features,
                             T *output_features, T *d_output_features,
                             Int nPlanes, Int input_stride, Int output_stride,
-                             Int *rules, Int nHot) {
+                             const Int *rules, Int nHot) {
  Int outSite;
 #pragma omp parallel for private(outSite)
  for (outSite = 0; outSite < nHot; outSite++) {
@@ -36,14 +36,14 @@ void MaxPooling_BackwardPass(T *input_features, T *d_input_features,
 template <typename T, Int Dimension>
 void cpu_MaxPooling_updateOutput(
-    /*long*/ at::Tensor inputSize, /*long*/ at::Tensor outputSize,
+    /*long*/ at::Tensor &inputSize, /*long*/ at::Tensor &outputSize,
-    /*long*/ at::Tensor poolSize,
+    /*long*/ at::Tensor &poolSize,
-    /*long*/ at::Tensor poolStride, Metadata<Dimension> &m,
+    /*long*/ at::Tensor &poolStride, Metadata<Dimension> &m,
-    /*float*/ at::Tensor input_features,
+    /*float*/ at::Tensor &input_features,
-    /*float*/ at::Tensor output_features, long nFeaturesToDrop) {
+    /*float*/ at::Tensor &output_features, long nFeaturesToDrop) {
  Int nPlanes = input_features.size(1) - nFeaturesToDrop;
-  auto _rules =
+  const auto &_rules =
      m.getRuleBook(inputSize, outputSize, poolSize, poolStride, true);
  Int nActive = m.getNActive(outputSize);
  output_features.resize_({nActive, input_features.size(1) - nFeaturesToDrop});
@@ -60,15 +60,16 @@ void cpu_MaxPooling_updateOutput(
 }
 template <typename T, Int Dimension>
 void cpu_MaxPooling_updateGradInput(
-    /*long*/ at::Tensor inputSize, /*long*/ at::Tensor outputSize,
+    /*long*/ at::Tensor &inputSize, /*long*/ at::Tensor &outputSize,
-    /*long*/ at::Tensor poolSize,
+    /*long*/ at::Tensor &poolSize,
-    /*long*/ at::Tensor poolStride, Metadata<Dimension> &m,
+    /*long*/ at::Tensor &poolStride, Metadata<Dimension> &m,
-    /*float*/ at::Tensor input_features,
+    /*float*/ at::Tensor &input_features,
-    /*float*/ at::Tensor d_input_features, /*float*/ at::Tensor output_features,
+    /*float*/ at::Tensor &d_input_features,
-    /*float*/ at::Tensor d_output_features, long nFeaturesToDrop) {
+    /*float*/ at::Tensor &output_features,
+    /*float*/ at::Tensor &d_output_features, long nFeaturesToDrop) {
  Int nPlanes = input_features.size(1) - nFeaturesToDrop;
-  auto _rules =
+  const auto &_rules =
      m.getRuleBook(inputSize, outputSize, poolSize, poolStride, true);
  d_input_features.resize_as_(input_features);
  d_input_features.zero_();
@@ -87,14 +88,14 @@ void cpu_MaxPooling_updateGradInput(
 }
 template <typename T, Int Dimension>
 void cpu_RandomizedStrideMaxPooling_updateOutput(
-    /*long*/ at::Tensor inputSize, /*long*/ at::Tensor outputSize,
+    /*long*/ at::Tensor &inputSize, /*long*/ at::Tensor &outputSize,
-    /*long*/ at::Tensor poolSize,
+    /*long*/ at::Tensor &poolSize,
-    /*long*/ at::Tensor poolStride, Metadata<Dimension> &m,
+    /*long*/ at::Tensor &poolStride, Metadata<Dimension> &m,
-    /*float*/ at::Tensor input_features,
+    /*float*/ at::Tensor &input_features,
-    /*float*/ at::Tensor output_features, long nFeaturesToDrop) {
+    /*float*/ at::Tensor &output_features, long nFeaturesToDrop) {
  Int nPlanes = input_features.size(1) - nFeaturesToDrop;
-  auto _rules = m.getRandomizedStrideRuleBook(inputSize, outputSize, poolSize,
+  const auto &_rules = m.getRandomizedStrideRuleBook(inputSize, outputSize, poolSize,
                                              poolStride, true);
  Int nActive = m.getNActive(outputSize);
  output_features.resize_({nActive, input_features.size(1) - nFeaturesToDrop});
@@ -111,15 +112,16 @@ void cpu_RandomizedStrideMaxPooling_updateOutput(
 }
 template <typename T, Int Dimension>
 void cpu_RandomizedStrideMaxPooling_updateGradInput(
-    /*long*/ at::Tensor inputSize, /*long*/ at::Tensor outputSize,
+    /*long*/ at::Tensor &inputSize, /*long*/ at::Tensor &outputSize,
-    /*long*/ at::Tensor poolSize,
+    /*long*/ at::Tensor &poolSize,
-    /*long*/ at::Tensor poolStride, Metadata<Dimension> &m,
+    /*long*/ at::Tensor &poolStride, Metadata<Dimension> &m,
-    /*float*/ at::Tensor input_features,
+    /*float*/ at::Tensor &input_features,
-    /*float*/ at::Tensor d_input_features, /*float*/ at::Tensor output_features,
+    /*float*/ at::Tensor &d_input_features,
-    /*float*/ at::Tensor d_output_features, long nFeaturesToDrop) {
+    /*float*/ at::Tensor &output_features,
+    /*float*/ at::Tensor &d_output_features, long nFeaturesToDrop) {
  Int nPlanes = input_features.size(1) - nFeaturesToDrop;
-  auto _rules = m.getRandomizedStrideRuleBook(inputSize, outputSize, poolSize,
+  const auto &_rules = m.getRandomizedStrideRuleBook(inputSize, outputSize, poolSize,
                                              poolStride, true);
  d_input_features.resize_as_(input_features);
  d_input_features.zero_();

--- a/sparseconvnet/SCN/CPU/NetworkInNetwork.cpp
+++ b/sparseconvnet/SCN/CPU/NetworkInNetwork.cpp
@@ -5,10 +5,10 @@
 // LICENSE file in the root directory of this source tree.
 template <typename T>
-double cpu_NetworkInNetwork_updateOutput(/*float*/ at::Tensor input_features,
+double cpu_NetworkInNetwork_updateOutput(/*float*/ at::Tensor &input_features,
-                                         /*float*/ at::Tensor output_features,
+                                         /*float*/ at::Tensor &output_features,
-                                         /*float*/ at::Tensor weight,
+                                         /*float*/ at::Tensor &weight,
-                                         /*float*/ at::Tensor bias) {
+                                         /*float*/ at::Tensor &bias) {
  auto nActive = input_features.size(0);
  auto input_nPlanes = weight.size(0);
  auto output_nPlanes = weight.size(1);
@@ -23,9 +23,9 @@ double cpu_NetworkInNetwork_updateOutput(/*float*/ at::Tensor input_features,
 }
 template <typename T>
 void cpu_NetworkInNetwork_updateGradInput(
-    /*float*/ at::Tensor d_input_features,
+    /*float*/ at::Tensor &d_input_features,
-    /*float*/ at::Tensor d_output_features,
+    /*float*/ at::Tensor &d_output_features,
-    /*float*/ at::Tensor weight) {
+    /*float*/ at::Tensor &weight) {
  int nActive = d_output_features.size(0);
  d_input_features.resize_({nActive, weight.size(0)});
@@ -35,9 +35,9 @@ void cpu_NetworkInNetwork_updateGradInput(
 }
 template <typename T>
 void cpu_NetworkInNetwork_accGradParameters(
-    /*float*/ at::Tensor input_features,
+    /*float*/ at::Tensor &input_features,
-    /*float*/ at::Tensor d_output_features,
+    /*float*/ at::Tensor &d_output_features,
-    /*float*/ at::Tensor d_weight, /*float*/ at::Tensor d_bias) {
+    /*float*/ at::Tensor &d_weight, /*float*/ at::Tensor &d_bias) {
  auto nActive = input_features.size(0);
  if (nActive and d_bias.numel())
    at::sum_out(d_bias, d_output_features, {0}, false);

--- a/sparseconvnet/SCN/CPU/SparseToDense.cpp
+++ b/sparseconvnet/SCN/CPU/SparseToDense.cpp
@@ -6,7 +6,7 @@
 template <typename T>
 void SparseToDense_ForwardPass(T *input_features, T *output_features,
-                               Int nPlanes, Int spatialVolume, Int *rules,
+                               Int nPlanes, Int spatialVolume, const Int *rules,
                               int nHot) {
  Int outSite;
 #pragma omp parallel for private(outSite)
@@ -20,7 +20,7 @@ void SparseToDense_ForwardPass(T *input_features, T *output_features,
 template <typename T>
 void SparseToDense_BackwardPass(T *d_input_features, T *d_output_features,
-                                Int nPlanes, Int spatialVolume, Int *rules,
+                                Int nPlanes, Int spatialVolume, const Int *rules,
                                int nHot) {
  Int outSite;
 #pragma omp parallel for private(outSite)
@@ -34,9 +34,9 @@ void SparseToDense_BackwardPass(T *d_input_features, T *d_output_features,
 template <typename T, Int Dimension>
 void cpu_SparseToDense_updateOutput(
-    /*long*/ at::Tensor inputSize, Metadata<Dimension> &m,
+    /*long*/ at::Tensor &inputSize, Metadata<Dimension> &m,
-    /*float*/ at::Tensor input_features,
+    /*float*/ at::Tensor &input_features,
-    /*float*/ at::Tensor output_features, long nPlanes) {
+    /*float*/ at::Tensor &output_features, long nPlanes) {
  {
    std::array<long, Dimension + 2> sz;
@@ -49,7 +49,7 @@ void cpu_SparseToDense_updateOutput(
    output_features.zero_();
  }
  if (input_features.ndimension() == 2) {
-    auto _rules = m.getSparseToDenseRuleBook(inputSize, true);
+    const auto &_rules = m.getSparseToDenseRuleBook(inputSize, true);
    Int _nPlanes = input_features.size(1);
    auto iF = input_features.data<T>();
    auto oF = output_features.data<T>();
@@ -64,15 +64,15 @@ void cpu_SparseToDense_updateOutput(
 }
 template <typename T, Int Dimension>
 void cpu_SparseToDense_updateGradInput(
-    /*long*/ at::Tensor inputSize, Metadata<Dimension> &m,
+    /*long*/ at::Tensor &inputSize, Metadata<Dimension> &m,
-    /*float*/ at::Tensor input_features,
+    /*float*/ at::Tensor &input_features,
-    /*float*/ at::Tensor d_input_features,
+    /*float*/ at::Tensor &d_input_features,
-    /*float*/ at::Tensor d_output_features) {
+    /*float*/ at::Tensor &d_output_features) {
  d_input_features.resize_as_(input_features);
  d_input_features.zero_();
  if (input_features.ndimension() == 2) {
-    auto _rules = m.getSparseToDenseRuleBook(inputSize, true);
+    const auto &_rules = m.getSparseToDenseRuleBook(inputSize, true);
    long spatialVolume = inputSize.prod().data<long>()[0];
    Int _nPlanes = d_input_features.size(1);
    auto diF = d_input_features.data<T>();

--- a/sparseconvnet/SCN/CPU/UnPooling.cpp
+++ b/sparseconvnet/SCN/CPU/UnPooling.cpp
@@ -6,7 +6,7 @@
 template <typename T>
 void UnPooling_ForwardPass(T *input_features, T *output_features, Int nPlanes,
-                           Int input_stride, Int output_stride, Int *rules,
+                           Int input_stride, Int output_stride, const Int *rules,
                           Int nHot) {
  Int outSite;
 #pragma omp parallel for private(outSite)
@@ -20,7 +20,7 @@ void UnPooling_ForwardPass(T *input_features, T *output_features, Int nPlanes,
 template <typename T>
 void UnPooling_BackwardPass(T *d_input_features, T *d_output_features,
                            Int nPlanes, Int input_stride, Int output_stride,
-                            Int *rules, Int nHot) {
+                            const Int *rules, Int nHot) {
  Int outSite;
 #pragma omp parallel for private(outSite)
  for (outSite = 0; outSite < nHot; outSite++) {
@@ -33,14 +33,14 @@ void UnPooling_BackwardPass(T *d_input_features, T *d_output_features,
 template <typename T, Int Dimension>
 void cpu_UnPooling_updateOutput(
-    /*long*/ at::Tensor inputSize, /*long*/ at::Tensor outputSize,
+    /*long*/ at::Tensor &inputSize, /*long*/ at::Tensor &outputSize,
-    /*long*/ at::Tensor poolSize,
+    /*long*/ at::Tensor &poolSize,
-    /*long*/ at::Tensor poolStride, Metadata<Dimension> &m,
+    /*long*/ at::Tensor &poolStride, Metadata<Dimension> &m,
-    /*float*/ at::Tensor input_features,
+    /*float*/ at::Tensor &input_features,
-    /*float*/ at::Tensor output_features, long nFeaturesToDrop) {
+    /*float*/ at::Tensor &output_features, long nFeaturesToDrop) {
  Int nPlanes = input_features.size(1) - nFeaturesToDrop;
-  auto _rules =
+  const auto &_rules =
      m.getRuleBook(outputSize, inputSize, poolSize, poolStride, true);
  Int nActive = m.getNActive(outputSize);
  output_features.resize_({nActive, input_features.size(1) - nFeaturesToDrop});
@@ -57,14 +57,14 @@ void cpu_UnPooling_updateOutput(
 }
 template <typename T, Int Dimension>
 void cpu_UnPooling_updateGradInput(
-    /*long*/ at::Tensor inputSize, /*long*/ at::Tensor outputSize,
+    /*long*/ at::Tensor &inputSize, /*long*/ at::Tensor &outputSize,
-    /*long*/ at::Tensor poolSize,
+    /*long*/ at::Tensor &poolSize,
-    /*long*/ at::Tensor poolStride, Metadata<Dimension> &m,
+    /*long*/ at::Tensor &poolStride, Metadata<Dimension> &m,
-    /*float*/ at::Tensor d_input_features,
+    /*float*/ at::Tensor &d_input_features,
-    /*float*/ at::Tensor d_output_features, long nFeaturesToDrop) {
+    /*float*/ at::Tensor &d_output_features, long nFeaturesToDrop) {
  Int nPlanes = d_input_features.size(1) - nFeaturesToDrop;
-  auto _rules =
+  const auto &_rules =
      m.getRuleBook(outputSize, inputSize, poolSize, poolStride, true);
  auto diF = d_input_features.data<T>() + nFeaturesToDrop;

--- a/sparseconvnet/SCN/CUDA/ActivePooling.cpp
+++ b/sparseconvnet/SCN/CUDA/ActivePooling.cpp
@@ -7,21 +7,21 @@
 template <typename T>
 void ActivePooling_ForwardPass(T *input_features, T *output_features,
                               Int batchSize, Int maxActive, Int nPlanes,
-                               Int *rules, bool average);
+                               const Int *rules, bool average);
 template <typename T>
 void ActivePooling_BackwardPass(T *d_input_features, T *d_output_features,
                                Int batchSize, Int maxActive, Int nPlanes,
-                                Int *rules, bool average);
+                                const Int *rules, bool average);
 template <typename T, Int Dimension>
 void cuda_ActivePooling_updateOutput(
-    /*long*/ at::Tensor inputSize, Metadata<Dimension> &m,
+    /*long*/ at::Tensor &inputSize, Metadata<Dimension> &m,
-    /*cuda float*/ at::Tensor input_features,
+    /*cuda float*/ at::Tensor &input_features,
-    /*cuda float*/ at::Tensor output_features, bool average) {
+    /*cuda float*/ at::Tensor &output_features, bool average) {
  Int nPlanes = input_features.size(1);
-  auto _rules = m.getActivePoolingRuleBook(inputSize);
+  const auto &_rules = m.getActivePoolingRuleBook(inputSize);
  Int batchSize = _rules[1][0];
  Int maxActive = _rules[1][1];
  output_features.resize_({batchSize, nPlanes});
@@ -34,13 +34,13 @@ void cuda_ActivePooling_updateOutput(
 }
 template <typename T, Int Dimension>
 void cuda_ActivePooling_updateGradInput(
-    /*long*/ at::Tensor inputSize, Metadata<Dimension> &m,
+    /*long*/ at::Tensor &inputSize, Metadata<Dimension> &m,
-    /*cuda float*/ at::Tensor input_features,
+    /*cuda float*/ at::Tensor &input_features,
-    /*cuda float*/ at::Tensor d_input_features,
+    /*cuda float*/ at::Tensor &d_input_features,
-    /*cuda float*/ at::Tensor d_output_features, bool average) {
+    /*cuda float*/ at::Tensor &d_output_features, bool average) {
  Int nPlanes = input_features.size(1);
-  auto _rules = m.getActivePoolingRuleBook(inputSize);
+  const auto &_rules = m.getActivePoolingRuleBook(inputSize);
  Int batchSize = _rules[1][0];
  Int maxActive = _rules[1][1];
  d_input_features.resize_as_(input_features);

--- a/sparseconvnet/SCN/CUDA/ActivePooling.cu
+++ b/sparseconvnet/SCN/CUDA/ActivePooling.cu
@@ -6,10 +6,10 @@
 template <typename T>
 __global__ void ActivePooling_fp(T *input_features, T *output_features,
-				 Int maxActive, Int nPlanes, Int *rules,
+				 Int maxActive, Int nPlanes, const Int *rules,
 				 bool average) {
  T *out = &output_features[blockIdx.x * nPlanes];
-  Int *r = &rules[blockIdx.x * (maxActive + 1)];
+  const Int *r = &rules[blockIdx.x * (maxActive + 1)];
  Int nActive = *r++;
  T multiplier = (average and nActive > 0) ? (T)1 / nActive : (T)1;
  while (nActive-- > 0) {
@@ -21,7 +21,7 @@ __global__ void ActivePooling_fp(T *input_features, T *output_features,
 template <typename T>
 void ActivePooling_ForwardPass(T *input_features, T *output_features,
 			       Int batchSize, Int maxActive, Int nPlanes,
-			       Int *rules, bool average) {
+			       const Int *rules, bool average) {
  auto rulesBuffer = at::empty({1<<22}, at::CUDA(at_kINT));
  Int *rb = rulesBuffer.data<Int>();
@@ -41,10 +41,10 @@ void ActivePooling_ForwardPass(T *input_features, T *output_features,
 }
 template <typename T>
 __global__ void ActivePooling_bp(T *d_input_features, T *d_output_features,
-				 Int maxActive, Int nPlanes, Int *rules,
+				 Int maxActive, Int nPlanes, const Int *rules,
 				 bool average) {
  T *out = &d_output_features[blockIdx.x * nPlanes];
-  Int *r = &rules[blockIdx.x * (maxActive + 1)];
+  const Int *r = &rules[blockIdx.x * (maxActive + 1)];
  Int nActive = *r++;
  T multiplier = (average and nActive > 0) ? (T)1 / nActive : (T)1;
  while (nActive-- > 0) {
@@ -57,7 +57,7 @@ __global__ void ActivePooling_bp(T *d_input_features, T *d_output_features,
 template <typename T>
 void ActivePooling_BackwardPass(T *d_input_features, T *d_output_features,
 				Int batchSize, Int maxActive, Int nPlanes,
-				Int *rules, bool average) {
+				const Int *rules, bool average) {
  auto rulesBuffer = at::empty({1<<22}, at::CUDA(at_kINT));
  Int *rb = rulesBuffer.data<Int>();
  Int rowBatchSize = std::min((Int)32768, (1 << 22) / (maxActive + 1));

--- a/sparseconvnet/SCN/CUDA/AffineReluTrivialConvolution.cpp
+++ b/sparseconvnet/SCN/CUDA/AffineReluTrivialConvolution.cpp
@@ -24,11 +24,11 @@ void dAffineReluTrivialConvolution_backward_dW(
 template <typename T>
 double cuda_AffineReluTrivialConvolution_updateOutput(
-    /*cuda float*/ at::Tensor input_features,
+    /*cuda float*/ at::Tensor &input_features,
-    /*cuda float*/ at::Tensor output_features,
+    /*cuda float*/ at::Tensor &output_features,
-    /*cuda float*/ at::Tensor affineWeight,
+    /*cuda float*/ at::Tensor &affineWeight,
-    /*cuda float*/ at::Tensor affineBias,
+    /*cuda float*/ at::Tensor &affineBias,
-    /*cuda float*/ at::Tensor convWeight) {
+    /*cuda float*/ at::Tensor &convWeight) {
  output_features.resize_({input_features.size(0), convWeight.size(1)});
  dAffineReluTrivialConvolution_forward<T>(
@@ -42,15 +42,15 @@ double cuda_AffineReluTrivialConvolution_updateOutput(
 template <typename T>
 void cuda_AffineReluTrivialConvolution_backward(
-    /*cuda float*/ at::Tensor input_features,
+    /*cuda float*/ at::Tensor &input_features,
-    /*cuda float*/ at::Tensor d_input_features,
+    /*cuda float*/ at::Tensor &d_input_features,
-    /*cuda float*/ at::Tensor d_output_features,
+    /*cuda float*/ at::Tensor &d_output_features,
-    /*cuda float*/ at::Tensor affineWeight,
+    /*cuda float*/ at::Tensor &affineWeight,
-    /*cuda float*/ at::Tensor d_affineWeight,
+    /*cuda float*/ at::Tensor &d_affineWeight,
-    /*cuda float*/ at::Tensor affineBias,
+    /*cuda float*/ at::Tensor &affineBias,
-    /*cuda float*/ at::Tensor d_affineBias,
+    /*cuda float*/ at::Tensor &d_affineBias,
-    /*cuda float*/ at::Tensor convWeight,
+    /*cuda float*/ at::Tensor &convWeight,
-    /*cuda float*/ at::Tensor d_convWeight, bool additiveGrad) {
+    /*cuda float*/ at::Tensor &d_convWeight, bool additiveGrad) {
  d_input_features.resize_as_(input_features);
  dAffineReluTrivialConvolution_backward_dW<T>(

--- a/sparseconvnet/SCN/CUDA/AveragePooling.cpp
+++ b/sparseconvnet/SCN/CUDA/AveragePooling.cpp
@@ -18,14 +18,14 @@ void cuda_AveragePooling_BackwardPass(T *d_input_features, T *d_output_features,
 template <typename T, Int Dimension>
 void cuda_AveragePooling_updateOutput(
-    /*long*/ at::Tensor inputSize, /*long*/ at::Tensor outputSize,
+    /*long*/ at::Tensor &inputSize, /*long*/ at::Tensor &outputSize,
-    /*long*/ at::Tensor poolSize,
+    /*long*/ at::Tensor &poolSize,
-    /*long*/ at::Tensor poolStride, Metadata<Dimension> &m,
+    /*long*/ at::Tensor &poolStride, Metadata<Dimension> &m,
-    /*cuda float*/ at::Tensor input_features,
+    /*cuda float*/ at::Tensor &input_features,
-    /*cuda float*/ at::Tensor output_features, long nFeaturesToDrop) {
+    /*cuda float*/ at::Tensor &output_features, long nFeaturesToDrop) {
  Int nPlanes = input_features.size(1) - nFeaturesToDrop;
-  auto _rules =
+  const auto &_rules =
      m.getRuleBook(inputSize, outputSize, poolSize, poolStride, true);
  Int nActive = m.getNActive(outputSize);
  output_features.resize_({nActive, input_features.size(1) - nFeaturesToDrop});
@@ -40,15 +40,15 @@ void cuda_AveragePooling_updateOutput(
 template <typename T, Int Dimension>
 void cuda_AveragePooling_updateGradInput(
-    /*long*/ at::Tensor inputSize, /*long*/ at::Tensor outputSize,
+    /*long*/ at::Tensor &inputSize, /*long*/ at::Tensor &outputSize,
-    /*long*/ at::Tensor poolSize,
+    /*long*/ at::Tensor &poolSize,
-    /*long*/ at::Tensor poolStride, Metadata<Dimension> &m,
+    /*long*/ at::Tensor &poolStride, Metadata<Dimension> &m,
-    /*cuda float*/ at::Tensor input_features,
+    /*cuda float*/ at::Tensor &input_features,
-    /*cuda float*/ at::Tensor d_input_features,
+    /*cuda float*/ at::Tensor &d_input_features,
-    /*cuda float*/ at::Tensor d_output_features, long nFeaturesToDrop) {
+    /*cuda float*/ at::Tensor &d_output_features, long nFeaturesToDrop) {
  Int nPlanes = input_features.size(1) - nFeaturesToDrop;
-  auto _rules =
+  const auto &_rules =
      m.getRuleBook(inputSize, outputSize, poolSize, poolStride, true);
  d_input_features.resize_as_(input_features);
  d_input_features.zero_();
@@ -70,8 +70,8 @@ void cuda_CopyFeaturesHelper_BackwardPass(T *d_input_features,
                                          Int nPlanes, Int nHot);
 template <typename T>
-void cuda_CopyFeaturesHelper_updateOutput(at::Tensor rules, at::Tensor context,
+void cuda_CopyFeaturesHelper_updateOutput(at::Tensor &rules, at::Tensor &context,
-                                          at::Tensor Context) {
+                                          at::Tensor &Context) {
  Int nPlanes = context.size(1);
  Int nHot = rules.size(0) / 2;
@@ -80,9 +80,9 @@ void cuda_CopyFeaturesHelper_updateOutput(at::Tensor rules, at::Tensor context,
 }
 template <typename T>
-void cuda_CopyFeaturesHelper_updateGradInput(at::Tensor rules,
+void cuda_CopyFeaturesHelper_updateGradInput(at::Tensor &rules,
-                                             at::Tensor dcontext,
+                                             at::Tensor &dcontext,
-                                             at::Tensor dContext) {
+                                             at::Tensor &dContext) {
  Int nPlanes = dcontext.size(1);
  Int nHot = rules.size(0) / 2;

--- a/sparseconvnet/SCN/CUDA/BatchwiseMultiplicativeDropout.cpp
+++ b/sparseconvnet/SCN/CUDA/BatchwiseMultiplicativeDropout.cpp
@@ -13,8 +13,8 @@ void bmd_b(T *input_features, T *d_input_features, T *d_output_features,
 template <typename T>
 void cuda_BatchwiseMultiplicativeDropout_updateOutput(
-    /*cuda float*/ at::Tensor input_features,
+    /*cuda float*/ at::Tensor &input_features,
-    /*cuda float*/ at::Tensor output_features, /*cuda float*/ at::Tensor noise,
+    /*cuda float*/ at::Tensor &output_features, /*cuda float*/ at::Tensor &noise,
    T alpha) {
  output_features.resize_as_(input_features);
  auto nActive = input_features.size(0);
@@ -25,10 +25,10 @@ void cuda_BatchwiseMultiplicativeDropout_updateOutput(
 template <typename T>
 void cuda_BatchwiseMultiplicativeDropout_updateGradInput(
-    /*cuda float*/ at::Tensor input_features,
+    /*cuda float*/ at::Tensor &input_features,
-    /*cuda float*/ at::Tensor d_input_features,
+    /*cuda float*/ at::Tensor &d_input_features,
-    /*cuda float*/ at::Tensor d_output_features,
+    /*cuda float*/ at::Tensor &d_output_features,
-    /*cuda float*/ at::Tensor noise, T alpha) {
+    /*cuda float*/ at::Tensor &noise, T alpha) {
  d_input_features.resize_as_(d_output_features);
  auto nActive = input_features.size(0);
  auto nPlanes = input_features.size(1);

--- a/sparseconvnet/SCN/CUDA/Convolution.cpp
+++ b/sparseconvnet/SCN/CUDA/Convolution.cpp
@@ -22,14 +22,14 @@ void dConvolution_backward_dW2(T *inFeatures, T *dInFeatures, T *dOutFeatures,
 template <typename T, Int Dimension>
 double cuda_Convolution_updateOutput(
-    /*long*/ at::Tensor inputSize, /*long*/ at::Tensor outputSize,
+    /*long*/ at::Tensor &inputSize, /*long*/ at::Tensor &outputSize,
-    /*long*/ at::Tensor filterSize,
+    /*long*/ at::Tensor &filterSize,
-    /*long*/ at::Tensor filterStride, Metadata<Dimension> &m,
+    /*long*/ at::Tensor &filterStride, Metadata<Dimension> &m,
-    /*cuda float*/ at::Tensor input_features,
+    /*cuda float*/ at::Tensor &input_features,
-    /*cuda float*/ at::Tensor output_features, /*cuda float*/ at::Tensor weight,
+    /*cuda float*/ at::Tensor &output_features, /*cuda float*/ at::Tensor &weight,
-    /*cuda float*/ at::Tensor bias) {
+    /*cuda float*/ at::Tensor &bias) {
-  auto _rules =
+  const auto &_rules =
      m.getRuleBook(inputSize, outputSize, filterSize, filterStride, true);
  Int nActiveOut = m.getNActive(outputSize);
  Int nGroups = weight.size(1);
@@ -56,16 +56,16 @@ double cuda_Convolution_updateOutput(
 template <typename T, Int Dimension>
 void cuda_Convolution_backward(
-    /*long*/ at::Tensor inputSize, /*long*/ at::Tensor outputSize,
+    /*long*/ at::Tensor &inputSize, /*long*/ at::Tensor &outputSize,
-    /*long*/ at::Tensor filterSize,
+    /*long*/ at::Tensor &filterSize,
-    /*long*/ at::Tensor filterStride, Metadata<Dimension> &m,
+    /*long*/ at::Tensor &filterStride, Metadata<Dimension> &m,
-    /*cuda float*/ at::Tensor input_features,
+    /*cuda float*/ at::Tensor &input_features,
-    /*cuda float*/ at::Tensor d_input_features,
+    /*cuda float*/ at::Tensor &d_input_features,
-    /*cuda float*/ at::Tensor d_output_features,
+    /*cuda float*/ at::Tensor &d_output_features,
-    /*cuda float*/ at::Tensor weight, /*cuda float*/ at::Tensor d_weight,
+    /*cuda float*/ at::Tensor &weight, /*cuda float*/ at::Tensor &d_weight,
-    /*cuda float*/ at::Tensor d_bias) {
+    /*cuda float*/ at::Tensor &d_bias) {
-  auto _rules =
+  const auto &_rules =
      m.getRuleBook(inputSize, outputSize, filterSize, filterStride, true);
  Int nActiveIn = m.getNActive(inputSize);
  Int nActiveOut = m.getNActive(outputSize);
@@ -94,13 +94,13 @@ void cuda_Convolution_backward(
 template <typename T, Int Dimension>
 double cuda_SubmanifoldConvolution_updateOutput(
-    /*long*/ at::Tensor inputSize, /*long*/ at::Tensor filterSize,
+    /*long*/ at::Tensor &inputSize, /*long*/ at::Tensor &filterSize,
    Metadata<Dimension> &m,
-    /*cuda float*/ at::Tensor input_features,
+    /*cuda float*/ at::Tensor &input_features,
-    /*cuda float*/ at::Tensor output_features, /*cuda float*/ at::Tensor weight,
+    /*cuda float*/ at::Tensor &output_features, /*cuda float*/ at::Tensor &weight,
-    /*cuda float*/ at::Tensor bias) {
+    /*cuda float*/ at::Tensor &bias) {
-  auto _rules = m.getSubmanifoldRuleBook(inputSize, filterSize, true);
+  const auto &_rules = m.getSubmanifoldRuleBook(inputSize, filterSize, true);
  Int nActive = m.getNActive(inputSize);
  Int nGroups = weight.size(1);
  Int ip = weight.size(2);
@@ -126,15 +126,15 @@ double cuda_SubmanifoldConvolution_updateOutput(
 template <typename T, Int Dimension>
 void cuda_SubmanifoldConvolution_backward(
-    /*long*/ at::Tensor inputSize, /*long*/ at::Tensor filterSize,
+    /*long*/ at::Tensor &inputSize, /*long*/ at::Tensor &filterSize,
    Metadata<Dimension> &m,
-    /*cuda float*/ at::Tensor input_features,
+    /*cuda float*/ at::Tensor &input_features,
-    /*cuda float*/ at::Tensor d_input_features,
+    /*cuda float*/ at::Tensor &d_input_features,
-    /*cuda float*/ at::Tensor d_output_features,
+    /*cuda float*/ at::Tensor &d_output_features,
-    /*cuda float*/ at::Tensor weight, /*cuda float*/ at::Tensor d_weight,
+    /*cuda float*/ at::Tensor &weight, /*cuda float*/ at::Tensor &d_weight,
-    /*cuda float*/ at::Tensor d_bias) {
+    /*cuda float*/ at::Tensor &d_bias) {
-  auto _rules = m.getSubmanifoldRuleBook(inputSize, filterSize, true);
+  const auto &_rules = m.getSubmanifoldRuleBook(inputSize, filterSize, true);
  Int nActive = m.getNActive(inputSize);
  Int nGroups = weight.size(1);
  Int ip = weight.size(2);
@@ -161,12 +161,12 @@ void cuda_SubmanifoldConvolution_backward(
 template <typename T, Int Dimension>
 double cuda_PermutohedralSubmanifoldConvolution_updateOutput(
-    /*long*/ at::Tensor inputSize, Metadata<Dimension> &m,
+    /*long*/ at::Tensor &inputSize, Metadata<Dimension> &m,
-    /*cuda float*/ at::Tensor input_features,
+    /*cuda float*/ at::Tensor &input_features,
-    /*cuda float*/ at::Tensor output_features, /*cuda float*/ at::Tensor weight,
+    /*cuda float*/ at::Tensor &output_features, /*cuda float*/ at::Tensor &weight,
-    /*cuda float*/ at::Tensor bias) {
+    /*cuda float*/ at::Tensor &bias) {
-  auto _rules = m.getPermutohedralSubmanifoldRuleBook(inputSize, true);
+  const auto &_rules = m.getPermutohedralSubmanifoldRuleBook(inputSize, true);
  Int nActive = m.getNActive(inputSize);
  Int nGroups = weight.size(1);
  Int ip = weight.size(2);
@@ -192,14 +192,14 @@ double cuda_PermutohedralSubmanifoldConvolution_updateOutput(
 template <typename T, Int Dimension>
 void cuda_PermutohedralSubmanifoldConvolution_backward(
-    /*long*/ at::Tensor inputSize, Metadata<Dimension> &m,
+    /*long*/ at::Tensor &inputSize, Metadata<Dimension> &m,
-    /*cuda float*/ at::Tensor input_features,
+    /*cuda float*/ at::Tensor &input_features,
-    /*cuda float*/ at::Tensor d_input_features,
+    /*cuda float*/ at::Tensor &d_input_features,
-    /*cuda float*/ at::Tensor d_output_features,
+    /*cuda float*/ at::Tensor &d_output_features,
-    /*cuda float*/ at::Tensor weight, /*cuda float*/ at::Tensor d_weight,
+    /*cuda float*/ at::Tensor &weight, /*cuda float*/ at::Tensor &d_weight,
-    /*cuda float*/ at::Tensor d_bias) {
+    /*cuda float*/ at::Tensor &d_bias) {
-  auto _rules = m.getPermutohedralSubmanifoldRuleBook(inputSize, true);
+  const auto &_rules = m.getPermutohedralSubmanifoldRuleBook(inputSize, true);
  Int nActive = m.getNActive(inputSize);
  Int nGroups = weight.size(1);
  Int ip = weight.size(2);
@@ -226,15 +226,15 @@ void cuda_PermutohedralSubmanifoldConvolution_backward(
 template <typename T, Int Dimension>
 double cuda_FullConvolution_updateOutput(
-    /*long*/ at::Tensor inputSize, /*long*/ at::Tensor outputSize,
+    /*long*/ at::Tensor &inputSize, /*long*/ at::Tensor &outputSize,
-    /*long*/ at::Tensor filterSize,
+    /*long*/ at::Tensor &filterSize,
-    /*long*/ at::Tensor filterStride, Metadata<Dimension> &mIn,
+    /*long*/ at::Tensor &filterStride, Metadata<Dimension> &mIn,
    Metadata<Dimension> &mOut,
-    /*cuda float*/ at::Tensor input_features,
+    /*cuda float*/ at::Tensor &input_features,
-    /*cuda float*/ at::Tensor output_features, /*cuda float*/ at::Tensor weight,
+    /*cuda float*/ at::Tensor &output_features, /*cuda float*/ at::Tensor &weight,
-    /*cuda float*/ at::Tensor bias) {
+    /*cuda float*/ at::Tensor &bias) {
-  auto _rules = mIn.getFullConvolutionRuleBook(inputSize, outputSize,
+  const auto &_rules = mIn.getFullConvolutionRuleBook(inputSize, outputSize,
                                               filterSize, filterStride, mOut);
  Int nActiveOut = mOut.getNActive(outputSize);
  Int nGroups = weight.size(1);
@@ -261,17 +261,17 @@ double cuda_FullConvolution_updateOutput(
 template <typename T, Int Dimension>
 void cuda_FullConvolution_backward(
-    /*long*/ at::Tensor inputSize, /*long*/ at::Tensor outputSize,
+    /*long*/ at::Tensor &inputSize, /*long*/ at::Tensor &outputSize,
-    /*long*/ at::Tensor filterSize,
+    /*long*/ at::Tensor &filterSize,
-    /*long*/ at::Tensor filterStride, Metadata<Dimension> &mIn,
+    /*long*/ at::Tensor &filterStride, Metadata<Dimension> &mIn,
    Metadata<Dimension> &mOut,
-    /*cuda float*/ at::Tensor input_features,
+    /*cuda float*/ at::Tensor &input_features,
-    /*cuda float*/ at::Tensor d_input_features,
+    /*cuda float*/ at::Tensor &d_input_features,
-    /*cuda float*/ at::Tensor d_output_features,
+    /*cuda float*/ at::Tensor &d_output_features,
-    /*cuda float*/ at::Tensor weight, /*cuda float*/ at::Tensor d_weight,
+    /*cuda float*/ at::Tensor &weight, /*cuda float*/ at::Tensor &d_weight,
-    /*cuda float*/ at::Tensor d_bias) {
+    /*cuda float*/ at::Tensor &d_bias) {
-  auto _rules = mIn.getFullConvolutionRuleBook(inputSize, outputSize,
+  const auto &_rules = mIn.getFullConvolutionRuleBook(inputSize, outputSize,
                                               filterSize, filterStride, mOut);
  Int nActiveIn = mIn.getNActive(inputSize);
  Int nActiveOut = mOut.getNActive(outputSize);
@@ -299,14 +299,14 @@ void cuda_FullConvolution_backward(
 }
 template <typename T, Int Dimension>
 double cuda_RandomizedStrideConvolution_updateOutput(
-    /*long*/ at::Tensor inputSize, /*long*/ at::Tensor outputSize,
+    /*long*/ at::Tensor &inputSize, /*long*/ at::Tensor &outputSize,
-    /*long*/ at::Tensor filterSize,
+    /*long*/ at::Tensor &filterSize,
-    /*long*/ at::Tensor filterStride, Metadata<Dimension> &m,
+    /*long*/ at::Tensor &filterStride, Metadata<Dimension> &m,
-    /*cuda float*/ at::Tensor input_features,
+    /*cuda float*/ at::Tensor &input_features,
-    /*cuda float*/ at::Tensor output_features,
+    /*cuda float*/ at::Tensor &output_features,
-    /*cuda float*/ at::Tensor weight, /*cuda float*/ at::Tensor bias) {
+    /*cuda float*/ at::Tensor &weight, /*cuda float*/ at::Tensor &bias) {
-  auto _rules = m.getRandomizedStrideRuleBook(inputSize, outputSize, filterSize,
+  const auto &_rules = m.getRandomizedStrideRuleBook(inputSize, outputSize, filterSize,
                                              filterStride, true);
  Int nActiveOut = m.getNActive(outputSize);
  Int nGroups = weight.size(1);
@@ -333,16 +333,16 @@ double cuda_RandomizedStrideConvolution_updateOutput(
 template <typename T, Int Dimension>
 void cuda_RandomizedStrideConvolution_backward(
-    /*long*/ at::Tensor inputSize, /*long*/ at::Tensor outputSize,
+    /*long*/ at::Tensor &inputSize, /*long*/ at::Tensor &outputSize,
-    /*long*/ at::Tensor filterSize,
+    /*long*/ at::Tensor &filterSize,
-    /*long*/ at::Tensor filterStride, Metadata<Dimension> &m,
+    /*long*/ at::Tensor &filterStride, Metadata<Dimension> &m,
-    /*cuda float*/ at::Tensor input_features,
+    /*cuda float*/ at::Tensor &input_features,
-    /*cuda float*/ at::Tensor d_input_features,
+    /*cuda float*/ at::Tensor &d_input_features,
-    /*cuda float*/ at::Tensor d_output_features,
+    /*cuda float*/ at::Tensor &d_output_features,
-    /*cuda float*/ at::Tensor weight, /*cuda float*/ at::Tensor d_weight,
+    /*cuda float*/ at::Tensor &weight, /*cuda float*/ at::Tensor &d_weight,
-    /*cuda float*/ at::Tensor d_bias) {
+    /*cuda float*/ at::Tensor &d_bias) {
-  auto _rules = m.getRandomizedStrideRuleBook(inputSize, outputSize, filterSize,
+  const auto &_rules = m.getRandomizedStrideRuleBook(inputSize, outputSize, filterSize,
                                              filterStride, true);
  Int nActiveIn = m.getNActive(inputSize);
  Int nActiveOut = m.getNActive(outputSize);

--- a/sparseconvnet/SCN/CUDA/Deconvolution.cpp
+++ b/sparseconvnet/SCN/CUDA/Deconvolution.cpp
@@ -19,14 +19,14 @@ void dDeconvolution_backward_dW2(T *inFeatures, T *dInFeatures, T *dOutFeatures,
 template <typename T, Int Dimension>
 double cuda_Deconvolution_updateOutput(
-    /*long*/ at::Tensor inputSize, /*long*/ at::Tensor outputSize,
+    /*long*/ at::Tensor &inputSize, /*long*/ at::Tensor &outputSize,
-    /*long*/ at::Tensor filterSize,
+    /*long*/ at::Tensor &filterSize,
-    /*long*/ at::Tensor filterStride, Metadata<Dimension> &m,
+    /*long*/ at::Tensor &filterStride, Metadata<Dimension> &m,
-    /*cuda float*/ at::Tensor input_features,
+    /*cuda float*/ at::Tensor &input_features,
-    /*cuda float*/ at::Tensor output_features, /*cuda float*/ at::Tensor weight,
+    /*cuda float*/ at::Tensor &output_features, /*cuda float*/ at::Tensor &weight,
-    /*cuda float*/ at::Tensor bias) {
+    /*cuda float*/ at::Tensor &bias) {
-  auto _rules =
+  const auto &_rules =
      m.getRuleBook(outputSize, inputSize, filterSize, filterStride, true);
  Int nActiveOut = m.getNActive(outputSize);
  Int nGroups = weight.size(1);
@@ -53,16 +53,16 @@ double cuda_Deconvolution_updateOutput(
 template <typename T, Int Dimension>
 void cuda_Deconvolution_backward(
-    /*long*/ at::Tensor inputSize, /*long*/ at::Tensor outputSize,
+    /*long*/ at::Tensor &inputSize, /*long*/ at::Tensor &outputSize,
-    /*long*/ at::Tensor filterSize,
+    /*long*/ at::Tensor &filterSize,
-    /*long*/ at::Tensor filterStride, Metadata<Dimension> &m,
+    /*long*/ at::Tensor &filterStride, Metadata<Dimension> &m,
-    /*cuda float*/ at::Tensor input_features,
+    /*cuda float*/ at::Tensor &input_features,
-    /*cuda float*/ at::Tensor d_input_features,
+    /*cuda float*/ at::Tensor &d_input_features,
-    /*cuda float*/ at::Tensor d_output_features,
+    /*cuda float*/ at::Tensor &d_output_features,
-    /*cuda float*/ at::Tensor weight, /*cuda float*/ at::Tensor d_weight,
+    /*cuda float*/ at::Tensor &weight, /*cuda float*/ at::Tensor &d_weight,
-    /*cuda float*/ at::Tensor d_bias) {
+    /*cuda float*/ at::Tensor &d_bias) {
-  auto _rules =
+  const auto &_rules =
      m.getRuleBook(outputSize, inputSize, filterSize, filterStride, true);
  Int nActiveIn = m.getNActive(inputSize);
  Int nActiveOut = m.getNActive(outputSize);