3d segmantation

1df7b845 · Benjamin Thomas Graham · f2e3800b · 1df7b845 · 1df7b845 · 1df7b845
Commit 1df7b845 authored May 14, 2018 by Benjamin Thomas Graham
20 changed files
--- a/PyTorch/sparseconvnet/SCN/generic/GPU/BatchwiseMultiplicativeDropout.h
+++ b/PyTorch/sparseconvnet/SCN/generic/GPU/BatchwiseMultiplicativeDropout.h
--- a/PyTorch/sparseconvnet/SCN/generic/GPU/Convolution.cu
+++ b/PyTorch/sparseconvnet/SCN/generic/GPU/Convolution.cu
@@ -16,7 +16,7 @@ extern "C" double scn_DR_(Convolution_updateOutput)(
    THLongTensor *inputSize, THLongTensor *outputSize, THLongTensor *filterSize,
    THLongTensor *filterStride, void **m, THCTensor *input_features,
    THCTensor *output_features, THCTensor *weight, THCTensor *bias,
-    long filterVolume, THCITensor *rulesBuffer) {
+    long filterVolume) {
  SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
  auto _rules =
      _m.getRuleBook(inputSize, outputSize, filterSize, filterStride, true);
@@ -38,9 +38,9 @@ extern "C" double scn_DR_(Convolution_updateOutput)(
      for (uInt i = 0; i < op; i += 32) {
        uInt blockDim = min(32L, op - i);
        uInt gridDim = min(4096, nActive);
-        Convolution_fp_bias
-                << <gridDim, blockDim, 0, THCState_getCurrentStream(state)>>>
-            (oF + i, b + i, op, op, nActive);
+        Convolution_fp_bias<<<gridDim, blockDim, 0,
+                              THCState_getCurrentStream(state)>>>(
+            oF + i, b + i, op, op, nActive);
      }
    }
    uInt c = ip * op;
@@ -57,7 +57,7 @@ extern "C" void scn_DR_(Convolution_backward)(
    THLongTensor *filterStride, void **m, THCTensor *input_features,
    THCTensor *d_input_features, THCTensor *d_output_features,
    THCTensor *weight, THCTensor *d_weight, THCTensor *d_bias,
-    long filterVolume, THCITensor *rulesBuffer) {
+    long filterVolume) {
  SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
  auto _rules =
      _m.getRuleBook(inputSize, outputSize, filterSize, filterStride, true);
@@ -90,7 +90,7 @@ extern "C" void scn_DR_(Convolution_backward)(
 extern "C" double scn_DR_(SubmanifoldConvolution_updateOutput)(
    THLongTensor *inputSize, THLongTensor *filterSize, void **m,
    THCTensor *input_features, THCTensor *output_features, THCTensor *weight,
-    THCTensor *bias, long filterVolume, THCITensor *rulesBuffer) {
+    THCTensor *bias, long filterVolume) {
  SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
  auto _rules = _m.getSubmanifoldRuleBook(inputSize, filterSize, true);
  uInt nActive = _m.getNActive(inputSize);
@@ -111,9 +111,9 @@ extern "C" double scn_DR_(SubmanifoldConvolution_updateOutput)(
      for (uInt i = 0; i < op; i += 32) {
        uInt blockDim = min(32L, op - i);
        uInt gridDim = min(4096, nActive);
-        Convolution_fp_bias
-                << <gridDim, blockDim, 0, THCState_getCurrentStream(state)>>>
-            (oF + i, b + i, op, op, nActive);
+        Convolution_fp_bias<<<gridDim, blockDim, 0,
+                              THCState_getCurrentStream(state)>>>(
+            oF + i, b + i, op, op, nActive);
      }
    }
    uInt c = ip * op;
@@ -129,7 +129,7 @@ extern "C" void scn_DR_(SubmanifoldConvolution_backward)(
    THLongTensor *inputSize, THLongTensor *filterSize, void **m,
    THCTensor *input_features, THCTensor *d_input_features,
    THCTensor *d_output_features, THCTensor *weight, THCTensor *d_weight,
-    THCTensor *d_bias, long filterVolume, THCITensor *rulesBuffer) {
+    THCTensor *d_bias, long filterVolume) {
  SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
  auto _rules = _m.getSubmanifoldRuleBook(inputSize, filterSize, true);
  uInt nActive = _m.getNActive(inputSize);
@@ -158,4 +158,156 @@ extern "C" void scn_DR_(SubmanifoldConvolution_backward)(
  }
 }

+extern "C" double scn_DR_(FullConvolution_updateOutput)(
+    THLongTensor *inputSize, THLongTensor *outputSize, THLongTensor *filterSize,
+    THLongTensor *filterStride, void **mIn, void **mOut,
+    THCTensor *input_features, THCTensor *output_features, THCTensor *weight,
+    THCTensor *bias, long filterVolume, THCITensor *rulesBuffer) {
+
+  SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, mIn)
+  SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, mOut)
+  auto _rules = _mIn.getFullConvolutionRuleBook(
+      inputSize, outputSize, filterSize, filterStride, _mOut);
+  uInt nActive = _mOut.getNActive(outputSize);
+  THCTensor_(resize2d)(state, output_features, nActive, weight->size[1]);
+  if (not bias)
+    THCTensor_(zero)(state, output_features);
+  double flops = 0;
+
+  if (nActive) {
+    auto iF = THCTensor_(data)(state, input_features);
+    auto oF = THCTensor_(data)(state, output_features);
+    auto ip = input_features->size[1];
+    auto op = output_features->size[1];
+    auto w = THCTensor_(data)(state, weight);
+
+    if (bias) {
+      auto b = THCTensor_(data)(state, bias);
+      for (uInt i = 0; i < op; i += 32) {
+        uInt blockDim = min(32L, op - i);
+        uInt gridDim = min(4096, nActive);
+        Convolution_fp_bias<<<gridDim, blockDim, 0,
+                              THCState_getCurrentStream(state)>>>(
+            oF + i, b + i, op, op, nActive);
+      }
+    }
+    uInt c = ip * op;
+    RULEBOOKITERATOR(
+        dConvolution_forward2<real>(iF, oF, w, rbB, nHotB, ip, ip, op, op,
+                                    THCState_getCurrentStream(state));
+        , w += c; flops += nHotB * c;)
+  }
+  return flops;
+}
+
+extern "C" void scn_DR_(FullConvolution_backward)(
+    THLongTensor *inputSize, THLongTensor *outputSize, THLongTensor *filterSize,
+    THLongTensor *filterStride, void **mIn, void **mOut,
+    THCTensor *input_features, THCTensor *d_input_features,
+    THCTensor *d_output_features, THCTensor *weight, THCTensor *d_weight,
+    THCTensor *d_bias, long filterVolume) {
+
+  SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, mIn)
+  SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, mOut)
+  auto _rules = _mIn.getFullConvolutionRuleBook(
+      inputSize, outputSize, filterSize, filterStride, _mOut);
+  uInt nActive = _mOut.getNActive(outputSize);
+  THCTensor_(resizeAs)(state, d_input_features, input_features);
+  THCTensor_(zero)(state, d_input_features);
+  if (nActive) {
+    auto iF = THCTensor_(data)(state, input_features);
+    auto diF = THCTensor_(data)(state, d_input_features);
+    auto doF = THCTensor_(data)(state, d_output_features);
+    auto ip = input_features->size[1];
+    auto op = d_output_features->size[1];
+    auto w = THCTensor_(data)(state, weight);
+    auto dw = THCTensor_(data)(state, d_weight);
+    uInt c = ip * op;
+    RULEBOOKITERATOR(dConvolution_backward_dW2<real>(
+                         iF, diF, doF, w, dw, rbB, nHotB, ip, ip, op, op,
+                         THCState_getCurrentStream(state));
+                     , w += c; dw += c;)
+
+    if (d_bias) {
+      auto db = THCTensor_(data)(state, d_bias);
+      Convolution_bp_bias(doF, db, op, op, nActive,
+                          THCState_getCurrentStream(state));
+    }
+  }
+}
+extern "C" double scn_DR_(RandomizedStrideConvolution_updateOutput)(
+    THLongTensor *inputSize, THLongTensor *outputSize, THLongTensor *filterSize,
+    THLongTensor *filterStride,
+    void **m, THCTensor *input_features, THCTensor *output_features,
+    THCTensor *weight, THCTensor *bias, long filterVolume,
+    THCITensor *rulesBuffer) {
+  SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
+  auto _rules =
+      _m.getRandomizedStrideRuleBook(inputSize, outputSize, filterSize, filterStride, true);
+  uInt nActive = _m.getNActive(outputSize);
+  THCTensor_(resize2d)(state, output_features, nActive, weight->size[1]);
+  if (not bias)
+    THCTensor_(zero)(state, output_features);
+
+  double flops = 0;
+  if (nActive) {
+    auto iF = THCTensor_(data)(state, input_features);
+    auto oF = THCTensor_(data)(state, output_features);
+    auto ip = input_features->size[1];
+    auto op = output_features->size[1];
+    auto w = THCTensor_(data)(state, weight);
+
+    if (bias) {
+      auto b = THCTensor_(data)(state, bias);
+      for (uInt i = 0; i < op; i += 32) {
+        uInt blockDim = min(32L, op - i);
+        uInt gridDim = min(4096, nActive);
+        Convolution_fp_bias<<<gridDim, blockDim, 0,
+                              THCState_getCurrentStream(state)>>>(
+            oF + i, b + i, op, op, nActive);
+      }
+    }
+    uInt c = ip * op;
+    RULEBOOKITERATOR(
+        dConvolution_forward2<real>(iF, oF, w, rbB, nHotB, ip, ip, op, op,
+                                    THCState_getCurrentStream(state));
+        , w += c; flops += nHotB * c;)
+  }
+  return flops;
+}
+
+extern "C" void scn_DR_(RandomizedStrideConvolution_backward)(
+    THLongTensor *inputSize, THLongTensor *outputSize, THLongTensor *filterSize,
+    THLongTensor *filterStride,
+    void **m, THCTensor *input_features, THCTensor *d_input_features,
+    THCTensor *d_output_features, THCTensor *weight, THCTensor *d_weight,
+    THCTensor *d_bias, long filterVolume) {
+  SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
+  auto _rules =
+      _m.getRandomizedStrideRuleBook(inputSize, outputSize, filterSize, filterStride, true);
+  uInt nActive = _m.getNActive(outputSize);
+  THCTensor_(resizeAs)(state, d_input_features, input_features);
+  THCTensor_(zero)(state, d_input_features);
+
+  if (nActive) {
+    auto iF = THCTensor_(data)(state, input_features);
+    auto diF = THCTensor_(data)(state, d_input_features);
+    auto doF = THCTensor_(data)(state, d_output_features);
+    auto ip = input_features->size[1];
+    auto op = d_output_features->size[1];
+    auto w = THCTensor_(data)(state, weight);
+    auto dw = THCTensor_(data)(state, d_weight);
+    uInt c = ip * op;
+    RULEBOOKITERATOR(dConvolution_backward_dW2<real>(
+                         iF, diF, doF, w, dw, rbB, nHotB, ip, ip, op, op,
+                         THCState_getCurrentStream(state));
+                     , w += c; dw += c;)
+
+    if (d_bias) {
+      auto db = THCTensor_(data)(state, d_bias);
+      Convolution_bp_bias(doF, db, op, op, nActive,
+                          THCState_getCurrentStream(state));
+    }
+  }
+}
 #endif
--- a/PyTorch/sparseconvnet/SCN/generic/GPU/Convolution.h
+++ b/PyTorch/sparseconvnet/SCN/generic/GPU/Convolution.h
--- a/PyTorch/sparseconvnet/SCN/generic/GPU/Deconvolution.cu
+++ b/PyTorch/sparseconvnet/SCN/generic/GPU/Deconvolution.cu
@@ -16,7 +16,7 @@ extern "C" double scn_DR_(Deconvolution_updateOutput)(
    THLongTensor *inputSize, THLongTensor *outputSize, THLongTensor *filterSize,
    THLongTensor *filterStride, void **m, THCTensor *input_features,
    THCTensor *output_features, THCTensor *weight, THCTensor *bias,
-    long filterVolume, THCITensor *rulesBuffer) {
+    long filterVolume) {
  SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
  auto _rules =
      _m.getRuleBook(outputSize, inputSize, filterSize, filterStride, true);
@@ -55,7 +55,7 @@ extern "C" void scn_DR_(Deconvolution_backward)(
    THLongTensor *filterStride, void **m, THCTensor *input_features,
    THCTensor *d_input_features, THCTensor *d_output_features,
    THCTensor *weight, THCTensor *d_weight, THCTensor *d_bias,
-    long filterVolume, THCITensor *rulesBuffer) {
+    long filterVolume) {
  SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
  auto _rules =
      _m.getRuleBook(outputSize, inputSize, filterSize, filterStride, true);

--- a/PyTorch/sparseconvnet/SCN/generic/GPU/Deconvolution.h
+++ b/PyTorch/sparseconvnet/SCN/generic/GPU/Deconvolution.h
--- a/PyTorch/sparseconvnet/SCN/generic/GPU/InputLayer.cu
+++ b/PyTorch/sparseconvnet/SCN/generic/GPU/InputLayer.cu
@@ -5,61 +5,128 @@
 // LICENSE file in the root directory of this source tree.

 #ifndef TH_GENERIC_FILE_
-#define TH_GENERIC_FILE_ "generic/GPU/InputLayer.cu"
+#define TH_GENERIC_FILE_ "generic/GPU/IOLayers.cu"
 #else
-#include "InputLayer.h"
+#include "IOLayers.h"

 extern "C" void scn_DR_(InputLayer_updateOutput)(
    void **m, THLongTensor *spatialSize, THLongTensor *input_coords,
    THCTensor *input_features, THCTensor *output_features, long batchSize,
-    long mode, THCITensor *rulesBuffer) {
+    long mode) {
  SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
  _m.inputLayer(spatialSize, input_coords, batchSize, mode);
  uInt nPlanes = input_features->size[1];
-  THCTensor_(resize2d)(state, output_features, *_m.inputNActive, nPlanes);
-  THCTensor_(zero)(state, output_features);
  auto &rules = _m.inputLayerRuleBook;
  uInt maxActive = rules[0][1];
  uInt nRows = rules[0][3];
-
-    THCITensor_resize1d(state, rulesBuffer, sizeof(uInt) * rules[1].size());
+  if (mode == 0) {
+    THCTensor_(resizeAs)(state, output_features, input_features);
+    THCTensor_(copy)(state, output_features, input_features);
+  } else {
+    THCTensor_(resize2d)(state, output_features, *_m.inputNActive, nPlanes);
+    THCTensor_(zero)(state, output_features);
+    auto rulesBuffer = THCITensor_(new)(state);
+    THCITensor_(resize1d)(state, rulesBuffer, sizeof(uInt) * rules[1].size());
    auto iF = THCTensor_(data)(state, input_features);
    auto oF = THCTensor_(data)(state, output_features);
-    auto rb = (uInt*) THCITensor_data(state, rulesBuffer);
+    auto rb = (uInt *)THCITensor_(data)(state, rulesBuffer);
    cudaMemcpy(rb, &rules[1][0], sizeof(uInt) * rules[1].size(),
               cudaMemcpyHostToDevice);
    InputLayer_fp<real><<<std::min(nRows, 32768U), std::min(nPlanes, 32U), 0,
                          THCState_getCurrentStream(state)>>>(
        iF, oF, nRows, maxActive, nPlanes, rb, mode == 4);
+    THCITensor_(free)(state, rulesBuffer);
+  }
 }
 extern "C" void
 scn_DR_(InputLayer_updateGradInput)(void **m, THCTensor *d_input_features,
-                                    THCTensor *d_output_features,
-                                    THCITensor *rulesBuffer) {
+                                    THCTensor *d_output_features) {
  SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
  auto &rules = _m.inputLayerRuleBook;
  uInt nPlanes = d_output_features->size[1];
-  THCTensor_(resize2d)(state, d_input_features, rules[0][2], nPlanes);
-  THCTensor_(zero)(state, d_input_features);
  auto mode = rules[0][0];
  uInt maxActive = rules[0][1];
  uInt nRows = rules[0][3];
-
-  THCITensor_resize1d(state, rulesBuffer, sizeof(uInt) * rules[1].size());
+  if (mode == 0) {
+    THCTensor_(resizeAs)(state, d_input_features, d_output_features);
+    THCTensor_(copy)(state, d_input_features, d_output_features);
+  } else {
+    THCTensor_(resize2d)(state, d_input_features, rules[0][2], nPlanes);
+    THCTensor_(zero)(state, d_input_features);
+    auto rulesBuffer = THCITensor_(new)(state);
+    THCITensor_(resize1d)(state, rulesBuffer, sizeof(uInt) * rules[1].size());
    auto diF = THCTensor_(data)(state, d_input_features);
    auto doF = THCTensor_(data)(state, d_output_features);
-  auto rb = (uInt*)THCITensor_data(state, rulesBuffer);
+    auto rb = (uInt *)THCITensor_(data)(state, rulesBuffer);
    cudaMemcpy(rb, &rules[1][0], sizeof(uInt) * rules[1].size(),
               cudaMemcpyHostToDevice);
    InputLayer_bp<real><<<std::min(nRows, 32768U), std::min(nPlanes, 32U), 0,
                          THCState_getCurrentStream(state)>>>(
        diF, doF, nRows, maxActive, nPlanes, rb, mode == 4);
+    THCITensor_(free)(state, rulesBuffer);
+  }
+}
+
+extern "C" void scn_DR_(OutputLayer_updateOutput)(void **m,
+                                                  THCTensor *input_features,
+                                                  THCTensor *output_features) {
+  SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
+  auto &rules = _m.inputLayerRuleBook;
+  uInt nPlanes = input_features->size[1];
+  auto mode = rules[0][0];
+  auto maxActive = rules[0][1];
+  auto nRows = rules[0][3];
+  if (mode == 0) {
+    THCTensor_(resizeAs)(state, output_features, input_features);
+    THCTensor_(copy)(state, output_features, input_features);
+  } else {
+    THCTensor_(resize2d)(state, output_features, rules[0][2], nPlanes);
+    THCTensor_(zero)(state, output_features);
+    auto rulesBuffer = THCITensor_(new)(state);
+    THCITensor_(resize1d)(state, rulesBuffer, sizeof(uInt) * rules[1].size());
+    auto iF = THCTensor_(data)(state, input_features);
+    auto oF = THCTensor_(data)(state, output_features);
+    auto rb = (uInt *)THCITensor_(data)(state, rulesBuffer);
+    cudaMemcpy(rb, &rules[1][0], sizeof(uInt) * rules[1].size(),
+               cudaMemcpyHostToDevice);
+    InputLayer_bp<real><<<std::min(nRows, 32768U), std::min(nPlanes, 32U), 0,
+                          THCState_getCurrentStream(state)>>>(
+        oF, iF, nRows, maxActive, nPlanes, rb, false);
+    THCITensor_(free)(state, rulesBuffer);
+  }
+}
+extern "C" void
+scn_DR_(OutputLayer_updateGradInput)(void **m, THCTensor *d_input_features,
+                                     THCTensor *d_output_features) {
+  SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
+  auto &rules = _m.inputLayerRuleBook;
+  uInt nPlanes = d_output_features->size[1];
+  auto mode = rules[0][0];
+  auto maxActive = rules[0][1];
+  auto nRows = rules[0][3];
+  if (mode == 0) {
+    THCTensor_(resizeAs)(state, d_input_features, d_output_features);
+    THCTensor_(copy)(state, d_input_features, d_output_features);
+  } else {
+    THCTensor_(resize2d)(state, d_input_features, nRows, nPlanes);
+    THCTensor_(zero)(state, d_input_features);
+    auto rulesBuffer = THCITensor_(new)(state);
+    THCITensor_(resize1d)(state, rulesBuffer, sizeof(uInt) * rules[1].size());
+    auto diF = THCTensor_(data)(state, d_input_features);
+    auto doF = THCTensor_(data)(state, d_output_features);
+    auto rb = (uInt *)THCITensor_(data)(state, rulesBuffer);
+    cudaMemcpy(rb, &rules[1][0], sizeof(uInt) * rules[1].size(),
+               cudaMemcpyHostToDevice);
+    InputLayer_fp<real><<<std::min(nRows, 32768U), std::min(nPlanes, 32U), 0,
+                          THCState_getCurrentStream(state)>>>(
+        doF, diF, nRows, maxActive, nPlanes, rb, false);
+    THCITensor_(free)(state, rulesBuffer);
+  }
 }

 extern "C" void scn_DR_(BLInputLayer_updateOutput)(
    void **m, THLongTensor *spatialSize, THLongTensor *input_coords,
-    THCTensor *input_features, THCTensor *output_features, long mode,
-    THCITensor *rulesBuffer) {
+    THCTensor *input_features, THCTensor *output_features, long mode) {
  SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
  _m.blLayer(spatialSize, input_coords, mode);
  uInt nPlanes = input_features->size[2];
@@ -74,21 +141,22 @@ extern "C" void scn_DR_(BLInputLayer_updateOutput)(
    THCTensor_(copy)(state, output_features, input_features);
    THCTensor_(resize2d)(state, output_features, *_m.inputNActive, nPlanes);
  } else {
-  THCITensor_resize1d(state, rulesBuffer, sizeof(uInt) * rules[1].size());
+    auto rulesBuffer = THCITensor_(new)(state);
+    THCITensor_(resize1d)(state, rulesBuffer, sizeof(uInt) * rules[1].size());
    auto iF = THCTensor_(data)(state, input_features);
    auto oF = THCTensor_(data)(state, output_features);
-  auto rb = (uInt*) THCITensor_data(state, rulesBuffer);
+    auto rb = (uInt *)THCITensor_(data)(state, rulesBuffer);
    cudaMemcpy(rb, &rules[1][0], sizeof(uInt) * rules[1].size(),
               cudaMemcpyHostToDevice);
    InputLayer_fp<real><<<std::min(nRows, 32768U), std::min(nPlanes, 32U), 0,
                          THCState_getCurrentStream(state)>>>(
        iF, oF, nRows, maxActive, nPlanes, rb, mode == 4);
+    THCITensor_(free)(state, rulesBuffer);
  }
 }
 extern "C" void
 scn_DR_(BLInputLayer_updateGradInput)(void **m, THCTensor *d_input_features,
-                                    THCTensor *d_output_features,
-                                    THCITensor *rulesBuffer) {
+                                      THCTensor *d_output_features) {
  SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
  auto &rules = _m.blLayerRuleBook;
  uInt nPlanes = d_output_features->size[1];
@@ -99,76 +167,84 @@ scn_DR_(BLInputLayer_updateGradInput)(void **m, THCTensor *d_input_features,
  if (mode == 0) {
    THCTensor_(resizeAs)(state, d_input_features, d_output_features);
    THCTensor_(copy)(state, d_input_features, d_output_features);
-    THCTensor_(resize3d)(state, d_input_features, rules[0][2], rules[0][3], nPlanes);
+    THCTensor_(resize3d)(state, d_input_features, rules[0][2], rules[0][3],
+                         nPlanes);
  } else {
-    THCTensor_(resize3d)(state, d_input_features, rules[0][2], rules[0][3], nPlanes);
+    THCTensor_(resize3d)(state, d_input_features, rules[0][2], rules[0][3],
+                         nPlanes);
    THCTensor_(zero)(state, d_input_features);
-  THCITensor_resize1d(state, rulesBuffer, sizeof(uInt) * rules[1].size());
+    auto rulesBuffer = THCITensor_(new)(state);
+    THCITensor_(resize1d)(state, rulesBuffer, sizeof(uInt) * rules[1].size());
    auto diF = THCTensor_(data)(state, d_input_features);
    auto doF = THCTensor_(data)(state, d_output_features);
-  auto rb = (uInt*)THCITensor_data(state, rulesBuffer);
+    auto rb = (uInt *)THCITensor_(data)(state, rulesBuffer);
    cudaMemcpy(rb, &rules[1][0], sizeof(uInt) * rules[1].size(),
               cudaMemcpyHostToDevice);
    InputLayer_bp<real><<<std::min(nRows, 32768U), std::min(nPlanes, 32U), 0,
                          THCState_getCurrentStream(state)>>>(
        diF, doF, nRows, maxActive, nPlanes, rb, mode == 4);
+  THCITensor_(free)(state, rulesBuffer);
  }
 }

-extern "C" void scn_DR_(BLOutputLayer_updateOutput)(
-    void **m,
-    THCTensor *input_features, THCTensor *output_features,
-    THCITensor *rulesBuffer) {
+extern "C" void scn_DR_(BLOutputLayer_updateOutput)(void **m,
+                                                    THCTensor *input_features,
+                                                    THCTensor *output_features) {
  SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
  auto &rules = _m.blLayerRuleBook;
  uInt nPlanes = input_features->size[1];
  auto mode = rules[0][0];
  uInt maxActive = rules[0][1];
  uInt nRows = rules[0][4];
-  if (mode==0) {
+  if (mode == 0) {
    THCTensor_(resizeAs)(state, output_features, input_features);
    THCTensor_(copy)(state, output_features, input_features);
-    THCTensor_(resize3d)(state, output_features, rules[0][2], rules[0][3], nPlanes);
+    THCTensor_(resize3d)(state, output_features, rules[0][2], rules[0][3],
+                         nPlanes);
  } else {
-    THCTensor_(resize3d)(state, output_features, rules[0][2], rules[0][3], nPlanes);
+    THCTensor_(resize3d)(state, output_features, rules[0][2], rules[0][3],
+                         nPlanes);
    THCTensor_(zero)(state, output_features);
-    THCITensor_resize1d(state, rulesBuffer, sizeof(uInt) * rules[1].size());
+    auto rulesBuffer = THCITensor_(new)(state);
+    THCITensor_(resize1d)(state, rulesBuffer, sizeof(uInt) * rules[1].size());
    auto iF = THCTensor_(data)(state, input_features);
    auto oF = THCTensor_(data)(state, output_features);
-    auto rb = (uInt*) THCITensor_data(state, rulesBuffer);
+    auto rb = (uInt *)THCITensor_(data)(state, rulesBuffer);
    cudaMemcpy(rb, &rules[1][0], sizeof(uInt) * rules[1].size(),
               cudaMemcpyHostToDevice);
    InputLayer_bp<real><<<std::min(nRows, 32768U), std::min(nPlanes, 32U), 0,
                          THCState_getCurrentStream(state)>>>(
        oF, iF, nRows, maxActive, nPlanes, rb, false);
+  THCITensor_(free)(state, rulesBuffer);
  }
 }
 extern "C" void
 scn_DR_(BLOutputLayer_updateGradInput)(void **m, THCTensor *d_input_features,
-                                    THCTensor *d_output_features,
-                                    THCITensor *rulesBuffer) {
+                                       THCTensor *d_output_features) {
  SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
  auto &rules = _m.blLayerRuleBook;
  uInt nPlanes = d_output_features->size[2];
  uInt mode = rules[0][0];
  uInt maxActive = rules[0][1];
  uInt nRows = rules[0][4];
-  if (mode==0) {
+  if (mode == 0) {
    THCTensor_(resizeAs)(state, d_input_features, d_output_features);
    THCTensor_(copy)(state, d_input_features, d_output_features);
    THCTensor_(resize2d)(state, d_input_features, nRows, nPlanes);
  } else {
    THCTensor_(resize2d)(state, d_input_features, nRows, nPlanes);
    THCTensor_(zero)(state, d_input_features);
-    THCITensor_resize1d(state, rulesBuffer, sizeof(uInt) * rules[1].size());
+    auto rulesBuffer = THCITensor_(new)(state);
+    THCITensor_(resize1d)(state, rulesBuffer, sizeof(uInt) * rules[1].size());
    auto diF = THCTensor_(data)(state, d_input_features);
    auto doF = THCTensor_(data)(state, d_output_features);
-    auto rb = (uInt*)THCITensor_data(state, rulesBuffer);
+    auto rb = (uInt *)THCITensor_(data)(state, rulesBuffer);
    cudaMemcpy(rb, &rules[1][0], sizeof(uInt) * rules[1].size(),
               cudaMemcpyHostToDevice);
    InputLayer_fp<real><<<std::min(nRows, 32768U), std::min(nPlanes, 32U), 0,
                          THCState_getCurrentStream(state)>>>(
        doF, diF, nRows, maxActive, nPlanes, rb, false);
+   THCITensor_(free)(state, rulesBuffer);
  }
 }
 #endif
--- a/PyTorch/sparseconvnet/SCN/generic/GPU/InputLayer.h
+++ b/PyTorch/sparseconvnet/SCN/generic/GPU/InputLayer.h
@@ -4,8 +4,8 @@
 // This source code is licensed under the license found in the
 // LICENSE file in the root directory of this source tree.

-#ifndef GPU_INPUTLAYER_H
-#define GPU_INPUTLAYER_H
+#ifndef GPU_IOLAYERS_H
+#define GPU_IOLAYERS_H

 template <typename T>
 __global__ void InputLayer_fp(T *input_features, T *output_features,
@@ -40,4 +40,4 @@ __global__ void InputLayer_bp(T *d_input_features, T *d_output_features,
    }
  }
 }
-#endif /* GPU_INPUTLAYER_H */
+#endif /* GPU_IOLAYERS_H */
--- a/PyTorch/sparseconvnet/SCN/generic/GPU/LeakyReLU.cu
+++ b/PyTorch/sparseconvnet/SCN/generic/GPU/LeakyReLU.cu
--- a/PyTorch/sparseconvnet/SCN/generic/GPU/LeakyReLU.h
+++ b/PyTorch/sparseconvnet/SCN/generic/GPU/LeakyReLU.h
--- a/PyTorch/sparseconvnet/SCN/generic/GPU/MaxPooling.cu
+++ b/PyTorch/sparseconvnet/SCN/generic/GPU/MaxPooling.cu
@@ -13,7 +13,7 @@
 extern "C" void scn_DR_(MaxPooling_updateOutput)(
    THLongTensor *inputSize, THLongTensor *outputSize, THLongTensor *poolSize,
    THLongTensor *poolStride, void **m, THCTensor *input_features,
-    THCTensor *output_features, long nFeaturesToDrop, THCITensor *rulesBuffer) {
+    THCTensor *output_features, long nFeaturesToDrop) {

  SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
  uInt nPlanes = input_features->size[1] - nFeaturesToDrop;
@@ -35,8 +35,7 @@ extern "C" void scn_DR_(MaxPooling_updateGradInput)(
    THLongTensor *inputSize, THLongTensor *outputSize, THLongTensor *poolSize,
    THLongTensor *poolStride, void **m, THCTensor *input_features,
    THCTensor *d_input_features, THCTensor *output_features,
-    THCTensor *d_output_features, long nFeaturesToDrop,
-    THCITensor *rulesBuffer) {
+    THCTensor *d_output_features, long nFeaturesToDrop) {

  SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
  uInt nPlanes = input_features->size[1] - nFeaturesToDrop;
@@ -56,4 +55,49 @@ extern "C" void scn_DR_(MaxPooling_updateGradInput)(
                                    d_output_features->size[1], rbB, nHotB);
      , )
 }
+extern "C" void scn_DR_(RandomizedStrideMaxPooling_updateOutput)(
+    THLongTensor *inputSize, THLongTensor *outputSize, THLongTensor *poolSize,
+    THLongTensor *poolStride, void **m, THCTensor *input_features,
+    THCTensor *output_features, long nFeaturesToDrop) {
+
+  SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
+  uInt nPlanes = input_features->size[1] - nFeaturesToDrop;
+  auto _rules =
+      _m.getRandomizedStrideRuleBook(inputSize, outputSize, poolSize, poolStride, true);
+  uInt nActive = _m.getNActive(outputSize);
+  THCTensor_(resize2d)(state, output_features, nActive, nPlanes);
+  THCTensor_(zero)(state, output_features);
+
+  auto iF = THCTensor_(data)(state, input_features) + nFeaturesToDrop;
+  auto oF = THCTensor_(data)(state, output_features);
+  RULEBOOKITERATOR(
+      MaxPooling_ForwardPass<real>(THCState_getCurrentStream(state), iF, oF,
+                                   nPlanes, input_features->size[1],
+                                   output_features->size[1], rbB, nHotB);
+      , )
+}
+extern "C" void scn_DR_(RandomizedStrideMaxPooling_updateGradInput)(
+    THLongTensor *inputSize, THLongTensor *outputSize, THLongTensor *poolSize,
+    THLongTensor *poolStride, void **m, THCTensor *input_features,
+    THCTensor *d_input_features, THCTensor *output_features,
+    THCTensor *d_output_features, long nFeaturesToDrop) {
+
+  SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
+  uInt nPlanes = input_features->size[1] - nFeaturesToDrop;
+  auto _rules =
+      _m.getRandomizedStrideRuleBook(inputSize, outputSize, poolSize, poolStride, true);
+  uInt nActive = _m.getNActive(outputSize);
+  THCTensor_(resizeAs)(state, d_input_features, input_features);
+  THCTensor_(zero)(state, d_input_features);
+
+  auto iF = THCTensor_(data)(state, input_features);
+  auto oF = THCTensor_(data)(state, output_features);
+  auto diF = THCTensor_(data)(state, d_input_features);
+  auto doF = THCTensor_(data)(state, d_output_features);
+  RULEBOOKITERATOR(
+      MaxPooling_BackwardPass<real>(THCState_getCurrentStream(state), iF, diF,
+                                    oF, doF, nPlanes, input_features->size[1],
+                                    d_output_features->size[1], rbB, nHotB);
+      , )
+}
 #endif
--- a/PyTorch/sparseconvnet/SCN/generic/GPU/MaxPooling.h
+++ b/PyTorch/sparseconvnet/SCN/generic/GPU/MaxPooling.h
--- a/PyTorch/sparseconvnet/SCN/generic/GPU/NetworkInNetwork.cu
+++ b/PyTorch/sparseconvnet/SCN/generic/GPU/NetworkInNetwork.cu
--- a/PyTorch/sparseconvnet/SCN/generic/GPU/RuleBookIterator.h
+++ b/PyTorch/sparseconvnet/SCN/generic/GPU/RuleBookIterator.h
@@ -14,9 +14,10 @@

 #define RULEBOOKITERATOR(X, Y)                                                 \
  uInt ms = ruleBookMaxSize(_rules);                                           \
-  if (THCITensor_nElement(state, rulesBuffer) < ms)                            \
-    THCITensor_resize1d(state, rulesBuffer, ms);                               \
-  uInt *rbB = (uInt *)THCITensor_data(state, rulesBuffer);                     \
+  auto rulesBuffer = THCITensor_(new)(state);                                 \
+  if (THCITensor_(nElement)(state, rulesBuffer) < ms)                         \
+    THCITensor_(resize1d)(state, rulesBuffer, ms);                            \
+  uInt *rbB = (uInt *)THCITensor_(data)(state, rulesBuffer);                  \
  for (int k = 0; k < _rules.size(); ++k) {                                    \
    auto &r = _rules[k];                                                       \
    uInt nHotB = r.size() / 2;                                                 \
@@ -28,6 +29,7 @@
      X                                                                        \
    }                                                                          \
    Y                                                                          \
-  }
+  }                                                                            \
+  THCITensor_(free)(state, rulesBuffer);

 #endif /* GPU_RULEBOOKITERATOR_H */
--- a/PyTorch/sparseconvnet/SCN/generic/GPU/SparseToDense.cu
+++ b/PyTorch/sparseconvnet/SCN/generic/GPU/SparseToDense.cu
@@ -11,7 +11,7 @@

 extern "C" void scn_DR_(SparseToDense_updateOutput)(
    THLongTensor *inputSize, void **m, THCTensor *input_features,
-    THCTensor *output_features, THCITensor *rulesBuffer, long nPlanes) {
+    THCTensor *output_features, long nPlanes) {

  SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
  {
@@ -36,8 +36,7 @@ extern "C" void scn_DR_(SparseToDense_updateOutput)(
 }
 extern "C" void scn_DR_(SparseToDense_updateGradInput)(
    THLongTensor *inputSize, void **m, THCTensor *input_features,
-    THCTensor *d_input_features, THCTensor *d_output_features,
-    THCITensor *rulesBuffer) {
+    THCTensor *d_input_features, THCTensor *d_output_features) {

  SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
  THCTensor_(resizeAs)(state, d_input_features, input_features);

--- a/PyTorch/sparseconvnet/SCN/generic/GPU/SparseToDense.h
+++ b/PyTorch/sparseconvnet/SCN/generic/GPU/SparseToDense.h
--- a/PyTorch/sparseconvnet/SCN/generic/GPU/THGenerateCudaFloatTypes.h
+++ b/PyTorch/sparseconvnet/SCN/generic/GPU/THGenerateCudaFloatTypes.h
--- a/PyTorch/sparseconvnet/SCN/generic/GPU/THGenerateDimCudaFloatTypes.h
+++ b/PyTorch/sparseconvnet/SCN/generic/GPU/THGenerateDimCudaFloatTypes.h
--- a/sparseconvnet/SCN/generic/GPU/UnPooling.cu
+++ b/sparseconvnet/SCN/generic/GPU/UnPooling.cu
+// Copyright 2016-present, Facebook, Inc.
+// All rights reserved.
+//
+// This source code is licensed under the license found in the
+// LICENSE file in the root directory of this source tree.
+
+#ifndef TH_GENERIC_FILE_
+#define TH_GENERIC_FILE_ "generic/GPU/UnPooling.cu"
+#else
+#include "RuleBookIterator.h"
+#include "UnPooling.h"
+
+extern "C" void scn_DR_(UnPooling_updateOutput)(
+    THLongTensor *inputSize, THLongTensor *outputSize, THLongTensor *poolSize,
+    THLongTensor *poolStride, void **m, THCTensor *input_features,
+    THCTensor *output_features, long nFeaturesToDrop) {
+
+  SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
+  uInt nPlanes = input_features->size[1] - nFeaturesToDrop;
+  auto _rules =
+      _m.getRuleBook(outputSize, inputSize, poolSize, poolStride, true);
+  uInt nActive = _m.getNActive(outputSize);
+  THCTensor_(resize2d)(state, output_features, nActive,
+                       input_features->size[1] - nFeaturesToDrop);
+  THCTensor_(zero)(state, output_features);
+
+  auto iF = THCTensor_(data)(state, input_features) + nFeaturesToDrop;
+  auto oF = THCTensor_(data)(state, output_features);
+  RULEBOOKITERATOR(UnPooling_ForwardPass<real>(
+                       THCState_getCurrentStream(state), iF, oF, nPlanes,
+                       input_features->size[1], output_features->size[1], rbB,
+                       nHotB, _rules.size());
+                   , )
+}
+
+extern "C" void scn_DR_(UnPooling_updateGradInput)(
+    THLongTensor *inputSize, THLongTensor *outputSize, THLongTensor *poolSize,
+    THLongTensor *poolStride, void **m, THCTensor *input_features,
+    THCTensor *d_input_features, THCTensor *d_output_features,
+    long nFeaturesToDrop) {
+
+  SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
+  uInt nPlanes = input_features->size[1] - nFeaturesToDrop;
+  auto _rules =
+      _m.getRuleBook(outputSize, inputSize, poolSize, poolStride, true);
+  uInt nActive = _m.getNActive(outputSize);
+  THCTensor_(resizeAs)(state, d_input_features, input_features);
+  THCTensor_(zero)(state, d_input_features);
+
+  auto diF = THCTensor_(data)(state, d_input_features) + nFeaturesToDrop;
+  auto doF = THCTensor_(data)(state, d_output_features);
+  RULEBOOKITERATOR(UnPooling_BackwardPass<real>(
+                       THCState_getCurrentStream(state), diF, doF, nPlanes,
+                       input_features->size[1], d_output_features->size[1], rbB,
+                       nHotB, _rules.size());
+                   , )
+}
+#endif
--- a/sparseconvnet/SCN/generic/GPU/UnPooling.h
+++ b/sparseconvnet/SCN/generic/GPU/UnPooling.h
+// Copyright 2016-present, Facebook, Inc.
+// All rights reserved.
+//
+// This source code is licensed under the license found in the
+// LICENSE file in the root directory of this source tree.
+
+#ifndef GPU_UNPOOLING_H
+#define GPU_UNPOOLING_H
+
+// NTX must be >=2 so r is filled properly
+template <typename T, uInt NTX, uInt NTY>
+__global__ void UnPooling_fp(T *input_features, T *output_features,
+                                  uInt nPlanes, uInt input_stride,
+                                  uInt output_stride, uInt *rules, uInt nHot) {
+  __shared__ uInt r[NTY * 2];
+  for (uInt n = blockIdx.x * NTY; n < nHot; n += gridDim.x * NTY) {
+    {
+      uInt i = threadIdx.x + NTX * threadIdx.y;
+      if (i < NTY * 2 and i < 2 * (n - nHot))
+        r[i] = rules[2 * n + i];
+    }
+    __syncthreads();
+    if (n + threadIdx.y < nHot) {
+      uInt i = r[2 * threadIdx.y + 1] * input_stride;
+      uInt o = r[2 * threadIdx.y] * output_stride;
+      for (uInt plane = threadIdx.x; plane < nPlanes; plane += NTX)
+        output_features[o + plane]+=input_features[i + plane];
+    }
+    __syncthreads();
+  }
+}
+
+template <typename T>
+void UnPooling_ForwardPass(cudaStream_t stream, T *input_features,
+                                T *output_features, uInt nPlanes,
+                                uInt input_stride, uInt output_stride,
+                                uInt *rules, uInt nHot, uInt filterVolume) {
+  UnPooling_fp<T, 32, 32><<<32, dim3(32, 32), 0, stream>>>(
+      input_features, output_features, nPlanes, input_stride, output_stride,
+      rules, nHot);
+}
+template <typename T, uInt NTX, uInt NTY>
+__global__ void UnPooling_bp(T *d_input_features, T *d_output_features,
+                                  uInt nPlanes, uInt input_stride,
+                                  uInt output_stride, uInt *rules, uInt nHot) {
+  __shared__ uInt r[NTY * 2];
+  for (uInt n = blockIdx.x * NTY; n < nHot; n += gridDim.x * NTY) {
+    {
+      uInt i = threadIdx.x + NTX * threadIdx.y;
+      if (i < NTY * 2 and i < 2 * (n - nHot))
+        r[i] = rules[2 * n + i];
+    }
+    __syncthreads();
+    if (n + threadIdx.y < nHot) {
+      uInt i = r[2 * threadIdx.y + 1] * input_stride;
+      uInt o = r[2 * threadIdx.y] * output_stride;
+      for (uInt plane = threadIdx.x; plane < nPlanes; plane += NTX)
+        d_input_features[i + plane] += d_output_features[o + plane];
+    }
+    __syncthreads();
+  }
+}
+
+template <typename T>
+void UnPooling_BackwardPass(cudaStream_t stream, T *d_input_features,
+                                 T *d_output_features, uInt nPlanes,
+                                 uInt input_stride, uInt output_stride,
+                                 uInt *rules, uInt nHot, uInt filterVolume) {
+  UnPooling_bp<T, 32, 32><<<32, dim3(32, 32), 0, stream>>>(
+      d_input_features, d_output_features, nPlanes, input_stride, output_stride,
+      rules, nHot);
+}
+#endif /* GPU_UNPOOLING_H */
--- a/PyTorch/sparseconvnet/SCN/generic/Geometry/ActivePoolingRules.h
+++ b/PyTorch/sparseconvnet/SCN/generic/Geometry/ActivePoolingRules.h