initial commit

f9552033 · Benjamin Thomas Graham · f9552033 · f9552033 · f9552033 · f9552033
Commit f9552033 authored Jul 16, 2017 by Benjamin Thomas Graham
20 changed files
--- a/PyTorch/sparseconvnet/SCN/generic/GPU/Deconvolution.cu
+++ b/PyTorch/sparseconvnet/SCN/generic/GPU/Deconvolution.cu
+// Copyright 2016-present, Facebook, Inc.
+// All rights reserved.
+//
+// This source code is licensed under the license found in the
+// LICENSE file in the root directory of this source tree.
+#ifndef TH_GENERIC_FILE_
+#define TH_GENERIC_FILE_ "generic/GPU/Deconvolution.cu"
+#else
+#include "Convolution.h"
+#include "Deconvolution.h"
+#include <algorithm>
+extern "C" double scn_DR_(Deconvolution_updateOutput)(
+    THLongTensor *inputSize, THLongTensor *outputSize, THLongTensor *filterSize,
+    THLongTensor *filterStride, void **m, THCTensor *input_features,
+    THCTensor *output_features, THCTensor *weight, THCTensor *bias,
+    long filterVolume, THCITensor *rulesBuffer) {
+  SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
+  auto _rules =
+      _m.getRuleBook(outputSize, inputSize, filterSize, filterStride, true);
+  uInt nActive = _m.getNActive(outputSize);
+  THCTensor_(resize2d)(state, output_features, nActive, weight->size[1]);
+  if (not bias)
+    THCTensor_(zero)(state, output_features);
+  auto iF = THCTensor_(data)(state, input_features);
+  auto oF = THCTensor_(data)(state, output_features);
+  auto ip = input_features->size[1];
+  auto op = output_features->size[1];
+  auto w = THCTensor_(data)(state, weight);
+  double flops = 0;
+  if (bias) {
+    auto b = THCTensor_(data)(state, bias);
+    for (uInt i = 0; i < op; i += 32) {
+      uInt blockDim = min(32L, op - i);
+      uInt gridDim = min(4096, nActive);
+      Convolution_fp_bias
+              << <gridDim, blockDim, 0, THCState_getCurrentStream(state)>>>
+          (oF + i, b + i, op, op, nActive);
+    }
+  }
+  uInt c = ip * op;
+  RULEBOOKITERATOR(
+      dDeconvolution_forward2<real>(iF, oF, w, rbB, nHotB, ip, ip, op, op,
+                                    THCState_getCurrentStream(state));
+      , w += c; flops += nHotB * c;)
+  return flops;
+}
+extern "C" void scn_DR_(Deconvolution_backward)(
+    THLongTensor *inputSize, THLongTensor *outputSize, THLongTensor *filterSize,
+    THLongTensor *filterStride, void **m, THCTensor *input_features,
+    THCTensor *d_input_features, THCTensor *d_output_features,
+    THCTensor *weight, THCTensor *d_weight, THCTensor *d_bias,
+    long filterVolume, THCITensor *rulesBuffer) {
+  SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
+  auto _rules =
+      _m.getRuleBook(outputSize, inputSize, filterSize, filterStride, true);
+  uInt nActive = _m.getNActive(outputSize);
+  THCTensor_(resizeAs)(state, d_input_features, input_features);
+  THCTensor_(zero)(state, d_input_features);
+  auto iF = THCTensor_(data)(state, input_features);
+  auto diF = THCTensor_(data)(state, d_input_features);
+  auto doF = THCTensor_(data)(state, d_output_features);
+  auto ip = input_features->size[1];
+  auto op = d_output_features->size[1];
+  auto w = THCTensor_(data)(state, weight);
+  auto dw = THCTensor_(data)(state, d_weight);
+  uInt c = ip * op;
+  RULEBOOKITERATOR(dDeconvolution_backward_dW2<real>(
+                       iF, diF, doF, w, dw, rbB, nHotB, ip, ip, op, op,
+                       THCState_getCurrentStream(state));
+                   , w += c; dw += c;)
+  if (d_bias) {
+    auto db = THCTensor_(data)(state, d_bias);
+    Convolution_bp_bias(doF, db, op, op, nActive,
+                        THCState_getCurrentStream(state));
+  }
+}
+#endif
--- a/PyTorch/sparseconvnet/SCN/generic/GPU/Deconvolution.h
+++ b/PyTorch/sparseconvnet/SCN/generic/GPU/Deconvolution.h
+// Copyright 2016-present, Facebook, Inc.
+// All rights reserved.
+//
+// This source code is licensed under the license found in the
+// LICENSE file in the root directory of this source tree.
+#ifndef GPU_DECONVOLUTION_H
+#define GPU_DECONVOLUTION_H
+#include "../SparseConvNet.h"
+#include "Convolution.h"
+template <typename T, uInt K, uInt V>
+__global__ void
+dDeconvolution_KMxKN_forwardA(T *inFeatures, T *outFeatures, T *w, uInt *rules,
+                              uInt nHot, uInt input_nPlanes, uInt input_stride,
+                              uInt output_nPlanes, uInt output_stride) {
+  // nHot must be a multiple of K!!
+  // Input x Weight -> Output
+  // blockDim=(K,K/V,1), gridDim=(nBlocks,N,1) Volkov-blocks
+  // K is a multiple of V,
+  // nHot x KM -> nHot x KN - parallel over N,nHot - loop over M
+  uInt M = input_nPlanes / K;
+  // N = gridDim.y == output_nPlanes/K
+  uInt n = blockIdx.y;
+  outFeatures += n * K;
+  w += n * K;
+  T O[V];
+  __shared__ T W[K][K];
+  __shared__ T I[K][K];
+  uInt R0[V];
+  uInt R1[V];
+  const int tx = threadIdx.x;
+  int ty[V];
+#pragma unroll
+  for (int v = 0; v < V; v++)
+    ty[v] = threadIdx.y + v * (K / V);
+  for (int m = 0; m < M; m++) {
+// Read w
+#pragma unroll
+    for (int v = 0; v < V; v++)
+      W[ty[v]][tx] = w[ty[v] * output_nPlanes + tx];
+    for (uInt s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
+#pragma unroll
+      for (int v = 0; v < V; v++) {
+        R1[v] = rules[2 * (s + ty[v])];
+        R0[v] = rules[2 * (s + ty[v]) + 1];
+      }
+      __syncthreads();
+// Read input, reset O[]
+#pragma unroll
+      for (int v = 0; v < V; v++) {
+        I[ty[v]][tx] = inFeatures[R0[v] * input_stride + tx];
+        O[v] = 0;
+      }
+      __syncthreads();
+#pragma unroll
+      for (int k = 0; k < K; k++)
+#pragma unroll
+        for (int v = 0; v < V; v++)
+          O[v] += I[ty[v]][k] * W[k][tx];
+#pragma unroll
+      for (int v = 0; v < V; v++)
+        O[v] += outFeatures[R1[v] * output_stride + tx];
+#pragma unroll
+      for (int v = 0; v < V; v++)
+        outFeatures[R1[v] * output_stride + tx] = O[v];
+      __syncthreads();
+    }
+    w += K * output_nPlanes;
+    inFeatures += K;
+  }
+}
+template <typename T, uInt K, uInt V>
+__global__ void
+dDeconvolution_KMxKN_forwardB(T *inFeatures, T *outFeatures, T *w, uInt *rules,
+                              uInt nHot, uInt input_nPlanes, uInt input_stride,
+                              uInt output_nPlanes, uInt output_stride) {
+  // Input x Weight -> Output
+  // blockDim=(K,K/V,1), gridDim=(nBlocks,N,1) Volkov-blocks
+  // K is a multiple of V,
+  // nHot x KM -> nHot x KN - parallel over N,nHot - loop over M
+  uInt M = input_nPlanes / K;
+  // N = gridDim.y == output_nPlanes/K
+  uInt n = blockIdx.y;
+  outFeatures += n * K;
+  w += n * K;
+  T O[V];
+  __shared__ T W[K][K];
+  __shared__ T I[K][K];
+  uInt R0[V];
+  uInt R1[V];
+  const int tx = threadIdx.x;
+  int ty[V];
+#pragma unroll
+  for (int v = 0; v < V; v++)
+    ty[v] = threadIdx.y + v * (K / V);
+  for (int m = 0; m < M; m++) {
+// Read w
+#pragma unroll
+    for (int v = 0; v < V; v++)
+      W[ty[v]][tx] = w[ty[v] * output_nPlanes + tx];
+    for (uInt s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
+#pragma unroll
+      for (int v = 0; v < V; v++) {
+        if (s + ty[v] < nHot) {
+          R1[v] = rules[2 * (s + ty[v])];
+          R0[v] = rules[2 * (s + ty[v]) + 1];
+        }
+      }
+      __syncthreads();
+// Read input, reset O[]
+#pragma unroll
+      for (int v = 0; v < V; v++) {
+        if (s + ty[v] < nHot)
+          I[ty[v]][tx] = inFeatures[R0[v] * input_stride + tx];
+        O[v] = 0;
+      }
+      __syncthreads();
+#pragma unroll
+      for (int k = 0; k < K; k++)
+#pragma unroll
+        for (int v = 0; v < V; v++)
+          O[v] += I[ty[v]][k] * W[k][tx];
+#pragma unroll
+      for (int v = 0; v < V; v++)
+        if (s + ty[v] < nHot)
+          O[v] += outFeatures[R1[v] * output_stride + tx];
+#pragma unroll
+      for (int v = 0; v < V; v++)
+        if (s + ty[v] < nHot)
+          outFeatures[R1[v] * output_stride + tx] = O[v];
+      __syncthreads();
+    }
+    w += K * output_nPlanes;
+    inFeatures += K;
+  }
+}
+#define FOO(K, V)                                                              \
+  {                                                                            \
+    if (input_nPlanes % K == 0 and output_nPlanes % K == 0) {                  \
+      uInt o = (nHot / K) * K;                                                 \
+      if (o >= K)                                                              \
+        dDeconvolution_KMxKN_forwardA<T, K, V> << <                            \
+            dim3(std::min(o / K, (uInt)512), output_nPlanes / K),              \
+            dim3(K, K / V), 0, stream>>>                                       \
+            (inFeatures, outFeatures, w, rules, o, input_nPlanes,              \
+             input_stride, output_nPlanes, output_stride);                     \
+      if (nHot > o)                                                            \
+        dDeconvolution_KMxKN_forwardB<T, K, V> << <                            \
+            dim3(1, output_nPlanes / K), dim3(K, K / V), 0, stream>>>          \
+            (inFeatures, outFeatures, w, rules + 2 * o, nHot - o,              \
+             input_nPlanes, input_stride, output_nPlanes, output_stride);      \
+      return;                                                                  \
+    }                                                                          \
+  }
+template <typename T>
+void dDeconvolution_forward(T *inFeatures, T *outFeatures, T *w, uInt *rules,
+                            uInt nHot, uInt input_nPlanes, uInt input_stride,
+                            uInt output_nPlanes, uInt output_stride,
+                            cudaStream_t stream) {
+  FOO(64, 16)
+  FOO(32, 8)
+  FOO(16, 4)
+  FOO(8, 2)
+  assert(false);
+}
+#undef FOO
+// dOutput x W^T -> dInput and
+// Input^T x dOutput -> dW
+// blockDim=(K,K/V,1), gridDim=(nBlocks,M,1)
+template <typename T, uInt K, uInt V>
+__global__ void dDeconvolution_KMxKN_backward_dW_A(
+    T *inFeatures, T *dInFeatures, T *dOutFeatures, T *w, T *dw, uInt *rules,
+    uInt nHot, uInt input_nPlanes, uInt input_stride, uInt output_nPlanes,
+    uInt output_stride) {
+  // M = gridDim.y == input_nPlanes / K
+  uInt N = output_nPlanes / K;
+  uInt m = blockIdx.y;
+  inFeatures += m * K;
+  dInFeatures += m * K;
+  w += m * K * output_nPlanes;
+  dw += m * K * output_nPlanes;
+  T dI[V];
+  T dW[V];
+  __shared__ T I[K][K];
+  __shared__ T dO[K][K];
+  __shared__ T W[K][K];
+  uInt R0[V];
+  uInt R1[V];
+  const int tx = threadIdx.x;
+  int ty[V];
+#pragma unroll
+  for (int v = 0; v < V; v++)
+    ty[v] = threadIdx.y + v * (K / V);
+  for (int n = 0; n < N; n++) {
+// Read w, reset dW
+#pragma unroll
+    for (int v = 0; v < V; v++) {
+      W[ty[v]][tx] = w[ty[v] * output_nPlanes + tx];
+      dW[v] = 0;
+    }
+    for (uInt s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
+#pragma unroll
+      for (int v = 0; v < V; v++) {
+        R1[v] = rules[2 * (s + ty[v])];
+        R0[v] = rules[2 * (s + ty[v]) + 1];
+        dI[v] = 0;
+      }
+      __syncthreads();
+// Read input and dOutput
+#pragma unroll
+      for (int v = 0; v < V; v++) {
+        I[ty[v]][tx] = inFeatures[R0[v] * input_stride + tx];
+        dO[ty[v]][tx] = dOutFeatures[R1[v] * output_stride + tx];
+      }
+      __syncthreads();
+#pragma unroll
+      for (int k = 0; k < K; k++)
+#pragma unroll
+        for (int v = 0; v < V; v++) {
+          dI[v] += dO[ty[v]][k] * W[tx][k];
+          dW[v] += I[k][ty[v]] * dO[k][tx];
+        }
+#pragma unroll
+      for (int v = 0; v < V; v++)
+        dI[v] += dInFeatures[R0[v] * input_stride + tx];
+#pragma unroll
+      for (int v = 0; v < V; v++)
+        dInFeatures[R0[v] * input_stride + tx] = dI[v];
+      __syncthreads();
+    }
+#pragma unroll
+    for (int v = 0; v < V; v++)
+      atomicAdd(&dw[ty[v] * output_nPlanes + tx], dW[v]);
+    w += K;
+    dw += K;
+    dOutFeatures += K;
+  }
+}
+// dOutput x W^T -> dInput and
+// Input^T x dOutput -> dW
+// blockDim=(K,K/V,1), gridDim=(nBlocks,M,1)
+template <typename T, uInt K, uInt V>
+__global__ void dDeconvolution_KMxKN_backward_dW_B(
+    T *inFeatures, T *dInFeatures, T *dOutFeatures, T *w, T *dw, uInt *rules,
+    uInt nHot, uInt input_nPlanes, uInt input_stride, uInt output_nPlanes,
+    uInt output_stride) {
+  // M = gridDim.y == input_nPlanes / K
+  uInt N = output_nPlanes / K;
+  uInt m = blockIdx.y;
+  inFeatures += m * K;
+  dInFeatures += m * K;
+  w += m * K * output_nPlanes;
+  dw += m * K * output_nPlanes;
+  T dI[V];
+  T dW[V];
+  __shared__ T I[K][K];
+  __shared__ T dO[K][K];
+  __shared__ T W[K][K];
+  uInt R0[V];
+  uInt R1[V];
+  const int tx = threadIdx.x;
+  int ty[V];
+#pragma unroll
+  for (int v = 0; v < V; v++)
+    ty[v] = threadIdx.y + v * (K / V);
+  for (int n = 0; n < N; n++) {
+// Read w, reset dW
+#pragma unroll
+    for (int v = 0; v < V; v++) {
+      W[ty[v]][tx] = w[ty[v] * output_nPlanes + tx];
+      dW[v] = 0;
+    }
+    for (uInt s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
+#pragma unroll
+      for (int v = 0; v < V; v++) {
+        if (s + ty[v] < nHot) {
+          R1[v] = rules[2 * (s + ty[v])];
+          R0[v] = rules[2 * (s + ty[v]) + 1];
+        }
+        dI[v] = 0;
+      }
+      __syncthreads();
+// Read input and dOutput
+#pragma unroll
+      for (int v = 0; v < V; v++)
+        if (s + ty[v] < nHot) {
+          I[ty[v]][tx] = inFeatures[R0[v] * input_stride + tx];
+          dO[ty[v]][tx] = dOutFeatures[R1[v] * output_stride + tx];
+        } else {
+          I[ty[v]][tx] = 0;
+          dO[ty[v]][tx] = 0;
+        }
+      __syncthreads();
+#pragma unroll
+      for (int k = 0; k < K; k++)
+#pragma unroll
+        for (int v = 0; v < V; v++) {
+          dI[v] += dO[ty[v]][k] * W[tx][k];
+          dW[v] += I[k][ty[v]] * dO[k][tx];
+        }
+#pragma unroll
+      for (int v = 0; v < V; v++)
+        if (s + ty[v] < nHot)
+          dI[v] += dInFeatures[R0[v] * input_stride + tx];
+#pragma unroll
+      for (int v = 0; v < V; v++)
+        if (s + ty[v] < nHot)
+          dInFeatures[R0[v] * input_stride + tx] = dI[v];
+      __syncthreads();
+    }
+#pragma unroll
+    for (int v = 0; v < V; v++)
+      atomicAdd(&dw[ty[v] * output_nPlanes + tx], dW[v]);
+    w += K;
+    dw += K;
+    dOutFeatures += K;
+  }
+}
+#define FOO(K, V)                                                              \
+  {                                                                            \
+    if (input_nPlanes % K == 0 and output_nPlanes % K == 0) {                  \
+      uInt o = (nHot / K) * K;                                                 \
+      if (o >= K)                                                              \
+        dDeconvolution_KMxKN_backward_dW_A<T, K, V> << <                       \
+            dim3(std::min(o / K, (uInt)512), input_nPlanes / K),               \
+            dim3(K, K / V), 0, stream>>>                                       \
+            (inFeatures, dInFeatures, dOutFeatures, w, dw, rules, o,           \
+             input_nPlanes, input_stride, output_nPlanes, output_stride);      \
+      if (nHot > o)                                                            \
+        dDeconvolution_KMxKN_backward_dW_B<T, K, V> << <                       \
+            dim3(1, input_nPlanes / K), dim3(K, K / V), 0, stream>>>           \
+            (inFeatures, dInFeatures, dOutFeatures, w, dw, rules + 2 * o,      \
+             nHot - o, input_nPlanes, input_stride, output_nPlanes,            \
+             output_stride);                                                   \
+      return;                                                                  \
+    }                                                                          \
+  }
+template <typename T>
+void dDeconvolution_backward_dW(T *inFeatures, T *dInFeatures, T *dOutFeatures,
+                                T *w, T *dw, uInt *rules, uInt nHot,
+                                uInt input_nPlanes, uInt input_stride,
+                                uInt output_nPlanes, uInt output_stride,
+                                cudaStream_t stream) {
+  FOO(32, 8)
+  FOO(16, 4)
+  FOO(8, 2)
+  assert(false);
+}
+#undef FOO
+template <typename T, uInt K, uInt V>
+__global__ void
+dDeconvolution_KMxKN_forward2(T *inFeatures, T *outFeatures, T *w, uInt *rules,
+                              uInt nHot, uInt input_nPlanes, uInt input_stride,
+                              uInt output_nPlanes, uInt output_stride) {
+  // Input x Weight -> Output
+  // blockDim=(K,K/V,1), gridDim=(nBlocks,N,1) Volkov-blocks
+  // K is a multiple of V,
+  // nHot x input_nplanes<=KM -> nHot x output_nPlanes<=KN
+  // - parallel over N,nHot - loop over M
+  uInt M = (input_nPlanes + K - 1) / K;
+  // N = gridDim.y ~ output_nPlanes/K
+  uInt n = blockIdx.y;
+  outFeatures += n * K;
+  w += n * K;
+  uInt KO = min(K, output_nPlanes - K * n);
+  T O[V];
+  __shared__ T W[K][K];
+  __shared__ T I[K][K];
+  __shared__ uInt R[K * 2];
+  const int tx = threadIdx.x;
+  int ty[V];
+#pragma unroll
+  for (int v = 0; v < V; v++)
+    ty[v] = threadIdx.y + v * (K / V);
+  for (int m = 0; m < M; m++) {
+    uInt KI = min(K, input_nPlanes - K * m);
+// Read w
+#pragma unroll
+    for (int v = 0; v < V; v++)
+      if (ty[v] < KI and tx < KO)
+        W[ty[v]][tx] = w[ty[v] * output_nPlanes + tx];
+    for (uInt s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
+// Read rules for K input/output pairs
+#pragma unroll
+      for (int v = 0; v < V; v++) {
+        if (ty[v] < 2) {
+          int q = ty[v] * K + tx;
+          if (s + q / 2 < nHot)
+            R[q] = rules[2 * s + q];
+        }
+      }
+      __syncthreads();
+// Read input, reset O[]
+#pragma unroll
+      for (int v = 0; v < V; v++) {
+        if (tx < KI and s + ty[v] < nHot)
+          I[ty[v]][tx] = inFeatures[R[2 * ty[v] + 1] * input_stride + tx];
+        O[v] = 0;
+      }
+      __syncthreads();
+#pragma unroll
+      for (int k = 0; k < KI; k++)
+#pragma unroll
+        for (int v = 0; v < V; v++)
+          O[v] += I[ty[v]][k] * W[k][tx];
+      __syncthreads();
+#pragma unroll
+      for (int v = 0; v < V; v++)
+        if (tx < KO and s + ty[v] < nHot)
+          outFeatures[R[2 * ty[v]] * output_stride + tx] += O[v];
+      __syncthreads();
+    }
+    w += K * output_nPlanes;
+    inFeatures += K;
+  }
+}
+template <typename T>
+void dDeconvolution_forward2(T *inFeatures, T *outFeatures, T *w, uInt *rules,
+                             uInt nHot, uInt input_nPlanes, uInt input_stride,
+                             uInt output_nPlanes, uInt output_stride,
+                             cudaStream_t stream) {
+  if (input_nPlanes % 8 != 0 or output_nPlanes % 8 != 0) {
+    const int K = 16;
+    const int V = 4;
+    dDeconvolution_KMxKN_forward2<T, K, V> << <
+        dim3(128, (output_nPlanes + K - 1) / K), dim3(K, K / V), 0, stream>>>
+        (inFeatures, outFeatures, w, rules, nHot, input_nPlanes, input_stride,
+         output_nPlanes, output_stride);
+    return;
+  } else {
+    dDeconvolution_forward(inFeatures, outFeatures, w, rules, nHot,
+                           input_nPlanes, input_stride, output_nPlanes,
+                           output_stride, stream);
+  }
+}
+// dOutput x W^T -> dInput and
+// Input^T x dOutput -> dW
+// blockDim=(K,K/V,1), gridDim=(nBlocks,M,1)
+template <typename T, uInt K, uInt V>
+__global__ void dDeconvolution_KMxKN_backward_dW2(
+    T *inFeatures, T *dInFeatures, T *dOutFeatures, T *w, T *dw, uInt *rules,
+    uInt nHot, uInt input_nPlanes, uInt input_stride, uInt output_nPlanes,
+    uInt output_stride) {
+  // M = gridDim.y == input_nPlanes / K
+  uInt N = (output_nPlanes + K - 1) / K;
+  uInt m = blockIdx.y;
+  inFeatures += m * K;
+  dInFeatures += m * K;
+  w += m * K * output_nPlanes;
+  dw += m * K * output_nPlanes;
+  uInt KI = min(K, input_nPlanes - K * m);
+  T dI[V];
+  T dW[V];
+  __shared__ T I[K][K];
+  __shared__ T dO[K][K];
+  __shared__ T W[K][K];
+  __shared__ uInt R[K * 2];
+  const int tx = threadIdx.x;
+  int ty[V];
+#pragma unroll
+  for (int v = 0; v < V; v++)
+    ty[v] = threadIdx.y + v * (K / V);
+  for (int n = 0; n < N; n++) {
+    uInt KO = min(K, output_nPlanes - K * n);
+// Read w, reset dW
+#pragma unroll
+    for (int v = 0; v < V; v++) {
+      if (ty[v] < KI and tx < KO)
+        W[ty[v]][tx] = w[ty[v] * output_nPlanes + tx];
+      dW[v] = 0;
+    }
+    for (uInt s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
+// Read rules for K input/output pairs, reset dI[]
+#pragma unroll
+      for (int v = 0; v < V; v++) {
+        if (ty[v] < 2) {
+          int q = ty[v] * K + tx;
+          if (s + q / 2 < nHot)
+            R[q] = rules[2 * s + q];
+        }
+        dI[v] = 0;
+      }
+      __syncthreads();
+// Read input and dOutput
+#pragma unroll
+      for (int v = 0; v < V; v++) {
+        if (tx < KI and s + ty[v] < nHot)
+          I[ty[v]][tx] = inFeatures[R[2 * ty[v] + 1] * input_stride + tx];
+        else
+          I[ty[v]][tx] = 0;
+        if (tx < KO and s + ty[v] < nHot)
+          dO[ty[v]][tx] = dOutFeatures[R[2 * ty[v]] * output_stride + tx];
+        else
+          dO[ty[v]][tx] = 0;
+      }
+      __syncthreads();
+#pragma unroll
+      for (int k = 0; k < KO; k++)
+#pragma unroll
+        for (int v = 0; v < V; v++)
+          dI[v] += dO[ty[v]][k] * W[tx][k];
+#pragma unroll
+      for (int k = 0; k < K; k++)
+#pragma unroll
+        for (int v = 0; v < V; v++)
+          dW[v] += I[k][ty[v]] * dO[k][tx];
+      __syncthreads();
+#pragma unroll
+      for (int v = 0; v < V; v++)
+        if (tx < KI and s + ty[v] < nHot)
+          dInFeatures[R[2 * ty[v] + 1] * input_stride + tx] += dI[v];
+      __syncthreads();
+    }
+#pragma unroll
+    for (int v = 0; v < V; v++)
+      if (ty[v] < KI and tx < KO)
+        atomicAdd(&dw[ty[v] * output_nPlanes + tx], dW[v]);
+    w += K;
+    dw += K;
+    dOutFeatures += K;
+  }
+}
+template <typename T>
+void dDeconvolution_backward_dW2(T *inFeatures, T *dInFeatures, T *dOutFeatures,
+                                 T *w, T *dw, uInt *rules, uInt nHot,
+                                 uInt input_nPlanes, uInt input_stride,
+                                 uInt output_nPlanes, uInt output_stride,
+                                 cudaStream_t stream) {
+  if (input_nPlanes % 8 != 0 or output_nPlanes % 8 != 0) {
+    const int K = 16;
+    const int V = 4;
+    dDeconvolution_KMxKN_backward_dW2<T, K, V> << <
+        dim3(128, (input_nPlanes + K - 1) / K), dim3(K, K / V), 0, stream>>>
+        (inFeatures, dInFeatures, dOutFeatures, w, dw, rules, nHot,
+         input_nPlanes, input_stride, output_nPlanes, output_stride);
+    return;
+  } else {
+    dDeconvolution_backward_dW(inFeatures, dInFeatures, dOutFeatures, w, dw,
+                               rules, nHot, input_nPlanes, input_stride,
+                               output_nPlanes, output_stride, stream);
+  }
+}
+#endif /* GPU_DECONVOLUTION_H */
--- a/PyTorch/sparseconvnet/SCN/generic/GPU/LeakyReLU.cu
+++ b/PyTorch/sparseconvnet/SCN/generic/GPU/LeakyReLU.cu
+// Copyright 2016-present, Facebook, Inc.
+// All rights reserved.
+//
+// This source code is licensed under the license found in the
+// LICENSE file in the root directory of this source tree.
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/GPU/LeakyReLU.cu"
+#else
+#include "LeakyReLU.h"
+extern "C" void scn_R_(LeakyReLU_updateOutput)(THCTensor *input_features,
+                                               THCTensor *output_features,
+                                               float alpha) {
+  if (input_features != output_features)
+    THCTensor_(resizeAs)(state, output_features, input_features);
+  auto n = THCTensor_(nElement)(state, input_features);
+  LeakyReLU_fp<real> << <16, 1024, 0, THCState_getCurrentStream(state)>>>
+      (THCTensor_(data)(state, input_features),
+       THCTensor_(data)(state, output_features), n, alpha);
+}
+extern "C" void scn_R_(LeakyReLU_updateGradInput)(THCTensor *input_features,
+                                                  THCTensor *d_input_features,
+                                                  THCTensor *d_output_features,
+                                                  float alpha) {
+  if (d_input_features != d_output_features)
+    THCTensor_(resizeAs)(state, d_input_features, d_output_features);
+  auto n = THCTensor_(nElement)(state, d_input_features);
+  LeakyReLU_bp<real> << <16, 1024, 0, THCState_getCurrentStream(state)>>>
+      (THCTensor_(data)(state, input_features),
+       THCTensor_(data)(state, d_input_features),
+       THCTensor_(data)(state, d_output_features), n, alpha);
+}
+#endif
--- a/PyTorch/sparseconvnet/SCN/generic/GPU/LeakyReLU.h
+++ b/PyTorch/sparseconvnet/SCN/generic/GPU/LeakyReLU.h
+// Copyright 2016-present, Facebook, Inc.
+// All rights reserved.
+//
+// This source code is licensed under the license found in the
+// LICENSE file in the root directory of this source tree.
+#ifndef LEAKYRELU_H
+#define LEAKYRELU_H
+template <typename T>
+__global__ void LeakyReLU_fp(T *input_features, T *output_features, uInt n,
+                             T alpha) {
+  for (uInt i = blockIdx.x * blockDim.x + threadIdx.x; i < n; i += 16 * 1024)
+    output_features[i] = (input_features[i] > 0) ? input_features[i]
+                                                 : (input_features[i] * alpha);
+}
+template <typename T>
+__global__ void LeakyReLU_bp(T *input_features, T *d_input_features,
+                             T *d_output_features, uInt n, T alpha) {
+  for (uInt i = blockIdx.x * blockDim.x + threadIdx.x; i < n; i += 16 * 1024)
+    d_input_features[i] = (input_features[i] > 0)
+                              ? d_output_features[i]
+                              : (d_output_features[i] * alpha);
+}
+#endif
--- a/PyTorch/sparseconvnet/SCN/generic/GPU/MaxPooling.cu
+++ b/PyTorch/sparseconvnet/SCN/generic/GPU/MaxPooling.cu
+// Copyright 2016-present, Facebook, Inc.
+// All rights reserved.
+//
+// This source code is licensed under the license found in the
+// LICENSE file in the root directory of this source tree.
+#ifndef TH_GENERIC_FILE_
+#define TH_GENERIC_FILE_ "generic/GPU/MaxPooling.cu"
+#else
+#include "MaxPooling.h"
+#include "RuleBookIterator.h"
+extern "C" void scn_DR_(MaxPooling_updateOutput)(
+    THLongTensor *inputSize, THLongTensor *outputSize, THLongTensor *poolSize,
+    THLongTensor *poolStride, void **m, THCTensor *input_features,
+    THCTensor *output_features, long nFeaturesToDrop, THCITensor *rulesBuffer) {
+  SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
+  uInt nPlanes = input_features->size[1] - nFeaturesToDrop;
+  auto _rules =
+      _m.getRuleBook(inputSize, outputSize, poolSize, poolStride, true);
+  uInt nActive = _m.getNActive(outputSize);
+  THCTensor_(resize2d)(state, output_features, nActive, nPlanes);
+  THCTensor_(zero)(state, output_features);
+  auto iF = THCTensor_(data)(state, input_features) + nFeaturesToDrop;
+  auto oF = THCTensor_(data)(state, output_features);
+  RULEBOOKITERATOR(
+      MaxPooling_ForwardPass<real>(THCState_getCurrentStream(state), iF, oF,
+                                   nPlanes, input_features->size[1],
+                                   output_features->size[1], rbB, nHotB);
+      , )
+}
+extern "C" void scn_DR_(MaxPooling_updateGradInput)(
+    THLongTensor *inputSize, THLongTensor *outputSize, THLongTensor *poolSize,
+    THLongTensor *poolStride, void **m, THCTensor *input_features,
+    THCTensor *d_input_features, THCTensor *output_features,
+    THCTensor *d_output_features, long nFeaturesToDrop,
+    THCITensor *rulesBuffer) {
+  SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
+  uInt nPlanes = input_features->size[1] - nFeaturesToDrop;
+  auto _rules =
+      _m.getRuleBook(inputSize, outputSize, poolSize, poolStride, true);
+  uInt nActive = _m.getNActive(outputSize);
+  THCTensor_(resizeAs)(state, d_input_features, input_features);
+  THCTensor_(zero)(state, d_input_features);
+  auto iF = THCTensor_(data)(state, input_features);
+  auto oF = THCTensor_(data)(state, output_features);
+  auto diF = THCTensor_(data)(state, d_input_features);
+  auto doF = THCTensor_(data)(state, d_output_features);
+  RULEBOOKITERATOR(
+      MaxPooling_BackwardPass<real>(THCState_getCurrentStream(state), iF, diF,
+                                    oF, doF, nPlanes, input_features->size[1],
+                                    d_output_features->size[1], rbB, nHotB);
+      , )
+}
+#endif
--- a/PyTorch/sparseconvnet/SCN/generic/GPU/MaxPooling.h
+++ b/PyTorch/sparseconvnet/SCN/generic/GPU/MaxPooling.h
+// Copyright 2016-present, Facebook, Inc.
+// All rights reserved.
+//
+// This source code is licensed under the license found in the
+// LICENSE file in the root directory of this source tree.
+#ifndef GPU_MAXPOOLING_H
+#define GPU_MAXPOOLING_H
+// NTX must be >=2 so r is filled properly
+template <typename T, uInt NTX, uInt NTY>
+__global__ void MaxPooling_fp(T *input_features, T *output_features,
+                              uInt nPlanes, uInt input_stride,
+                              uInt output_stride, uInt *rules, uInt nHot) {
+  __shared__ uInt r[NTY * 2];
+  for (uInt n = blockIdx.x * NTY; n < nHot; n += gridDim.x * NTY) {
+    {
+      uInt i = threadIdx.x + NTX * threadIdx.y;
+      if (i < NTY * 2 and i < 2 * (n - nHot))
+        r[i] = rules[2 * n + i];
+    }
+    __syncthreads();
+    if (n + threadIdx.y < nHot) {
+      uInt i = r[2 * threadIdx.y] * input_stride;
+      uInt o = r[2 * threadIdx.y + 1] * output_stride;
+      for (uInt plane = threadIdx.x; plane < nPlanes; plane += NTX) {
+        T inp = input_features[i + plane];
+        if (output_features[o + plane] < inp)
+          output_features[o + plane] = inp;
+      }
+    }
+    __syncthreads();
+  }
+}
+template <typename T>
+void MaxPooling_ForwardPass(cudaStream_t stream, T *input_features,
+                            T *output_features, uInt nPlanes, uInt input_stride,
+                            uInt output_stride, uInt *rules, uInt nHot) {
+  MaxPooling_fp<T, 32, 32> << <32, dim3(32, 32), 0, stream>>>
+      (input_features, output_features, nPlanes, input_stride, output_stride,
+       rules, nHot);
+}
+template <typename T, uInt NTX, uInt NTY>
+__global__ void MaxPooling_bp(T *input_features, T *d_input_features,
+                              T *output_features, T *d_output_features,
+                              uInt nPlanes, uInt input_stride,
+                              uInt output_stride, uInt *rules, uInt nHot) {
+  __shared__ uInt r[NTY * 2];
+  for (uInt n = blockIdx.x * NTY; n < nHot; n += gridDim.x * NTY) {
+    {
+      uInt i = threadIdx.x + NTX * threadIdx.y;
+      if (i < NTY * 2 and i < 2 * (n - nHot))
+        r[i] = rules[2 * n + i];
+    }
+    __syncthreads();
+    if (n + threadIdx.y < nHot) {
+      uInt i = r[2 * threadIdx.y] * input_stride;
+      uInt o = r[2 * threadIdx.y + 1] * output_stride;
+      for (uInt plane = threadIdx.x; plane < nPlanes; plane += NTX)
+        if (output_features[o + plane] == input_features[i + plane])
+          d_input_features[i + plane] += d_output_features[o + plane];
+    }
+    __syncthreads();
+  }
+}
+template <typename T>
+void MaxPooling_BackwardPass(cudaStream_t stream, T *input_features,
+                             T *d_input_features, T *output_features,
+                             T *d_output_features, uInt nPlanes,
+                             uInt input_stride, uInt output_stride, uInt *rules,
+                             uInt nHot) {
+  MaxPooling_bp<T, 32, 32> << <32, dim3(32, 32), 0, stream>>>
+      (input_features, d_input_features, output_features, d_output_features,
+       nPlanes, input_stride, output_stride, rules, nHot);
+}
+#endif /* GPU_MAXPOOLING_H */
--- a/PyTorch/sparseconvnet/SCN/generic/GPU/NetworkInNetwork.cu
+++ b/PyTorch/sparseconvnet/SCN/generic/GPU/NetworkInNetwork.cu
+// Copyright 2016-present, Facebook, Inc.
+// All rights reserved.
+//
+// This source code is licensed under the license found in the
+// LICENSE file in the root directory of this source tree.
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/GPU/NetworkInNetwork.cu"
+#else
+#include "Convolution.h"
+#include <algorithm>
+extern "C" double
+scn_R_(NetworkInNetwork_updateOutput)(THCTensor *input_features_,
+                                      THCTensor *output_features_,
+                                      THCTensor *weight_, THCTensor *bias_) {
+  auto nActive = input_features_->size[0];
+  auto input_nPlanes = weight_->size[0];
+  auto output_nPlanes = weight_->size[1];
+  THCTensor_(resize2d)(state, output_features_, nActive, output_nPlanes);
+  auto input_features = THCTensor_(data)(state, input_features_);
+  auto output_features = THCTensor_(data)(state, output_features_);
+  auto weight = THCTensor_(data)(state, weight_);
+  if (bias_ != nullptr) {
+    auto bias = THCTensor_(data)(state, bias_);
+    for (uInt i = 0; i < output_nPlanes; i += 32) {
+      uInt blockDim = min(32L, output_nPlanes - i);
+      uInt gridDim = min(4096L, nActive);
+      Convolution_fp_bias<<<gridDim, blockDim, 0,
+                            THCState_getCurrentStream(state)>>>(
+          output_features + i, bias + i, output_nPlanes, output_nPlanes,
+          nActive);
+    }
+    // Do GEMM (note: gemm assumes column-major matrices)
+    // buffer          is l*m (row-major)
+    // weight          is m*r (row-major)
+    // output_features is l*r (row-major)
+    // buffer * weights + bias -> output_features
+    THBLAS_GEMM(state, 'n', 'n',
+                output_nPlanes, // r
+                nActive,        // l
+                input_nPlanes,  // m
+                1,              // alpha
+                weight,
+                output_nPlanes, // r
+                input_features,
+                input_nPlanes, // m
+                1,             // beta
+                output_features,
+                output_nPlanes // r
+                );
+  } else {
+    THCTensor_(zero)(state, output_features_);
+    THBLAS_GEMM(state, 'n', 'n',
+                output_nPlanes, // r
+                nActive,        // l
+                input_nPlanes,  // m
+                1,              // alpha
+                weight,
+                output_nPlanes, // r
+                input_features,
+                input_nPlanes, // m
+                0,             // beta
+                output_features,
+                output_nPlanes // r
+                );
+  }
+  return nActive * input_nPlanes * output_nPlanes;
+}
+extern "C" void
+scn_R_(NetworkInNetwork_updateGradInput)(THCTensor *d_input_features_,
+                                         THCTensor *d_output_features_,
+                                         THCTensor *weight_) {
+  auto nActive = d_output_features_->size[0];
+  auto input_nPlanes = weight_->size[0];
+  auto output_nPlanes = weight_->size[1];
+  THCTensor_(resize2d)(state, d_input_features_, nActive, input_nPlanes);
+  THCTensor_(zero)(state, d_input_features_);
+  auto d_input_features = THCTensor_(data)(state, d_input_features_);
+  auto d_output_features = THCTensor_(data)(state, d_output_features_);
+  auto weight = THCTensor_(data)(state, weight_);
+  // Do GEMM (note: gemm assumes column-major matrices)
+  // d_output_features is l*m (row-major)
+  // weights           is r*m (row-major)
+  // d_buffer          is l*r (row-major)
+  // d_output_features * T(weight) -> d_buffer
+  THBLAS_GEMM(state, 't', 'n',
+              input_nPlanes,  // r
+              nActive,        // l
+              output_nPlanes, // m
+              1,              // alpha
+              weight,
+              output_nPlanes, // m
+              d_output_features,
+              output_nPlanes, // m
+              0,              // beta
+              d_input_features,
+              input_nPlanes // r
+              );
+}
+extern "C" void scn_R_(NetworkInNetwork_accGradParameters)(
+    THCTensor *input_features_, THCTensor *d_output_features_,
+    THCTensor *d_weight_, THCTensor *d_bias_) {
+  auto nActive = input_features_->size[0];
+  auto input_nPlanes = d_weight_->size[0];
+  auto output_nPlanes = d_weight_->size[1];
+  auto input_features = THCTensor_(data)(state, input_features_);
+  auto d_output_features = THCTensor_(data)(state, d_output_features_);
+  auto d_weight = THCTensor_(data)(state, d_weight_);
+  // Do GEMM (note: gemm assumes column-major matrices)
+  // buffer            is m*l (row-major)
+  // d_output_features is m*r (row-major)
+  // weights           is l*r (row-major)
+  // T(buffer) * d_output_features -> d_weight
+  THBLAS_GEMM(state, 'n', 't',
+              output_nPlanes, // r
+              input_nPlanes,  // l
+              nActive,        // m
+              1,              // alpha
+              d_output_features,
+              output_nPlanes, // r
+              input_features,
+              input_nPlanes, // l
+              1,             // beta
+              d_weight,
+              output_nPlanes // r
+              );
+  if (d_bias_) {
+    auto d_bias = THCTensor_(data)(state, d_bias_);
+    Convolution_bp_bias(d_output_features, d_bias, output_nPlanes,
+                        output_nPlanes, nActive,
+                        THCState_getCurrentStream(state));
+  }
+}
+#endif
--- a/PyTorch/sparseconvnet/SCN/generic/GPU/RuleBookIterator.h
+++ b/PyTorch/sparseconvnet/SCN/generic/GPU/RuleBookIterator.h
+// Copyright 2016-present, Facebook, Inc.
+// All rights reserved.
+//
+// This source code is licensed under the license found in the
+// LICENSE file in the root directory of this source tree.
+#ifndef GPU_RULEBOOKITERATOR_H
+#define GPU_RULEBOOKITERATOR_H
+// Macro to parallelize loading rulebook elements to GPU memory and operating
+// on the elements of the rulebook.
+// X is the function to apply.
+// Y is a command to run
+#define RULEBOOKITERATOR(X, Y)                                                 \
+  uInt ms = ruleBookMaxSize(_rules);                                           \
+  if (THCITensor_nElement(state, rulesBuffer) < ms)                            \
+    THCITensor_resize1d(state, rulesBuffer, ms);                               \
+  uInt *rbB = (uInt *)THCITensor_data(state, rulesBuffer);                     \
+  for (int k = 0; k < _rules.size(); ++k) {                                    \
+    auto &r = _rules[k];                                                       \
+    uInt nHotB = r.size() / 2;                                                 \
+    if (nHotB) {                                                               \
+      cudaMemcpy(rbB, &r[0], sizeof(uInt) * 2 * nHotB,                         \
+                 cudaMemcpyHostToDevice);                                      \
+    }                                                                          \
+    if (nHotB) {                                                               \
+      X                                                                        \
+    }                                                                          \
+    Y                                                                          \
+  }
+#endif /* GPU_RULEBOOKITERATOR_H */
--- a/PyTorch/sparseconvnet/SCN/generic/GPU/SparseToDense.cu
+++ b/PyTorch/sparseconvnet/SCN/generic/GPU/SparseToDense.cu
+// Copyright 2016-present, Facebook, Inc.
+// All rights reserved.
+//
+// This source code is licensed under the license found in the
+// LICENSE file in the root directory of this source tree.
+#ifndef TH_GENERIC_FILE_
+#define TH_GENERIC_FILE_ "generic/GPU/SparseToDense.cu"
+#else
+#include "SparseToDense.h"
+extern "C" void scn_DR_(SparseToDense_updateOutput)(THLongTensor *inputSize,
+                                                    void **m,
+                                                    THCTensor *input_features,
+                                                    THCTensor *output_features,
+                                                    THCITensor *rulesBuffer) {
+  SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m) {
+    long sz[Dimension + 2];
+    sz[0] = _m.inputSGs->size();
+    sz[1] = input_features->size[1];
+    for (int i = 0; i < Dimension; i++) {
+      auto x = THLongTensor_data(inputSize)[i];
+      sz[i + 2] = x;
+    }
+    THCTensor_(resizeNd)(state, output_features, Dimension + 2, sz, NULL);
+    THCTensor_(zero)(state, output_features);
+  }
+  auto _rules = _m.getSparseToDenseRuleBook(inputSize, true);
+  auto spatialVolume = _rules.size();
+  uInt nPlanes = input_features->size[1];
+  auto iF = THCTensor_(data)(state, input_features);
+  auto oF = THCTensor_(data)(state, output_features);
+  RULEBOOKITERATOR(
+      SparseToDense_ForwardPass<real>(THCState_getCurrentStream(state), iF, oF,
+                                      nPlanes, spatialVolume, rbB, nHotB);
+      , oF++;) // todo check ++ or +=spatialVolume????zzz
+}
+extern "C" void scn_DR_(SparseToDense_updateGradInput)(
+    THLongTensor *inputSize, void **m, THCTensor *input_features,
+    THCTensor *d_input_features, THCTensor *d_output_features,
+    THCITensor *rulesBuffer) {
+  THCTensor_(resizeAs)(state, d_input_features, input_features);
+  THCTensor_(zero)(state, d_input_features);
+  SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
+  auto _rules = _m.getSparseToDenseRuleBook(inputSize, true);
+  auto spatialVolume = _rules.size();
+  uInt nPlanes = d_input_features->size[1];
+  auto diF = THCTensor_(data)(state, d_input_features);
+  auto doF = THCTensor_(data)(state, d_output_features);
+  RULEBOOKITERATOR(
+      SparseToDense_BackwardPass<real>(THCState_getCurrentStream(state), diF,
+                                       doF, nPlanes, spatialVolume, rbB, nHotB);
+      , doF++;)
+}
+#endif
--- a/PyTorch/sparseconvnet/SCN/generic/GPU/SparseToDense.h
+++ b/PyTorch/sparseconvnet/SCN/generic/GPU/SparseToDense.h
+// Copyright 2016-present, Facebook, Inc.
+// All rights reserved.
+//
+// This source code is licensed under the license found in the
+// LICENSE file in the root directory of this source tree.
+#ifndef GPU_SPARSETODENSE_H
+#define GPU_SPARSETODENSE_H
+#include "../SparseConvNet.h"
+//#include <THC/THCAtomics.cuh>
+// NTX must be >=2 so r is filled properly
+template <typename T, uInt NTX, uInt NTY>
+__global__ void SparseToDense_fp(T *input_features, T *output_features,
+                                uInt nPlanes, uInt spatialVolume, uInt *rules, uInt nHot) {
+  __shared__ uInt r[NTY * 2];
+  for (uInt n = blockIdx.x * NTY; n < nHot; n += gridDim.x * NTY) {
+    {
+      uInt i = threadIdx.x + NTX * threadIdx.y;
+      if (i < NTY * 2 and i < 2 * (n - nHot))
+        r[i] = rules[2 * n + i];
+    }
+    __syncthreads();
+    if (n + threadIdx.y < nHot) {
+      T *i = &input_features[r[2 * threadIdx.y] * nPlanes];
+      T *o = &output_features[r[2*threadIdx.y+1]*spatialVolume*nPlanes];
+      for (uInt plane = threadIdx.x; plane < nPlanes; plane += NTX)
+      o[plane*spatialVolume]=i[plane];
+    }
+    __syncthreads();
+  }
+}
+template <typename T>
+void SparseToDense_ForwardPass(cudaStream_t stream, T *input_features,
+                              T *output_features, uInt nPlanes,
+                              uInt spatialVolume,
+                              uInt *rules, uInt nHot) {
+  SparseToDense_fp<T, 32, 32><<<32, dim3(32, 32), 0, stream>>>(
+      input_features, output_features, nPlanes, spatialVolume,  rules, nHot);
+}
+// NTX must be >=2 so r is filled properly
+template <typename T, uInt NTX, uInt NTY>
+__global__ void SparseToDense_bp(T *d_input_features, T *d_output_features,
+                                uInt nPlanes, uInt spatialVolume, uInt *rules, uInt nHot) {
+  __shared__ uInt r[NTY * 2];
+  for (uInt n = blockIdx.x * NTY; n < nHot; n += gridDim.x * NTY) {
+    {
+      uInt i = threadIdx.x + NTX * threadIdx.y;
+      if (i < NTY * 2 and i < 2 * (n - nHot))
+        r[i] = rules[2 * n + i];
+    }
+    __syncthreads();
+    if (n + threadIdx.y < nHot) {
+      T *i = &d_input_features[r[2 * threadIdx.y] * nPlanes];
+      T *o = &d_output_features[r[2*threadIdx.y+1]*spatialVolume*nPlanes];
+      for (uInt plane = threadIdx.x; plane < nPlanes; plane += NTX)
+      i[plane]=o[plane*spatialVolume];
+    }
+    __syncthreads();
+  }
+}
+template <typename T>
+void SparseToDense_BackwardPass(cudaStream_t stream, T *d_input_features,
+                              T *d_output_features, uInt nPlanes,
+                              uInt spatialVolume,
+                              uInt *rules, uInt nHot) {
+  SparseToDense_bp<T, 32, 32><<<32, dim3(32, 32), 0, stream>>>(
+      d_input_features, d_output_features, nPlanes, spatialVolume,  rules, nHot);
+}
+#endif /* GPU_SPARSETODENSE_H */
--- a/PyTorch/sparseconvnet/SCN/generic/GPU/THGenerateCudaFloatTypes.h
+++ b/PyTorch/sparseconvnet/SCN/generic/GPU/THGenerateCudaFloatTypes.h
+// Copyright 2016-present, Facebook, Inc.
+// All rights reserved.
+//
+// This source code is licensed under the license found in the
+// LICENSE file in the root directory of this source tree.
+#ifndef TH_GENERIC_FILE
+#error                                                                         \
+    "You must define TH_GENERIC_FILE before including THGenerateCudaFloatTypes.h"
+#endif
+// float
+#define real float
+#define accreal double
+#define Real Float
+#define CReal Cuda
+#define TH_REAL_IS_FLOAT
+#define THBLAS_GEMM THCudaBlas_Sgemm
+#line 1 TH_GENERIC_FILE
+#include TH_GENERIC_FILE
+#undef accreal
+#undef real
+#undef Real
+#undef CReal
+#undef TH_REAL_IS_FLOAT
+#undef THBLAS_GEMM
+#undef TH_GENERIC_FILE
--- a/PyTorch/sparseconvnet/SCN/generic/GPU/THGenerateDimCudaFloatTypes.h
+++ b/PyTorch/sparseconvnet/SCN/generic/GPU/THGenerateDimCudaFloatTypes.h
+// Copyright 2016-present, Facebook, Inc.
+// All rights reserved.
+//
+// This source code is licensed under the license found in the
+// LICENSE file in the root directory of this source tree.
+#ifndef TH_GENERIC_FILE_
+#error "Define TH_GENERIC_FILE_ before including THGenerateDimCudaFloatTypes.h"
+#endif
+#define TH_GENERIC_FILE TH_GENERIC_FILE_
+#define Dimension 1
+#define TH_GENERIC_FILE TH_GENERIC_FILE_
+#include "THGenerateCudaFloatTypes.h"
+#undef Dimension
+#define Dimension 2
+#define TH_GENERIC_FILE TH_GENERIC_FILE_
+#include "THGenerateCudaFloatTypes.h"
+#undef Dimension
+#define Dimension 3
+#define TH_GENERIC_FILE TH_GENERIC_FILE_
+#include "THGenerateCudaFloatTypes.h"
+#undef Dimension
+#define Dimension 4
+#define TH_GENERIC_FILE TH_GENERIC_FILE_
+#include "THGenerateCudaFloatTypes.h"
+#undef Dimension
+#define Dimension 5
+#define TH_GENERIC_FILE TH_GENERIC_FILE_
+#include "THGenerateCudaFloatTypes.h"
+#undef Dimension
+#define Dimension 6
+#define TH_GENERIC_FILE TH_GENERIC_FILE_
+#include "THGenerateCudaFloatTypes.h"
+#undef Dimension
+#define Dimension 7
+#define TH_GENERIC_FILE TH_GENERIC_FILE_
+#include "THGenerateCudaFloatTypes.h"
+#undef Dimension
+#define Dimension 8
+#define TH_GENERIC_FILE TH_GENERIC_FILE_
+#include "THGenerateCudaFloatTypes.h"
+#undef Dimension
+#define Dimension 9
+#define TH_GENERIC_FILE TH_GENERIC_FILE_
+#include "THGenerateCudaFloatTypes.h"
+#undef Dimension
+#define Dimension 10
+#define TH_GENERIC_FILE TH_GENERIC_FILE_
+#include "THGenerateCudaFloatTypes.h"
+#undef Dimension
+#undef TH_GENERIC_FILE_
--- a/PyTorch/sparseconvnet/SCN/generic/Geometry/ActivePoolingRules.h
+++ b/PyTorch/sparseconvnet/SCN/generic/Geometry/ActivePoolingRules.h
+// Copyright 2016-present, Facebook, Inc.
+// All rights reserved.
+//
+// This source code is licensed under the license found in the
+// LICENSE file in the root directory of this source tree.
+#ifndef ACTIVEPOOLING_H
+#define ACTIVEPOOLING_H
+#include "../SparseConvNet.h"
+// Return the maximum number of active sites in the batch
+// rules has size 1.
+// rules[0] is a batchSize x (maxActive + 1) matrix.
+// First column is number of active sites for that sample (<= maxActive)
+// Remaining maxActive columns give the active sites, zero padded.
+template <uInt dimension>
+void activePoolingRules(SparseGrids<dimension> &SGs, RuleBook &rules) {
+  rules.clear();
+  rules.resize(2);
+  auto &r = rules[0];
+  uInt maxActive = 0;
+  for (auto &sg : SGs)
+    maxActive = std::max(maxActive, (uInt)sg.mp.size());
+  for (auto &sg : SGs) {
+    r.push_back(sg.mp.size());
+    for (auto &iter : sg.mp)
+      r.push_back(sg.ctr + iter.second);
+    while (rules.size() % (maxActive + 1) != 0)
+      r.push_back(0); // padding
+  }
+  rules[1].push_back(SGs.size());
+  rules[1].push_back(maxActive);
+}
+#endif /* ACTIVEPOOLING_H */
--- a/PyTorch/sparseconvnet/SCN/generic/Geometry/ConvolutionRules.h
+++ b/PyTorch/sparseconvnet/SCN/generic/Geometry/ConvolutionRules.h
+// Copyright 2016-present, Facebook, Inc.
+// All rights reserved.
+//
+// This source code is licensed under the license found in the
+// LICENSE file in the root directory of this source tree.
+#ifndef CONVOLUTIONRULES_H
+#define CONVOLUTIONRULES_H
+#include "RectangularRegions.h"
+template <uInt dimension>
+void Convolution_InputSgToRulesAndOutputSg(SparseGrid<dimension> &inputGrid,
+                                           SparseGrid<dimension> &outputGrid,
+                                           RuleBook &rules, long *size,
+                                           long *stride, long *inputSpatialSize,
+                                           long *outputSpatialSize) {
+  rules.resize(volume<dimension>(size));
+  for (auto const &inIter : inputGrid.mp) {
+    for (auto j : OutputRegionCalculator<dimension>(inIter.first, size, stride,
+                                                    outputSpatialSize)) {
+      auto inRegion = InputRegionCalculator<dimension>(j, size, stride);
+      uInt rulesOffset = inRegion.offset(inIter.first);
+      auto outIter = outputGrid.mp.find(j);
+      if (outIter == outputGrid.mp.end()) {
+        outIter =
+            outputGrid.mp.insert(std::make_pair(j, outputGrid.ctr++)).first;
+      }
+      rules[rulesOffset].push_back(inIter.second + inputGrid.ctr);
+      rules[rulesOffset].push_back(outIter->second);
+    }
+  }
+}
+template <uInt dimension>
+uInt Convolution_InputSgsToRulesAndOutputSgs(SparseGrids<dimension> &input_SGs,
+                                             SparseGrids<dimension> &output_SGs,
+                                             RuleBook &rules, long *filterSize,
+                                             long *filterStride,
+                                             long *input_spatialSize,
+                                             long *output_spatialSize) {
+  rules.clear();
+  output_SGs.clear();
+  uInt batchSize = input_SGs.size();
+  output_SGs.resize(batchSize);
+  uInt output_nActive = 0;
+  for (uInt i = 0; i < batchSize; i++) {
+    auto &iSG = input_SGs[i];
+    auto &oSG = output_SGs[i];
+    oSG.ctr = output_nActive;
+    Convolution_InputSgToRulesAndOutputSg<dimension>(
+        iSG, oSG, rules, filterSize, filterStride, input_spatialSize,
+        output_spatialSize);
+    output_nActive = oSG.ctr;
+    oSG.ctr = 0;
+  }
+  return output_nActive;
+}
+template <uInt dimension>
+uInt Convolution_InputSgsToRulesAndOutputSgs_OMP(
+    SparseGrids<dimension> &input_SGs, SparseGrids<dimension> &output_SGs,
+    RuleBook &rules, long *filterSize, long *filterStride,
+    long *input_spatialSize, long *output_spatialSize) {
+  rules.clear();
+  rules.resize(volume<dimension>(filterSize));
+  output_SGs.clear();
+  uInt batchSize = input_SGs.size();
+  output_SGs.resize(batchSize);
+  std::vector<RuleBook> rbs(batchSize);
+  {
+    uInt i;
+#pragma omp parallel for private(i)
+    for (i = 0; i < batchSize; i++)
+      Convolution_InputSgToRulesAndOutputSg<dimension>(
+          input_SGs[i], output_SGs[i], rbs[i], filterSize, filterStride,
+          input_spatialSize, output_spatialSize);
+  }
+  uInt output_nActive = 0;
+  for (uInt i = 0; i < batchSize; i++) {
+    // Parallel assignment:
+    // output_nActive     <-  output_nActive+output_SGs[i].ctr
+    // output_SGs[i].ctr  <-  output_nActive
+    uInt tmp = output_nActive;
+    output_nActive += output_SGs[i].ctr;
+    output_SGs[i].ctr = tmp;
+  }
+  {
+    uInt i;
+#pragma omp parallel for private(i)
+    for (i = 0; i < rules.size(); i++) {
+      auto &R = rules[i];
+      for (uInt j = 0; j < batchSize; j++) {
+        auto &r = rbs[j][i];
+        auto offset = output_SGs[j].ctr;
+        for (uInt k = 0; k < r.size();) {
+          R.push_back(r[k++]);
+          R.push_back(r[k++] + offset);
+        }
+      }
+    }
+  }
+  return output_nActive;
+}
+// for each site in filterVolume, list of (inputFeatureNumber,batchIdx) pairs
+template <uInt dimension>
+void SparseToDense_InputSgsToRulesAndOutputSgs(
+    SparseGrids<dimension> &input_SGs, RuleBook &rules, long *spatialSize) {
+  uInt batchSize = input_SGs.size();
+  SparseGrids<dimension> output_SGs(batchSize);
+  std::vector<long> ones(dimension, 1);
+  rules.clear();
+  for (uInt i = 0; i < batchSize; i++) {
+    auto &iSG = input_SGs[i];
+    auto &oSG = output_SGs[i];
+    oSG.ctr = i; // batchIdx
+    Convolution_InputSgToRulesAndOutputSg<dimension>(
+        iSG, oSG, rules, spatialSize, &ones[0], spatialSize, &ones[0]);
+  }
+}
+template <uInt dimension>
+void SparseToDense_InputSgsToRulesAndOutputSgs_OMP(
+    SparseGrids<dimension> &input_SGs, RuleBook &rules, long *spatialSize) {
+  uInt batchSize = input_SGs.size();
+  SparseGrids<dimension> output_SGs(batchSize);
+  std::vector<long> ones(dimension, 1);
+  rules.clear();
+  rules.resize(volume<dimension>(spatialSize));
+  std::vector<RuleBook> rbs(batchSize);
+  {
+    uInt i;
+#pragma omp parallel for private(i)
+    for (i = 0; i < batchSize; i++) {
+      output_SGs[i].ctr = i; // batchIdx
+      Convolution_InputSgToRulesAndOutputSg<dimension>(
+          input_SGs[i], output_SGs[i], rbs[i], spatialSize, &ones[0],
+          spatialSize, &ones[0]);
+    }
+  }
+  {
+    uInt i;
+#pragma omp parallel for private(i)
+    for (i = 0; i < rules.size(); i++) {
+      auto &R = rules[i];
+      for (uInt j = 0; j < batchSize; j++) {
+        auto &r = rbs[j][i];
+        for (uInt k = 0; k < r.size();) {
+          R.push_back(r[k++]);
+          R.push_back(r[k++]);
+        }
+      }
+    }
+  }
+}
+#endif /* CONVOLUTIONRULES_H */
--- a/PyTorch/sparseconvnet/SCN/generic/Geometry/Metadata.cpp
+++ b/PyTorch/sparseconvnet/SCN/generic/Geometry/Metadata.cpp
+// Copyright 2016-present, Facebook, Inc.
+// All rights reserved.
+//
+// This source code is licensed under the license found in the
+// LICENSE file in the root directory of this source tree.
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/Geometry/Metadata.cpp"
+#else
+#include "Metadata.h"
+#include <cstring>
+extern "C" void scn_D_(setInputSpatialSize)(void **m,
+                                            THLongTensor *spatialSize) {
+  SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
+  _m.setInputSpatialSize(spatialSize);
+}
+extern "C" void scn_D_(batchAddSample)(void **m) {
+  SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
+  assert(_m.inputSGs && "Call setInputSpatialSize first, please!");
+  _m.inputSGs->resize(_m.inputSGs->size() + 1);
+  _m.inputSG = &_m.inputSGs->back();
+}
+extern "C" void scn_D_(setInputSpatialLocation)(void **m,
+                                                THFloatTensor *features,
+                                                THLongTensor *location,
+                                                THFloatTensor *vec,
+                                                bool overwrite) {
+  SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
+  auto p = LongTensorToPoint<Dimension>(location);
+  auto &mp = _m.inputSG->mp;
+  auto &nActive = *_m.inputNActive;
+  auto iter = mp.find(p);
+  auto nPlanes = vec->size[0];
+  if (iter == mp.end()) {
+    iter = mp.insert(std::make_pair(p, nActive++)).first;
+    THFloatTensor_resize2d(features, nActive, nPlanes);
+    std::memcpy(THFloatTensor_data(features) + (nActive - 1) * nPlanes,
+                THFloatTensor_data(vec), sizeof(float) * nPlanes);
+  } else if (overwrite) {
+    std::memcpy(THFloatTensor_data(features) + iter->second * nPlanes,
+                THFloatTensor_data(vec), sizeof(float) * nPlanes);
+  }
+}
+extern "C" void
+    scn_D_(createMetadataForDenseToSparse)(void **m, THLongTensor *spatialSize_,
+                                           THLongTensor *pad_,
+                                           THLongTensor *nz_, long batchSize) {
+  SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
+  _m.setInputSpatialSize(spatialSize_);
+  _m.inputSGs->resize(batchSize);
+  auto &nActive = *_m.inputNActive;
+  nActive = nz_->size[0];
+  auto nz = THLongTensor_data(nz_);
+  auto pad = THLongTensor_data(pad_);
+  auto spatialSize = THLongTensor_data(spatialSize_);
+  std::vector<uInt> br(batchSize + 1);
+  if (batchSize == 1) {
+    br[1] = nActive;
+  } else {
+    long b = 0;
+    for (uInt i = 0; i < nActive; i++) {
+      long B = nz[i * (Dimension + 1)];
+      for (; b < B;)
+        br[++b] = i;
+    }
+    for (; b < batchSize;)
+      br[++b] = nActive;
+  }
+  uInt b;
+#pragma omp parallel for private(b)
+  for (b = 0; b < batchSize; b++) {
+    auto &sg = _m.inputSGs->at(b);
+    for (uInt i = br[b]; i < br[b + 1]; i++) {
+      Point<Dimension> x;
+      for (uInt j = 0; j < Dimension; j++) {
+        x[j] = nz[i * (Dimension + 1) + j + 1] +
+               pad[b * Dimension + j]; // 0-indexed
+      }
+      sg.mp[x] = i;
+    }
+  }
+}
+// tensor is size[0] x .. x size[Dimension-1] x size[Dimension]
+// size[0] x .. x size[Dimension-1] == spatial volume
+// size[Dimension] == #feature planes
+extern "C" void scn_D_(addSampleFromThresholdedTensor)(
+    void **m, THFloatTensor *features_, THFloatTensor *tensor_,
+    THLongTensor *offset_, THLongTensor *spatialSize_, float threshold) {
+  SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
+  auto &nActive = *_m.inputNActive;
+  auto &SGs = *_m.inputSGs;
+  SGs.resize(SGs.size() + 1);
+  auto &sg = SGs.back();
+  auto tensor = THFloatTensor_data(tensor_);
+  auto offset = THLongTensor_data(offset_);
+  auto spatialSize = THLongTensor_data(spatialSize_);
+  long *size = tensor_->size;
+  auto nPlanes = size[Dimension];
+  long volume = 1;
+  for (int i = 0; i < Dimension; ++i)
+    volume *= size[i];
+  THFloatTensor_resize2d(features_, nActive + volume, nPlanes);
+  // Increment pointers as we work through the data
+  auto features = THFloatTensor_data(features_) + nActive * nPlanes;
+  // Active locations
+  Point<Dimension> point;
+  for (uInt i = 0; i < Dimension; i++)
+    point[i] = offset[i];
+  for (uInt ctr = 0; ctr < volume; ctr++) {
+    bool active = false;
+    for (uInt i = 0; i < nPlanes; i++) {
+      if (fabs(tensor[i]) > threshold) {
+        active = true;
+        break;
+      }
+    }
+    for (uInt i = 0; i < Dimension; i++) {
+      if (point[i] < 0 or point[i] >= spatialSize[i]) {
+        active = false;
+        break;
+      }
+    }
+    if (active) {
+      sg.mp[point] = nActive++;
+      std::memcpy(features, tensor, sizeof(float) * nPlanes);
+      features += nPlanes;
+    }
+    tensor += nPlanes;
+    incrementPointInCube<Dimension>(point, size, offset);
+  }
+  THFloatTensor_resize2d(features_, nActive, nPlanes);
+}
+// 3x3 valid convolutions, 3x3/2x2 pooling or strided convolutions
+extern "C" void scn_D_(generateRuleBooks3s2)(void **m) {
+  SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
+  long sz[Dimension], str[Dimension], inS[Dimension], outS[Dimension];
+  Point<Dimension> p1;
+  Point<2 * Dimension> p2;
+  Point<3 * Dimension> p3;
+  for (int i = 0; i < Dimension; ++i) {
+    p1[i] = p2[i] = p3[i] = inS[i] = _m.inputSpatialSize[i];
+    p2[i + Dimension] = p3[i + Dimension] = sz[i] = 3;
+    p3[i + 2 * Dimension] = str[i] = 2;
+  }
+  while (true) {
+    auto &SGs = _m.grids[p1];
+    auto &rb = _m.validRuleBooks[p2];
+    if (rb.empty())
+      ValidConvolution_SgsToRules(SGs, rb, sz);
+    for (int i = 0; i < Dimension; ++i)
+      if (p1[i] < 3 or p1[i] % 2 != 1)
+        return;
+      else
+        p1[i] = outS[i] = (inS[i] - 1) / 2;
+    auto &SGs2 = _m.grids[p1];
+    auto &rb2 = _m.ruleBooks[p3];
+    if (rb2.empty())
+      _m.nActive[p1] = Convolution_InputSgsToRulesAndOutputSgs(
+          SGs, SGs2, rb2, sz, str, inS, outS);
+    for (int i = 0; i < Dimension; ++i)
+      p2[i] = p3[i] = inS[i] = outS[i];
+  }
+}
+// 3x3 valid convolutions, 2x2 pooling or strided convolutions
+extern "C" void scn_D_(generateRuleBooks2s2)(void **m) {
+  SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
+  long s2[Dimension], s3[Dimension], inS[Dimension], outS[Dimension];
+  Point<Dimension> p1;
+  Point<2 * Dimension> p2;
+  Point<3 * Dimension> p3;
+  for (int i = 0; i < Dimension; ++i) {
+    p1[i] = p2[i] = p3[i] = inS[i] = _m.inputSpatialSize[i];
+    p2[i + Dimension] = s3[i] = 3;
+    p3[i + Dimension] = p3[i + 2 * Dimension] = s2[i] = 2;
+  }
+  while (true) {
+    auto &SGs = _m.grids[p1];
+    auto &rb = _m.validRuleBooks[p2];
+    ValidConvolution_SgsToRules(SGs, rb, s3);
+    for (int i = 0; i < Dimension; ++i)
+      if (p1[i] < 2 or p1[i] % 2 != 0)
+        return;
+      else
+        p1[i] = outS[i] = inS[i] / 2;
+    auto &SGs2 = _m.grids[p1];
+    auto &rb2 = _m.ruleBooks[p3];
+    if (rb2.empty())
+      _m.nActive[p1] = Convolution_InputSgsToRulesAndOutputSgs(
+          SGs, SGs2, rb2, s2, s2, inS, outS);
+    for (int i = 0; i < Dimension; ++i)
+      p2[i] = p3[i] = inS[i] = outS[i];
+  }
+}
+extern "C" void scn_D_(freeMetadata)(void **m) {
+  SCN_DELETE(Metadata<Dimension>, m)
+}
+#endif
--- a/PyTorch/sparseconvnet/SCN/generic/Geometry/Metadata.h
+++ b/PyTorch/sparseconvnet/SCN/generic/Geometry/Metadata.h
+// Copyright 2016-present, Facebook, Inc.
+// All rights reserved.
+//
+// This source code is licensed under the license found in the
+// LICENSE file in the root directory of this source tree.
+#ifndef Metadata_H
+#define Metadata_H
+#include "../SparseConvNet.h"
+#include "ActivePoolingRules.h"
+#include "ConvolutionRules.h"
+#include "ValidConvolutionRules.h"
+#include <iostream>
+#include <tuple>
+#include <unordered_map>
+template <uInt dimension> class Metadata {
+public:
+  std::unordered_map<Point<dimension>, uInt, IntArrayHash<dimension>> nActive;
+  std::unordered_map<Point<dimension>, SparseGrids<dimension>,
+                     IntArrayHash<dimension>> grids;
+  std::unordered_map<Point<dimension>, RuleBook, IntArrayHash<dimension>>
+      activePoolingRuleBooks;
+  std::unordered_map<Point<2 * dimension>, RuleBook,
+                     IntArrayHash<2 * dimension>> validRuleBooks;
+  std::unordered_map<Point<3 * dimension>, RuleBook,
+                     IntArrayHash<3 * dimension>> ruleBooks;
+  std::unordered_map<Point<dimension>, RuleBook, IntArrayHash<dimension>>
+      sparseToDenseRuleBooks;
+  Point<dimension> inputSpatialSize;
+  SparseGrids<dimension> *inputSGs;
+  SparseGrid<dimension> *inputSG;
+  uInt *inputNActive;
+  Metadata() {}
+  void setInputSpatialSize(THLongTensor *spatialSize) {
+    inputSpatialSize = LongTensorToPoint<dimension>(spatialSize);
+    inputSGs = &grids[inputSpatialSize];
+    inputNActive = &nActive[inputSpatialSize];
+  }
+  SparseGrids<dimension> &getSparseGrid(THLongTensor *spatialSize) {
+    return grids[LongTensorToPoint<dimension>(spatialSize)];
+  };
+  uInt getNActive(THLongTensor *spatialSize) {
+    return nActive[LongTensorToPoint<dimension>(spatialSize)];
+  };
+  RuleBook &getValidRuleBook(THLongTensor *spatialSize, THLongTensor *size,
+                             bool openMP) {
+    auto p = TwoLongTensorsToPoint<dimension>(spatialSize, size);
+    auto &rb = validRuleBooks[p];
+    if (rb.empty()) {
+      auto &SGs = grids[LongTensorToPoint<dimension>(spatialSize)];
+#if defined(ENABLE_OPENMP)
+      openMP ? ValidConvolution_SgsToRules_OMP(SGs, rb, THLongTensor_data(size))
+             :
+#endif
+             ValidConvolution_SgsToRules(SGs, rb, THLongTensor_data(size));
+    }
+    return rb;
+  }
+  RuleBook &getActivePoolingRuleBook(THLongTensor *spatialSize) {
+    auto spatialSz = LongTensorToPoint<dimension>(spatialSize);
+    auto &SGs = grids[spatialSz];
+    auto &rb = activePoolingRuleBooks[spatialSz];
+    if (rb.empty())
+      activePoolingRules(SGs, rb);
+    return rb;
+  }
+  RuleBook &getSparseToDenseRuleBook(THLongTensor *spatialSize, bool openMP) {
+    auto ss = LongTensorToPoint<dimension>(spatialSize);
+    auto &SGs = grids[ss];
+    auto &rb = sparseToDenseRuleBooks[ss];
+    if (rb.empty())
+#if defined(ENABLE_OPENMP)
+      openMP ? SparseToDense_InputSgsToRulesAndOutputSgs_OMP(
+                   SGs, rb, THLongTensor_data(spatialSize))
+             :
+#endif
+             SparseToDense_InputSgsToRulesAndOutputSgs(
+                 SGs, rb, THLongTensor_data(spatialSize));
+    return rb;
+  }
+  RuleBook &getRuleBook(THLongTensor *inputSpatialSize,
+                        THLongTensor *outputSpatialSize, THLongTensor *size,
+                        THLongTensor *stride, bool openMP) {
+    auto p = ThreeLongTensorsToPoint<dimension>(inputSpatialSize, size, stride);
+    auto &rb = ruleBooks[p];
+    if (rb.empty()) {
+      auto iS = LongTensorToPoint<dimension>(inputSpatialSize);
+      auto oS = LongTensorToPoint<dimension>(outputSpatialSize);
+      auto &iSGs = grids[iS];
+      auto &oSGs = grids[oS];
+      nActive[oS] =
+#if defined(ENABLE_OPENMP)
+          openMP ? Convolution_InputSgsToRulesAndOutputSgs_OMP(
+                       iSGs, oSGs, rb, THLongTensor_data(size),
+                       THLongTensor_data(stride),
+                       THLongTensor_data(inputSpatialSize),
+                       THLongTensor_data(outputSpatialSize))
+                 :
+#endif
+                 Convolution_InputSgsToRulesAndOutputSgs(
+                     iSGs, oSGs, rb, THLongTensor_data(size),
+                     THLongTensor_data(stride),
+                     THLongTensor_data(inputSpatialSize),
+                     THLongTensor_data(outputSpatialSize));
+    }
+    return rb;
+  }
+};
+#endif
--- a/PyTorch/sparseconvnet/SCN/generic/Geometry/RectangularRegions.h
+++ b/PyTorch/sparseconvnet/SCN/generic/Geometry/RectangularRegions.h
+// Copyright 2016-present, Facebook, Inc.
+// All rights reserved.
+//
+// This source code is licensed under the license found in the
+// LICENSE file in the root directory of this source tree.
+#ifndef RECTANGULARREGIONS_H
+#define RECTANGULARREGIONS_H
+#include "../SparseConvNet.h"
+// For iterating over the rectangular region with corners lb and ub.
+// The .end() method and operator!= are designed to allow range based for
+// loops of the region, but nothing else.
+template <uInt dimension> class RectangularRegionIterator;
+template <uInt dimension> class RectangularRegion {
+public:
+  Point<dimension> lb;
+  Point<dimension> ub;
+  RectangularRegion(Point<dimension> &lb, Point<dimension> &ub)
+      : lb(lb), ub(ub) {}
+  RectangularRegionIterator<dimension> begin() {
+    return RectangularRegionIterator<dimension>(*this, lb);
+  }
+  RectangularRegionIterator<dimension> end() {
+    // Not really used by the custom operator!= function
+    // Otherwise it would need to represent a point just outside the region
+    return RectangularRegionIterator<dimension>(*this, ub);
+  }
+  uInt
+  offset(const Point<dimension> &p) { // Enumerate the points inside the region
+    uInt of = 0, m = 1;
+    for (Int i = dimension - 1; i >= 0; i--) {
+      of += m * (p[i] - lb[i]);
+      m *= ub[i] - lb[i] + 1;
+    }
+    return of;
+  }
+};
+template <uInt dimension> class RectangularRegionIterator {
+private:
+  RectangularRegion<dimension> &region;
+public:
+  bool stillLooping;
+  Point<dimension> point;
+  RectangularRegionIterator(RectangularRegion<dimension> &region,
+                            Point<dimension> &point)
+      : region(region), point(point), stillLooping(true) {
+    // If stride > size, we can have lb[i]>ub[i] meaning region_size == 0
+    for (Int i = 0; i < dimension; i++)
+      if (point[i] > region.ub[i])
+        stillLooping = false;
+  }
+  RectangularRegionIterator<dimension> &operator++() {
+    for (Int i = dimension - 1;;) {
+      point[i]++;
+      if (point[i] <= region.ub[i])
+        break;
+      point[i] = region.lb[i];
+      i--;
+      if (i == -1) {
+        stillLooping = false; // Signal to operator!= to end iteration
+        break;
+      }
+    }
+    return *this;
+  }
+  Point<dimension> &operator*() { return point; }
+};
+// Only to be used for checking the end point of range based for loops.
+template <uInt dimension>
+inline bool operator!=(const RectangularRegionIterator<dimension> &lhs,
+                       const RectangularRegionIterator<dimension> &rhs) {
+  return lhs.stillLooping;
+}
+// Similar to above but for [ offset[0] ... offset[0]+size[0]-1 ] x ... x [..]
+template <uInt dimension>
+void incrementPointInCube(Point<dimension> &point, long *size, long *offset) {
+  for (Int i = dimension - 1; i >= 0; i--) {
+    point[i]++;
+    if (point[i] < offset[i] + size[i])
+      break;
+    point[i] = offset[i];
+  }
+}
+// For a convolutional layer with given filter *size* and *stride*, find the
+// subset of the input field corresponding to a point in the output.
+template <uInt dimension>
+RectangularRegion<dimension>
+InputRegionCalculator(const Point<dimension> &output, long *size,
+                      long *stride) {
+  Point<dimension> lb, ub;
+  for (uInt i = 0; i < dimension; i++) {
+    lb[i] = output[i] * stride[i];
+    ub[i] = output[i] * stride[i] + size[i] - 1;
+  }
+  return RectangularRegion<dimension>(lb, ub);
+}
+// For a convolutional layer with given filter *size* and *stride*, find the
+// subset of the output field corresponding to a point in the input.
+template <uInt dimension>
+RectangularRegion<dimension>
+OutputRegionCalculator(const Point<dimension> &input, long *size, long *stride,
+                       long *outputSpatialSize) {
+  Point<dimension> lb, ub;
+  for (uInt i = 0; i < dimension; i++) {
+    lb[i] = std::max(0L, (input[i] - size[i] + stride[i]) / stride[i]);
+    ub[i] = std::min(outputSpatialSize[i] - 1, input[i] / stride[i]);
+  }
+  return RectangularRegion<dimension>(lb, ub);
+}
+#endif /* RECTANGULARREGIONS_H */
--- a/PyTorch/sparseconvnet/SCN/generic/Geometry/THGenerateDimTypes.h
+++ b/PyTorch/sparseconvnet/SCN/generic/Geometry/THGenerateDimTypes.h
+// Copyright 2016-present, Facebook, Inc.
+// All rights reserved.
+//
+// This source code is licensed under the license found in the
+// LICENSE file in the root directory of this source tree.
+#ifndef TH_GENERIC_FILE
+#error "You must define TH_GENERIC_FILE before including THGenerateDimTypes.h"
+#endif
+#define Dimension 1
+#line 1 TH_GENERIC_FILE
+#include TH_GENERIC_FILE
+#undef Dimension
+#define Dimension 2
+#line 1 TH_GENERIC_FILE
+#include TH_GENERIC_FILE
+#undef Dimension
+#define Dimension 3
+#line 1 TH_GENERIC_FILE
+#include TH_GENERIC_FILE
+#undef Dimension
+#define Dimension 4
+#line 1 TH_GENERIC_FILE
+#include TH_GENERIC_FILE
+#undef Dimension
+#define Dimension 5
+#line 1 TH_GENERIC_FILE
+#include TH_GENERIC_FILE
+#undef Dimension
+#define Dimension 6
+#line 1 TH_GENERIC_FILE
+#include TH_GENERIC_FILE
+#undef Dimension
+#define Dimension 7
+#line 1 TH_GENERIC_FILE
+#include TH_GENERIC_FILE
+#undef Dimension
+#define Dimension 8
+#line 1 TH_GENERIC_FILE
+#include TH_GENERIC_FILE
+#undef Dimension
+#define Dimension 9
+#line 1 TH_GENERIC_FILE
+#include TH_GENERIC_FILE
+#undef Dimension
+#define Dimension 10
+#line 1 TH_GENERIC_FILE
+#include TH_GENERIC_FILE
+#undef Dimension
+#undef TH_GENERIC_FILE
--- a/PyTorch/sparseconvnet/SCN/generic/Geometry/ValidConvolutionRules.h
+++ b/PyTorch/sparseconvnet/SCN/generic/Geometry/ValidConvolutionRules.h
+// Copyright 2016-present, Facebook, Inc.
+// All rights reserved.
+//
+// This source code is licensed under the license found in the
+// LICENSE file in the root directory of this source tree.
+#ifndef VALIDCONVOLUTIONRULES_H
+#define VALIDCONVOLUTIONRULES_H
+#include<iostream>
+// Full input region for an output point
+template <uInt dimension>
+RectangularRegion<dimension>
+InputRegionCalculator_Valid(const Point<dimension> &output, long *size) {
+  Point<dimension> lb, ub;
+  for (uInt i = 0; i < dimension; i++) {
+    Int pad = size[i] / 2;
+    lb[i] = output[i] - pad;
+    ub[i] = output[i] + size[i] - 1 - pad;
+  }
+  return RectangularRegion<dimension>(lb, ub);
+}
+// Call for each convolutional / max-pooling layer, once for each batch item.
+// rules is used to carry out the "lowering" whilst carrying out the convolution
+template <uInt dimension>
+double ValidConvolution_SgToRules(SparseGrid<dimension> &grid,
+                                    RuleBook &rules, long *size) {
+  uInt sd = volume<dimension>(size);
+  double countActiveInputs = 0;
+  for (auto const &outputIter : grid.mp) {
+    auto inRegion =
+        InputRegionCalculator_Valid<dimension>(outputIter.first, size);
+    uInt rulesOffset = 0;
+    for (auto inputPoint : inRegion) {
+      auto inputIter = grid.mp.find(inputPoint);
+      if (inputIter != grid.mp.end()) {
+        rules[rulesOffset].push_back(inputIter->second + grid.ctr);
+        rules[rulesOffset].push_back(outputIter.second + grid.ctr);
+        countActiveInputs++;
+      }
+      rulesOffset++;
+    }
+  }
+  return countActiveInputs;
+}
+template <uInt dimension>
+uInt ValidConvolution_SgsToRules(SparseGrids<dimension> &SGs,
+                                   RuleBook &rules, long *size) {
+  uInt sd = volume<dimension>(size);
+  uInt countActiveInputs = 0;
+  rules.clear();
+  rules.resize(sd);
+  for (uInt i = 0; i < SGs.size(); i++)
+    countActiveInputs +=
+        ValidConvolution_SgToRules<dimension>(SGs[i], rules, size);
+  return countActiveInputs;
+}
+template <uInt dimension>
+uInt ValidConvolution_SgsToRules_OMP(SparseGrids<dimension> &SGs,
+                                       RuleBook &rules, long *size) {
+  std::vector<RuleBook> rbs(SGs.size());
+  std::vector<double> countActiveInputs(SGs.size());
+  rules.clear();
+  uInt sd = volume<dimension>(size);
+  rules.resize(sd);
+  {
+    uInt i;
+#pragma omp parallel for private(i)
+    for (i = 0; i < SGs.size(); i++) {
+      rbs[i].resize(sd);
+      countActiveInputs[i] =
+          ValidConvolution_SgToRules<dimension>(SGs[i], rbs[i], size);
+    }
+  }
+  {
+    uInt i;
+#pragma omp parallel for private(i)
+    for (i = 0; i < sd; i++)
+      for (auto const &rb : rbs)
+        rules[i].insert(rules[i].end(), rb[i].begin(), rb[i].end());
+  }
+  uInt countActiveInputs_ = 0;
+  for (auto &i : countActiveInputs)
+    countActiveInputs_ += i;
+  return countActiveInputs_;
+}
+#endif /* VALIDCONVOLUTIONRULES_H */
--- a/PyTorch/sparseconvnet/SCN/generic/SparseConvNet.h
+++ b/PyTorch/sparseconvnet/SCN/generic/SparseConvNet.h
+// Copyright 2016-present, Facebook, Inc.
+// All rights reserved.
+//
+// This source code is licensed under the license found in the
+// LICENSE file in the root directory of this source tree.
+#ifndef SPARSECONVNET_H
+#define SPARSECONVNET_H
+// To use 64 bits instead of 32, replace 32bits.h with 64bits.h
+#include "32bits.h"
+#include <array>
+#include <cstdint>
+#include <google/dense_hash_map>
+#include <iostream>
+#include <string>
+#include <tuple>
+#include <vector>
+#if defined(ENABLE_OPENMP)
+#include <omp.h>
+#endif
+// Submanifold Sparse Convolutional Networks
+// A batch of samples, for each layer of a sparse convolutional network, is
+// encoded as a matrix of nActive x nFeatures and a vector of
+// hash tables identifying points in space with the rows of
+// the matrix.
+// SparseGridMap<dimension> - a hash table assigning integer labels to a sparse
+// collection of 'Point<dimension>' points
+template <uInt dimension>
+using SparseGridMap =
+    google::dense_hash_map<Point<dimension>, int, IntArrayHash<dimension>,
+                           std::equal_to<Point<dimension>>>;
+template <uInt dimension> class SparseGrid {
+public:
+  uInt ctr; // Count #active sites during output hash construction. Then store
+            // offset within a batch.
+  SparseGridMap<dimension> mp;
+  SparseGrid() : ctr(0) {
+    // Sparsehash needs a key to be set aside and never used - we use
+    // (Int_MAX,...,Int_MAX)
+    Point<dimension> empty_key;
+    for (uInt i = 0; i < dimension; ++i)
+      empty_key[i] = Int_MAX;
+    mp.set_empty_key(empty_key);
+  }
+};
+template <uInt dimension>
+using SparseGrids = std::vector<SparseGrid<dimension>>;
+// Each convolution/pooling operation requires the calculation of a 'rulebook'
+// setting out how the output points depend on the points in the layer below
+using RuleBook = std::vector<std::vector<uInt>>;
+// Code relating to squares/cubes/rectangles/cuboids etc
+// integer powers - ok for filter sizes, could overflow if we calculate
+// inputSpatialSize^d
+template <uInt m> uInt ipow(uInt n) { return n * ipow<m - 1>(n); }
+template <> uInt ipow<1>(uInt n) { return n; }
+template <> uInt ipow<0>(uInt n) { return 1; }
+template <uInt dimension> uInt volume(long *point) {
+  uInt v = 1;
+  for (uInt i = 0; i < dimension; i++)
+    v *= point[i];
+  return v;
+}
+// Macro to initialize arguments passed as void*[1] from Lua.
+// This allows Lua to take ownership of arbitrary C++ objects.
+// The macro:
+// - takes a pointer to a pointer [allocated as ffi.new('void *[1]') in Lua]
+// - if the pointer has not yet been initialized, create an object for it
+// - create a reference "_VAR" to the object
+#define SCN_INITIALIZE_AND_REFERENCE(TYPE, VAR)                                \
+  if (VAR[0] == NULL)                                                          \
+    VAR[0] = (void *)new TYPE;                                                 \
+  TYPE &_##VAR = *(TYPE *)VAR[0];
+// Macro to free the memory allocated by SCN_INITIALIZE_AND_REFERENCE
+#define SCN_DELETE(TYPE, VAR)                                                  \
+  if (VAR[0] != NULL) {                                                        \
+    delete (TYPE *) VAR[0];                                                    \
+    VAR[0] = NULL;                                                             \
+  }
+uInt ruleBookMaxSize(RuleBook &rb) {
+  uInt m = 0;
+  for (auto &r : rb)
+    m = std::max(m, (uInt)r.size());
+  return m;
+}
+uInt ruleBookTotalSize(RuleBook &rb) {
+  uInt m = 0;
+  for (auto &r : rb)
+    m += (uInt)r.size();
+  return m;
+}
+#endif /* SPARSECONVNET_H */