initial commit

f9552033 · Benjamin Thomas Graham · f9552033 · f9552033 · f9552033 · f9552033
Commit f9552033 authored Jul 16, 2017 by Benjamin Thomas Graham
20 changed files
--- a/PyTorch/sparseconvnet/SCN/generic/GPU/Deconvolution.cu
+++ b/PyTorch/sparseconvnet/SCN/generic/GPU/Deconvolution.cu
+// Copyright 2016-present, Facebook, Inc.
+// All rights reserved.
+//
+// This source code is licensed under the license found in the
+// LICENSE file in the root directory of this source tree.
+#ifndef TH_GENERIC_FILE_
+#define TH_GENERIC_FILE_ "generic/GPU/Deconvolution.cu"
+#else
+#include "Convolution.h"
+#include "Deconvolution.h"
+#include <algorithm>
+extern "C" double scn_DR_(Deconvolution_updateOutput)(
+    THLongTensor *inputSize, THLongTensor *outputSize, THLongTensor *filterSize,
+    THLongTensor *filterStride, void **m, THCTensor *input_features,
+    THCTensor *output_features, THCTensor *weight, THCTensor *bias,
+    long filterVolume, THCITensor *rulesBuffer) {
+  SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
+  auto _rules =
+      _m.getRuleBook(outputSize, inputSize, filterSize, filterStride, true);
+  uInt nActive = _m.getNActive(outputSize);
+  THCTensor_(resize2d)(state, output_features, nActive, weight->size[1]);
+  if (not bias)
+    THCTensor_(zero)(state, output_features);
+  auto iF = THCTensor_(data)(state, input_features);
+  auto oF = THCTensor_(data)(state, output_features);
+  auto ip = input_features->size[1];
+  auto op = output_features->size[1];
+  auto w = THCTensor_(data)(state, weight);
+  double flops = 0;
+  if (bias) {
+    auto b = THCTensor_(data)(state, bias);
+    for (uInt i = 0; i < op; i += 32) {
+      uInt blockDim = min(32L, op - i);
+      uInt gridDim = min(4096, nActive);
+      Convolution_fp_bias
+              << <gridDim, blockDim, 0, THCState_getCurrentStream(state)>>>
+          (oF + i, b + i, op, op, nActive);
+    }
+  }
+  uInt c = ip * op;
+  RULEBOOKITERATOR(
+      dDeconvolution_forward2<real>(iF, oF, w, rbB, nHotB, ip, ip, op, op,
+                                    THCState_getCurrentStream(state));
+      , w += c; flops += nHotB * c;)
+  return flops;
+}
+extern "C" void scn_DR_(Deconvolution_backward)(
+    THLongTensor *inputSize, THLongTensor *outputSize, THLongTensor *filterSize,
+    THLongTensor *filterStride, void **m, THCTensor *input_features,
+    THCTensor *d_input_features, THCTensor *d_output_features,
+    THCTensor *weight, THCTensor *d_weight, THCTensor *d_bias,
+    long filterVolume, THCITensor *rulesBuffer) {
+  SCN_INITIALIZE_AND_REFERENCE(Metadata<Dimension>, m)
+  auto _rules =
+      _m.getRuleBook(outputSize, inputSize, filterSize, filterStride, true);
+  uInt nActive = _m.getNActive(outputSize);
+  THCTensor_(resizeAs)(state, d_input_features, input_features);
+  THCTensor_(zero)(state, d_input_features);
+  auto iF = THCTensor_(data)(state, input_features);
+  auto diF = THCTensor_(data)(state, d_input_features);
+  auto doF = THCTensor_(data)(state, d_output_features);
+  auto ip = input_features->size[1];
+  auto op = d_output_features->size[1];
+  auto w = THCTensor_(data)(state, weight);
+  auto dw = THCTensor_(data)(state, d_weight);
+  uInt c = ip * op;
+  RULEBOOKITERATOR(dDeconvolution_backward_dW2<real>(
+                       iF, diF, doF, w, dw, rbB, nHotB, ip, ip, op, op,
+                       THCState_getCurrentStream(state));
+                   , w += c; dw += c;)
+  if (d_bias) {
+    auto db = THCTensor_(data)(state, d_bias);
+    Convolution_bp_bias(doF, db, op, op, nActive,
+                        THCState_getCurrentStream(state));
+  }
+}
+#endif
--- a/PyTorch/sparseconvnet/SCN/generic/GPU/Deconvolution.h
+++ b/PyTorch/sparseconvnet/SCN/generic/GPU/Deconvolution.h
+// Copyright 2016-present, Facebook, Inc.
+// All rights reserved.
+//
+// This source code is licensed under the license found in the
+// LICENSE file in the root directory of this source tree.
+#ifndef GPU_DECONVOLUTION_H
+#define GPU_DECONVOLUTION_H
+#include "../SparseConvNet.h"
+#include "Convolution.h"
+template <typename T, uInt K, uInt V>
+__global__ void
+dDeconvolution_KMxKN_forwardA(T *inFeatures, T *outFeatures, T *w, uInt *rules,
+                              uInt nHot, uInt input_nPlanes, uInt input_stride,
+                              uInt output_nPlanes, uInt output_stride) {
+  // nHot must be a multiple of K!!
+  // Input x Weight -> Output
+  // blockDim=(K,K/V,1), gridDim=(nBlocks,N,1) Volkov-blocks
+  // K is a multiple of V,
+  // nHot x KM -> nHot x KN - parallel over N,nHot - loop over M
+  uInt M = input_nPlanes / K;
+  // N = gridDim.y == output_nPlanes/K
+  uInt n = blockIdx.y;
+  outFeatures += n * K;
+  w += n * K;
+  T O[V];
+  __shared__ T W[K][K];
+  __shared__ T I[K][K];
+  uInt R0[V];
+  uInt R1[V];
+  const int tx = threadIdx.x;
+  int ty[V];
+#pragma unroll
+  for (int v = 0; v < V; v++)
+    ty[v] = threadIdx.y + v * (K / V);
+  for (int m = 0; m < M; m++) {
+// Read w
+#pragma unroll
+    for (int v = 0; v < V; v++)
+      W[ty[v]][tx] = w[ty[v] * output_nPlanes + tx];
+    for (uInt s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
+#pragma unroll
+      for (int v = 0; v < V; v++) {
+        R1[v] = rules[2 * (s + ty[v])];
+        R0[v] = rules[2 * (s + ty[v]) + 1];
+      }
+      __syncthreads();
+// Read input, reset O[]
+#pragma unroll
+      for (int v = 0; v < V; v++) {
+        I[ty[v]][tx] = inFeatures[R0[v] * input_stride + tx];
+        O[v] = 0;
+      }
+      __syncthreads();
+#pragma unroll
+      for (int k = 0; k < K; k++)
+#pragma unroll
+        for (int v = 0; v < V; v++)
+          O[v] += I[ty[v]][k] * W[k][tx];
+#pragma unroll
+      for (int v = 0; v < V; v++)
+        O[v] += outFeatures[R1[v] * output_stride + tx];
+#pragma unroll
+      for (int v = 0; v < V; v++)
+        outFeatures[R1[v] * output_stride + tx] = O[v];
+      __syncthreads();
+    }
+    w += K * output_nPlanes;
+    inFeatures += K;
+  }
+}
+template <typename T, uInt K, uInt V>
+__global__ void
+dDeconvolution_KMxKN_forwardB(T *inFeatures, T *outFeatures, T *w, uInt *rules,
+                              uInt nHot, uInt input_nPlanes, uInt input_stride,
+                              uInt output_nPlanes, uInt output_stride) {
+  // Input x Weight -> Output
+  // blockDim=(K,K/V,1), gridDim=(nBlocks,N,1) Volkov-blocks
+  // K is a multiple of V,
+  // nHot x KM -> nHot x KN - parallel over N,nHot - loop over M
+  uInt M = input_nPlanes / K;
+  // N = gridDim.y == output_nPlanes/K
+  uInt n = blockIdx.y;
+  outFeatures += n * K;
+  w += n * K;
+  T O[V];
+  __shared__ T W[K][K];
+  __shared__ T I[K][K];
+  uInt R0[V];
+  uInt R1[V];
+  const int tx = threadIdx.x;
+  int ty[V];
+#pragma unroll
+  for (int v = 0; v < V; v++)
+    ty[v] = threadIdx.y + v * (K / V);
+  for (int m = 0; m < M; m++) {
+// Read w
+#pragma unroll
+    for (int v = 0; v < V; v++)
+      W[ty[v]][tx] = w[ty[v] * output_nPlanes + tx];
+    for (uInt s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
+#pragma unroll
+      for (int v = 0; v < V; v++) {
+        if (s + ty[v] < nHot) {
+          R1[v] = rules[2 * (s + ty[v])];
+          R0[v] = rules[2 * (s + ty[v]) + 1];
+        }
+      }
+      __syncthreads();
+// Read input, reset O[]
+#pragma unroll
+      for (int v = 0; v < V; v++) {
+        if (s + ty[v] < nHot)
+          I[ty[v]][tx] = inFeatures[R0[v] * input_stride + tx];
+        O[v] = 0;
+      }
+      __syncthreads();
+#pragma unroll
+      for (int k = 0; k < K; k++)
+#pragma unroll
+        for (int v = 0; v < V; v++)
+          O[v] += I[ty[v]][k] * W[k][tx];
+#pragma unroll
+      for (int v = 0; v < V; v++)
+        if (s + ty[v] < nHot)
+          O[v] += outFeatures[R1[v] * output_stride + tx];
+#pragma unroll
+      for (int v = 0; v < V; v++)
+        if (s + ty[v] < nHot)
+          outFeatures[R1[v] * output_stride + tx] = O[v];
+      __syncthreads();
+    }
+    w += K * output_nPlanes;
+    inFeatures += K;
+  }
+}
+#define FOO(K, V)                                                              \
+  {                                                                            \
+    if (input_nPlanes % K == 0 and output_nPlanes % K == 0) {                  \
+      uInt o = (nHot / K) * K;                                                 \
+      if (o >= K)                                                              \
+        dDeconvolution_KMxKN_forwardA<T, K, V> << <                            \
+            dim3(std::min(o / K, (uInt)512), output_nPlanes / K),              \
+            dim3(K, K / V), 0, stream>>>                                       \
+            (inFeatures, outFeatures, w, rules, o, input_nPlanes,              \
+             input_stride, output_nPlanes, output_stride);                     \
+      if (nHot > o)                                                            \
+        dDeconvolution_KMxKN_forwardB<T, K, V> << <                            \
+            dim3(1, output_nPlanes / K), dim3(K, K / V), 0, stream>>>          \
+            (inFeatures, outFeatures, w, rules + 2 * o, nHot - o,              \
+             input_nPlanes, input_stride, output_nPlanes, output_stride);      \
+      return;                                                                  \
+    }                                                                          \
+  }
+template <typename T>
+void dDeconvolution_forward(T *inFeatures, T *outFeatures, T *w, uInt *rules,
+                            uInt nHot, uInt input_nPlanes, uInt input_stride,
+                            uInt output_nPlanes, uInt output_stride,
+                            cudaStream_t stream) {
+  FOO(64, 16)
+  FOO(32, 8)
+  FOO(16, 4)
+  FOO(8, 2)
+  assert(false);
+}
+#undef FOO
+// dOutput x W^T -> dInput and
+// Input^T x dOutput -> dW
+// blockDim=(K,K/V,1), gridDim=(nBlocks,M,1)
+template <typename T, uInt K, uInt V>
+__global__ void dDeconvolution_KMxKN_backward_dW_A(
+    T *inFeatures, T *dInFeatures, T *dOutFeatures, T *w, T *dw, uInt *rules,
+    uInt nHot, uInt input_nPlanes, uInt input_stride, uInt output_nPlanes,
+    uInt output_stride) {
+  // M = gridDim.y == input_nPlanes / K
+  uInt N = output_nPlanes / K;
+  uInt m = blockIdx.y;
+  inFeatures += m * K;
+  dInFeatures += m * K;
+  w += m * K * output_nPlanes;
+  dw += m * K * output_nPlanes;
+  T dI[V];
+  T dW[V];
+  __shared__ T I[K][K];
+  __shared__ T dO[K][K];
+  __shared__ T W[K][K];
+  uInt R0[V];
+  uInt R1[V];
+  const int tx = threadIdx.x;
+  int ty[V];
+#pragma unroll
+  for (int v = 0; v < V; v++)
+    ty[v] = threadIdx.y + v * (K / V);
+  for (int n = 0; n < N; n++) {
+// Read w, reset dW
+#pragma unroll
+    for (int v = 0; v < V; v++) {
+      W[ty[v]][tx] = w[ty[v] * output_nPlanes + tx];
+      dW[v] = 0;
+    }
+    for (uInt s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
+#pragma unroll
+      for (int v = 0; v < V; v++) {
+        R1[v] = rules[2 * (s + ty[v])];
+        R0[v] = rules[2 * (s + ty[v]) + 1];
+        dI[v] = 0;
+      }
+      __syncthreads();
+// Read input and dOutput
+#pragma unroll
+      for (int v = 0; v < V; v++) {
+        I[ty[v]][tx] = inFeatures[R0[v] * input_stride + tx];
+        dO[ty[v]][tx] = dOutFeatures[R1[v] * output_stride + tx];
+      }
+      __syncthreads();
+#pragma unroll
+      for (int k = 0; k < K; k++)
+#pragma unroll
+        for (int v = 0; v < V; v++) {
+          dI[v] += dO[ty[v]][k] * W[tx][k];
+          dW[v] += I[k][ty[v]] * dO[k][tx];
+        }
+#pragma unroll
+      for (int v = 0; v < V; v++)
+        dI[v] += dInFeatures[R0[v] * input_stride + tx];
+#pragma unroll
+      for (int v = 0; v < V; v++)
+        dInFeatures[R0[v] * input_stride + tx] = dI[v];
+      __syncthreads();
+    }
+#pragma unroll
+    for (int v = 0; v < V; v++)
+      atomicAdd(&dw[ty[v] * output_nPlanes + tx], dW[v]);
+    w += K;
+    dw += K;
+    dOutFeatures += K;
+  }
+}
+// dOutput x W^T -> dInput and
+// Input^T x dOutput -> dW
+// blockDim=(K,K/V,1), gridDim=(nBlocks,M,1)
+template <typename T, uInt K, uInt V>
+__global__ void dDeconvolution_KMxKN_backward_dW_B(
+    T *inFeatures, T *dInFeatures, T *dOutFeatures, T *w, T *dw, uInt *rules,
+    uInt nHot, uInt input_nPlanes, uInt input_stride, uInt output_nPlanes,
+    uInt output_stride) {
+  // M = gridDim.y == input_nPlanes / K
+  uInt N = output_nPlanes / K;
+  uInt m = blockIdx.y;
+  inFeatures += m * K;
+  dInFeatures += m * K;
+  w += m * K * output_nPlanes;
+  dw += m * K * output_nPlanes;
+  T dI[V];
+  T dW[V];
+  __shared__ T I[K][K];
+  __shared__ T dO[K][K];
+  __shared__ T W[K][K];
+  uInt R0[V];
+  uInt R1[V];
+  const int tx = threadIdx.x;
+  int ty[V];
+#pragma unroll
+  for (int v = 0; v < V; v++)
+    ty[v] = threadIdx.y + v * (K / V);
+  for (int n = 0; n < N; n++) {
+// Read w, reset dW
+#pragma unroll
+    for (int v = 0; v < V; v++) {
+      W[ty[v]][tx] = w[ty[v] * output_nPlanes + tx];
+      dW[v] = 0;
+    }
+    for (uInt s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
+#pragma unroll
+      for (int v = 0; v < V; v++) {
+        if (s + ty[v] < nHot) {
+          R1[v] = rules[2 * (s + ty[v])];
+          R0[v] = rules[2 * (s + ty[v]) + 1];
+        }
+        dI[v] = 0;
+      }
+      __syncthreads();
+// Read input and dOutput
+#pragma unroll
+      for (int v = 0; v < V; v++)
+        if (s + ty[v] < nHot) {
+          I[ty[v]][tx] = inFeatures[R0[v] * input_stride + tx];
+          dO[ty[v]][tx] = dOutFeatures[R1[v] * output_stride + tx];
+        } else {
+          I[ty[v]][tx] = 0;
+          dO[ty[v]][tx] = 0;
+        }
+      __syncthreads();
+#pragma unroll
+      for (int k = 0; k < K; k++)
+#pragma unroll
+        for (int v = 0; v < V; v++) {
+          dI[v] += dO[ty[v]][k] * W[tx][k];
+          dW[v] += I[k][ty[v]] * dO[k][tx];
+        }
+#pragma unroll
+      for (int v = 0; v < V; v++)
+        if (s + ty[v] < nHot)
+          dI[v] += dInFeatures[R0[v] * input_stride + tx];
+#pragma unroll
+      for (int v = 0; v < V; v++)
+        if (s + ty[v] < nHot)
+          dInFeatures[R0[v] * input_stride + tx] = dI[v];
+      __syncthreads();
+    }
+#pragma unroll
+    for (int v = 0; v < V; v++)
+      atomicAdd(&dw[ty[v] * output_nPlanes + tx], dW[v]);
+    w += K;
+    dw += K;
+    dOutFeatures += K;
+  }
+}
+#define FOO(K, V)                                                              \
+  {                                                                            \
+    if (input_nPlanes % K == 0 and output_nPlanes % K == 0) {                  \
+      uInt o = (nHot / K) * K;                                                 \
+      if (o >= K)                                                              \
+        dDeconvolution_KMxKN_backward_dW_A<T, K, V> << <                       \
+            dim3(std::min(o / K, (uInt)512), input_nPlanes / K),               \
+            dim3(K, K / V), 0, stream>>>                                       \
+            (inFeatures, dInFeatures, dOutFeatures, w, dw, rules, o,           \
+             input_nPlanes, input_stride, output_nPlanes, output_stride);      \
+      if (nHot > o)                                                            \
+        dDeconvolution_KMxKN_backward_dW_B<T, K, V> << <                       \
+            dim3(1, input_nPlanes / K), dim3(K, K / V), 0, stream>>>           \
+            (inFeatures, dInFeatures, dOutFeatures, w, dw, rules + 2 * o,      \
+             nHot - o, input_nPlanes, input_stride, output_nPlanes,            \
+             output_stride);                                                   \
+      return;                                                                  \
+    }                                                                          \
+  }
+template <typename T>
+void dDeconvolution_backward_dW(T *inFeatures, T *dInFeatures, T *dOutFeatures,
+                                T *w, T *dw, uInt *rules, uInt nHot,
+                                uInt input_nPlanes, uInt input_stride,
+                                uInt output_nPlanes, uInt output_stride,
+                                cudaStream_t stream) {
+  FOO(32, 8)
+  FOO(16, 4)
+  FOO(8, 2)
+  assert(false);
+}
+#undef FOO
+template <typename T, uInt K, uInt V>
+__global__ void
+dDeconvolution_KMxKN_forward2(T *inFeatures, T *outFeatures, T *w, uInt *rules,
+                              uInt nHot, uInt input_nPlanes, uInt input_stride,
+                              uInt output_nPlanes, uInt output_stride) {
+  // Input x Weight -> Output
+  // blockDim=(K,K/V,1), gridDim=(nBlocks,N,1) Volkov-blocks
+  // K is a multiple of V,
+  // nHot x input_nplanes<=KM -> nHot x output_nPlanes<=KN
+  // - parallel over N,nHot - loop over M
+  uInt M = (input_nPlanes + K - 1) / K;
+  // N = gridDim.y ~ output_nPlanes/K
+  uInt n = blockIdx.y;
+  outFeatures += n * K;
+  w += n * K;
+  uInt KO = min(K, output_nPlanes - K * n);
+  T O[V];
+  __shared__ T W[K][K];
+  __shared__ T I[K][K];
+  __shared__ uInt R[K * 2];
+  const int tx = threadIdx.x;
+  int ty[V];
+#pragma unroll
+  for (int v = 0; v < V; v++)
+    ty[v] = threadIdx.y + v * (K / V);
+  for (int m = 0; m < M; m++) {
+    uInt KI = min(K, input_nPlanes - K * m);
+// Read w
+#pragma unroll
+    for (int v = 0; v < V; v++)
+      if (ty[v] < KI and tx < KO)
+        W[ty[v]][tx] = w[ty[v] * output_nPlanes + tx];
+    for (uInt s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
+// Read rules for K input/output pairs
+#pragma unroll
+      for (int v = 0; v < V; v++) {
+        if (ty[v] < 2) {
+          int q = ty[v] * K + tx;
+          if (s + q / 2 < nHot)
+            R[q] = rules[2 * s + q];
+        }
+      }
+      __syncthreads();
+// Read input, reset O[]
+#pragma unroll
+      for (int v = 0; v < V; v++) {
+        if (tx < KI and s + ty[v] < nHot)
+          I[ty[v]][tx] = inFeatures[R[2 * ty[v] + 1] * input_stride + tx];
+        O[v] = 0;
+      }
+      __syncthreads();
+#pragma unroll
+      for (int k = 0; k < KI; k++)
+#pragma unroll
+        for (int v = 0; v < V; v++)
+          O[v] += I[ty[v]][k] * W[k][tx];
+      __syncthreads();
+#pragma unroll
+      for (int v = 0; v < V; v++)
+        if (tx < KO and s + ty[v] < nHot)
+          outFeatures[R[2 * ty[v]] * output_stride + tx] += O[v];
+      __syncthreads();
+    }
+    w += K * output_nPlanes;
+    inFeatures += K;
+  }
+}
+template <typename T>
+void dDeconvolution_forward2(T *inFeatures, T *outFeatures, T *w, uInt *rules,
+                             uInt nHot, uInt input_nPlanes, uInt input_stride,
+                             uInt output_nPlanes, uInt output_stride,
+                             cudaStream_t stream) {
+  if (input_nPlanes % 8 != 0 or output_nPlanes % 8 != 0) {
+    const int K = 16;
+    const int V = 4;
+    dDeconvolution_KMxKN_forward2<T, K, V> << <
+        dim3(128, (output_nPlanes + K - 1) / K), dim3(K, K / V), 0, stream>>>
+        (inFeatures, outFeatures, w, rules, nHot, input_nPlanes, input_stride,
+         output_nPlanes, output_stride);
+    return;
+  } else {
+    dDeconvolution_forward(inFeatures, outFeatures, w, rules, nHot,
+                           input_nPlanes, input_stride, output_nPlanes,
+                           output_stride, stream);
+  }
+}
+// dOutput x W^T -> dInput and
+// Input^T x dOutput -> dW
+// blockDim=(K,K/V,1), gridDim=(nBlocks,M,1)
+template <typename T, uInt K, uInt V>
+__global__ void dDeconvolution_KMxKN_backward_dW2(
+    T *inFeatures, T *dInFeatures, T *dOutFeatures, T *w, T *dw, uInt *rules,
+    uInt nHot, uInt input_nPlanes, uInt input_stride, uInt output_nPlanes,
+    uInt output_stride) {
+  // M = gridDim.y == input_nPlanes / K
+  uInt N = (output_nPlanes + K - 1) / K;
+  uInt m = blockIdx.y;
+  inFeatures += m * K;
+  dInFeatures += m * K;
+  w += m * K * output_nPlanes;
+  dw += m * K * output_nPlanes;
+  uInt KI = min(K, input_nPlanes - K * m);
+  T dI[V];
+  T dW[V];
+  __shared__ T I[K][K];
+  __shared__ T dO[K][K];
+  __shared__ T W[K][K];
+  __shared__ uInt R[K * 2];
+  const int tx = threadIdx.x;
+  int ty[V];
+#pragma unroll
+  for (int v = 0; v < V; v++)
+    ty[v] = threadIdx.y + v * (K / V);
+  for (int n = 0; n < N; n++) {
+    uInt KO = min(K, output_nPlanes - K * n);
+// Read w, reset dW
+#pragma unroll
+    for (int v = 0; v < V; v++) {
+      if (ty[v] < KI and tx < KO)
+        W[ty[v]][tx] = w[ty[v] * output_nPlanes + tx];
+      dW[v] = 0;
+    }
+    for (uInt s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
+// Read rules for K input/output pairs, reset dI[]
+#pragma unroll
+      for (int v = 0; v < V; v++) {
+        if (ty[v] < 2) {
+          int q = ty[v] * K + tx;
+          if (s + q / 2 < nHot)
+            R[q] = rules[2 * s + q];
+        }
+        dI[v] = 0;
+      }
+      __syncthreads();
+// Read input and dOutput
+#pragma unroll
+      for (int v = 0; v < V; v++) {
+        if (tx < KI and s + ty[v] < nHot)
+          I[ty[v]][tx] = inFeatures[R[2 * ty[v] + 1] * input_stride + tx];
+        else
+          I[ty[v]][tx] = 0;
+        if (tx < KO and s + ty[v] < nHot)
+          dO[ty[v]][tx] = dOutFeatures[R[2 * ty[v]] * output_stride + tx];
+        else
+          dO[ty[v]][tx] = 0;
+      }
+      __syncthreads();
+#pragma unroll
+      for (int k = 0; k < KO; k++)
+#pragma unroll
+        for (int v = 0; v < V; v++)
+          dI[v] += dO[ty[v]][k] * W[tx][k];
+#pragma unroll
+      for (int k = 0; k < K; k++)
+#pragma unroll
+        for (int v = 0; v < V; v++)
+          dW[v] += I[k][ty[v]] * dO[k][tx];
+      __syncthreads();
+#pragma unroll
+      for (int v = 0; v < V; v++)
+        if (tx < KI and s + ty[v] < nHot)
+          dInFeatures[R[2 * ty[v] + 1] * input_stride + tx] += dI[v];
+      __syncthreads();
+    }
+#pragma unroll
+    for (int v = 0; v < V; v++)
+      if (ty[v] < KI and tx < KO)
+        atomicAdd(&dw[ty[v] * output_nPlanes + tx], dW[v]);
+    w += K;
+    dw += K;
+    dOutFeatures += K;
+  }
+}
+template <typename T>
+void dDeconvolution_backward_dW2(T *inFeatures, T *dInFeatures, T *dOutFeatures,
+                                 T *w, T *dw, uInt *rules, uInt nHot,
+                                 uInt input_nPlanes, uInt input_stride,
+                                 uInt output_nPlanes, uInt output_stride,
+                                 cudaStream_t stream) {
+  if (input_nPlanes % 8 != 0 or output_nPlanes % 8 != 0) {
+    const int K = 16;
+    const int V = 4;
+    dDeconvolution_KMxKN_backward_dW2<T, K, V> << <
+        dim3(128, (input_nPlanes + K - 1) / K), dim3(K, K / V), 0, stream>>>
+        (inFeatures, dInFeatures, dOutFeatures, w, dw, rules, nHot,
+         input_nPlanes, input_stride, output_nPlanes, output_stride);
+    return;
+  } else {
+    dDeconvolution_backward_dW(inFeatures, dInFeatures, dOutFeatures, w, dw,
+                               rules, nHot, input_nPlanes, input_stride,
+                               output_nPlanes, output_stride, stream);
+  }
+}
+#endif /* GPU_DECONVOLUTION_H */
--- a/PyTorch/sparseconvnet/SCN/generic/GPU/LeakyReLU.cu
+++ b/PyTorch/sparseconvnet/SCN/generic/GPU/LeakyReLU.cu
--- a/PyTorch/sparseconvnet/SCN/generic/GPU/LeakyReLU.h
+++ b/PyTorch/sparseconvnet/SCN/generic/GPU/LeakyReLU.h
+// Copyright 2016-present, Facebook, Inc.
+// All rights reserved.
+//
+// This source code is licensed under the license found in the
+// LICENSE file in the root directory of this source tree.
+#ifndef LEAKYRELU_H
+#define LEAKYRELU_H
+template <typename T>
+__global__ void LeakyReLU_fp(T *input_features, T *output_features, uInt n,
+                             T alpha) {
+  for (uInt i = blockIdx.x * blockDim.x + threadIdx.x; i < n; i += 16 * 1024)
+    output_features[i] = (input_features[i] > 0) ? input_features[i]
+                                                 : (input_features[i] * alpha);
+}
+template <typename T>
+__global__ void LeakyReLU_bp(T *input_features, T *d_input_features,
+                             T *d_output_features, uInt n, T alpha) {
+  for (uInt i = blockIdx.x * blockDim.x + threadIdx.x; i < n; i += 16 * 1024)
+    d_input_features[i] = (input_features[i] > 0)
+                              ? d_output_features[i]
+                              : (d_output_features[i] * alpha);
+}
+#endif
--- a/PyTorch/sparseconvnet/SCN/generic/GPU/MaxPooling.cu
+++ b/PyTorch/sparseconvnet/SCN/generic/GPU/MaxPooling.cu
--- a/PyTorch/sparseconvnet/SCN/generic/GPU/MaxPooling.h
+++ b/PyTorch/sparseconvnet/SCN/generic/GPU/MaxPooling.h
--- a/PyTorch/sparseconvnet/SCN/generic/GPU/NetworkInNetwork.cu
+++ b/PyTorch/sparseconvnet/SCN/generic/GPU/NetworkInNetwork.cu
--- a/PyTorch/sparseconvnet/SCN/generic/GPU/RuleBookIterator.h
+++ b/PyTorch/sparseconvnet/SCN/generic/GPU/RuleBookIterator.h
--- a/PyTorch/sparseconvnet/SCN/generic/GPU/SparseToDense.cu
+++ b/PyTorch/sparseconvnet/SCN/generic/GPU/SparseToDense.cu
--- a/PyTorch/sparseconvnet/SCN/generic/GPU/SparseToDense.h
+++ b/PyTorch/sparseconvnet/SCN/generic/GPU/SparseToDense.h
--- a/PyTorch/sparseconvnet/SCN/generic/GPU/THGenerateCudaFloatTypes.h
+++ b/PyTorch/sparseconvnet/SCN/generic/GPU/THGenerateCudaFloatTypes.h
--- a/PyTorch/sparseconvnet/SCN/generic/GPU/THGenerateDimCudaFloatTypes.h
+++ b/PyTorch/sparseconvnet/SCN/generic/GPU/THGenerateDimCudaFloatTypes.h
--- a/PyTorch/sparseconvnet/SCN/generic/Geometry/ActivePoolingRules.h
+++ b/PyTorch/sparseconvnet/SCN/generic/Geometry/ActivePoolingRules.h
--- a/PyTorch/sparseconvnet/SCN/generic/Geometry/ConvolutionRules.h
+++ b/PyTorch/sparseconvnet/SCN/generic/Geometry/ConvolutionRules.h
--- a/PyTorch/sparseconvnet/SCN/generic/Geometry/Metadata.cpp
+++ b/PyTorch/sparseconvnet/SCN/generic/Geometry/Metadata.cpp
--- a/PyTorch/sparseconvnet/SCN/generic/Geometry/Metadata.h
+++ b/PyTorch/sparseconvnet/SCN/generic/Geometry/Metadata.h
--- a/PyTorch/sparseconvnet/SCN/generic/Geometry/RectangularRegions.h
+++ b/PyTorch/sparseconvnet/SCN/generic/Geometry/RectangularRegions.h
--- a/PyTorch/sparseconvnet/SCN/generic/Geometry/THGenerateDimTypes.h
+++ b/PyTorch/sparseconvnet/SCN/generic/Geometry/THGenerateDimTypes.h
--- a/PyTorch/sparseconvnet/SCN/generic/Geometry/ValidConvolutionRules.h
+++ b/PyTorch/sparseconvnet/SCN/generic/Geometry/ValidConvolutionRules.h
--- a/PyTorch/sparseconvnet/SCN/generic/SparseConvNet.h
+++ b/PyTorch/sparseconvnet/SCN/generic/SparseConvNet.h