fixes

66986767 · Benjamin Thomas Graham · edf89af3 · 66986767 · 66986767 · 66986767
Commit 66986767 authored Jul 31, 2018 by Benjamin Thomas Graham
8 changed files
--- a/sparseconvnet/SCN/CPU/IOLayers.cpp
+++ b/sparseconvnet/SCN/CPU/IOLayers.cpp
@@ -17,14 +17,14 @@ void InputLayer_ForwardPass(T *input_features, T *output_features, Int nRows,
  for (row = 0; row < nRows; row++) {
    auto nActive = rules[0];
    T multiplier = (average and nActive > 0) ? (T)1 / nActive : (T)1;
+    auto out_f = output_features + row * nPlanes;
+    auto r = rules + row * (1 + maxActive);
    for (Int i = 1; i <= nActive; ++i) {
-      auto in_f = input_features + nPlanes * rules[i];
+      auto in_f = input_features + r[i] * nPlanes;
      for (Int plane = 0; plane < nPlanes; plane++) {
-        output_features[plane] += multiplier * in_f[plane];
+        out_f[plane] += multiplier * in_f[plane];
      }
    }
-    output_features += nPlanes;
-    rules += 1 + maxActive;
  }
 }
 template <typename T>
@@ -36,13 +36,13 @@ void InputLayer_BackwardPass(T *d_input_features, T *d_output_features,
  for (row = 0; row < nRows; row++) {
    auto nActive = rules[0];
    T multiplier = (average and nActive > 0) ? (T)1 / nActive : (T)1;
+    auto d_out_f = d_output_features + row * nPlanes;
+    auto r = rules + row * (1 + maxActive);
    for (Int i = 1; i <= nActive; ++i) {
-      auto d_in_f = d_input_features + nPlanes * rules[i];
+      auto d_in_f = d_input_features + r[i] * nPlanes;
      for (Int plane = 0; plane < nPlanes; plane++)
-        d_in_f[plane] += multiplier * d_output_features[plane];
+        d_in_f[plane] += multiplier * d_out_f[plane];
    }
-    d_output_features += nPlanes;
-    rules += 1 + maxActive;
  }
 }


--- a/sparseconvnet/SCN/CUDA/Convolution.cpp
+++ b/sparseconvnet/SCN/CUDA/Convolution.cpp
@@ -5,10 +5,9 @@
 // LICENSE file in the root directory of this source tree.

 template <typename T>
-void Convolution_fp_bias(T *of, T *b, Int nPlanes, Int nActiveOut);
+void Convolution_fp_bias(T *oF, T *b, Int nPlanes, Int nActive);
 template <typename T>
-void Convolution_bp_bias(T *matrix, T *target, Int nRows, Int nColumns,
-                         Int nCOLUMNS);
+void Convolution_bp_bias(T *d_oF, T *d_b, Int nPlanes, Int nActive);
 template <typename T>
 double dConvolution_forward2(T *inFeatures, T *outFeatures, T *w,
                             RuleBook _rules, Int input_nPlanes,
@@ -84,7 +83,7 @@ void cuda_Convolution_backward(

    if (d_bias.numel()) {
      auto db = d_bias.data<T>();
-      Convolution_bp_bias(doF, db, op, op, nActiveOut);
+      Convolution_bp_bias(doF, db, op, nActiveOut);
    }
  }
 }
@@ -147,7 +146,7 @@ void cuda_SubmanifoldConvolution_backward(

    if (d_bias.numel()) {
      auto db = d_bias.data<T>();
-      Convolution_bp_bias(doF, db, op, op, nActive);
+      Convolution_bp_bias(doF, db, op, nActive);
    }
  }
 }
@@ -216,7 +215,7 @@ void cuda_FullConvolution_backward(

    if (d_bias.numel()) {
      auto db = d_bias.data<T>();
-      Convolution_bp_bias(doF, db, op, op, nActiveOut);
+      Convolution_bp_bias(doF, db, op, nActiveOut);
    }
  }
 }
@@ -283,7 +282,7 @@ void cuda_RandomizedStrideConvolution_backward(

    if (d_bias.numel()) {
      auto db = d_bias.data<T>();
-      Convolution_bp_bias(doF, db, op, op, nActiveOut);
+      Convolution_bp_bias(doF, db, op, nActiveOut);
    }
  }
 }
--- a/sparseconvnet/SCN/CUDA/Convolution.cu
+++ b/sparseconvnet/SCN/CUDA/Convolution.cu
@@ -5,10 +5,11 @@
 // LICENSE file in the root directory of this source tree.

 #include "RuleBookIterator.h"
+#define TACC double

 template <typename T>
 __global__ void Convolution_fp_bias_(T *output_features, T *bias, Int nPlanes,
-                                     Int nActive) {
+				     Int nActive) {
  Int n = blockIdx.x * 32 + threadIdx.x;
  T b = bias[n];
  output_features += n;
@@ -21,41 +22,38 @@ template <typename T>
 void Convolution_fp_bias(T *oF, T *b, Int nPlanes, Int nActive) {
  if (nPlanes / 32 > 0)
    Convolution_fp_bias_<<<dim3(nPlanes / 32, 4096), 32>>>(oF, b, nPlanes,
-                                                           nActive);
+							   nActive);
  if (nPlanes % 32 > 0) {
    Int o = nPlanes / 32 * 32;
    Convolution_fp_bias_<<<dim3(1, 4096), nPlanes - o>>>(oF + o, b + o, nPlanes,
-                                                         nActive);
+							 nActive);
  }
 }

 template <typename T>
-__global__ void dColumnSum(T *matrix, T *target, Int nRows, Int nColumns,
-                           Int nCOLUMNS) {
-  Int i = blockIdx.x * 32 + threadIdx.x;
-  T t = 0;
-  for (Int j = blockIdx.y; j < nRows; j += 32)
-    t += matrix[j * nCOLUMNS + i];
-  atomicAdd(&target[i], t);
+__global__ void Convolution_bp_bias_(T *d_oF, T *d_b, Int nPlanes, Int nActive) {
+  Int n = blockIdx.x * 32 + threadIdx.x;
+  d_oF+=n;
+  TACC t = 0;
+  for (Int row = blockIdx.y; row < nActive; row += gridDim.y)
+    t += d_oF[row * nPlanes ];
+  atomicAdd(&d_b[n], t);
 }
 template <typename T>
-void Convolution_bp_bias(T *matrix, T *target, Int nRows, Int nColumns,
-                         Int nCOLUMNS) {
-  if (nColumns / 32 > 0)
-    dColumnSum<<<dim3(nColumns / 32, 32), 32>>>(matrix, target, nRows, nColumns,
-                                                nCOLUMNS);
-  if (nColumns % 32 > 0) {
-    Int o = nColumns / 32 * 32;
-    dColumnSum<<<dim3(1, 32), nColumns - o>>>(matrix + o, target + o, nRows,
-                                              nColumns, nCOLUMNS);
+void Convolution_bp_bias(T *d_oF, T *d_b, Int nPlanes, Int nActive) {
+  if (nPlanes / 32 > 0)
+    Convolution_bp_bias_<<<dim3(nPlanes / 32, 32), 32>>>(d_oF, d_b, nPlanes, nActive);
+  if (nPlanes % 32 > 0) {
+    Int o = nPlanes / 32 * 32;
+    Convolution_bp_bias_<<<dim3(1, 32), nPlanes - o>>>(d_oF + o, d_b + o, nPlanes, nActive);
  }
 }

 template <typename T, Int K, Int V>
 __global__ void
 dConvolution_KMxKN_forwardA(T *inFeatures, T *outFeatures, T *w, Int *rules,
-                            Int nHot, Int input_nPlanes, Int input_stride,
-                            Int output_nPlanes, Int output_stride) {
+			    Int nHot, Int input_nPlanes, Int input_stride,
+			    Int output_nPlanes, Int output_stride) {
  // nHot must be a multiple of K!!

  // Input x Weight -> Output
@@ -70,7 +68,7 @@ dConvolution_KMxKN_forwardA(T *inFeatures, T *outFeatures, T *w, Int *rules,
  outFeatures += n * K;
  w += n * K;

-  T O[V];
+  TACC O[V];
  __shared__ T W[K][K];
  __shared__ T I[K][K];
  Int R0[V];
@@ -90,31 +88,31 @@ dConvolution_KMxKN_forwardA(T *inFeatures, T *outFeatures, T *w, Int *rules,
    for (Int s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
 #pragma unroll
      for (int v = 0; v < V; v++) {
-        R0[v] = rules[2 * (s + ty[v])];
-        R1[v] = rules[2 * (s + ty[v]) + 1];
+	R0[v] = rules[2 * (s + ty[v])];
+	R1[v] = rules[2 * (s + ty[v]) + 1];
      }
      __syncthreads();

 // Read input, reset O[]
 #pragma unroll
      for (int v = 0; v < V; v++) {
-        I[ty[v]][tx] = inFeatures[R0[v] * input_stride + tx];
-        O[v] = 0;
+	I[ty[v]][tx] = inFeatures[R0[v] * input_stride + tx];
+	O[v] = 0;
      }
      __syncthreads();

 #pragma unroll
      for (int k = 0; k < K; k++)
 #pragma unroll
-        for (int v = 0; v < V; v++)
-          O[v] += I[ty[v]][k] * W[k][tx];
+	for (int v = 0; v < V; v++)
+	  O[v] += I[ty[v]][k] * W[k][tx];

 #pragma unroll
      for (int v = 0; v < V; v++)
-        O[v] += outFeatures[R1[v] * output_stride + tx];
+	O[v] += outFeatures[R1[v] * output_stride + tx];
 #pragma unroll
      for (int v = 0; v < V; v++)
-        outFeatures[R1[v] * output_stride + tx] = O[v];
+	outFeatures[R1[v] * output_stride + tx] = O[v];
      __syncthreads();
    }
    w += K * output_nPlanes;
@@ -124,8 +122,8 @@ dConvolution_KMxKN_forwardA(T *inFeatures, T *outFeatures, T *w, Int *rules,
 template <typename T, Int K, Int V>
 __global__ void
 dConvolution_KMxKN_forwardB(T *inFeatures, T *outFeatures, T *w, Int *rules,
-                            Int nHot, Int input_nPlanes, Int input_stride,
-                            Int output_nPlanes, Int output_stride) {
+			    Int nHot, Int input_nPlanes, Int input_stride,
+			    Int output_nPlanes, Int output_stride) {
  // Input x Weight -> Output
  // blockDim=(K,K/V,1), gridDim=(nBlocks,N,1) Volkov-blocks
  // K is a multiple of V,
@@ -138,7 +136,7 @@ dConvolution_KMxKN_forwardB(T *inFeatures, T *outFeatures, T *w, Int *rules,
  outFeatures += n * K;
  w += n * K;

-  T O[V];
+  TACC O[V];
  __shared__ T W[K][K];
  __shared__ T I[K][K];
  Int R0[V];
@@ -158,36 +156,36 @@ dConvolution_KMxKN_forwardB(T *inFeatures, T *outFeatures, T *w, Int *rules,
    for (Int s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
 #pragma unroll
      for (int v = 0; v < V; v++) {
-        if (s + ty[v] < nHot) {
-          R0[v] = rules[2 * (s + ty[v])];
-          R1[v] = rules[2 * (s + ty[v]) + 1];
-        }
+	if (s + ty[v] < nHot) {
+	  R0[v] = rules[2 * (s + ty[v])];
+	  R1[v] = rules[2 * (s + ty[v]) + 1];
+	}
      }
      __syncthreads();

 // Read input, reset O[]
 #pragma unroll
      for (int v = 0; v < V; v++) {
-        if (s + ty[v] < nHot)
-          I[ty[v]][tx] = inFeatures[R0[v] * input_stride + tx];
-        O[v] = 0;
+	if (s + ty[v] < nHot)
+	  I[ty[v]][tx] = inFeatures[R0[v] * input_stride + tx];
+	O[v] = 0;
      }
      __syncthreads();

 #pragma unroll
      for (int k = 0; k < K; k++)
 #pragma unroll
-        for (int v = 0; v < V; v++)
-          O[v] += I[ty[v]][k] * W[k][tx];
+	for (int v = 0; v < V; v++)
+	  O[v] += I[ty[v]][k] * W[k][tx];

 #pragma unroll
      for (int v = 0; v < V; v++)
-        if (s + ty[v] < nHot)
-          O[v] += outFeatures[R1[v] * output_stride + tx];
+	if (s + ty[v] < nHot)
+	  O[v] += outFeatures[R1[v] * output_stride + tx];
 #pragma unroll
      for (int v = 0; v < V; v++)
-        if (s + ty[v] < nHot)
-          outFeatures[R1[v] * output_stride + tx] = O[v];
+	if (s + ty[v] < nHot)
+	  outFeatures[R1[v] * output_stride + tx] = O[v];
      __syncthreads();
    }
    w += K * output_nPlanes;
@@ -200,24 +198,24 @@ dConvolution_KMxKN_forwardB(T *inFeatures, T *outFeatures, T *w, Int *rules,
    if (input_nPlanes % K == 0 and output_nPlanes % K == 0) {                  \
      Int o = (nHot / K) * K;                                                  \
      if (o >= K)                                                              \
-        dConvolution_KMxKN_forwardA<                                           \
-            T, K, V><<<dim3(std::min(o / K, (Int)512), output_nPlanes / K),    \
-                       dim3(K, K / V)>>>(inFeatures, outFeatures, w, rules, o, \
-                                         input_nPlanes, input_stride,          \
-                                         output_nPlanes, output_stride);       \
+	dConvolution_KMxKN_forwardA<                                           \
+	    T, K, V><<<dim3(std::min(o / K, (Int)512), output_nPlanes / K),    \
+		       dim3(K, K / V)>>>(inFeatures, outFeatures, w, rules, o, \
+					 input_nPlanes, input_stride,          \
+					 output_nPlanes, output_stride);       \
      if (nHot > o)                                                            \
-        dConvolution_KMxKN_forwardB<                                           \
-            T, K, V><<<dim3(1, output_nPlanes / K), dim3(K, K / V)>>>(         \
-            inFeatures, outFeatures, w, rules + 2 * o, nHot - o,               \
-            input_nPlanes, input_stride, output_nPlanes, output_stride);       \
+	dConvolution_KMxKN_forwardB<                                           \
+	    T, K, V><<<dim3(1, output_nPlanes / K), dim3(K, K / V)>>>(         \
+	    inFeatures, outFeatures, w, rules + 2 * o, nHot - o,               \
+	    input_nPlanes, input_stride, output_nPlanes, output_stride);       \
      return;                                                                  \
    }                                                                          \
  }

 template <typename T>
 void dConvolution_forward(T *inFeatures, T *outFeatures, T *w, Int *rules,
-                          Int nHot, Int input_nPlanes, Int input_stride,
-                          Int output_nPlanes, Int output_stride) {
+			  Int nHot, Int input_nPlanes, Int input_stride,
+			  Int output_nPlanes, Int output_stride) {
  FOO(T, 64, 16)
  FOO(T, 32, 8)
  FOO(T, 16, 4)
@@ -226,9 +224,9 @@ void dConvolution_forward(T *inFeatures, T *outFeatures, T *w, Int *rules,
 }
 template <>
 void dConvolution_forward<double>(double *inFeatures, double *outFeatures,
-                                  double *w, Int *rules, Int nHot,
-                                  Int input_nPlanes, Int input_stride,
-                                  Int output_nPlanes, Int output_stride) {
+				  double *w, Int *rules, Int nHot,
+				  Int input_nPlanes, Int input_stride,
+				  Int output_nPlanes, Int output_stride) {
  FOO(double, 32, 8)
  FOO(double, 16, 4)
  FOO(double, 8, 2)
@@ -242,9 +240,9 @@ void dConvolution_forward<double>(double *inFeatures, double *outFeatures,
 template <typename T, Int K, Int V>
 __global__ void
 dConvolution_KMxKN_backward_dW_A(T *inFeatures, T *dInFeatures, T *dOutFeatures,
-                                 T *w, T *dw, Int *rules, Int nHot,
-                                 Int input_nPlanes, Int input_stride,
-                                 Int output_nPlanes, Int output_stride) {
+				 T *w, T *dw, Int *rules, Int nHot,
+				 Int input_nPlanes, Int input_stride,
+				 Int output_nPlanes, Int output_stride) {
  // M = gridDim.y == input_nPlanes / K
  Int N = output_nPlanes / K;
  Int m = blockIdx.y;
@@ -253,8 +251,8 @@ dConvolution_KMxKN_backward_dW_A(T *inFeatures, T *dInFeatures, T *dOutFeatures,
  w += m * K * output_nPlanes;
  dw += m * K * output_nPlanes;

-  T dI[V];
-  T dW[V];
+  TACC dI[V];
+  TACC dW[V];
  __shared__ T I[K][K];
  __shared__ T dO[K][K];
  __shared__ T W[K][K];
@@ -277,31 +275,31 @@ dConvolution_KMxKN_backward_dW_A(T *inFeatures, T *dInFeatures, T *dOutFeatures,
    for (Int s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
 #pragma unroll
      for (int v = 0; v < V; v++) {
-        R0[v] = rules[2 * (s + ty[v])];
-        R1[v] = rules[2 * (s + ty[v]) + 1];
-        dI[v] = 0;
+	R0[v] = rules[2 * (s + ty[v])];
+	R1[v] = rules[2 * (s + ty[v]) + 1];
+	dI[v] = 0;
      }
      __syncthreads();
 // Read input and dOutput
 #pragma unroll
      for (int v = 0; v < V; v++) {
-        I[ty[v]][tx] = inFeatures[R0[v] * input_stride + tx];
-        dO[ty[v]][tx] = dOutFeatures[R1[v] * output_stride + tx];
+	I[ty[v]][tx] = inFeatures[R0[v] * input_stride + tx];
+	dO[ty[v]][tx] = dOutFeatures[R1[v] * output_stride + tx];
      }
      __syncthreads();
 #pragma unroll
      for (int k = 0; k < K; k++)
 #pragma unroll
-        for (int v = 0; v < V; v++) {
-          dI[v] += dO[ty[v]][k] * W[tx][k];
-          dW[v] += I[k][ty[v]] * dO[k][tx];
-        }
+	for (int v = 0; v < V; v++) {
+	  dI[v] += dO[ty[v]][k] * W[tx][k];
+	  dW[v] += I[k][ty[v]] * dO[k][tx];
+	}
 #pragma unroll
      for (int v = 0; v < V; v++)
-        dI[v] += dInFeatures[R0[v] * input_stride + tx];
+	dI[v] += dInFeatures[R0[v] * input_stride + tx];
 #pragma unroll
      for (int v = 0; v < V; v++)
-        dInFeatures[R0[v] * input_stride + tx] = dI[v];
+	dInFeatures[R0[v] * input_stride + tx] = dI[v];
      __syncthreads();
    }
 #pragma unroll
@@ -319,9 +317,9 @@ dConvolution_KMxKN_backward_dW_A(T *inFeatures, T *dInFeatures, T *dOutFeatures,
 template <typename T, Int K, Int V>
 __global__ void
 dConvolution_KMxKN_backward_dW_B(T *inFeatures, T *dInFeatures, T *dOutFeatures,
-                                 T *w, T *dw, Int *rules, Int nHot,
-                                 Int input_nPlanes, Int input_stride,
-                                 Int output_nPlanes, Int output_stride) {
+				 T *w, T *dw, Int *rules, Int nHot,
+				 Int input_nPlanes, Int input_stride,
+				 Int output_nPlanes, Int output_stride) {
  // M = gridDim.y == input_nPlanes / K
  Int N = output_nPlanes / K;
  Int m = blockIdx.y;
@@ -330,8 +328,8 @@ dConvolution_KMxKN_backward_dW_B(T *inFeatures, T *dInFeatures, T *dOutFeatures,
  w += m * K * output_nPlanes;
  dw += m * K * output_nPlanes;

-  T dI[V];
-  T dW[V];
+  TACC dI[V];
+  TACC dW[V];
  __shared__ T I[K][K];
  __shared__ T dO[K][K];
  __shared__ T W[K][K];
@@ -354,39 +352,39 @@ dConvolution_KMxKN_backward_dW_B(T *inFeatures, T *dInFeatures, T *dOutFeatures,
    for (Int s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
 #pragma unroll
      for (int v = 0; v < V; v++) {
-        if (s + ty[v] < nHot) {
-          R0[v] = rules[2 * (s + ty[v])];
-          R1[v] = rules[2 * (s + ty[v]) + 1];
-        }
-        dI[v] = 0;
+	if (s + ty[v] < nHot) {
+	  R0[v] = rules[2 * (s + ty[v])];
+	  R1[v] = rules[2 * (s + ty[v]) + 1];
+	}
+	dI[v] = 0;
      }
      __syncthreads();
 // Read input and dOutput
 #pragma unroll
      for (int v = 0; v < V; v++)
-        if (s + ty[v] < nHot) {
-          I[ty[v]][tx] = inFeatures[R0[v] * input_stride + tx];
-          dO[ty[v]][tx] = dOutFeatures[R1[v] * output_stride + tx];
-        } else {
-          I[ty[v]][tx] = 0;
-          dO[ty[v]][tx] = 0;
-        }
+	if (s + ty[v] < nHot) {
+	  I[ty[v]][tx] = inFeatures[R0[v] * input_stride + tx];
+	  dO[ty[v]][tx] = dOutFeatures[R1[v] * output_stride + tx];
+	} else {
+	  I[ty[v]][tx] = 0;
+	  dO[ty[v]][tx] = 0;
+	}
      __syncthreads();
 #pragma unroll
      for (int k = 0; k < K; k++)
 #pragma unroll
-        for (int v = 0; v < V; v++) {
-          dI[v] += dO[ty[v]][k] * W[tx][k];
-          dW[v] += I[k][ty[v]] * dO[k][tx];
-        }
+	for (int v = 0; v < V; v++) {
+	  dI[v] += dO[ty[v]][k] * W[tx][k];
+	  dW[v] += I[k][ty[v]] * dO[k][tx];
+	}
 #pragma unroll
      for (int v = 0; v < V; v++)
-        if (s + ty[v] < nHot)
-          dI[v] += dInFeatures[R0[v] * input_stride + tx];
+	if (s + ty[v] < nHot)
+	  dI[v] += dInFeatures[R0[v] * input_stride + tx];
 #pragma unroll
      for (int v = 0; v < V; v++)
-        if (s + ty[v] < nHot)
-          dInFeatures[R0[v] * input_stride + tx] = dI[v];
+	if (s + ty[v] < nHot)
+	  dInFeatures[R0[v] * input_stride + tx] = dI[v];
      __syncthreads();
    }
 #pragma unroll
@@ -403,26 +401,26 @@ dConvolution_KMxKN_backward_dW_B(T *inFeatures, T *dInFeatures, T *dOutFeatures,
    if (input_nPlanes % K == 0 and output_nPlanes % K == 0) {                  \
      Int o = (nHot / K) * K;                                                  \
      if (o >= K)                                                              \
-        dConvolution_KMxKN_backward_dW_A<                                      \
-            T, K, V><<<dim3(std::min(o / K, (Int)512), input_nPlanes / K),     \
-                       dim3(K, K / V)>>>(                                      \
-            inFeatures, dInFeatures, dOutFeatures, w, dw, rules, o,            \
-            input_nPlanes, input_stride, output_nPlanes, output_stride);       \
+	dConvolution_KMxKN_backward_dW_A<                                      \
+	    T, K, V><<<dim3(std::min(o / K, (Int)512), input_nPlanes / K),     \
+		       dim3(K, K / V)>>>(                                      \
+	    inFeatures, dInFeatures, dOutFeatures, w, dw, rules, o,            \
+	    input_nPlanes, input_stride, output_nPlanes, output_stride);       \
      if (nHot > o)                                                            \
-        dConvolution_KMxKN_backward_dW_B<                                      \
-            T, K, V><<<dim3(1, input_nPlanes / K), dim3(K, K / V)>>>(          \
-            inFeatures, dInFeatures, dOutFeatures, w, dw, rules + 2 * o,       \
-            nHot - o, input_nPlanes, input_stride, output_nPlanes,             \
-            output_stride);                                                    \
+	dConvolution_KMxKN_backward_dW_B<                                      \
+	    T, K, V><<<dim3(1, input_nPlanes / K), dim3(K, K / V)>>>(          \
+	    inFeatures, dInFeatures, dOutFeatures, w, dw, rules + 2 * o,       \
+	    nHot - o, input_nPlanes, input_stride, output_nPlanes,             \
+	    output_stride);                                                    \
      return;                                                                  \
    }                                                                          \
  }

 template <typename T>
 void dConvolution_backward_dW(T *inFeatures, T *dInFeatures, T *dOutFeatures,
-                              T *w, T *dw, Int *rules, Int nHot,
-                              Int input_nPlanes, Int input_stride,
-                              Int output_nPlanes, Int output_stride) {
+			      T *w, T *dw, Int *rules, Int nHot,
+			      Int input_nPlanes, Int input_stride,
+			      Int output_nPlanes, Int output_stride) {
  FOO(T, 32, 8)
  FOO(T, 16, 4)
  FOO(T, 8, 2)
@@ -433,8 +431,8 @@ void dConvolution_backward_dW(T *inFeatures, T *dInFeatures, T *dOutFeatures,
 template <typename T, Int K, Int V>
 __global__ void
 dConvolution_KMxKN_forward2(T *inFeatures, T *outFeatures, T *w, Int *rules,
-                            Int nHot, Int input_nPlanes, Int input_stride,
-                            Int output_nPlanes, Int output_stride) {
+			    Int nHot, Int input_nPlanes, Int input_stride,
+			    Int output_nPlanes, Int output_stride) {
  // Input x Weight -> Output
  // blockDim=(K,K/V,1), gridDim=(nBlocks,N,1) Volkov-blocks
  // K is a multiple of V,
@@ -449,7 +447,7 @@ dConvolution_KMxKN_forward2(T *inFeatures, T *outFeatures, T *w, Int *rules,
  w += n * K;
  Int KO = min(K, output_nPlanes - K * n);

-  T O[V];
+  TACC O[V];
  __shared__ T W[K][K];
  __shared__ T I[K][K];
  __shared__ Int R[K * 2];
@@ -466,40 +464,40 @@ dConvolution_KMxKN_forward2(T *inFeatures, T *outFeatures, T *w, Int *rules,
 #pragma unroll
    for (int v = 0; v < V; v++)
      if (ty[v] < KI and tx < KO)
-        W[ty[v]][tx] = w[ty[v] * output_nPlanes + tx];
+	W[ty[v]][tx] = w[ty[v] * output_nPlanes + tx];

    for (Int s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
 // Read rules for K input/output pairs
 #pragma unroll
      for (int v = 0; v < V; v++) {
-        if (ty[v] < 2) {
-          int q = ty[v] * K + tx;
-          if (s + q / 2 < nHot)
-            R[q] = rules[2 * s + q];
-        }
+	if (ty[v] < 2) {
+	  int q = ty[v] * K + tx;
+	  if (s + q / 2 < nHot)
+	    R[q] = rules[2 * s + q];
+	}
      }
      __syncthreads();

 // Read input, reset O[]
 #pragma unroll
      for (int v = 0; v < V; v++) {
-        if (tx < KI and s + ty[v] < nHot)
-          I[ty[v]][tx] = inFeatures[R[2 * ty[v]] * input_stride + tx];
-        O[v] = 0;
+	if (tx < KI and s + ty[v] < nHot)
+	  I[ty[v]][tx] = inFeatures[R[2 * ty[v]] * input_stride + tx];
+	O[v] = 0;
      }
      __syncthreads();

 #pragma unroll
      for (int k = 0; k < KI; k++)
 #pragma unroll
-        for (int v = 0; v < V; v++)
-          O[v] += I[ty[v]][k] * W[k][tx];
+	for (int v = 0; v < V; v++)
+	  O[v] += I[ty[v]][k] * W[k][tx];
      __syncthreads();

 #pragma unroll
      for (int v = 0; v < V; v++)
-        if (tx < KO and s + ty[v] < nHot)
-          outFeatures[R[2 * ty[v] + 1] * output_stride + tx] += O[v];
+	if (tx < KO and s + ty[v] < nHot)
+	  outFeatures[R[2 * ty[v] + 1] * output_stride + tx] += O[v];
      __syncthreads();
    }
    w += K * output_nPlanes;
@@ -513,9 +511,9 @@ dConvolution_KMxKN_forward2(T *inFeatures, T *outFeatures, T *w, Int *rules,
 template <typename T, Int K, Int V>
 __global__ void
 dConvolution_KMxKN_backward_dW2(T *inFeatures, T *dInFeatures, T *dOutFeatures,
-                                T *w, T *dw, Int *rules, Int nHot,
-                                Int input_nPlanes, Int input_stride,
-                                Int output_nPlanes, Int output_stride) {
+				T *w, T *dw, Int *rules, Int nHot,
+				Int input_nPlanes, Int input_stride,
+				Int output_nPlanes, Int output_stride) {
  // M = gridDim.y == input_nPlanes / K
  Int N = (output_nPlanes + K - 1) / K;
  Int m = blockIdx.y;
@@ -525,8 +523,8 @@ dConvolution_KMxKN_backward_dW2(T *inFeatures, T *dInFeatures, T *dOutFeatures,
  dw += m * K * output_nPlanes;
  Int KI = min(K, input_nPlanes - K * m);

-  T dI[V];
-  T dW[V];
+  TACC dI[V];
+  TACC dW[V];
  __shared__ T I[K][K];
  __shared__ T dO[K][K];
  __shared__ T W[K][K];
@@ -544,7 +542,7 @@ dConvolution_KMxKN_backward_dW2(T *inFeatures, T *dInFeatures, T *dOutFeatures,
 #pragma unroll
    for (int v = 0; v < V; v++) {
      if (ty[v] < KI and tx < KO)
-        W[ty[v]][tx] = w[ty[v] * output_nPlanes + tx];
+	W[ty[v]][tx] = w[ty[v] * output_nPlanes + tx];
      dW[v] = 0;
    }

@@ -552,48 +550,48 @@ dConvolution_KMxKN_backward_dW2(T *inFeatures, T *dInFeatures, T *dOutFeatures,
 // Read rules for K input/output pairs, reset dI[]
 #pragma unroll
      for (int v = 0; v < V; v++) {
-        if (ty[v] < 2) {
-          int q = ty[v] * K + tx;
-          if (s + q / 2 < nHot)
-            R[q] = rules[2 * s + q];
-        }
-        dI[v] = 0;
+	if (ty[v] < 2) {
+	  int q = ty[v] * K + tx;
+	  if (s + q / 2 < nHot)
+	    R[q] = rules[2 * s + q];
+	}
+	dI[v] = 0;
      }
      __syncthreads();
 // Read input and dOutput
 #pragma unroll
      for (int v = 0; v < V; v++) {
-        if (tx < KI and s + ty[v] < nHot)
-          I[ty[v]][tx] = inFeatures[R[2 * ty[v]] * input_stride + tx];
-        else
-          I[ty[v]][tx] = 0;
-        if (tx < KO and s + ty[v] < nHot)
-          dO[ty[v]][tx] = dOutFeatures[R[2 * ty[v] + 1] * output_stride + tx];
-        else
-          dO[ty[v]][tx] = 0;
+	if (tx < KI and s + ty[v] < nHot)
+	  I[ty[v]][tx] = inFeatures[R[2 * ty[v]] * input_stride + tx];
+	else
+	  I[ty[v]][tx] = 0;
+	if (tx < KO and s + ty[v] < nHot)
+	  dO[ty[v]][tx] = dOutFeatures[R[2 * ty[v] + 1] * output_stride + tx];
+	else
+	  dO[ty[v]][tx] = 0;
      }
      __syncthreads();
 #pragma unroll
      for (int k = 0; k < KO; k++)
 #pragma unroll
-        for (int v = 0; v < V; v++)
-          dI[v] += dO[ty[v]][k] * W[tx][k];
+	for (int v = 0; v < V; v++)
+	  dI[v] += dO[ty[v]][k] * W[tx][k];
 #pragma unroll
      for (int k = 0; k < K; k++)
 #pragma unroll
-        for (int v = 0; v < V; v++)
-          dW[v] += I[k][ty[v]] * dO[k][tx];
+	for (int v = 0; v < V; v++)
+	  dW[v] += I[k][ty[v]] * dO[k][tx];
      __syncthreads();
 #pragma unroll
      for (int v = 0; v < V; v++)
-        if (tx < KI and s + ty[v] < nHot)
-          dInFeatures[R[2 * ty[v]] * input_stride + tx] += dI[v];
+	if (tx < KI and s + ty[v] < nHot)
+	  dInFeatures[R[2 * ty[v]] * input_stride + tx] += dI[v];
      __syncthreads();
    }
 #pragma unroll
    for (int v = 0; v < V; v++)
      if (ty[v] < KI and tx < KO)
-        atomicAdd(&dw[ty[v] * output_nPlanes + tx], dW[v]);
+	atomicAdd(&dw[ty[v] * output_nPlanes + tx], dW[v]);
    w += K;
    dw += K;
    dOutFeatures += K;
@@ -602,51 +600,52 @@ dConvolution_KMxKN_backward_dW2(T *inFeatures, T *dInFeatures, T *dOutFeatures,

 template <typename T>
 double dConvolution_forward2(T *inFeatures, T *outFeatures, T *w,
-                             RuleBook _rules, Int input_nPlanes,
-                             Int input_stride, Int output_nPlanes,
-                             Int output_stride) {
+			     RuleBook _rules, Int input_nPlanes,
+			     Int input_stride, Int output_nPlanes,
+			     Int output_stride) {
  Int c = input_nPlanes * output_nPlanes;
  double flops = 0;
  if (input_nPlanes % 8 != 0 or output_nPlanes % 8 != 0) {
    const int K = 16;
    const int V = 4;
    RULEBOOKITERATOR(
-        (dConvolution_KMxKN_forward2<
-            T, K,
-            V><<<dim3(128, (output_nPlanes + K - 1) / K), dim3(K, K / V)>>>(
-            inFeatures, outFeatures, w, rbB, nHotB, input_nPlanes, input_stride,
-            output_nPlanes, output_stride));
-        , w += c; flops += nHotB * c;)
+	(dConvolution_KMxKN_forward2<
+	    T, K,
+	    V><<<dim3(128, (output_nPlanes + K - 1) / K), dim3(K, K / V)>>>(
+	    inFeatures, outFeatures, w, rbB, nHotB, input_nPlanes, input_stride,
+	    output_nPlanes, output_stride));
+	, w += c; flops += nHotB * c;)
  } else {
    RULEBOOKITERATOR(dConvolution_forward(inFeatures, outFeatures, w, rbB,
-                                          nHotB, input_nPlanes, input_stride,
-                                          output_nPlanes, output_stride);
-                     , w += c; flops += nHotB * c;)
+					  nHotB, input_nPlanes, input_stride,
+					  output_nPlanes, output_stride);
+		     , w += c; flops += nHotB * c;)
  }
  return flops;
 }

 template <typename T>
 void dConvolution_backward_dW2(T *inFeatures, T *dInFeatures, T *dOutFeatures,
-                               T *w, T *dw, RuleBook _rules, Int input_nPlanes,
-                               Int input_stride, Int output_nPlanes,
-                               Int output_stride) {
+			       T *w, T *dw, RuleBook _rules, Int input_nPlanes,
+			       Int input_stride, Int output_nPlanes,
+			       Int output_stride) {
  Int c = input_nPlanes * output_nPlanes;
  if (input_nPlanes % 8 != 0 or output_nPlanes % 8 != 0) {
    const int K = 16;
    const int V = 4;
    RULEBOOKITERATOR(
-        (dConvolution_KMxKN_backward_dW2<
-            T, K,
-            V><<<dim3(128, (input_nPlanes + K - 1) / K), dim3(K, K / V)>>>(
-            inFeatures, dInFeatures, dOutFeatures, w, dw, rbB, nHotB,
-            input_nPlanes, input_stride, output_nPlanes, output_stride));
-        , w += c; dw += c;)
+	(dConvolution_KMxKN_backward_dW2<
+	    T, K,
+	    V><<<dim3(128, (input_nPlanes + K - 1) / K), dim3(K, K / V)>>>(
+	    inFeatures, dInFeatures, dOutFeatures, w, dw, rbB, nHotB,
+	    input_nPlanes, input_stride, output_nPlanes, output_stride));
+	, w += c; dw += c;)
  } else {
    RULEBOOKITERATOR(dConvolution_backward_dW(inFeatures, dInFeatures,
-                                              dOutFeatures, w, dw, rbB, nHotB,
-                                              input_nPlanes, input_stride,
-                                              output_nPlanes, output_stride);
-                     , w += c; dw += c;)
+					      dOutFeatures, w, dw, rbB, nHotB,
+					      input_nPlanes, input_stride,
+					      output_nPlanes, output_stride);
+		     , w += c; dw += c;)
  }
 }
+#undef TACC
\ No newline at end of file
--- a/sparseconvnet/SCN/CUDA/Deconvolution.cpp
+++ b/sparseconvnet/SCN/CUDA/Deconvolution.cpp
@@ -78,7 +78,7 @@ void cuda_Deconvolution_backward(
    dDeconvolution_backward_dW2<T>(iF, diF, doF, w, dw, _rules, ip, ip, op, op);
    if (d_bias.numel()) {
      auto db = d_bias.data<T>();
-      Convolution_bp_bias(doF, db, op, op, nActiveOut);
+      Convolution_bp_bias(doF, db, op, nActiveOut);
    }
  }
 }
--- a/sparseconvnet/SCN/CUDA/Deconvolution.cu
+++ b/sparseconvnet/SCN/CUDA/Deconvolution.cu
@@ -4,11 +4,13 @@
 // This source code is licensed under the license found in the
 // LICENSE file in the root directory of this source tree.

+#define TACC double
+
 template <typename T, Int K, Int V>
 __global__ void
 dDeconvolution_KMxKN_forwardA(T *inFeatures, T *outFeatures, T *w, Int *rules,
-                              Int nHot, Int input_nPlanes, Int input_stride,
-                              Int output_nPlanes, Int output_stride) {
+			      Int nHot, Int input_nPlanes, Int input_stride,
+			      Int output_nPlanes, Int output_stride) {
  // nHot must be a multiple of K!!

  // Input x Weight -> Output
@@ -23,7 +25,7 @@ dDeconvolution_KMxKN_forwardA(T *inFeatures, T *outFeatures, T *w, Int *rules,
  outFeatures += n * K;
  w += n * K;

-  T O[V];
+  TACC O[V];
  __shared__ T W[K][K];
  __shared__ T I[K][K];
  Int R0[V];
@@ -43,31 +45,31 @@ dDeconvolution_KMxKN_forwardA(T *inFeatures, T *outFeatures, T *w, Int *rules,
    for (Int s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
 #pragma unroll
      for (int v = 0; v < V; v++) {
-        R1[v] = rules[2 * (s + ty[v])];
-        R0[v] = rules[2 * (s + ty[v]) + 1];
+	R1[v] = rules[2 * (s + ty[v])];
+	R0[v] = rules[2 * (s + ty[v]) + 1];
      }
      __syncthreads();

 // Read input, reset O[]
 #pragma unroll
      for (int v = 0; v < V; v++) {
-        I[ty[v]][tx] = inFeatures[R0[v] * input_stride + tx];
-        O[v] = 0;
+	I[ty[v]][tx] = inFeatures[R0[v] * input_stride + tx];
+	O[v] = 0;
      }
      __syncthreads();

 #pragma unroll
      for (int k = 0; k < K; k++)
 #pragma unroll
-        for (int v = 0; v < V; v++)
-          O[v] += I[ty[v]][k] * W[k][tx];
+	for (int v = 0; v < V; v++)
+	  O[v] += I[ty[v]][k] * W[k][tx];

 #pragma unroll
      for (int v = 0; v < V; v++)
-        O[v] += outFeatures[R1[v] * output_stride + tx];
+	O[v] += outFeatures[R1[v] * output_stride + tx];
 #pragma unroll
      for (int v = 0; v < V; v++)
-        outFeatures[R1[v] * output_stride + tx] = O[v];
+	outFeatures[R1[v] * output_stride + tx] = O[v];
      __syncthreads();
    }
    w += K * output_nPlanes;
@@ -77,8 +79,8 @@ dDeconvolution_KMxKN_forwardA(T *inFeatures, T *outFeatures, T *w, Int *rules,
 template <typename T, Int K, Int V>
 __global__ void
 dDeconvolution_KMxKN_forwardB(T *inFeatures, T *outFeatures, T *w, Int *rules,
-                              Int nHot, Int input_nPlanes, Int input_stride,
-                              Int output_nPlanes, Int output_stride) {
+			      Int nHot, Int input_nPlanes, Int input_stride,
+			      Int output_nPlanes, Int output_stride) {
  // Input x Weight -> Output
  // blockDim=(K,K/V,1), gridDim=(nBlocks,N,1) Volkov-blocks
  // K is a multiple of V,
@@ -91,7 +93,7 @@ dDeconvolution_KMxKN_forwardB(T *inFeatures, T *outFeatures, T *w, Int *rules,
  outFeatures += n * K;
  w += n * K;

-  T O[V];
+  TACC O[V];
  __shared__ T W[K][K];
  __shared__ T I[K][K];
  Int R0[V];
@@ -111,36 +113,36 @@ dDeconvolution_KMxKN_forwardB(T *inFeatures, T *outFeatures, T *w, Int *rules,
    for (Int s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
 #pragma unroll
      for (int v = 0; v < V; v++) {
-        if (s + ty[v] < nHot) {
-          R1[v] = rules[2 * (s + ty[v])];
-          R0[v] = rules[2 * (s + ty[v]) + 1];
-        }
+	if (s + ty[v] < nHot) {
+	  R1[v] = rules[2 * (s + ty[v])];
+	  R0[v] = rules[2 * (s + ty[v]) + 1];
+	}
      }
      __syncthreads();

 // Read input, reset O[]
 #pragma unroll
      for (int v = 0; v < V; v++) {
-        if (s + ty[v] < nHot)
-          I[ty[v]][tx] = inFeatures[R0[v] * input_stride + tx];
-        O[v] = 0;
+	if (s + ty[v] < nHot)
+	  I[ty[v]][tx] = inFeatures[R0[v] * input_stride + tx];
+	O[v] = 0;
      }
      __syncthreads();

 #pragma unroll
      for (int k = 0; k < K; k++)
 #pragma unroll
-        for (int v = 0; v < V; v++)
-          O[v] += I[ty[v]][k] * W[k][tx];
+	for (int v = 0; v < V; v++)
+	  O[v] += I[ty[v]][k] * W[k][tx];

 #pragma unroll
      for (int v = 0; v < V; v++)
-        if (s + ty[v] < nHot)
-          O[v] += outFeatures[R1[v] * output_stride + tx];
+	if (s + ty[v] < nHot)
+	  O[v] += outFeatures[R1[v] * output_stride + tx];
 #pragma unroll
      for (int v = 0; v < V; v++)
-        if (s + ty[v] < nHot)
-          outFeatures[R1[v] * output_stride + tx] = O[v];
+	if (s + ty[v] < nHot)
+	  outFeatures[R1[v] * output_stride + tx] = O[v];
      __syncthreads();
    }
    w += K * output_nPlanes;
@@ -153,24 +155,24 @@ dDeconvolution_KMxKN_forwardB(T *inFeatures, T *outFeatures, T *w, Int *rules,
    if (input_nPlanes % K == 0 and output_nPlanes % K == 0) {                  \
      Int o = (nHot / K) * K;                                                  \
      if (o >= K)                                                              \
-        dDeconvolution_KMxKN_forwardA<                                         \
-            T, K, V><<<dim3(std::min(o / K, (Int)512), output_nPlanes / K),    \
-                       dim3(K, K / V)>>>(inFeatures, outFeatures, w, rules, o, \
-                                         input_nPlanes, input_stride,          \
-                                         output_nPlanes, output_stride);       \
+	dDeconvolution_KMxKN_forwardA<                                         \
+	    T, K, V><<<dim3(std::min(o / K, (Int)512), output_nPlanes / K),    \
+		       dim3(K, K / V)>>>(inFeatures, outFeatures, w, rules, o, \
+					 input_nPlanes, input_stride,          \
+					 output_nPlanes, output_stride);       \
      if (nHot > o)                                                            \
-        dDeconvolution_KMxKN_forwardB<                                         \
-            T, K, V><<<dim3(1, output_nPlanes / K), dim3(K, K / V)>>>(         \
-            inFeatures, outFeatures, w, rules + 2 * o, nHot - o,               \
-            input_nPlanes, input_stride, output_nPlanes, output_stride);       \
+	dDeconvolution_KMxKN_forwardB<                                         \
+	    T, K, V><<<dim3(1, output_nPlanes / K), dim3(K, K / V)>>>(         \
+	    inFeatures, outFeatures, w, rules + 2 * o, nHot - o,               \
+	    input_nPlanes, input_stride, output_nPlanes, output_stride);       \
      return;                                                                  \
    }                                                                          \
  }

 template <typename T>
 void dDeconvolution_forward(T *inFeatures, T *outFeatures, T *w, Int *rules,
-                            Int nHot, Int input_nPlanes, Int input_stride,
-                            Int output_nPlanes, Int output_stride) {
+			    Int nHot, Int input_nPlanes, Int input_stride,
+			    Int output_nPlanes, Int output_stride) {
  FOO(T, 64, 16)
  FOO(T, 32, 8)
  FOO(T, 16, 4)
@@ -179,9 +181,9 @@ void dDeconvolution_forward(T *inFeatures, T *outFeatures, T *w, Int *rules,
 }
 template <>
 void dDeconvolution_forward<double>(double *inFeatures, double *outFeatures,
-                                    double *w, Int *rules, Int nHot,
-                                    Int input_nPlanes, Int input_stride,
-                                    Int output_nPlanes, Int output_stride) {
+				    double *w, Int *rules, Int nHot,
+				    Int input_nPlanes, Int input_stride,
+				    Int output_nPlanes, Int output_stride) {
  FOO(double, 32, 8)
  FOO(double, 16, 4)
  FOO(double, 8, 2)
@@ -205,8 +207,8 @@ __global__ void dDeconvolution_KMxKN_backward_dW_A(
  w += m * K * output_nPlanes;
  dw += m * K * output_nPlanes;

-  T dI[V];
-  T dW[V];
+  TACC dI[V];
+  TACC dW[V];
  __shared__ T I[K][K];
  __shared__ T dO[K][K];
  __shared__ T W[K][K];
@@ -229,31 +231,31 @@ __global__ void dDeconvolution_KMxKN_backward_dW_A(
    for (Int s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
 #pragma unroll
      for (int v = 0; v < V; v++) {
-        R1[v] = rules[2 * (s + ty[v])];
-        R0[v] = rules[2 * (s + ty[v]) + 1];
-        dI[v] = 0;
+	R1[v] = rules[2 * (s + ty[v])];
+	R0[v] = rules[2 * (s + ty[v]) + 1];
+	dI[v] = 0;
      }
      __syncthreads();
 // Read input and dOutput
 #pragma unroll
      for (int v = 0; v < V; v++) {
-        I[ty[v]][tx] = inFeatures[R0[v] * input_stride + tx];
-        dO[ty[v]][tx] = dOutFeatures[R1[v] * output_stride + tx];
+	I[ty[v]][tx] = inFeatures[R0[v] * input_stride + tx];
+	dO[ty[v]][tx] = dOutFeatures[R1[v] * output_stride + tx];
      }
      __syncthreads();
 #pragma unroll
      for (int k = 0; k < K; k++)
 #pragma unroll
-        for (int v = 0; v < V; v++) {
-          dI[v] += dO[ty[v]][k] * W[tx][k];
-          dW[v] += I[k][ty[v]] * dO[k][tx];
-        }
+	for (int v = 0; v < V; v++) {
+	  dI[v] += dO[ty[v]][k] * W[tx][k];
+	  dW[v] += I[k][ty[v]] * dO[k][tx];
+	}
 #pragma unroll
      for (int v = 0; v < V; v++)
-        dI[v] += dInFeatures[R0[v] * input_stride + tx];
+	dI[v] += dInFeatures[R0[v] * input_stride + tx];
 #pragma unroll
      for (int v = 0; v < V; v++)
-        dInFeatures[R0[v] * input_stride + tx] = dI[v];
+	dInFeatures[R0[v] * input_stride + tx] = dI[v];
      __syncthreads();
    }
 #pragma unroll
@@ -281,8 +283,8 @@ __global__ void dDeconvolution_KMxKN_backward_dW_B(
  w += m * K * output_nPlanes;
  dw += m * K * output_nPlanes;

-  T dI[V];
-  T dW[V];
+  TACC dI[V];
+  TACC dW[V];
  __shared__ T I[K][K];
  __shared__ T dO[K][K];
  __shared__ T W[K][K];
@@ -305,39 +307,39 @@ __global__ void dDeconvolution_KMxKN_backward_dW_B(
    for (Int s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
 #pragma unroll
      for (int v = 0; v < V; v++) {
-        if (s + ty[v] < nHot) {
-          R1[v] = rules[2 * (s + ty[v])];
-          R0[v] = rules[2 * (s + ty[v]) + 1];
-        }
-        dI[v] = 0;
+	if (s + ty[v] < nHot) {
+	  R1[v] = rules[2 * (s + ty[v])];
+	  R0[v] = rules[2 * (s + ty[v]) + 1];
+	}
+	dI[v] = 0;
      }
      __syncthreads();
 // Read input and dOutput
 #pragma unroll
      for (int v = 0; v < V; v++)
-        if (s + ty[v] < nHot) {
-          I[ty[v]][tx] = inFeatures[R0[v] * input_stride + tx];
-          dO[ty[v]][tx] = dOutFeatures[R1[v] * output_stride + tx];
-        } else {
-          I[ty[v]][tx] = 0;
-          dO[ty[v]][tx] = 0;
-        }
+	if (s + ty[v] < nHot) {
+	  I[ty[v]][tx] = inFeatures[R0[v] * input_stride + tx];
+	  dO[ty[v]][tx] = dOutFeatures[R1[v] * output_stride + tx];
+	} else {
+	  I[ty[v]][tx] = 0;
+	  dO[ty[v]][tx] = 0;
+	}
      __syncthreads();
 #pragma unroll
      for (int k = 0; k < K; k++)
 #pragma unroll
-        for (int v = 0; v < V; v++) {
-          dI[v] += dO[ty[v]][k] * W[tx][k];
-          dW[v] += I[k][ty[v]] * dO[k][tx];
-        }
+	for (int v = 0; v < V; v++) {
+	  dI[v] += dO[ty[v]][k] * W[tx][k];
+	  dW[v] += I[k][ty[v]] * dO[k][tx];
+	}
 #pragma unroll
      for (int v = 0; v < V; v++)
-        if (s + ty[v] < nHot)
-          dI[v] += dInFeatures[R0[v] * input_stride + tx];
+	if (s + ty[v] < nHot)
+	  dI[v] += dInFeatures[R0[v] * input_stride + tx];
 #pragma unroll
      for (int v = 0; v < V; v++)
-        if (s + ty[v] < nHot)
-          dInFeatures[R0[v] * input_stride + tx] = dI[v];
+	if (s + ty[v] < nHot)
+	  dInFeatures[R0[v] * input_stride + tx] = dI[v];
      __syncthreads();
    }
 #pragma unroll
@@ -354,26 +356,26 @@ __global__ void dDeconvolution_KMxKN_backward_dW_B(
    if (input_nPlanes % K == 0 and output_nPlanes % K == 0) {                  \
      Int o = (nHot / K) * K;                                                  \
      if (o >= K)                                                              \
-        dDeconvolution_KMxKN_backward_dW_A<                                    \
-            T, K, V><<<dim3(std::min(o / K, (Int)512), input_nPlanes / K),     \
-                       dim3(K, K / V)>>>(                                      \
-            inFeatures, dInFeatures, dOutFeatures, w, dw, rules, o,            \
-            input_nPlanes, input_stride, output_nPlanes, output_stride);       \
+	dDeconvolution_KMxKN_backward_dW_A<                                    \
+	    T, K, V><<<dim3(std::min(o / K, (Int)512), input_nPlanes / K),     \
+		       dim3(K, K / V)>>>(                                      \
+	    inFeatures, dInFeatures, dOutFeatures, w, dw, rules, o,            \
+	    input_nPlanes, input_stride, output_nPlanes, output_stride);       \
      if (nHot > o)                                                            \
-        dDeconvolution_KMxKN_backward_dW_B<                                    \
-            T, K, V><<<dim3(1, input_nPlanes / K), dim3(K, K / V)>>>(          \
-            inFeatures, dInFeatures, dOutFeatures, w, dw, rules + 2 * o,       \
-            nHot - o, input_nPlanes, input_stride, output_nPlanes,             \
-            output_stride);                                                    \
+	dDeconvolution_KMxKN_backward_dW_B<                                    \
+	    T, K, V><<<dim3(1, input_nPlanes / K), dim3(K, K / V)>>>(          \
+	    inFeatures, dInFeatures, dOutFeatures, w, dw, rules + 2 * o,       \
+	    nHot - o, input_nPlanes, input_stride, output_nPlanes,             \
+	    output_stride);                                                    \
      return;                                                                  \
    }                                                                          \
  }

 template <typename T>
 void dDeconvolution_backward_dW(T *inFeatures, T *dInFeatures, T *dOutFeatures,
-                                T *w, T *dw, Int *rules, Int nHot,
-                                Int input_nPlanes, Int input_stride,
-                                Int output_nPlanes, Int output_stride) {
+				T *w, T *dw, Int *rules, Int nHot,
+				Int input_nPlanes, Int input_stride,
+				Int output_nPlanes, Int output_stride) {
  FOO(T, 32, 8)
  FOO(T, 16, 4)
  FOO(T, 8, 2)
@@ -384,8 +386,8 @@ void dDeconvolution_backward_dW(T *inFeatures, T *dInFeatures, T *dOutFeatures,
 template <typename T, Int K, Int V>
 __global__ void
 dDeconvolution_KMxKN_forward2(T *inFeatures, T *outFeatures, T *w, Int *rules,
-                              Int nHot, Int input_nPlanes, Int input_stride,
-                              Int output_nPlanes, Int output_stride) {
+			      Int nHot, Int input_nPlanes, Int input_stride,
+			      Int output_nPlanes, Int output_stride) {
  // Input x Weight -> Output
  // blockDim=(K,K/V,1), gridDim=(nBlocks,N,1) Volkov-blocks
  // K is a multiple of V,
@@ -400,7 +402,7 @@ dDeconvolution_KMxKN_forward2(T *inFeatures, T *outFeatures, T *w, Int *rules,
  w += n * K;
  Int KO = min(K, output_nPlanes - K * n);

-  T O[V];
+  TACC O[V];
  __shared__ T W[K][K];
  __shared__ T I[K][K];
  __shared__ Int R[K * 2];
@@ -417,40 +419,40 @@ dDeconvolution_KMxKN_forward2(T *inFeatures, T *outFeatures, T *w, Int *rules,
 #pragma unroll
    for (int v = 0; v < V; v++)
      if (ty[v] < KI and tx < KO)
-        W[ty[v]][tx] = w[ty[v] * output_nPlanes + tx];
+	W[ty[v]][tx] = w[ty[v] * output_nPlanes + tx];

    for (Int s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
 // Read rules for K input/output pairs
 #pragma unroll
      for (int v = 0; v < V; v++) {
-        if (ty[v] < 2) {
-          int q = ty[v] * K + tx;
-          if (s + q / 2 < nHot)
-            R[q] = rules[2 * s + q];
-        }
+	if (ty[v] < 2) {
+	  int q = ty[v] * K + tx;
+	  if (s + q / 2 < nHot)
+	    R[q] = rules[2 * s + q];
+	}
      }
      __syncthreads();

 // Read input, reset O[]
 #pragma unroll
      for (int v = 0; v < V; v++) {
-        if (tx < KI and s + ty[v] < nHot)
-          I[ty[v]][tx] = inFeatures[R[2 * ty[v] + 1] * input_stride + tx];
-        O[v] = 0;
+	if (tx < KI and s + ty[v] < nHot)
+	  I[ty[v]][tx] = inFeatures[R[2 * ty[v] + 1] * input_stride + tx];
+	O[v] = 0;
      }
      __syncthreads();

 #pragma unroll
      for (int k = 0; k < KI; k++)
 #pragma unroll
-        for (int v = 0; v < V; v++)
-          O[v] += I[ty[v]][k] * W[k][tx];
+	for (int v = 0; v < V; v++)
+	  O[v] += I[ty[v]][k] * W[k][tx];
      __syncthreads();

 #pragma unroll
      for (int v = 0; v < V; v++)
-        if (tx < KO and s + ty[v] < nHot)
-          outFeatures[R[2 * ty[v]] * output_stride + tx] += O[v];
+	if (tx < KO and s + ty[v] < nHot)
+	  outFeatures[R[2 * ty[v]] * output_stride + tx] += O[v];
      __syncthreads();
    }
    w += K * output_nPlanes;
@@ -464,9 +466,9 @@ dDeconvolution_KMxKN_forward2(T *inFeatures, T *outFeatures, T *w, Int *rules,
 template <typename T, Int K, Int V>
 __global__ void
 dDeconvolution_KMxKN_backward_dW2(T *inFeatures, T *dInFeatures,
-                                  T *dOutFeatures, T *w, T *dw, Int *rules,
-                                  Int nHot, Int input_nPlanes, Int input_stride,
-                                  Int output_nPlanes, Int output_stride) {
+				  T *dOutFeatures, T *w, T *dw, Int *rules,
+				  Int nHot, Int input_nPlanes, Int input_stride,
+				  Int output_nPlanes, Int output_stride) {
  // M = gridDim.y == input_nPlanes / K
  Int N = (output_nPlanes + K - 1) / K;
  Int m = blockIdx.y;
@@ -476,8 +478,8 @@ dDeconvolution_KMxKN_backward_dW2(T *inFeatures, T *dInFeatures,
  dw += m * K * output_nPlanes;
  Int KI = min(K, input_nPlanes - K * m);

-  T dI[V];
-  T dW[V];
+  TACC dI[V];
+  TACC dW[V];
  __shared__ T I[K][K];
  __shared__ T dO[K][K];
  __shared__ T W[K][K];
@@ -495,7 +497,7 @@ dDeconvolution_KMxKN_backward_dW2(T *inFeatures, T *dInFeatures,
 #pragma unroll
    for (int v = 0; v < V; v++) {
      if (ty[v] < KI and tx < KO)
-        W[ty[v]][tx] = w[ty[v] * output_nPlanes + tx];
+	W[ty[v]][tx] = w[ty[v] * output_nPlanes + tx];
      dW[v] = 0;
    }

@@ -503,48 +505,48 @@ dDeconvolution_KMxKN_backward_dW2(T *inFeatures, T *dInFeatures,
 // Read rules for K input/output pairs, reset dI[]
 #pragma unroll
      for (int v = 0; v < V; v++) {
-        if (ty[v] < 2) {
-          int q = ty[v] * K + tx;
-          if (s + q / 2 < nHot)
-            R[q] = rules[2 * s + q];
-        }
-        dI[v] = 0;
+	if (ty[v] < 2) {
+	  int q = ty[v] * K + tx;
+	  if (s + q / 2 < nHot)
+	    R[q] = rules[2 * s + q];
+	}
+	dI[v] = 0;
      }
      __syncthreads();
 // Read input and dOutput
 #pragma unroll
      for (int v = 0; v < V; v++) {
-        if (tx < KI and s + ty[v] < nHot)
-          I[ty[v]][tx] = inFeatures[R[2 * ty[v] + 1] * input_stride + tx];
-        else
-          I[ty[v]][tx] = 0;
-        if (tx < KO and s + ty[v] < nHot)
-          dO[ty[v]][tx] = dOutFeatures[R[2 * ty[v]] * output_stride + tx];
-        else
-          dO[ty[v]][tx] = 0;
+	if (tx < KI and s + ty[v] < nHot)
+	  I[ty[v]][tx] = inFeatures[R[2 * ty[v] + 1] * input_stride + tx];
+	else
+	  I[ty[v]][tx] = 0;
+	if (tx < KO and s + ty[v] < nHot)
+	  dO[ty[v]][tx] = dOutFeatures[R[2 * ty[v]] * output_stride + tx];
+	else
+	  dO[ty[v]][tx] = 0;
      }
      __syncthreads();
 #pragma unroll
      for (int k = 0; k < KO; k++)
 #pragma unroll
-        for (int v = 0; v < V; v++)
-          dI[v] += dO[ty[v]][k] * W[tx][k];
+	for (int v = 0; v < V; v++)
+	  dI[v] += dO[ty[v]][k] * W[tx][k];
 #pragma unroll
      for (int k = 0; k < K; k++)
 #pragma unroll
-        for (int v = 0; v < V; v++)
-          dW[v] += I[k][ty[v]] * dO[k][tx];
+	for (int v = 0; v < V; v++)
+	  dW[v] += I[k][ty[v]] * dO[k][tx];
      __syncthreads();
 #pragma unroll
      for (int v = 0; v < V; v++)
-        if (tx < KI and s + ty[v] < nHot)
-          dInFeatures[R[2 * ty[v] + 1] * input_stride + tx] += dI[v];
+	if (tx < KI and s + ty[v] < nHot)
+	  dInFeatures[R[2 * ty[v] + 1] * input_stride + tx] += dI[v];
      __syncthreads();
    }
 #pragma unroll
    for (int v = 0; v < V; v++)
      if (ty[v] < KI and tx < KO)
-        atomicAdd(&dw[ty[v] * output_nPlanes + tx], dW[v]);
+	atomicAdd(&dw[ty[v] * output_nPlanes + tx], dW[v]);
    w += K;
    dw += K;
    dOutFeatures += K;
@@ -553,51 +555,53 @@ dDeconvolution_KMxKN_backward_dW2(T *inFeatures, T *dInFeatures,

 template <typename T>
 double dDeconvolution_forward2(T *inFeatures, T *outFeatures, T *w,
-                               RuleBook _rules, Int input_nPlanes,
-                               Int input_stride, Int output_nPlanes,
-                               Int output_stride) {
+			       RuleBook _rules, Int input_nPlanes,
+			       Int input_stride, Int output_nPlanes,
+			       Int output_stride) {
  Int c = input_nPlanes * output_nPlanes;
  double flops = 0;
  if (input_nPlanes % 8 != 0 or output_nPlanes % 8 != 0) {
    const int K = 16;
    const int V = 4;
    RULEBOOKITERATOR(
-        (dDeconvolution_KMxKN_forward2<
-            T, K,
-            V><<<dim3(128, (output_nPlanes + K - 1) / K), dim3(K, K / V)>>>(
-            inFeatures, outFeatures, w, rbB, nHotB, input_nPlanes, input_stride,
-            output_nPlanes, output_stride));
-        , w += c; flops += nHotB * c;)
+	(dDeconvolution_KMxKN_forward2<
+	    T, K,
+	    V><<<dim3(128, (output_nPlanes + K - 1) / K), dim3(K, K / V)>>>(
+	    inFeatures, outFeatures, w, rbB, nHotB, input_nPlanes, input_stride,
+	    output_nPlanes, output_stride));
+	, w += c; flops += nHotB * c;)
  } else {
    RULEBOOKITERATOR(dDeconvolution_forward(inFeatures, outFeatures, w, rbB,
-                                            nHotB, input_nPlanes, input_stride,
-                                            output_nPlanes, output_stride);
-                     , w += c; flops += nHotB * c;)
+					    nHotB, input_nPlanes, input_stride,
+					    output_nPlanes, output_stride);
+		     , w += c; flops += nHotB * c;)
  }
  return flops;
 }

 template <typename T>
 void dDeconvolution_backward_dW2(T *inFeatures, T *dInFeatures, T *dOutFeatures,
-                                 T *w, T *dw, RuleBook _rules,
-                                 Int input_nPlanes, Int input_stride,
-                                 Int output_nPlanes, Int output_stride) {
+				 T *w, T *dw, RuleBook _rules,
+				 Int input_nPlanes, Int input_stride,
+				 Int output_nPlanes, Int output_stride) {
  Int c = input_nPlanes * output_nPlanes;
  if (input_nPlanes % 8 != 0 or output_nPlanes % 8 != 0) {
    const int K = 16;
    const int V = 4;
    RULEBOOKITERATOR(
-        (dDeconvolution_KMxKN_backward_dW2<
-            T, K,
-            V><<<dim3(128, (input_nPlanes + K - 1) / K), dim3(K, K / V)>>>(
-            inFeatures, dInFeatures, dOutFeatures, w, dw, rbB, nHotB,
-            input_nPlanes, input_stride, output_nPlanes, output_stride));
-        , w += c; dw += c;)
+	(dDeconvolution_KMxKN_backward_dW2<
+	    T, K,
+	    V><<<dim3(128, (input_nPlanes + K - 1) / K), dim3(K, K / V)>>>(
+	    inFeatures, dInFeatures, dOutFeatures, w, dw, rbB, nHotB,
+	    input_nPlanes, input_stride, output_nPlanes, output_stride));
+	, w += c; dw += c;)
  } else {
    RULEBOOKITERATOR(dDeconvolution_backward_dW(inFeatures, dInFeatures,
-                                                dOutFeatures, w, dw, rbB, nHotB,
-                                                input_nPlanes, input_stride,
-                                                output_nPlanes, output_stride);
-                     , w += c; dw += c;)
+						dOutFeatures, w, dw, rbB, nHotB,
+						input_nPlanes, input_stride,
+						output_nPlanes, output_stride);
+		     , w += c; dw += c;)
  }
 }
+
+#undef TACC
\ No newline at end of file
--- a/sparseconvnet/SCN/cuda.cu
+++ b/sparseconvnet/SCN/cuda.cu
@@ -15,15 +15,15 @@
 #include "CUDA/UnPooling.cu"

 template void ActivePooling_ForwardPass<float>(float *input_features,
-                                               float *output_features,
-                                               Int batchSize, Int maxActive,
-                                               Int nPlanes, Int *rules,
-                                               bool average);
+					       float *output_features,
+					       Int batchSize, Int maxActive,
+					       Int nPlanes, Int *rules,
+					       bool average);
 template void ActivePooling_BackwardPass<float>(float *d_input_features,
-                                                float *d_output_features,
-                                                Int batchSize, Int maxActive,
-                                                Int nPlanes, Int *rules,
-                                                bool average);
+						float *d_output_features,
+						Int batchSize, Int maxActive,
+						Int nPlanes, Int *rules,
+						bool average);

 template void dAffineReluTrivialConvolution_forward<float>(
    float *inFeatures, float *outFeatures, float *affineWeight,
@@ -43,10 +43,10 @@ template void cuda_AveragePooling_BackwardPass<float>(
    float *d_input_features, float *d_output_features, Int nPlanes,
    Int input_stride, Int output_stride, RuleBook _rules, Int filterVolume);

-template void Convolution_fp_bias<float>(float *of, float *b, Int op,
-                                         Int nActive);
-template void Convolution_bp_bias<float>(float *matrix, float *target,
-                                         Int nRows, Int nColumns, Int nCOLUMNS);
+template void Convolution_fp_bias<float>(float *oF, float *b, Int nPlanes,
+					 Int nActive);
+template void Convolution_bp_bias<float>(float *d_oF, float *d_b,
+					 Int nPlanes, Int nActive);
 template double dConvolution_forward2<float>(
    float *inFeatures, float *outFeatures, float *w, RuleBook _rules,
    Int input_nPlanes, Int input_stride, Int output_nPlanes, Int output_stride);
@@ -66,65 +66,65 @@ template void dDeconvolution_backward_dW2<float>(
    Int output_nPlanes, Int output_stride);

 template void InputLayer_fp<float>(float *input_features,
-                                   float *output_features, Int nRows,
-                                   Int maxActive, Int nPlanes, Int *rules_cpu,
-                                   Int *rules_gpu, bool average);
+				   float *output_features, Int nRows,
+				   Int maxActive, Int nPlanes, Int *rules_cpu,
+				   Int *rules_gpu, bool average);
 template void InputLayer_bp<float>(float *d_input_features,
-                                   float *d_output_features, Int nRows,
-                                   Int maxActive, Int nPlanes, Int *rules_cpu,
-                                   Int *rules_gpu, bool average);
+				   float *d_output_features, Int nRows,
+				   Int maxActive, Int nPlanes, Int *rules_cpu,
+				   Int *rules_gpu, bool average);

 template void LeakyReLU_fp<float>(float *input_features, float *output_features,
-                                  Int n, float alpha);
+				  Int n, float alpha);
 template void LeakyReLU_bp<float>(float *input_features,
-                                  float *d_input_features,
-                                  float *output_features, Int n, float alpha);
+				  float *d_input_features,
+				  float *output_features, Int n, float alpha);
 template void cuda_MaxPooling_ForwardPass<float>(float *input_features,
-                                                 float *output_features,
-                                                 Int nPlanes, Int input_stride,
-                                                 Int output_stride,
-                                                 RuleBook _rules);
+						 float *output_features,
+						 Int nPlanes, Int input_stride,
+						 Int output_stride,
+						 RuleBook _rules);
 template void cuda_MaxPooling_BackwardPass<float>(
    float *input_features, float *d_input_features, float *output_features,
    float *d_output_features, Int nPlanes, Int input_stride, Int output_stride,
    RuleBook _rules);
 template void cuda_SparseToDense_ForwardPass<float>(float *input_features,
-                                                    float *output_features,
-                                                    Int nPlanes,
-                                                    Int spatialVolume,
-                                                    RuleBook _rules);
+						    float *output_features,
+						    Int nPlanes,
+						    Int spatialVolume,
+						    RuleBook _rules);
 template void cuda_SparseToDense_BackwardPass<float>(float *d_input_features,
-                                                     float *d_output_features,
-                                                     Int nPlanes,
-                                                     Int spatialVolume,
-                                                     RuleBook _rules);
+						     float *d_output_features,
+						     Int nPlanes,
+						     Int spatialVolume,
+						     RuleBook _rules);
 template void cuda_UnPooling_ForwardPass<float>(float *input_features,
-                                                float *output_features,
-                                                Int nPlanes, Int input_stride,
-                                                Int output_stride,
-                                                RuleBook _rules);
+						float *output_features,
+						Int nPlanes, Int input_stride,
+						Int output_stride,
+						RuleBook _rules);
 template void cuda_UnPooling_BackwardPass<float>(float *d_input_features,
-                                                 float *d_output_features,
-                                                 Int nPlanes, Int input_stride,
-                                                 Int output_stride,
-                                                 RuleBook _rules);
+						 float *d_output_features,
+						 Int nPlanes, Int input_stride,
+						 Int output_stride,
+						 RuleBook _rules);

 template void bn_f<float>(float *iF, float *oF, Int nPlanes, Int input_stride,
-                          Int output_stride, Int nActive, float *saveMean,
-                          float *saveInvStd, float *runningMean,
-                          float *runningVar, float *weight, float *bias,
-                          float eps, float momentum, bool train,
-                          float leakiness);
+			  Int output_stride, Int nActive, float *saveMean,
+			  float *saveInvStd, float *runningMean,
+			  float *runningVar, float *weight, float *bias,
+			  float eps, float momentum, bool train,
+			  float leakiness);
 template void bn_b<float>(float *input_features, float *d_input_features,
-                          float *output_features, float *d_output_features,
-                          Int nPlanes, Int input_stride, Int output_stride,
-                          Int nActive, float *saveMean, float *saveInvStd,
-                          float *runningMean, float *runningVar, float *weight,
-                          float *bias, float *d_weight, float *d_bias,
-                          float leakiness);
+			  float *output_features, float *d_output_features,
+			  Int nPlanes, Int input_stride, Int output_stride,
+			  Int nActive, float *saveMean, float *saveInvStd,
+			  float *runningMean, float *runningVar, float *weight,
+			  float *bias, float *d_weight, float *d_bias,
+			  float leakiness);

 template void bmd_f<float>(float *input_features, float *output_features,
-                           float *noise, Int nActive, Int nPlanes, float alpha);
+			   float *noise, Int nActive, Int nPlanes, float alpha);
 template void bmd_b<float>(float *input_features, float *d_input_features,
-                           float *d_output_features, float *noise, Int nActive,
-                           Int nPlanes, float alpha);
+			   float *d_output_features, float *noise, Int nActive,
+			   Int nPlanes, float alpha);
--- a/sparseconvnet/__init__.py
+++ b/sparseconvnet/__init__.py
@@ -6,7 +6,7 @@

 forward_pass_multiplyAdd_count = 0
 forward_pass_hidden_states = 0
-from .activations import Tanh, Sigmoid, ReLU, ELU, SELU, BatchNormELU
+from .activations import Tanh, Sigmoid, ReLU, LeakyReLU, ELU, SELU, BatchNormELU
 from .averagePooling import AveragePooling
 from .batchNormalization import BatchNormalization, BatchNormReLU, BatchNormLeakyReLU
 from .classificationTrainValidate import ClassificationTrainValidate

--- a/sparseconvnet/activations.py
+++ b/sparseconvnet/activations.py
@@ -22,6 +22,18 @@ class Sigmoid(Module):
        return output


+class LeakyReLU(Module):
+    def __init__(self,leak=1/3):
+        Module.__init__(self)
+        self.leak=leak
+    def forward(self, input):
+        output = SparseConvNetTensor()
+        output.features = F.leaky_relu(input.features,self.leak)
+        output.metadata = input.metadata
+        output.spatial_size = input.spatial_size
+        return output
+
+
 class Tanh(Module):
    def forward(self, input):
        output = SparseConvNetTensor()