group convolutions

d27491d4 · Benjamin Thomas Graham · 8422a6f5 · d27491d4 · d27491d4 · d27491d4
Commit d27491d4 authored Jun 13, 2019 by Benjamin Thomas Graham
19 changed files
--- a/README.md
+++ b/README.md
@@ -24,9 +24,9 @@ Higher dimensional input is more likely to be sparse because of the 'curse of di

  Dimension|Name in 'torch.nn'|Use cases
  :--:|:--:|:--:
-  1|TemporalConvolution| Text, audio
-  2|SpatialConvolution|Lines in 2D space, e.g. handwriting
-  3|VolumetricConvolution|Lines and surfaces in 3D space or (2+1)D space-time
+  1|Conv1d| Text, audio
+  2|Conv2d|Lines in 2D space, e.g. handwriting
+  3|Conv3d|Lines and surfaces in 3D space or (2+1)D space-time
  4| - |Lines, etc,  in (3+1)D space-time

 We use the term 'submanifold' to refer to input data that is sparse because it has a lower effective dimension than the space in which it lives, for example a one-dimensional curve in 2+ dimensional space, or a two-dimensional surface in 3+ dimensional space.
@@ -137,7 +137,7 @@ apt-get install unrar
 ```

 ## License
-SparseConvNet is Attribution-NonCommercial 4.0 International licensed, as found in the LICENSE file.
+SparseConvNet is BSD licensed, as found in the LICENSE file.

 ## Links
 1. [ICDAR 2013 Chinese Handwriting Recognition Competition 2013](http://www.nlpr.ia.ac.cn/events/CHRcompetition2013/competition/Home.html) First place in task 3, with test error of 2.61%. Human performance on the test set was 4.81%. [Report](http://www.nlpr.ia.ac.cn/events/CHRcompetition2013/competition/ICDAR%202013%20CHR%20competition.pdf)

--- a/sparseconvnet/SCN/CPU/Convolution.cpp
+++ b/sparseconvnet/SCN/CPU/Convolution.cpp
--- a/sparseconvnet/SCN/CPU/Deconvolution.cpp
+++ b/sparseconvnet/SCN/CPU/Deconvolution.cpp
@@ -15,29 +15,26 @@ double cpu_Deconvolution_updateOutput(
  auto _rules =
      m.getRuleBook(outputSize, inputSize, filterSize, filterStride, true);
  Int nActive = m.getNActive(outputSize);
-  output_features.resize_({nActive, weight.size(2)});
+  output_features.resize_({nActive, weight.size(1) * weight.size(3)});
  if (bias.numel() and nActive)
    output_features.copy_(bias);
  else
    output_features.zero_();

  double flops = 0;
-  auto ip = weight.size(1);
-  auto op = weight.size(2);
-  for (Int i = 0; i < (Int)_rules.size(); i++) {
+  auto groups = weight.size(1);
+  auto ip = weight.size(2);
+  auto op = weight.size(3);
+  for (Int i = 0; i < (Int)_rules.size(); ++i) {
    auto r = _rules[i];
-    int nRules = r.size() / 2;
+    Int nRules = r.size() / 2;
    if (nRules) {
-      flops += nRules * ip * op;
-      // auto rt = torch::CPU(at_kINT).tensorFromBlob(&r[0], {nRules, 2});
-      // auto input_rows = input_features.index_select(0, rt.select(1, 1));
-      // auto w = weight.select(0, i);
-      // auto output_rows = at::mm(input_rows, w);
-      // output_features.index_add_(0, rt.select(1, 0), output_rows);
-      auto input_rows = rule_index_select<T>(input_features, nRules, &r[1]);
+      flops += nRules * ip * op * groups;
      auto w = weight.select(0, i);
-      auto output_rows = at::mm(input_rows, w);
-      rule_index_add_<T>(output_features, output_rows, nRules, &r[0]);
+      auto input_rows =
+          rule_index_select<T>(input_features, nRules, &r[1], groups);
+      auto output_rows = at::matmul(input_rows, w);
+      rule_index_add_<T>(output_features, output_rows, nRules, &r[0], groups);
    }
  }
  return flops;
@@ -59,26 +56,22 @@ void cpu_Deconvolution_backward(
  d_input_features.resize_as_(input_features);
  d_input_features.zero_();

+  auto groups = weight.size(1);
  if (nActive and d_bias.numel())
    at::sum_out(d_bias, d_output_features, {0}, false);
-  for (Int i = 0; i < (Int)_rules.size(); i++) {
+  for (Int i = 0; i < (Int)_rules.size(); ++i) {
    auto r = _rules[i];
-    int nRules = r.size() / 2;
+    Int nRules = r.size() / 2;
    if (nRules) {
      auto w = weight.select(0, i);
      auto dw = d_weight.select(0, i);
-      // auto rt = torch::CPU(at_kINT).tensorFromBlob(&r[0], {nRules, 2});
-      // auto input_rows = input_features.index_select(0, rt.select(1, 1));
-      // auto d_output_rows = d_output_features.index_select(0, rt.select(1,
-      // 0));
-      // at::mm_out(dw, input_rows.t(), d_output_rows);
-      // auto d_input_rows = at::mm(d_output_rows, w.t());
-      // d_input_features.index_add_(0, rt.select(1, 1), d_input_rows);
-      auto input_rows = rule_index_select<T>(input_features, nRules, &r[1]);
-      auto d_output_rows = rule_index_select<T>(d_output_features, nRules, &r[0]);
-      at::mm_out(dw, input_rows.t(), d_output_rows);
-      auto d_input_rows = at::mm(d_output_rows, w.t());
-      rule_index_add_<T>(d_input_features, d_input_rows, nRules, &r[1]);
+      auto input_rows =
+          rule_index_select<T>(input_features, nRules, &r[1], groups);
+      auto d_output_rows =
+          rule_index_select<T>(d_output_features, nRules, &r[0], groups);
+      at::matmul_out(dw, input_rows.transpose(1, 2), d_output_rows);
+      auto d_input_rows = at::matmul(d_output_rows, w.transpose(1, 2));
+      rule_index_add_<T>(d_input_features, d_input_rows, nRules, &r[1], groups);
    }
  }
 }
--- a/sparseconvnet/SCN/CUDA/Convolution.cpp
+++ b/sparseconvnet/SCN/CUDA/Convolution.cpp
@@ -12,13 +12,13 @@ template <typename T>
 double dConvolution_forward2(T *inFeatures, T *outFeatures, T *w,
                             RuleBook _rules, Int input_nPlanes,
                             Int input_stride, Int output_nPlanes,
-                             Int output_stride);
+                             Int output_stride, Int nGroups);

 template <typename T>
 void dConvolution_backward_dW2(T *inFeatures, T *dInFeatures, T *dOutFeatures,
                               T *w, T *dw, RuleBook _rules, Int input_nPlanes,
                               Int input_stride, Int output_nPlanes,
-                               Int output_stride);
+                               Int output_stride, Int nGroups);

 template <typename T, Int Dimension>
 double cuda_Convolution_updateOutput(
@@ -32,9 +32,10 @@ double cuda_Convolution_updateOutput(
  auto _rules =
      m.getRuleBook(inputSize, outputSize, filterSize, filterStride, true);
  Int nActiveOut = m.getNActive(outputSize);
-  Int ip = weight.size(1);
-  Int op = weight.size(2);
-  output_features.resize_({nActiveOut, op});
+  Int nGroups = weight.size(1);
+  Int ip = weight.size(2);
+  Int op = weight.size(3);
+  output_features.resize_({nActiveOut, op * nGroups});

  if (nActiveOut) {
    auto iF = input_features.data<T>();
@@ -46,7 +47,8 @@ double cuda_Convolution_updateOutput(
    else
      output_features.zero_();

-    return dConvolution_forward2<T>(iF, oF, w, _rules, ip, ip, op, op);
+    return dConvolution_forward2<T>(iF, oF, w, _rules, ip, ip * nGroups, op,
+                                    op * nGroups, nGroups);
  } else {
    return 0;
  }
@@ -67,9 +69,10 @@ void cuda_Convolution_backward(
      m.getRuleBook(inputSize, outputSize, filterSize, filterStride, true);
  Int nActiveIn = m.getNActive(inputSize);
  Int nActiveOut = m.getNActive(outputSize);
-  Int ip = weight.size(1);
-  Int op = weight.size(2);
-  d_input_features.resize_({nActiveIn, ip});
+  Int nGroups = weight.size(1);
+  Int ip = weight.size(2);
+  Int op = weight.size(3);
+  d_input_features.resize_({nActiveIn, ip * nGroups});
  d_input_features.zero_();

  if (nActiveOut) {
@@ -79,7 +82,8 @@ void cuda_Convolution_backward(
    auto w = weight.data<T>();
    auto dw = d_weight.data<T>();

-    dConvolution_backward_dW2<T>(iF, diF, doF, w, dw, _rules, ip, ip, op, op);
+    dConvolution_backward_dW2<T>(iF, diF, doF, w, dw, _rules, ip, ip * nGroups,
+                                 op, op * nGroups, nGroups);

    if (d_bias.numel()) {
      auto db = d_bias.data<T>();
@@ -98,9 +102,10 @@ double cuda_SubmanifoldConvolution_updateOutput(

  auto _rules = m.getSubmanifoldRuleBook(inputSize, filterSize, true);
  Int nActive = m.getNActive(inputSize);
-  Int ip = weight.size(1);
-  Int op = weight.size(2);
-  output_features.resize_({nActive, op});
+  Int nGroups = weight.size(1);
+  Int ip = weight.size(2);
+  Int op = weight.size(3);
+  output_features.resize_({nActive, op * nGroups});

  if (nActive) {
    auto iF = input_features.data<T>();
@@ -112,7 +117,8 @@ double cuda_SubmanifoldConvolution_updateOutput(
    else
      output_features.zero_();

-    return dConvolution_forward2<T>(iF, oF, w, _rules, ip, ip, op, op);
+    return dConvolution_forward2<T>(iF, oF, w, _rules, ip, ip * nGroups, op,
+                                    op * nGroups, nGroups);
  } else {
    return 0;
  }
@@ -130,9 +136,10 @@ void cuda_SubmanifoldConvolution_backward(

  auto _rules = m.getSubmanifoldRuleBook(inputSize, filterSize, true);
  Int nActive = m.getNActive(inputSize);
-  Int ip = weight.size(1);
-  Int op = weight.size(2);
-  d_input_features.resize_({nActive, ip});
+  Int nGroups = weight.size(1);
+  Int ip = weight.size(2);
+  Int op = weight.size(3);
+  d_input_features.resize_({nActive, ip * nGroups});
  d_input_features.zero_();

  if (nActive) {
@@ -142,7 +149,8 @@ void cuda_SubmanifoldConvolution_backward(
    auto w = weight.data<T>();
    auto dw = d_weight.data<T>();

-    dConvolution_backward_dW2<T>(iF, diF, doF, w, dw, _rules, ip, ip, op, op);
+    dConvolution_backward_dW2<T>(iF, diF, doF, w, dw, _rules, ip, ip * nGroups,
+                                 op, op * nGroups, nGroups);

    if (d_bias.numel()) {
      auto db = d_bias.data<T>();
@@ -160,9 +168,10 @@ double cuda_PermutohedralSubmanifoldConvolution_updateOutput(

  auto _rules = m.getPermutohedralSubmanifoldRuleBook(inputSize, true);
  Int nActive = m.getNActive(inputSize);
-  Int ip = weight.size(1);
-  Int op = weight.size(2);
-  output_features.resize_({nActive, op});
+  Int nGroups = weight.size(1);
+  Int ip = weight.size(2);
+  Int op = weight.size(3);
+  output_features.resize_({nActive, op * nGroups});

  if (nActive) {
    auto iF = input_features.data<T>();
@@ -174,7 +183,8 @@ double cuda_PermutohedralSubmanifoldConvolution_updateOutput(
    else
      output_features.zero_();

-    return dConvolution_forward2<T>(iF, oF, w, _rules, ip, ip, op, op);
+    return dConvolution_forward2<T>(iF, oF, w, _rules, ip, ip * nGroups, op,
+                                    op * nGroups, nGroups);
  } else {
    return 0;
  }
@@ -191,9 +201,10 @@ void cuda_PermutohedralSubmanifoldConvolution_backward(

  auto _rules = m.getPermutohedralSubmanifoldRuleBook(inputSize, true);
  Int nActive = m.getNActive(inputSize);
-  Int ip = weight.size(1);
-  Int op = weight.size(2);
-  d_input_features.resize_({nActive, ip});
+  Int nGroups = weight.size(1);
+  Int ip = weight.size(2);
+  Int op = weight.size(3);
+  d_input_features.resize_({nActive, ip * nGroups});
  d_input_features.zero_();

  if (nActive) {
@@ -203,7 +214,8 @@ void cuda_PermutohedralSubmanifoldConvolution_backward(
    auto w = weight.data<T>();
    auto dw = d_weight.data<T>();

-    dConvolution_backward_dW2<T>(iF, diF, doF, w, dw, _rules, ip, ip, op, op);
+    dConvolution_backward_dW2<T>(iF, diF, doF, w, dw, _rules, ip, ip * nGroups,
+                                 op, op * nGroups, nGroups);

    if (d_bias.numel()) {
      auto db = d_bias.data<T>();
@@ -225,9 +237,10 @@ double cuda_FullConvolution_updateOutput(
  auto _rules = mIn.getFullConvolutionRuleBook(inputSize, outputSize,
                                               filterSize, filterStride, mOut);
  Int nActiveOut = mOut.getNActive(outputSize);
-  Int ip = weight.size(1);
-  Int op = weight.size(2);
-  output_features.resize_({nActiveOut, op});
+  Int nGroups = weight.size(1);
+  Int ip = weight.size(2);
+  Int op = weight.size(3);
+  output_features.resize_({nActiveOut, op * nGroups});

  if (nActiveOut) {
    auto iF = input_features.data<T>();
@@ -239,7 +252,8 @@ double cuda_FullConvolution_updateOutput(
    else
      output_features.zero_();

-    return dConvolution_forward2<T>(iF, oF, w, _rules, ip, ip, op, op);
+    return dConvolution_forward2<T>(iF, oF, w, _rules, ip, ip * nGroups, op,
+                                    op * nGroups, nGroups);
  } else {
    return 0;
  }
@@ -261,9 +275,10 @@ void cuda_FullConvolution_backward(
                                               filterSize, filterStride, mOut);
  Int nActiveIn = mIn.getNActive(inputSize);
  Int nActiveOut = mOut.getNActive(outputSize);
-  Int ip = weight.size(1);
-  Int op = weight.size(2);
-  d_input_features.resize_({nActiveIn, ip});
+  Int nGroups = weight.size(1);
+  Int ip = weight.size(2);
+  Int op = weight.size(3);
+  d_input_features.resize_({nActiveIn, ip * nGroups});
  d_input_features.zero_();

  if (nActiveOut) {
@@ -273,7 +288,8 @@ void cuda_FullConvolution_backward(
    auto w = weight.data<T>();
    auto dw = d_weight.data<T>();

-    dConvolution_backward_dW2<T>(iF, diF, doF, w, dw, _rules, ip, ip, op, op);
+    dConvolution_backward_dW2<T>(iF, diF, doF, w, dw, _rules, ip, ip * nGroups,
+                                 op, op * nGroups, nGroups);

    if (d_bias.numel()) {
      auto db = d_bias.data<T>();
@@ -293,9 +309,10 @@ double cuda_RandomizedStrideConvolution_updateOutput(
  auto _rules = m.getRandomizedStrideRuleBook(inputSize, outputSize, filterSize,
                                              filterStride, true);
  Int nActiveOut = m.getNActive(outputSize);
-  Int ip = weight.size(1);
-  Int op = weight.size(2);
-  output_features.resize_({nActiveOut, op});
+  Int nGroups = weight.size(1);
+  Int ip = weight.size(2);
+  Int op = weight.size(3);
+  output_features.resize_({nActiveOut, op * nGroups});

  if (nActiveOut) {
    auto iF = input_features.data<T>();
@@ -307,7 +324,8 @@ double cuda_RandomizedStrideConvolution_updateOutput(
    else
      output_features.zero_();

-    return dConvolution_forward2<T>(iF, oF, w, _rules, ip, ip, op, op);
+    return dConvolution_forward2<T>(iF, oF, w, _rules, ip, ip * nGroups, op,
+                                    op * nGroups, nGroups);
  } else {
    return 0;
  }
@@ -328,9 +346,10 @@ void cuda_RandomizedStrideConvolution_backward(
                                              filterStride, true);
  Int nActiveIn = m.getNActive(inputSize);
  Int nActiveOut = m.getNActive(outputSize);
-  Int ip = weight.size(1);
-  Int op = weight.size(2);
-  d_input_features.resize_({nActiveIn, ip});
+  Int nGroups = weight.size(1);
+  Int ip = weight.size(2);
+  Int op = weight.size(3);
+  d_input_features.resize_({nActiveIn, ip * nGroups});
  d_input_features.zero_();

  if (nActiveOut) {
@@ -340,7 +359,8 @@ void cuda_RandomizedStrideConvolution_backward(
    auto w = weight.data<T>();
    auto dw = d_weight.data<T>();

-    dConvolution_backward_dW2<T>(iF, diF, doF, w, dw, _rules, ip, ip, op, op);
+    dConvolution_backward_dW2<T>(iF, diF, doF, w, dw, _rules, ip, ip * nGroups,
+                                 op, op * nGroups, nGroups);

    if (d_bias.numel()) {
      auto db = d_bias.data<T>();

--- a/sparseconvnet/SCN/CUDA/Convolution.cu
+++ b/sparseconvnet/SCN/CUDA/Convolution.cu
@@ -49,6 +49,11 @@ void Convolution_bp_bias(T *d_oF, T *d_b, Int nPlanes, Int nActive) {
  }
 }

+
+// .._nPlanes == planes per nGroup
+// weight = nGroups x input_nPlanes x output_nPlanes
+//        = nGroups x M*K           x N*K
+
 template <typename T, Int K, Int V>
 __global__ void
 dConvolution_KMxKN_forwardA(T *inFeatures, T *outFeatures, T *w, Int *rules,
@@ -57,7 +62,7 @@ dConvolution_KMxKN_forwardA(T *inFeatures, T *outFeatures, T *w, Int *rules,
  // nHot must be a multiple of K!!

  // Input x Weight -> Output
-  // blockDim=(K,K/V,1), gridDim=(nBlocks,N,1) Volkov-blocks
+  // blockDim=(K,K/V,1), gridDim=(nBlocks,N,nGroups) Volkov-blocks
  // K is a multiple of V,

  // nHot x KM -> nHot x KN - parallel over N,nHot - loop over M
@@ -65,8 +70,10 @@ dConvolution_KMxKN_forwardA(T *inFeatures, T *outFeatures, T *w, Int *rules,
  Int M = input_nPlanes / K;
  // N = gridDim.y == output_nPlanes/K
  Int n = blockIdx.y;
-  outFeatures += n * K;
-  w += n * K;
+  Int g = blockIdx.z;
+  inFeatures += g * input_nPlanes;
+  outFeatures += n * K + g * output_nPlanes;
+  w += n * K + g * input_nPlanes * output_nPlanes;

  TACC O[V];
  __shared__ T W[K][K];
@@ -125,7 +132,7 @@ dConvolution_KMxKN_forwardB(T *inFeatures, T *outFeatures, T *w, Int *rules,
 			    Int nHot, Int input_nPlanes, Int input_stride,
 			    Int output_nPlanes, Int output_stride) {
  // Input x Weight -> Output
-  // blockDim=(K,K/V,1), gridDim=(nBlocks,N,1) Volkov-blocks
+  // blockDim=(K,K/V,1), gridDim=(nBlocks,N,nGroups) Volkov-blocks
  // K is a multiple of V,

  // nHot x KM -> nHot x KN - parallel over N,nHot - loop over M
@@ -133,8 +140,10 @@ dConvolution_KMxKN_forwardB(T *inFeatures, T *outFeatures, T *w, Int *rules,
  Int M = input_nPlanes / K;
  // N = gridDim.y == output_nPlanes/K
  Int n = blockIdx.y;
-  outFeatures += n * K;
-  w += n * K;
+  Int g = blockIdx.z;
+  inFeatures += g * input_nPlanes;
+  outFeatures += n * K + g * output_nPlanes;
+  w += n * K + g * input_nPlanes * output_nPlanes;

  TACC O[V];
  __shared__ T W[K][K];
@@ -199,13 +208,13 @@ dConvolution_KMxKN_forwardB(T *inFeatures, T *outFeatures, T *w, Int *rules,
      Int o = (nHot / K) * K;                                                  \
      if (o >= K)                                                              \
 	dConvolution_KMxKN_forwardA<                                           \
-	    T, K, V><<<dim3(std::min(o / K, (Int)512), output_nPlanes / K),    \
+	    T, K, V><<<dim3(std::min(o / K, (Int)512), output_nPlanes / K, nGroups),    \
 		       dim3(K, K / V)>>>(inFeatures, outFeatures, w, rules, o, \
 					 input_nPlanes, input_stride,          \
 					 output_nPlanes, output_stride);       \
      if (nHot > o)                                                            \
 	dConvolution_KMxKN_forwardB<                                           \
-	    T, K, V><<<dim3(1, output_nPlanes / K), dim3(K, K / V)>>>(         \
+	    T, K, V><<<dim3(1, output_nPlanes / K, nGroups), dim3(K, K / V )>>>(         \
 	    inFeatures, outFeatures, w, rules + 2 * o, nHot - o,               \
 	    input_nPlanes, input_stride, output_nPlanes, output_stride);       \
      return;                                                                  \
@@ -215,7 +224,7 @@ dConvolution_KMxKN_forwardB(T *inFeatures, T *outFeatures, T *w, Int *rules,
 template <typename T>
 void dConvolution_forward(T *inFeatures, T *outFeatures, T *w, Int *rules,
 			  Int nHot, Int input_nPlanes, Int input_stride,
-			  Int output_nPlanes, Int output_stride) {
+			  Int output_nPlanes, Int output_stride, Int nGroups) {
  FOO(T, 64, 16)
  FOO(T, 32, 8)
  FOO(T, 16, 4)
@@ -226,7 +235,7 @@ template <>
 void dConvolution_forward<double>(double *inFeatures, double *outFeatures,
 				  double *w, Int *rules, Int nHot,
 				  Int input_nPlanes, Int input_stride,
-				  Int output_nPlanes, Int output_stride) {
+				  Int output_nPlanes, Int output_stride, Int nGroups) {
  FOO(double, 32, 8)
  FOO(double, 16, 4)
  FOO(double, 8, 2)
@@ -236,7 +245,7 @@ void dConvolution_forward<double>(double *inFeatures, double *outFeatures,

 // dOutput x W^T -> dInput and
 // Input^T x dOutput -> dW
-// blockDim=(K,K/V,1), gridDim=(nBlocks,M,1)
+// blockDim=(K,K/V,1), gridDim=(nBlocks,M,nGroups)
 template <typename T, Int K, Int V>
 __global__ void
 dConvolution_KMxKN_backward_dW_A(T *inFeatures, T *dInFeatures, T *dOutFeatures,
@@ -246,10 +255,12 @@ dConvolution_KMxKN_backward_dW_A(T *inFeatures, T *dInFeatures, T *dOutFeatures,
  // M = gridDim.y == input_nPlanes / K
  Int N = output_nPlanes / K;
  Int m = blockIdx.y;
-  inFeatures += m * K;
-  dInFeatures += m * K;
-  w += m * K * output_nPlanes;
-  dw += m * K * output_nPlanes;
+  Int g = blockIdx.z;
+  inFeatures += m * K + g * input_nPlanes;
+  dInFeatures += m * K + g * input_nPlanes;
+  dOutFeatures += g * output_nPlanes;
+  w += m * K * output_nPlanes+ g * input_nPlanes * output_nPlanes;
+  dw += m * K * output_nPlanes+ g * input_nPlanes * output_nPlanes;

  TACC dI[V];
  TACC dW[V];
@@ -313,7 +324,7 @@ dConvolution_KMxKN_backward_dW_A(T *inFeatures, T *dInFeatures, T *dOutFeatures,

 // dOutput x W^T -> dInput and
 // Input^T x dOutput -> dW
-// blockDim=(K,K/V,1), gridDim=(nBlocks,M,1)
+// blockDim=(K,K/V,1), gridDim=(nBlocks,M,nGroups)
 template <typename T, Int K, Int V>
 __global__ void
 dConvolution_KMxKN_backward_dW_B(T *inFeatures, T *dInFeatures, T *dOutFeatures,
@@ -323,10 +334,12 @@ dConvolution_KMxKN_backward_dW_B(T *inFeatures, T *dInFeatures, T *dOutFeatures,
  // M = gridDim.y == input_nPlanes / K
  Int N = output_nPlanes / K;
  Int m = blockIdx.y;
-  inFeatures += m * K;
-  dInFeatures += m * K;
-  w += m * K * output_nPlanes;
-  dw += m * K * output_nPlanes;
+  Int g = blockIdx.z;
+  inFeatures += m * K + g * input_nPlanes;
+  dInFeatures += m * K + g * input_nPlanes;
+  dOutFeatures += g * output_nPlanes;
+  w += m * K * output_nPlanes+ g * input_nPlanes * output_nPlanes;
+  dw += m * K * output_nPlanes+ g * input_nPlanes * output_nPlanes;

  TACC dI[V];
  TACC dW[V];
@@ -402,13 +415,13 @@ dConvolution_KMxKN_backward_dW_B(T *inFeatures, T *dInFeatures, T *dOutFeatures,
      Int o = (nHot / K) * K;                                                  \
      if (o >= K)                                                              \
 	dConvolution_KMxKN_backward_dW_A<                                      \
-	    T, K, V><<<dim3(std::min(o / K, (Int)512), input_nPlanes / K),     \
+	    T, K, V><<<dim3(std::min(o / K, (Int)512), input_nPlanes / K, nGroups),     \
 		       dim3(K, K / V)>>>(                                      \
 	    inFeatures, dInFeatures, dOutFeatures, w, dw, rules, o,            \
 	    input_nPlanes, input_stride, output_nPlanes, output_stride);       \
      if (nHot > o)                                                            \
 	dConvolution_KMxKN_backward_dW_B<                                      \
-	    T, K, V><<<dim3(1, input_nPlanes / K), dim3(K, K / V)>>>(          \
+	    T, K, V><<<dim3(1, input_nPlanes / K, nGroups), dim3(K, K / V)>>>(          \
 	    inFeatures, dInFeatures, dOutFeatures, w, dw, rules + 2 * o,       \
 	    nHot - o, input_nPlanes, input_stride, output_nPlanes,             \
 	    output_stride);                                                    \
@@ -420,7 +433,7 @@ template <typename T>
 void dConvolution_backward_dW(T *inFeatures, T *dInFeatures, T *dOutFeatures,
 			      T *w, T *dw, Int *rules, Int nHot,
 			      Int input_nPlanes, Int input_stride,
-			      Int output_nPlanes, Int output_stride) {
+			      Int output_nPlanes, Int output_stride, Int nGroups) {
  FOO(T, 32, 8)
  FOO(T, 16, 4)
  FOO(T, 8, 2)
@@ -434,7 +447,7 @@ dConvolution_KMxKN_forward2(T *inFeatures, T *outFeatures, T *w, Int *rules,
 			    Int nHot, Int input_nPlanes, Int input_stride,
 			    Int output_nPlanes, Int output_stride) {
  // Input x Weight -> Output
-  // blockDim=(K,K/V,1), gridDim=(nBlocks,N,1) Volkov-blocks
+  // blockDim=(K,K/V,1), gridDim=(nBlocks,N,nGroups) Volkov-blocks
  // K is a multiple of V,

  // nHot x input_nplanes<=KM -> nHot x output_nPlanes<=KN
@@ -443,8 +456,10 @@ dConvolution_KMxKN_forward2(T *inFeatures, T *outFeatures, T *w, Int *rules,
  Int M = (input_nPlanes + K - 1) / K;
  // N = gridDim.y ~ output_nPlanes/K
  Int n = blockIdx.y;
-  outFeatures += n * K;
-  w += n * K;
+  Int g = blockIdx.z;
+  inFeatures += g * input_nPlanes;
+  outFeatures += n * K + g * output_nPlanes;
+  w += n * K + g * input_nPlanes * output_nPlanes;
  Int KO = min(K, output_nPlanes - K * n);

  TACC O[V];
@@ -507,7 +522,7 @@ dConvolution_KMxKN_forward2(T *inFeatures, T *outFeatures, T *w, Int *rules,

 // dOutput x W^T -> dInput and
 // Input^T x dOutput -> dW
-// blockDim=(K,K/V,1), gridDim=(nBlocks,M,1)
+// blockDim=(K,K/V,1), gridDim=(nBlocks,M,nGroups)
 template <typename T, Int K, Int V>
 __global__ void
 dConvolution_KMxKN_backward_dW2(T *inFeatures, T *dInFeatures, T *dOutFeatures,
@@ -517,10 +532,12 @@ dConvolution_KMxKN_backward_dW2(T *inFeatures, T *dInFeatures, T *dOutFeatures,
  // M = gridDim.y == input_nPlanes / K
  Int N = (output_nPlanes + K - 1) / K;
  Int m = blockIdx.y;
-  inFeatures += m * K;
-  dInFeatures += m * K;
-  w += m * K * output_nPlanes;
-  dw += m * K * output_nPlanes;
+  Int g = blockIdx.z;
+  inFeatures += m * K + g * input_nPlanes;
+  dInFeatures += m * K + g * input_nPlanes;
+  dOutFeatures += g * output_nPlanes;
+  w += m * K * output_nPlanes+ g * input_nPlanes * output_nPlanes;
+  dw += m * K * output_nPlanes+ g * input_nPlanes * output_nPlanes;
  Int KI = min(K, input_nPlanes - K * m);

  TACC dI[V];
@@ -602,8 +619,8 @@ template <typename T>
 double dConvolution_forward2(T *inFeatures, T *outFeatures, T *w,
 			     RuleBook _rules, Int input_nPlanes,
 			     Int input_stride, Int output_nPlanes,
-			     Int output_stride) {
-  Int c = input_nPlanes * output_nPlanes;
+			     Int output_stride, Int nGroups) {
+  Int c = input_nPlanes * output_nPlanes * nGroups;
  double flops = 0;
  if (input_nPlanes % 8 != 0 or output_nPlanes % 8 != 0) {
    const int K = 16;
@@ -611,14 +628,14 @@ double dConvolution_forward2(T *inFeatures, T *outFeatures, T *w,
    RULEBOOKITERATOR(
 	(dConvolution_KMxKN_forward2<
 	    T, K,
-	    V><<<dim3(128, (output_nPlanes + K - 1) / K), dim3(K, K / V)>>>(
+	    V><<<dim3(128, (output_nPlanes + K - 1) / K, nGroups), dim3(K, K / V)>>>(
 	    inFeatures, outFeatures, w, rbB, nHotB, input_nPlanes, input_stride,
 	    output_nPlanes, output_stride));
 	, w += c; flops += nHotB * c;)
  } else {
    RULEBOOKITERATOR(dConvolution_forward(inFeatures, outFeatures, w, rbB,
 					  nHotB, input_nPlanes, input_stride,
-					  output_nPlanes, output_stride);
+					  output_nPlanes, output_stride, nGroups);
 		     , w += c; flops += nHotB * c;)
  }
  return flops;
@@ -628,15 +645,15 @@ template <typename T>
 void dConvolution_backward_dW2(T *inFeatures, T *dInFeatures, T *dOutFeatures,
 			       T *w, T *dw, RuleBook _rules, Int input_nPlanes,
 			       Int input_stride, Int output_nPlanes,
-			       Int output_stride) {
-  Int c = input_nPlanes * output_nPlanes;
+			       Int output_stride, Int nGroups) {
+  Int c = input_nPlanes * output_nPlanes * nGroups;
  if (input_nPlanes % 8 != 0 or output_nPlanes % 8 != 0) {
    const int K = 16;
    const int V = 4;
    RULEBOOKITERATOR(
 	(dConvolution_KMxKN_backward_dW2<
 	    T, K,
-	    V><<<dim3(128, (input_nPlanes + K - 1) / K), dim3(K, K / V)>>>(
+	    V><<<dim3(128, (input_nPlanes + K - 1) / K, nGroups), dim3(K, K / V)>>>(
 	    inFeatures, dInFeatures, dOutFeatures, w, dw, rbB, nHotB,
 	    input_nPlanes, input_stride, output_nPlanes, output_stride));
 	, w += c; dw += c;)
@@ -644,7 +661,7 @@ void dConvolution_backward_dW2(T *inFeatures, T *dInFeatures, T *dOutFeatures,
    RULEBOOKITERATOR(dConvolution_backward_dW(inFeatures, dInFeatures,
 					      dOutFeatures, w, dw, rbB, nHotB,
 					      input_nPlanes, input_stride,
-					      output_nPlanes, output_stride);
+					      output_nPlanes, output_stride, nGroups);
 		     , w += c; dw += c;)
  }
 }

--- a/sparseconvnet/SCN/CUDA/Deconvolution.cpp
+++ b/sparseconvnet/SCN/CUDA/Deconvolution.cpp
@@ -8,13 +8,14 @@ template <typename T>
 double dDeconvolution_forward2(T *inFeatures, T *outFeatures, T *w,
                               RuleBook _rules, Int input_nPlanes,
                               Int input_stride, Int output_nPlanes,
-                               Int output_stride);
+                               Int output_stride, Int nGroups);

 template <typename T>
 void dDeconvolution_backward_dW2(T *inFeatures, T *dInFeatures, T *dOutFeatures,
                                 T *w, T *dw, RuleBook _rules,
                                 Int input_nPlanes, Int input_stride,
-                                 Int output_nPlanes, Int output_stride);
+                                 Int output_nPlanes, Int output_stride,
+                                 Int nGroups);

 template <typename T, Int Dimension>
 double cuda_Deconvolution_updateOutput(
@@ -28,9 +29,10 @@ double cuda_Deconvolution_updateOutput(
  auto _rules =
      m.getRuleBook(outputSize, inputSize, filterSize, filterStride, true);
  Int nActiveOut = m.getNActive(outputSize);
-  Int ip = weight.size(1);
-  Int op = weight.size(2);
-  output_features.resize_({nActiveOut, op});
+  Int nGroups = weight.size(1);
+  Int ip = weight.size(2);
+  Int op = weight.size(3);
+  output_features.resize_({nActiveOut, op * nGroups});

  if (nActiveOut) {
    auto iF = input_features.data<T>();
@@ -42,7 +44,8 @@ double cuda_Deconvolution_updateOutput(
    else
      output_features.zero_();

-    return dDeconvolution_forward2<T>(iF, oF, w, _rules, ip, ip, op, op);
+    return dDeconvolution_forward2<T>(iF, oF, w, _rules, ip, ip * nGroups, op,
+                                      op * nGroups, nGroups);
  } else {
    return 0;
  }
@@ -63,9 +66,10 @@ void cuda_Deconvolution_backward(
      m.getRuleBook(outputSize, inputSize, filterSize, filterStride, true);
  Int nActiveIn = m.getNActive(inputSize);
  Int nActiveOut = m.getNActive(outputSize);
-  Int ip = weight.size(1);
-  Int op = weight.size(2);
-  d_input_features.resize_({nActiveIn, ip});
+  Int nGroups = weight.size(1);
+  Int ip = weight.size(2);
+  Int op = weight.size(3);
+  d_input_features.resize_({nActiveIn, ip * nGroups});
  d_input_features.zero_();

  if (nActiveOut) {
@@ -75,7 +79,8 @@ void cuda_Deconvolution_backward(
    auto w = weight.data<T>();
    auto dw = d_weight.data<T>();

-    dDeconvolution_backward_dW2<T>(iF, diF, doF, w, dw, _rules, ip, ip, op, op);
+    dDeconvolution_backward_dW2<T>(iF, diF, doF, w, dw, _rules, ip,
+                                   ip * nGroups, op, op * nGroups, nGroups);
    if (d_bias.numel()) {
      auto db = d_bias.data<T>();
      Convolution_bp_bias(doF, db, op, nActiveOut);

--- a/sparseconvnet/SCN/CUDA/Deconvolution.cu
+++ b/sparseconvnet/SCN/CUDA/Deconvolution.cu
@@ -14,7 +14,7 @@ dDeconvolution_KMxKN_forwardA(T *inFeatures, T *outFeatures, T *w, Int *rules,
  // nHot must be a multiple of K!!

  // Input x Weight -> Output
-  // blockDim=(K,K/V,1), gridDim=(nBlocks,N,1) Volkov-blocks
+  // blockDim=(K,K/V,1), gridDim=(nBlocks,N,nGroups) Volkov-blocks
  // K is a multiple of V,

  // nHot x KM -> nHot x KN - parallel over N,nHot - loop over M
@@ -22,8 +22,10 @@ dDeconvolution_KMxKN_forwardA(T *inFeatures, T *outFeatures, T *w, Int *rules,
  Int M = input_nPlanes / K;
  // N = gridDim.y == output_nPlanes/K
  Int n = blockIdx.y;
-  outFeatures += n * K;
-  w += n * K;
+  Int g = blockIdx.z;
+  inFeatures += g * input_nPlanes;
+  outFeatures += n * K + g * output_nPlanes;
+  w += n * K + g * input_nPlanes * output_nPlanes;

  TACC O[V];
  __shared__ T W[K][K];
@@ -82,7 +84,7 @@ dDeconvolution_KMxKN_forwardB(T *inFeatures, T *outFeatures, T *w, Int *rules,
 			      Int nHot, Int input_nPlanes, Int input_stride,
 			      Int output_nPlanes, Int output_stride) {
  // Input x Weight -> Output
-  // blockDim=(K,K/V,1), gridDim=(nBlocks,N,1) Volkov-blocks
+  // blockDim=(K,K/V,1), gridDim=(nBlocks,N,nGroups) Volkov-blocks
  // K is a multiple of V,

  // nHot x KM -> nHot x KN - parallel over N,nHot - loop over M
@@ -90,8 +92,10 @@ dDeconvolution_KMxKN_forwardB(T *inFeatures, T *outFeatures, T *w, Int *rules,
  Int M = input_nPlanes / K;
  // N = gridDim.y == output_nPlanes/K
  Int n = blockIdx.y;
-  outFeatures += n * K;
-  w += n * K;
+  Int g = blockIdx.z;
+  inFeatures += g * input_nPlanes;
+  outFeatures += n * K + g * output_nPlanes;
+  w += n * K + g * input_nPlanes * output_nPlanes;

  TACC O[V];
  __shared__ T W[K][K];
@@ -156,13 +160,13 @@ dDeconvolution_KMxKN_forwardB(T *inFeatures, T *outFeatures, T *w, Int *rules,
      Int o = (nHot / K) * K;                                                  \
      if (o >= K)                                                              \
 	dDeconvolution_KMxKN_forwardA<                                         \
-	    T, K, V><<<dim3(std::min(o / K, (Int)512), output_nPlanes / K),    \
+	    T, K, V><<<dim3(std::min(o / K, (Int)512), output_nPlanes / K, nGroups),    \
 		       dim3(K, K / V)>>>(inFeatures, outFeatures, w, rules, o, \
 					 input_nPlanes, input_stride,          \
 					 output_nPlanes, output_stride);       \
      if (nHot > o)                                                            \
 	dDeconvolution_KMxKN_forwardB<                                         \
-	    T, K, V><<<dim3(1, output_nPlanes / K), dim3(K, K / V)>>>(         \
+	    T, K, V><<<dim3(1, output_nPlanes / K, nGroups), dim3(K, K / V)>>>(         \
 	    inFeatures, outFeatures, w, rules + 2 * o, nHot - o,               \
 	    input_nPlanes, input_stride, output_nPlanes, output_stride);       \
      return;                                                                  \
@@ -172,7 +176,7 @@ dDeconvolution_KMxKN_forwardB(T *inFeatures, T *outFeatures, T *w, Int *rules,
 template <typename T>
 void dDeconvolution_forward(T *inFeatures, T *outFeatures, T *w, Int *rules,
 			    Int nHot, Int input_nPlanes, Int input_stride,
-			    Int output_nPlanes, Int output_stride) {
+			    Int output_nPlanes, Int output_stride, Int nGroups) {
  FOO(T, 64, 16)
  FOO(T, 32, 8)
  FOO(T, 16, 4)
@@ -183,7 +187,7 @@ template <>
 void dDeconvolution_forward<double>(double *inFeatures, double *outFeatures,
 				    double *w, Int *rules, Int nHot,
 				    Int input_nPlanes, Int input_stride,
-				    Int output_nPlanes, Int output_stride) {
+				    Int output_nPlanes, Int output_stride, Int nGroups) {
  FOO(double, 32, 8)
  FOO(double, 16, 4)
  FOO(double, 8, 2)
@@ -193,7 +197,7 @@ void dDeconvolution_forward<double>(double *inFeatures, double *outFeatures,

 // dOutput x W^T -> dInput and
 // Input^T x dOutput -> dW
-// blockDim=(K,K/V,1), gridDim=(nBlocks,M,1)
+// blockDim=(K,K/V,1), gridDim=(nBlocks,M,nGroups)
 template <typename T, Int K, Int V>
 __global__ void dDeconvolution_KMxKN_backward_dW_A(
    T *inFeatures, T *dInFeatures, T *dOutFeatures, T *w, T *dw, Int *rules,
@@ -202,10 +206,12 @@ __global__ void dDeconvolution_KMxKN_backward_dW_A(
  // M = gridDim.y == input_nPlanes / K
  Int N = output_nPlanes / K;
  Int m = blockIdx.y;
-  inFeatures += m * K;
-  dInFeatures += m * K;
-  w += m * K * output_nPlanes;
-  dw += m * K * output_nPlanes;
+  Int g = blockIdx.z;
+  inFeatures += m * K + g * input_nPlanes;
+  dInFeatures += m * K + g * input_nPlanes;
+  dOutFeatures += g * output_nPlanes;
+  w += m * K * output_nPlanes+ g * input_nPlanes * output_nPlanes;
+  dw += m * K * output_nPlanes+ g * input_nPlanes * output_nPlanes;

  TACC dI[V];
  TACC dW[V];
@@ -269,7 +275,7 @@ __global__ void dDeconvolution_KMxKN_backward_dW_A(

 // dOutput x W^T -> dInput and
 // Input^T x dOutput -> dW
-// blockDim=(K,K/V,1), gridDim=(nBlocks,M,1)
+// blockDim=(K,K/V,1), gridDim=(nBlocks,M,nGroups)
 template <typename T, Int K, Int V>
 __global__ void dDeconvolution_KMxKN_backward_dW_B(
    T *inFeatures, T *dInFeatures, T *dOutFeatures, T *w, T *dw, Int *rules,
@@ -278,10 +284,12 @@ __global__ void dDeconvolution_KMxKN_backward_dW_B(
  // M = gridDim.y == input_nPlanes / K
  Int N = output_nPlanes / K;
  Int m = blockIdx.y;
-  inFeatures += m * K;
-  dInFeatures += m * K;
-  w += m * K * output_nPlanes;
-  dw += m * K * output_nPlanes;
+  Int g = blockIdx.z;
+  inFeatures += m * K + g * input_nPlanes;
+  dInFeatures += m * K + g * input_nPlanes;
+  dOutFeatures += g * output_nPlanes;
+  w += m * K * output_nPlanes+ g * input_nPlanes * output_nPlanes;
+  dw += m * K * output_nPlanes+ g * input_nPlanes * output_nPlanes;

  TACC dI[V];
  TACC dW[V];
@@ -357,13 +365,13 @@ __global__ void dDeconvolution_KMxKN_backward_dW_B(
      Int o = (nHot / K) * K;                                                  \
      if (o >= K)                                                              \
 	dDeconvolution_KMxKN_backward_dW_A<                                    \
-	    T, K, V><<<dim3(std::min(o / K, (Int)512), input_nPlanes / K),     \
+	    T, K, V><<<dim3(std::min(o / K, (Int)512), input_nPlanes / K, nGroups),     \
 		       dim3(K, K / V)>>>(                                      \
 	    inFeatures, dInFeatures, dOutFeatures, w, dw, rules, o,            \
 	    input_nPlanes, input_stride, output_nPlanes, output_stride);       \
      if (nHot > o)                                                            \
 	dDeconvolution_KMxKN_backward_dW_B<                                    \
-	    T, K, V><<<dim3(1, input_nPlanes / K), dim3(K, K / V)>>>(          \
+	    T, K, V><<<dim3(1, input_nPlanes / K, nGroups), dim3(K, K / V)>>>(          \
 	    inFeatures, dInFeatures, dOutFeatures, w, dw, rules + 2 * o,       \
 	    nHot - o, input_nPlanes, input_stride, output_nPlanes,             \
 	    output_stride);                                                    \
@@ -375,7 +383,7 @@ template <typename T>
 void dDeconvolution_backward_dW(T *inFeatures, T *dInFeatures, T *dOutFeatures,
 				T *w, T *dw, Int *rules, Int nHot,
 				Int input_nPlanes, Int input_stride,
-				Int output_nPlanes, Int output_stride) {
+				Int output_nPlanes, Int output_stride, Int nGroups) {
  FOO(T, 32, 8)
  FOO(T, 16, 4)
  FOO(T, 8, 2)
@@ -389,7 +397,7 @@ dDeconvolution_KMxKN_forward2(T *inFeatures, T *outFeatures, T *w, Int *rules,
 			      Int nHot, Int input_nPlanes, Int input_stride,
 			      Int output_nPlanes, Int output_stride) {
  // Input x Weight -> Output
-  // blockDim=(K,K/V,1), gridDim=(nBlocks,N,1) Volkov-blocks
+  // blockDim=(K,K/V,1), gridDim=(nBlocks,N,nGroups) Volkov-blocks
  // K is a multiple of V,

  // nHot x input_nplanes<=KM -> nHot x output_nPlanes<=KN
@@ -398,8 +406,10 @@ dDeconvolution_KMxKN_forward2(T *inFeatures, T *outFeatures, T *w, Int *rules,
  Int M = (input_nPlanes + K - 1) / K;
  // N = gridDim.y ~ output_nPlanes/K
  Int n = blockIdx.y;
-  outFeatures += n * K;
-  w += n * K;
+  Int g = blockIdx.z;
+  inFeatures += g * input_nPlanes;
+  outFeatures += n * K + g * output_nPlanes;
+  w += n * K + g * input_nPlanes * output_nPlanes;
  Int KO = min(K, output_nPlanes - K * n);

  TACC O[V];
@@ -462,7 +472,7 @@ dDeconvolution_KMxKN_forward2(T *inFeatures, T *outFeatures, T *w, Int *rules,

 // dOutput x W^T -> dInput and
 // Input^T x dOutput -> dW
-// blockDim=(K,K/V,1), gridDim=(nBlocks,M,1)
+// blockDim=(K,K/V,1), gridDim=(nBlocks,M,nGroups)
 template <typename T, Int K, Int V>
 __global__ void
 dDeconvolution_KMxKN_backward_dW2(T *inFeatures, T *dInFeatures,
@@ -472,10 +482,12 @@ dDeconvolution_KMxKN_backward_dW2(T *inFeatures, T *dInFeatures,
  // M = gridDim.y == input_nPlanes / K
  Int N = (output_nPlanes + K - 1) / K;
  Int m = blockIdx.y;
-  inFeatures += m * K;
-  dInFeatures += m * K;
-  w += m * K * output_nPlanes;
-  dw += m * K * output_nPlanes;
+  Int g = blockIdx.z;
+  inFeatures += m * K + g * input_nPlanes;
+  dInFeatures += m * K + g * input_nPlanes;
+  dOutFeatures += g * output_nPlanes;
+  w += m * K * output_nPlanes+ g * input_nPlanes * output_nPlanes;
+  dw += m * K * output_nPlanes+ g * input_nPlanes * output_nPlanes;
  Int KI = min(K, input_nPlanes - K * m);

  TACC dI[V];
@@ -557,8 +569,8 @@ template <typename T>
 double dDeconvolution_forward2(T *inFeatures, T *outFeatures, T *w,
 			       RuleBook _rules, Int input_nPlanes,
 			       Int input_stride, Int output_nPlanes,
-			       Int output_stride) {
-  Int c = input_nPlanes * output_nPlanes;
+			       Int output_stride, Int nGroups) {
+  Int c = input_nPlanes * output_nPlanes * nGroups;
  double flops = 0;
  if (input_nPlanes % 8 != 0 or output_nPlanes % 8 != 0) {
    const int K = 16;
@@ -566,14 +578,14 @@ double dDeconvolution_forward2(T *inFeatures, T *outFeatures, T *w,
    RULEBOOKITERATOR(
 	(dDeconvolution_KMxKN_forward2<
 	    T, K,
-	    V><<<dim3(128, (output_nPlanes + K - 1) / K), dim3(K, K / V)>>>(
+	    V><<<dim3(128, (output_nPlanes + K - 1) / K, nGroups), dim3(K, K / V)>>>(
 	    inFeatures, outFeatures, w, rbB, nHotB, input_nPlanes, input_stride,
 	    output_nPlanes, output_stride));
 	, w += c; flops += nHotB * c;)
  } else {
    RULEBOOKITERATOR(dDeconvolution_forward(inFeatures, outFeatures, w, rbB,
 					    nHotB, input_nPlanes, input_stride,
-					    output_nPlanes, output_stride);
+					    output_nPlanes, output_stride, nGroups);
 		     , w += c; flops += nHotB * c;)
  }
  return flops;
@@ -583,15 +595,15 @@ template <typename T>
 void dDeconvolution_backward_dW2(T *inFeatures, T *dInFeatures, T *dOutFeatures,
 				 T *w, T *dw, RuleBook _rules,
 				 Int input_nPlanes, Int input_stride,
-				 Int output_nPlanes, Int output_stride) {
-  Int c = input_nPlanes * output_nPlanes;
+				 Int output_nPlanes, Int output_stride, Int nGroups) {
+  Int c = input_nPlanes * output_nPlanes * nGroups;
  if (input_nPlanes % 8 != 0 or output_nPlanes % 8 != 0) {
    const int K = 16;
    const int V = 4;
    RULEBOOKITERATOR(
 	(dDeconvolution_KMxKN_backward_dW2<
 	    T, K,
-	    V><<<dim3(128, (input_nPlanes + K - 1) / K), dim3(K, K / V)>>>(
+	    V><<<dim3(128, (input_nPlanes + K - 1) / K, nGroups), dim3(K, K / V)>>>(
 	    inFeatures, dInFeatures, dOutFeatures, w, dw, rbB, nHotB,
 	    input_nPlanes, input_stride, output_nPlanes, output_stride));
 	, w += c; dw += c;)
@@ -599,9 +611,8 @@ void dDeconvolution_backward_dW2(T *inFeatures, T *dInFeatures, T *dOutFeatures,
    RULEBOOKITERATOR(dDeconvolution_backward_dW(inFeatures, dInFeatures,
 						dOutFeatures, w, dw, rbB, nHotB,
 						input_nPlanes, input_stride,
-						output_nPlanes, output_stride);
+						output_nPlanes, output_stride, nGroups);
 		     , w += c; dw += c;)
  }
 }
-
 #undef TACC
\ No newline at end of file
--- a/sparseconvnet/SCN/cuda.cu
+++ b/sparseconvnet/SCN/cuda.cu
@@ -55,21 +55,21 @@ template void Convolution_bp_bias<float>(float *d_oF, float *d_b,
 					 Int nPlanes, Int nActive);
 template double dConvolution_forward2<float>(
    float *inFeatures, float *outFeatures, float *w, RuleBook _rules,
-    Int input_nPlanes, Int input_stride, Int output_nPlanes, Int output_stride);
+    Int input_nPlanes, Int input_stride, Int output_nPlanes, Int output_stride, Int nGroups);

 template void dConvolution_backward_dW2<float>(
    float *inFeatures, float *dInFeatures, float *dOutFeatures, float *w,
    float *dw, RuleBook _rules, Int input_nPlanes, Int input_stride,
-    Int output_nPlanes, Int output_stride);
+    Int output_nPlanes, Int output_stride, Int nGroups);

 template double dDeconvolution_forward2<float>(
    float *inFeatures, float *outFeatures, float *w, RuleBook _rules,
-    Int input_nPlanes, Int input_stride, Int output_nPlanes, Int output_stride);
+    Int input_nPlanes, Int input_stride, Int output_nPlanes, Int output_stride, Int nGroups);

 template void dDeconvolution_backward_dW2<float>(
    float *inFeatures, float *dInFeatures, float *dOutFeatures, float *w,
    float *dw, RuleBook _rules, Int input_nPlanes, Int input_stride,
-    Int output_nPlanes, Int output_stride);
+    Int output_nPlanes, Int output_stride, Int nGroups);

 template void InputLayer_fp<float>(float *input_features,
 				   float *output_features, Int nRows,

--- a/sparseconvnet/__init__.py
+++ b/sparseconvnet/__init__.py
@@ -25,10 +25,10 @@ from .networkInNetwork import NetworkInNetwork
 from .permutohedralSubmanifoldConvolution import PermutohedralSubmanifoldConvolution, permutohedral_basis
 from .randomizedStrideConvolution import RandomizedStrideConvolution
 from .randomizedStrideMaxPooling import RandomizedStrideMaxPooling
-from .sequential import Sequential
+from .sequential import Sequential, CheckpointedSequential
 from .sparseConvNetTensor import SparseConvNetTensor
 from .sparseToDense import SparseToDense
-from .sparsify import Sparsify
+from .sparsify import Sparsify, SparsifyFCS
 from .spectral_norm import spectral_norm
 from .submanifoldConvolution import SubmanifoldConvolution, ValidConvolution
 from .tables import *

--- a/sparseconvnet/batchNormalization.py
+++ b/sparseconvnet/batchNormalization.py
@@ -41,7 +41,9 @@ class BatchNormalization(Module):
            self.bias = Parameter(torch.Tensor(nPlanes).fill_(0))

    def forward(self, input):
-        assert input.features.nelement() == 0 or input.features.size(1) == self.nPlanes, (self.nPlanes, input.features.shape)
+        if input.features.nelement() == 0:
+            return input
+        assert input.features.size(1) == self.nPlanes, (self.nPlanes, input.features.shape)
        output = SparseConvNetTensor()
        output.metadata = input.metadata
        output.spatial_size = input.spatial_size

--- a/sparseconvnet/convolution.py
+++ b/sparseconvnet/convolution.py
@@ -11,17 +11,18 @@ from .utils import *
 from .sparseConvNetTensor import SparseConvNetTensor

 class Convolution(Module):
-    def __init__(self, dimension, nIn, nOut, filter_size, filter_stride, bias):
+    def __init__(self, dimension, nIn, nOut, filter_size, filter_stride, bias, groups=1):
        Module.__init__(self)
        self.dimension = dimension
+        self.groups = groups
        self.nIn = nIn
        self.nOut = nOut
        self.filter_size = toLongTensor(dimension, filter_size)
        self.filter_volume = self.filter_size.prod().item()
        self.filter_stride = toLongTensor(dimension, filter_stride)
-        std = (2.0 / nIn / self.filter_volume)**0.5
+        std = (2.0 * groups / nIn / self.filter_volume)**0.5
        self.weight = Parameter(torch.Tensor(
-            self.filter_volume, nIn, nOut).normal_(
+            self.filter_volume, groups, nIn//groups, nOut//groups).normal_(
            0,
            std))
        if bias:

--- a/sparseconvnet/deconvolution.py
+++ b/sparseconvnet/deconvolution.py
@@ -11,17 +11,18 @@ from .utils import *
 from .sparseConvNetTensor import SparseConvNetTensor

 class Deconvolution(Module):
-    def __init__(self, dimension, nIn, nOut, filter_size, filter_stride, bias):
+    def __init__(self, dimension, nIn, nOut, filter_size, filter_stride, bias, groups=1):
        Module.__init__(self)
        self.dimension = dimension
+        self.groups = groups
        self.nIn = nIn
        self.nOut = nOut
        self.filter_size = toLongTensor(dimension, filter_size)
        self.filter_volume = self.filter_size.prod().item()
        self.filter_stride = toLongTensor(dimension, filter_stride)
-        std = (2.0 / nIn / self.filter_volume)**0.5
+        std = (2.0 * groups / nIn / self.filter_volume)**0.5
        self.weight = Parameter(torch.Tensor(
-            self.filter_volume, nIn, nOut).normal_(
+            self.filter_volume, groups, nIn//groups, nOut//groups).normal_(
            0,
            std))
        if bias:

--- a/sparseconvnet/fullConvolution.py
+++ b/sparseconvnet/fullConvolution.py
@@ -12,17 +12,18 @@ from .sparseConvNetTensor import SparseConvNetTensor
 from .metadata import Metadata

 class FullConvolution(Module):
-    def __init__(self, dimension, nIn, nOut, filter_size, filter_stride, bias):
+    def __init__(self, dimension, nIn, nOut, filter_size, filter_stride, bias, groups=1):
        Module.__init__(self)
        self.dimension = dimension
+        self.groups = groups
        self.nIn = nIn
        self.nOut = nOut
        self.filter_size = toLongTensor(dimension, filter_size)
        self.filter_volume = self.filter_size.prod().item()
        self.filter_stride = toLongTensor(dimension, filter_stride)
-        std = (2.0 / nIn / self.filter_volume)**0.5
+        std = (2.0 * groups / nIn / self.filter_volume)**0.5
        self.weight = Parameter(torch.Tensor(
-            self.filter_volume, nIn, nOut).normal_(
+            self.filter_volume, groups, nIn//groups, nOut//groups).normal_(
            0,
            std))
        if bias:
@@ -68,16 +69,16 @@ class FullConvolution(Module):

    def __repr__(self):
        s = 'FullConvolution ' + str(self.nIn) + '->' + str(self.nOut) + ' C'
-        if self.filter_size.max() == self.filter_size.min() and\
-                self.filter_stride.max() == self.filter_stride.min():
-            s = s + str(self.filter_size[0]) + '/' + str(self.filter_stride[0])
+        if self.filter_size.max().item() == self.filter_size.min().item() and\
+                self.filter_stride.max().item() == self.filter_stride.min().item():
+            s = s + str(self.filter_size[0].item()) + '/' + str(self.filter_stride[0].item())
        else:
-            s = s + '(' + str(self.filter_size[0])
+            s = s + '(' + str(self.filter_size[0].item())
            for i in self.filter_size[1:]:
-                s = s + ',' + str(i)
-            s = s + ')/(' + str(self.filter_stride[0])
+                s = s + ',' + str(i.item())
+            s = s + ')/(' + str(self.filter_stride[0].item())
            for i in self.filter_stride[1:]:
-                s = s + ',' + str(i)
+                s = s + ',' + str(i.item())
            s = s + ')'
        return s


--- a/sparseconvnet/networkArchitectures.py
+++ b/sparseconvnet/networkArchitectures.py
@@ -310,3 +310,42 @@ def FullyConvolutionalNet(dimension, reps, nPlanes, residual_blocks=False, downs
        return m
    m = U(nPlanes)
    return m
+
+def FullConvolutionalNetIntegratedLinear(dimension, reps, nPlanes, nClasses=-1, residual=False, downsample=[2,2], leakiness=0):
+    if nClasses==-1:
+        nClasses=reps[0]
+    def l(x):
+        return x+nPlanes
+    def foo(m,np):
+        for _ in range(reps):
+            if residual_blocks: #ResNet style blocks
+                m.add(scn.ConcatTable()
+                      .add(scn.Identity())
+                      .add(scn.Sequential()
+                        .add(scn.BatchNormLeakyReLU(np,leakiness=leakiness))
+                        .add(scn.SubmanifoldConvolution(dimension, np, np, 3, False))
+                        .add(scn.BatchNormLeakyReLU(np,leakiness=leakiness))
+                        .add(scn.SubmanifoldConvolution(dimension, np, np, 3, False)))
+                 ).add(scn.AddTable())
+            else: #VGG style blocks
+                m.add(scn.BatchNormLeakyReLU(np,leakiness=leakiness)
+                ).add(scn.SubmanifoldConvolution(dimension, np, np, 3, False))
+    def bar(m,nPlanes,bias):
+        m.add(scn.BatchNormLeakyReLU(nPlanes,leakiness=leakiness))
+        m.add(scn.NetworkInNetwork(nPlanes,nClasses,bias)) #accumulte softmax input, only one set of biases
+    def baz(depth,nPlanes):
+        m=scn.Sequential()
+        foo(m,nPlanes[0])
+        if len(nPlanes)==1:
+            bar(m,nPlanes[0],True)
+        else:
+            a=scn.Sequential()
+            bar(a,nPlanes,False)
+            b=scn.Sequential(
+                scn.BatchNormLeakyReLU(nPlanes,leakiness=leakiness),
+                scn.Convolution(dimension, nPlanes[0], nPlanes[1], downsample[0], downsample[1], False),
+                baz(nPlanes[1:]),
+                scn.UnPooling(dimension, downsample[0], downsample[1]))
+            m.add(ConcatTable(a,b))
+            m.add(scn.AddTable())
+    return baz(depth,nPlanes)
--- a/sparseconvnet/randomizedStrideConvolution.py
+++ b/sparseconvnet/randomizedStrideConvolution.py
@@ -22,17 +22,18 @@ class RandomizedStrideConvolution(Module):
    sparseconvnet.Deconvolution module in an UNet style network, to restore
    the input sparsity pattern.
    """
-    def __init__(self, dimension, nIn, nOut, filter_size, filter_stride, bias):
+    def __init__(self, dimension, nIn, nOut, filter_size, filter_stride, bias, groups=1):
        Module.__init__(self)
        self.dimension = dimension
+        self.groups = groups
        self.nIn = nIn
        self.nOut = nOut
        self.filter_size = toLongTensor(dimension, filter_size)
        self.filter_volume = self.filter_size.prod().item()
        self.filter_stride = toLongTensor(dimension, filter_stride)
-        std = (2.0 / nIn / self.filter_volume)**0.5
+        std = (2.0 * groups / nIn / self.filter_volume)**0.5
        self.weight = Parameter(torch.Tensor(
-            self.filter_volume, nIn, nOut).normal_(
+            self.filter_volume, groups, nIn//groups, nOut//groups).normal_(
            0,
            std))
        if bias:

--- a/sparseconvnet/sequential.py
+++ b/sparseconvnet/sequential.py
@@ -4,7 +4,7 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.

-import torch
+import torch, torch.utils.checkpoint

 class Sequential(torch.nn.Sequential):
    def input_spatial_size(self, out_size):
@@ -15,6 +15,15 @@ class Sequential(torch.nn.Sequential):
    def add(self, module):
        self._modules[str(len(self._modules))] = module
        return self
+    
+    def insert(self, index, module):
+        for i in range(len(self._modules), index, -1):
+            self._modules[str(i)] = self._modules[str(i - 1)]
+        self._modules[str(index)] = module
+        
+    def append(self, module):
+        self._modules[str(len(self._modules))] = module
+        return self

    def reweight(self, input):
        for module in self._modules.values():
@@ -44,3 +53,9 @@ class Sequential(torch.nn.Sequential):
            else:
                input = module(input)
        return input
+
+class CheckpointedSequential(Sequential):
+    def forward(self, x):
+        def run(x):
+            return Sequential.forward(self,x)
+        return torch.utils.checkpoint.checkpoint(run, x)
--- a/sparseconvnet/sparsify.py
+++ b/sparseconvnet/sparsify.py
@@ -80,13 +80,16 @@ class Sparsify(Module):
            self.net = Sequential(NetworkInNetwork(nIn,1,True),Sigmoid())
        else:
            self.net = NetworkInNetwork(nIn,1,True)
+        self.threshold=0.5 if activation else 0
    def forward(self,input):
        if input.features.numel():
            output = SparseConvNetTensor()
            output.spatial_size = input.spatial_size
            output.metadata = Metadata(self.dimension)
            output.mask = self.net(input).features.view(-1)
-            active = output.mask>(0.5 if self.activation else 0)
+            if self.threshold<0:
+                print(output.mask.mean(),output.mask.std())
+            active = output.mask>self.threshold
            output.features=input.features[active]
            active=active.cpu()
            input.metadata.sparsifyMetadata(

--- a/sparseconvnet/submanifoldConvolution.py
+++ b/sparseconvnet/submanifoldConvolution.py
@@ -4,8 +4,6 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.

-# 'SubmanifoldConvolution == SubmanifoldConvolution'
-
 import sparseconvnet
 import sparseconvnet.SCN
 from torch.autograd import Function
@@ -14,16 +12,17 @@ from .utils import *
 from .sparseConvNetTensor import SparseConvNetTensor

 class SubmanifoldConvolution(Module):
-    def __init__(self, dimension, nIn, nOut, filter_size, bias):
+    def __init__(self, dimension, nIn, nOut, filter_size, bias, groups=1):
        Module.__init__(self)
        self.dimension = dimension
+        self.groups = groups
        self.nIn = nIn
        self.nOut = nOut
        self.filter_size = toLongTensor(dimension, filter_size)
        self.filter_volume = self.filter_size.prod().item()
-        std = (2.0 / nIn / self.filter_volume)**0.5
+        std = (2.0 * groups / nIn / self.filter_volume)**0.5
        self.weight = Parameter(torch.Tensor(
-            self.filter_volume, nIn, nOut
+            self.filter_volume, groups, nIn//groups, nOut//groups
        ).normal_(0, std))
        if bias:
            self.bias = Parameter(torch.Tensor(nOut).zero_())

--- a/sparseconvnet/utils.py
+++ b/sparseconvnet/utils.py
@@ -132,7 +132,7 @@ def prepare_BLInput(l,f):
        for i, (ll, ff) in enumerate(zip(l,f)):
            L[i,:ll.size(0),:].copy_(ll)
            F[i,:ff.size(0),:].copy_(ff)
-    return (L,F)
+    return [L,F]

 def checkpoint_restore(model,exp_name,name2,use_cuda=True,epoch=0):
    if use_cuda:
@@ -155,8 +155,10 @@ def checkpoint_restore(model,exp_name,name2,use_cuda=True,epoch=0):

 def is_power2(num):
    return num != 0 and ((num & (num - 1)) == 0)
+
 def has_only_one_nonzero_digit(num): #https://oeis.org/A037124
    return num != 0 and (num/10**math.floor(math.log(num,10))).is_integer()
+
 def checkpoint_save(model,exp_name,name2,epoch, use_cuda=True):
    f=exp_name+'-%09d-'%epoch+name2+'.pth'
    model.cpu()
@@ -170,24 +172,128 @@ def checkpoint_save(model,exp_name,name2,epoch, use_cuda=True):
        if not is_power2(epoch):
            os.remove(f)

-def random_rotation(dimension=3):
-    return torch.qr(torch.randn(dimension,dimension))[0]
-
-class LayerNormLeakyReLU(torch.nn.Module):
-    def __init__(self,num_features,leakiness):
-        torch.nn.Module.__init__(self)
-        self.leakiness=leakiness
-        self.in1d=torch.nn.LayerNorm(num_features)
-    def forward(self,x):
-        if x.features.numel():
-            x.features=self.in1d(x.features)
-        x.features=torch.nn.functional.leaky_relu(x.features,self.leakiness,inplace=True)
-        return x
+def random_rotation(dimension=3,allow_mirror=False):
+    r=torch.qr(torch.randn(dimension,dimension))[0]
+    f=torch.randint(2,(3,))
+    if f.sum()%2==0 and not allow_mirror:
+        f=1-f
+    return r*(2*f-1).float()
+
+def squareroot_rotation(a):
+    import scipy.spatial
+    b=scipy.spatial.transform.Slerp(
+        [0,1],
+        scipy.spatial.transform.Rotation.from_dcm(torch.stack([torch.eye(3),a])))([0.5]).as_dcm()
+    return torch.from_numpy(b).float()[0]
+
+def voxelize_pointcloud(xyz,rgb,average=True,accumulate=False):
+    if xyz.numel()==0:
+        return xyz, rgb
+    if average or accumulate:
+        xyz,inv,counts=np.unique(xyz.numpy(),axis=0,return_inverse=True,return_counts=True)
+        xyz=torch.from_numpy(xyz)
+        inv=torch.from_numpy(inv)
+        rgb_out=torch.zeros(xyz.size(0),rgb.size(1),dtype=torch.float32)
+        rgb_out.index_add_(0,inv,rgb)
+        if average:
+            rgb=rgb_out/torch.from_numpy(counts[:,None]).float()
+        return xyz, rgb
+    else:
+        xyz,idxs=np.unique(xyz,axis=0,return_index=True)
+        xyz=torch.from_numpy(xyz)
+        rgb=rgb[idxs]
+        return xyz, rgb
+
+class checkpointFunction(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, run_function, x_features, x_metadata, x_spatial_size):
+        ctx.run_function = run_function
+        ctx.save_for_backward(x_features, x_spatial_size)
+        ctx.x_metadata=x_metadata
+        with torch.no_grad():
+            y = run_function(
+                scn.SparseConvNetTensor
+                (x_features, x_metadata, x_spatial_size))
+        return y.features
+    @staticmethod
+    def backward(ctx, grad_y_features):
+        x_features, x_spatial_size = ctx.saved_tensors
+        x_features = x_features.detach()
+        x_features.requires_grad = True
+        with torch.enable_grad():
+            y = ctx.run_function(
+                scn.SparseConvNetTensor
+                (x_features, ctx.x_metadata, x_spatial_size))
+        torch.autograd.backward(y.features, grad_y_features,retain_graph=False)
+        return None, x_features.grad, None, None
+
+def checkpoint101(run_function, x, down=1):
+    f=checkpointFunction.apply(run_function, x.features, x.metadata, x.spatial_size)
+    s=x.spatial_size//down
+    return scn.SparseConvNetTensor(f, x.metadata, s)
+
+def matplotlib_cubes(ax, positions,colors):
+    from mpl_toolkits.mplot3d import Axes3D
+    from mpl_toolkits.mplot3d.art3d import Poly3DCollection
+    """
+    import matplotlib.pyplot as plt
+    fig = plt.figure(figsize=(15,15))
+    ax = fig.gca(projection='3d')
+    ...
+    plt.show()
+    """
+    try:
+        positions=positions.numpy()
+        colors=colors.numpy()
+        X = np.array([[[0, 1, 0], [0, 0, 0], [1, 0, 0], [1, 1, 0]],
+             [[0, 0, 0], [0, 0, 1], [1, 0, 1], [1, 0, 0]],
+             [[1, 0, 1], [1, 0, 0], [1, 1, 0], [1, 1, 1]],
+             [[0, 0, 1], [0, 0, 0], [0, 1, 0], [0, 1, 1]],
+             [[0, 1, 0], [0, 1, 1], [1, 1, 1], [1, 1, 0]],
+             [[0, 1, 1], [0, 0, 1], [1, 0, 1], [1, 1, 1]]]).astype(np.float32)[None]-0.5
+        X=X+positions[:,None,None,:]
+        X.resize(X.shape[0]*6,4,3)
+        m=positions.min(0)
+        M=positions.max(0)+1
+        ax.set_xlim([m[0],M[0]])
+        ax.set_ylim([m[1],M[1]])
+        ax.set_zlim([m[2],M[2]])
+        ax.add_collection3d(Poly3DCollection(X,
+                                facecolors=np.repeat(colors,6, axis=0)))
+    except:
+        print('matplotlibcubes fail!?!')
+        pass
+    ax.set_axis_off()
+def matplotlib_planes(ax, positions,colors):
+    from mpl_toolkits.mplot3d import Axes3D
+    from mpl_toolkits.mplot3d.art3d import Poly3DCollection
+    """
+    import matplotlib.pyplot as plt
+    fig = plt.figure(figsize=(15,15))
+    ax = fig.gca(projection='3d')
+    ...
+    plt.show()
+    """
+    try:
+        positions=positions.numpy()
+        colors=colors.numpy()
+        X = np.array([[[0, -0.5, 0.5], [0, -0.5, -0.5], [0, 0.5, -0.5], [0, 0.5, 0.5]]]).astype(np.float32)[None]
+        X=X+positions[:,None,None,:]
+        X.resize(X.shape[0]*1,4,3)
+        m=positions.min(0)
+        M=positions.max(0)+1
+        ax.set_xlim([m[0],M[0]])
+        ax.set_ylim([m[1],M[1]])
+        ax.set_zlim([m[2],M[2]])
+        ax.add_collection3d(Poly3DCollection(X,
+                                facecolors=np.repeat(colors,1, axis=0)))
+    except:
+        pass
+    ax.set_axis_off()
+
+def visdom_scatter(vis, xyz, rgb, win='3d', markersize=3):
+    vis.scatter(
+        xyz,
+        opts={'markersize': markersize,'markercolor': rgb},
+        win=win)

-def voxelize_pointcloud(xyz,rgb):
-    xyz,inv,counts=np.unique(xyz.long().numpy(),axis=0,return_inverse=True,return_counts=True)
-    xyz=torch.from_numpy(xyz)
-    inv=torch.from_numpy(inv)
-    rgb_out=torch.zeros(xyz.size(0),rgb.size(1),dtype=torch.float32)
-    rgb_out.index_add_(0,inv,rgb)
-    return xyz, rgb_out/torch.from_numpy(counts[:,None]).float()