Goodbye THNN. Hello ATen!

2c4ed608 · Benjamin Thomas Graham · 6d4475db · 2c4ed608 · 2c4ed608 · 2c4ed608
Commit 2c4ed608 authored Jun 20, 2018 by Benjamin Thomas Graham
20 changed files
--- a/README.md
+++ b/README.md
@@ -128,19 +128,19 @@ python VGGplus.py
 ## Setup
-Tested with Ubuntu 16.04, Python 3 in [Miniconda](https://conda.io/miniconda.html) and PyTorch v0.4 (with merged Tensors/Variables).
+Tested with Ubuntu 16.04, Python 3.6 in [Miniconda](https://conda.io/miniconda.html) and PyTorch v0.4 (with merged Tensors/Variables).
 ```
 conda install pytorch -c pytorch
-conda install google-sparsehash -c bioconda # OR apt-get install libsparsehash-dev
+conda install google-sparsehash -c bioconda   # OR apt-get install libsparsehash-dev
+conda install -c anaconda pillow 
 git clone git@github.com:facebookresearch/SparseConvNet.git
-cd SparseConvNet/
+cd SparseConvNet/PyTorch/
-python setup.py install
+bash build.sh
 ```
-To run the examples you may also need to install unrar and TorchNet:
+To run the examples you may also need to install unrar:
 ```
 apt-get install unrar
-pip install git+https://github.com/pytorch/tnt.git@master
 ```
@@ -154,7 +154,7 @@ pip install git+https://github.com/pytorch/tnt.git@master
 6. [Kaggle Diabetic Retinopathy Detection, 2015](https://www.kaggle.com/c/diabetic-retinopathy-detection/) First place in the Kaggle Diabetic Retinopathy Detection competition.
 7. [Submanifold Sparse Convolutional Networks, 2017](https://arxiv.org/abs/1706.01307) Introduces deep 'submanifold' SparseConvNets.
 8. [Workshop on Learning to See from 3D Data, 2017](https://shapenet.cs.stanford.edu/iccv17workshop/) First place in the [semantic segmentation](https://shapenet.cs.stanford.edu/iccv17/) competition. [Report](https://arxiv.org/pdf/1710.06104)
-9. [3D Semantic Segmentation with Submanifold Sparse Convolutional Networks, 2017](https://arxiv.org/abs/1711.10275) Semantic segmentation for the ShapeNet Core55 and NYU-DepthV2 datasets
+9. [3D Semantic Segmentation with Submanifold Sparse Convolutional Networks, 2017](https://arxiv.org/abs/1711.10275) Semantic segmentation for the ShapeNet Core55 and NYU-DepthV2 datasets, CVPR 2018
 ### Citations

--- a/build.sh
+++ b/build.sh
+#!/bin/bash
+rm -rf build/ sparseconvnet.egg-info sparseconvnet_SCN*.so
+python setup.py install
--- a/examples/3d_segmentation/fully_convolutional.py
+++ b/examples/3d_segmentation/fully_convolutional.py
@@ -48,10 +48,10 @@ p['lr_decay'] = 4e-2
 p['weight_decay'] = 1e-4
 p['momentum'] = 0.9
 p['check_point'] = True
-p['use_gpu'] = torch.cuda.is_available()
+p['use_cuda'] = torch.cuda.is_available()
-dtype = 'torch.cuda.FloatTensor' if p['use_gpu'] else 'torch.FloatTensor'
+dtype = 'torch.cuda.FloatTensor' if p['use_cuda'] else 'torch.FloatTensor'
-dtypei = 'torch.cuda.LongTensor' if p['use_gpu'] else 'torch.LongTensor'
+dtypei = 'torch.cuda.LongTensor' if p['use_cuda'] else 'torch.LongTensor'
-if p['use_gpu']:
+if p['use_cuda']:
    model.cuda()
    criterion.cuda()
 optimizer = optim.SGD(model.parameters(),

--- a/examples/3d_segmentation/unet.py
+++ b/examples/3d_segmentation/unet.py
@@ -48,10 +48,10 @@ p['lr_decay'] = 4e-2
 p['weight_decay'] = 1e-4
 p['momentum'] = 0.9
 p['check_point'] = True
-p['use_gpu'] = torch.cuda.is_available()
+p['use_cuda'] = torch.cuda.is_available()
-dtype = 'torch.cuda.FloatTensor' if p['use_gpu'] else 'torch.FloatTensor'
+dtype = 'torch.cuda.FloatTensor' if p['use_cuda'] else 'torch.FloatTensor'
-dtypei = 'torch.cuda.LongTensor' if p['use_gpu'] else 'torch.LongTensor'
+dtypei = 'torch.cuda.LongTensor' if p['use_cuda'] else 'torch.LongTensor'
-if p['use_gpu']:
+if p['use_cuda']:
    model.cuda()
    criterion.cuda()
 optimizer = optim.SGD(model.parameters(),

--- a/examples/Assamese_handwriting/ResNet.py
+++ b/examples/Assamese_handwriting/ResNet.py
@@ -10,40 +10,41 @@ import sparseconvnet as scn
 from data import get_iterators
 # two-dimensional SparseConvNet
 class Model(nn.Module):
    def __init__(self):
        nn.Module.__init__(self)
        self.sparseModel = scn.Sequential(
-        ).add(scn.SubmanifoldConvolution(2, 3, 8, 3, False)
+            scn.SubmanifoldConvolution(2, 3, 8, 3, False),
-              ).add(scn.MaxPooling(2, 3, 2)
+            scn.MaxPooling(2, 3, 2),
-                    ).add(scn.SparseResNet(2, 8, [
+            scn.SparseResNet(2, 8, [
                        ['b', 8, 2, 1],
                        ['b', 16, 2, 2],
                        ['b', 24, 2, 2],
-                        ['b', 32, 2, 2]])
+                        ['b', 32, 2, 2]]),
-        ).add(scn.Convolution(2, 32, 64, 5, 1, False)
+            scn.Convolution(2, 32, 64, 5, 1, False),
-              ).add(scn.BatchNormReLU(64)
+            scn.BatchNormReLU(64),
-                    ).add(scn.SparseToDense(2, 64))
+            scn.SparseToDense(2, 64))
+        self.spatial_size= self.sparseModel.input_spatial_size(torch.LongTensor([1, 1]))
+        self.inputLayer = scn.InputLayer(2,self.spatial_size,2)
        self.linear = nn.Linear(64, 183)
    def forward(self, x):
+        x = self.inputLayer(x)
        x = self.sparseModel(x)
        x = x.view(-1, 64)
        x = self.linear(x)
        return x
 model = Model()
-spatial_size = model.sparseModel.input_spatial_size(torch.LongTensor([1, 1]))
+scale=63
-print('Input spatial size:', spatial_size)
+dataset = get_iterators(model.spatial_size, scale)
-dataset = get_iterators(spatial_size, 63, 3)
+print('Input spatial size:', model.spatial_size, 'Data scale:', scale)
 scn.ClassificationTrainValidate(
    model, dataset,
    {'n_epochs': 100,
     'initial_lr': 0.1,
     'lr_decay': 0.05,
     'weight_decay': 1e-4,
-     'use_gpu': torch.cuda.is_available(),
+     'use_cuda': torch.cuda.is_available(),
-     'check_point': True, })
+     'check_point': False, })
--- a/examples/Assamese_handwriting/VGGplus.py
+++ b/examples/Assamese_handwriting/VGGplus.py
@@ -13,32 +13,36 @@ from data import get_iterators
 class Model(nn.Module):
    def __init__(self):
        nn.Module.__init__(self)
-        self.sparseModel = scn.SparseVggNet(2, 3, [
+        self.sparseModel = scn.Sequential(
+          scn.SparseVggNet(2, 3, [
            ['C', 8, ], ['C', 8], 'MP',
            ['C', 16], ['C', 16], 'MP',
-            ['C', 16 + 8], ['C', 16 + 8], 'MP',
+            ['C', 16, 8], ['C', 16, 8], 'MP',
-            ['C', 24 + 8], ['C', 24 + 8], 'MP']
+            ['C', 24, 8], ['C', 24, 8], 'MP']),
-        ).add(scn.Convolution(2, 32, 64, 5, 1, False)
+          scn.Convolution(2, 32, 64, 5, 1, False),
-              ).add(scn.BatchNormReLU(64)
+          scn.BatchNormReLU(64),
-                    ).add(scn.SparseToDense(2, 64))
+          scn.SparseToDense(2, 64))
+        self.spatial_size= self.sparseModel.input_spatial_size(torch.LongTensor([1, 1]))
+        self.inputLayer = scn.InputLayer(2,self.spatial_size,2)
        self.linear = nn.Linear(64, 183)
    def forward(self, x):
+        x = self.inputLayer(x)
        x = self.sparseModel(x)
        x = x.view(-1, 64)
        x = self.linear(x)
        return x
 model = Model()
-spatial_size = model.sparseModel.input_spatial_size(torch.LongTensor([1, 1]))
+scale=63
-print('Input spatial size:', spatial_size)
+dataset = get_iterators(model.spatial_size, scale)
-dataset = get_iterators(spatial_size, 63, 3)
+print('Input spatial size:', model.spatial_size, 'Data scale:', scale)
 scn.ClassificationTrainValidate(
    model, dataset,
    {'n_epochs': 100,
     'initial_lr': 0.1,
     'lr_decay': 0.05,
     'weight_decay': 1e-4,
-     'use_gpu': torch.cuda.is_available(),
+     'use_cuda': torch.cuda.is_available(),
-     'check_point': True, })
+     'check_point': False, })
--- a/examples/Assamese_handwriting/data.py
+++ b/examples/Assamese_handwriting/data.py
@@ -19,114 +19,90 @@ if not os.path.exists('pickle/'):
    import process
-def train(spatial_size, Scale, precomputeSize):
+def interp(sample,x,y):
-    d = pickle.load(open('pickle/train.pickle', 'rb'))
+    return torch.from_numpy(np.hstack([np.interp(sample.numpy(),x.numpy(),y[:,i].numpy())[:,None] for i in range(y.shape[1])])).float()
-    print('Replicating training set 10 times (1 epoch = 10 iterations through the training set = 10x6588 training samples)')
+class Data(torch.utils.data.Dataset):
-    for i in range(9):
+    def __init__(self,file,scale=63,repeats=1):
-        for j in range(6588):
+        torch.utils.data.Dataset.__init__(self)
-            d.append(d[j])
+        self.data = pickle.load(open(file, 'rb'))
-    for i, x in enumerate(d):
+        for j in range(len(self.data)):
-        x['idx'] = i
+            strokes=[]
-    d = torchnet.dataset.ListDataset(d)
+            features=[]
-    randperm = torch.randperm(len(d))
+            for k,stroke in enumerate(self.data[j]['input']):
+                if len(stroke)>1:
-    def perm(idx, size):
+                    stroke=stroke.float()/255-0.5
-        return randperm[idx]
+                    stroke*=scale-1e-3
+                    delta=stroke[1:]-stroke[:-1]
+                    mag=(delta**2).sum(1)**0.5
+                    l=mag.cumsum(0)
+                    zl=torch.cat([torch.zeros(1),l])
+                    strokes.append(interp(torch.arange(0,zl[-1]),zl,stroke))
+                    delta/=mag[:,None]
+                    delta=torch.Tensor(delta[[i//2 for i in range(2*len(l))]])
+                    zl_=zl[[i//2 for i in range(1,2*len(l)+1)]]
+                    features.append(interp(torch.arange(0,zl[-1]),zl_,delta))
+            self.data[j]['coords'] = torch.cat(strokes,0)
+            self.data[j]['features'] = torch.cat(features,0)
+            self.data[j]['target']-=1
+        if repeats>1:
+            print('Replicating dataset: 1 epoch = %d iterations of the dataset; %d x %d = %d training samples'%(repeats, repeats, len(self.data), repeats * len(self.data)))
+        for j in range(len(self.data)):
+            for i in range(repeats-1):
+                self.data.append(self.data[j])
+        for i, x in enumerate(self.data):
+            x['idx'] = i
+    def __getitem__(self,n):
+        return self.data[n]
+    def __len__(self):
+        return len(self.data)
+def TrainMergeFn(spatial_size=95, jitter=8):
+    center = spatial_size/2
    def merge(tbl):
-        inp = scn.InputBatch(2, spatial_size)
+        v=torch.Tensor([[1,0,0]])
-        center = spatial_size.float().view(1, 2) / 2
+        targets=[x['target'] for x in tbl]
-        p = torch.LongTensor(2)
+        locations=[]
-        v = torch.FloatTensor([1, 0, 0])
+        features=[]
-        np_random = np.random.RandomState(tbl['idx'])
+        for idx,char in enumerate(tbl):
-        for char in tbl['input']:
-            inp.add_sample()
            m = torch.eye(2)
-            r = np_random.randint(1, 3)
+            r = torch.randint(0,3,[1]).int().item()
-            alpha = random.uniform(-0.2, 0.2)
+            alpha = torch.rand(1).item()*0.4-0.2
-            if alpha == 1:
+            if r == 1:
                m[0][1] = alpha
-            elif alpha == 2:
+            elif r == 2:
                m[1][0] = alpha
            else:
                m = torch.mm(m, torch.FloatTensor(
                    [[math.cos(alpha), math.sin(alpha)],
                     [-math.sin(alpha), math.cos(alpha)]]))
-            c = center + torch.FloatTensor(1, 2).uniform_(-8, 8)
+            coords=char['coords']
-            for stroke in char:
+            coords = torch.mm(coords, m) + torch.FloatTensor(1, 2).uniform_(center-jitter, center+jitter)
-                stroke = stroke.float() / 255 - 0.5
+            coords = torch.cat([coords.long(),torch.LongTensor([idx]).expand([coords.size(0),1])],1)
-                stroke = c.expand_as(stroke) + \
+            locations.append(coords)
-                    torch.mm(stroke, m * (Scale - 0.01))
+            f=char['features']
-                ###############################################################
+            f=torch.mm(f, m)
-                # To avoid GIL problems use a helper function:
+            f /= (f**2).sum(1,keepdim=True)**0.5
-                scn.dim_fn(
+            f = torch.cat([f,torch.ones([f.size(0),1])],1)
-                    2,
+            features.append(f)
-                    'drawCurve')(
+        return {'input': scn.InputLayerInput(torch.cat(locations,0), torch.cat(features,0)), 'target': torch.LongTensor(targets)}
-                    inp.metadata.ffi,
+    return merge
-                    inp.features,
+def TestMergeFn(spatial_size=95):
-                    stroke)
+    center = spatial_size/2
-                ###############################################################
-                # Above is equivalent to :
-                # x1,x2,y1,y2,l=0,stroke[0][0],0,stroke[0][1],0
-                # for i in range(1,stroke.size(0)):
-                #     x1=x2
-                #     y1=y2
-                #     x2=stroke[i][0]
-                #     y2=stroke[i][1]
-                #     l=1e-10+((x2-x1)**2+(y2-y1)**2)**0.5
-                #     v[1]=(x2-x1)/l
-                #     v[2]=(y2-y1)/l
-                #     l=max(x2-x1,y2-y1,x1-x2,y1-y2,0.9)
-                #     for j in np.arange(0,1,1/l):
-                #         p[0]=math.floor(x1*j+x2*(1-j))
-                #         p[1]=math.floor(y1*j+y2*(1-j))
-                #         inp.set_location(p,v,False)
-                ###############################################################
-        inp.precomputeMetadata(precomputeSize)
-        return {'input': inp, 'target': torch.LongTensor(tbl['target']) - 1}
-    bd = torchnet.dataset.BatchDataset(d, 108, perm=perm, merge=merge)
-    tdi = scn.threadDatasetIterator(bd)
-    def iter():
-        randperm.copy_(torch.randperm(len(d)))
-        return tdi()
-    return iter
-def val(spatial_size, Scale, precomputeSize):
-    d = pickle.load(open('pickle/test.pickle', 'rb'))
-    d = torchnet.dataset.ListDataset(d)
-    randperm = torch.randperm(len(d))
-    def perm(idx, size):
-        return randperm[idx]
    def merge(tbl):
-        inp = scn.InputBatch(2, spatial_size)
+        v=torch.Tensor([[1,0,0]])
-        center = spatial_size.float().view(1, 2) / 2
+        targets=[x['target'] for x in tbl]
-        p = torch.LongTensor(2)
+        locations=[]
-        v = torch.FloatTensor([1, 0, 0])
+        features=[]
-        for char in tbl['input']:
+        for idx,char in enumerate(tbl):
-            inp.add_sample()
+            coords=char['coords']+center
-            for stroke in char:
+            coords = torch.cat([coords.long(),torch.LongTensor([idx]).expand([coords.size(0),1])],1)
-                stroke = stroke.float() * (Scale - 0.01) / 255 - 0.5 * (Scale - 0.01)
+            locations.append(coords)
-                stroke += center.expand_as(stroke)
+            f=char['features']
-                scn.dim_fn(
+            f = torch.cat([f,torch.ones([f.size(0),1])],1)
-                    2,
+            features.append(f)
-                    'drawCurve')(
+        return {'input': scn.InputLayerInput(torch.cat(locations,0), torch.cat(features,0)), 'target': torch.LongTensor(targets)}
-                    inp.metadata.ffi,
+    return merge
-                    inp.features,
-                    stroke)
-        inp.precomputeMetadata(precomputeSize)
-        return {'input': inp, 'target': torch.LongTensor(tbl['target']) - 1}
-    bd = torchnet.dataset.BatchDataset(d, 183, perm=perm, merge=merge)
-    tdi = scn.threadDatasetIterator(bd)
-    def iter():
-        randperm.copy_(torch.randperm(len(d)))
-        return tdi()
-    return iter
 def get_iterators(*args):
-    return {'train': train(*args), 'val': val(*args)}
+    return {'train': torch.utils.data.DataLoader(Data('pickle/train.pickle',repeats=10), collate_fn=TrainMergeFn(), batch_size=108, shuffle=True, num_workers=10),
+            'val': torch.utils.data.DataLoader(Data('pickle/test.pickle',repeats=1), collate_fn=TestMergeFn(), batch_size=183, shuffle=True, num_workers=10)}
--- a/examples/Chinese_handwriting/ResNet-A-VSC.py
+++ b/examples/Chinese_handwriting/ResNet-A-VSC.py
@@ -10,40 +10,41 @@ import sparseconvnet as scn
 from data import get_iterators
 # two-dimensional SparseConvNet
 class Model(nn.Module):
    def __init__(self):
        nn.Module.__init__(self)
        self.sparseModel = scn.Sequential(
-        ).add(scn.SubmanifoldConvolution(2, 3, 16, 3, False)
+            scn.SubmanifoldConvolution(2, 3, 16, 3, False),
-              ).add(scn.MaxPooling(2, 3, 2)
+            scn.MaxPooling(2, 3, 2),
-                    ).add(scn.SparseResNet(2, 16, [
+            scn.SparseResNet(2, 16, [
                        ['b', 16, 2, 1],
                        ['b', 32, 2, 2],
                        ['b', 48, 2, 2],
-                        ['b', 96, 2, 2]])
+                        ['b', 96, 2, 2]]),
-        ).add(scn.Convolution(2, 96, 128, 5, 1, False)
+            scn.Convolution(2, 96, 128, 3, 1, False),
-              ).add(scn.BatchNormReLU(128)
+            scn.BatchNormReLU(128),
-                    ).add(scn.SparseToDense(2, 128))
+            scn.SparseToDense(2, 128))
+        self.spatial_size= self.sparseModel.input_spatial_size(torch.LongTensor([1, 1]))
+        self.inputLayer = scn.InputLayer(2,self.spatial_size,2)
        self.linear = nn.Linear(128, 3755)
    def forward(self, x):
+        x = self.inputLayer(x)
        x = self.sparseModel(x)
        x = x.view(-1, 128)
        x = self.linear(x)
        return x
 model = Model()
-spatial_size = model.sparseModel.input_spatial_size(torch.LongTensor([1, 1]))
+scale=63
-print('Input spatial size:', spatial_size)
+dataset = get_iterators(model.spatial_size, scale)
-dataset = get_iterators(spatial_size, 63, 3)
+print('Input spatial size:', model.spatial_size, 'Data scale:', scale)
 scn.ClassificationTrainValidate(
    model, dataset,
    {'n_epochs': 100,
     'initial_lr': 0.1,
     'lr_decay': 0.05,
     'weight_decay': 1e-4,
-     'use_gpu': torch.cuda.is_available(),
+     'use_cuda': torch.cuda.is_available(),
-     'check_point': True, })
+     'check_point': False, })
--- a/examples/Chinese_handwriting/VGG-C.py
+++ b/examples/Chinese_handwriting/VGG-C.py
@@ -22,23 +22,27 @@ class Model(nn.Module):
        ).add(scn.Convolution(2, 96, 128, 3, 2, False)
        ).add(scn.BatchNormReLU(128)
        ).add(scn.SparseToDense(2, 128))
+        self.spatial_size= self.sparseModel.input_spatial_size(torch.LongTensor([1, 1]))
+        self.inputLayer = scn.InputLayer(2,self.spatial_size,2)
        self.linear = nn.Linear(128, 3755)
    def forward(self, x):
+        x = self.inputLayer(x)
        x = self.sparseModel(x)
        x = x.view(-1, 128)
        x = self.linear(x)
        return x
 model = Model()
-spatial_size = model.sparseModel.input_spatial_size(torch.LongTensor([1, 1]))
+scale=63
-print('Input spatial size:', spatial_size)
+dataset = get_iterators(model.spatial_size, scale)
-dataset = get_iterators(spatial_size, 63, 3)
+print('Input spatial size:', model.spatial_size, 'Data scale:', scale)
 scn.ClassificationTrainValidate(
    model, dataset,
    {'n_epochs': 100,
     'initial_lr': 0.1,
     'lr_decay': 0.05,
     'weight_decay': 1e-4,
-     'use_gpu': torch.cuda.is_available(),
+     'use_cuda': torch.cuda.is_available(),
-     'check_point': True, })
+     'check_point': False, })
--- a/examples/Chinese_handwriting/VGG-Cplus.py
+++ b/examples/Chinese_handwriting/VGG-Cplus.py
@@ -22,23 +22,27 @@ class Model(nn.Module):
        ).add(scn.Convolution(2, 112, 128, 3, 2, False)
        ).add(scn.BatchNormReLU(128)
        ).add(scn.SparseToDense(2, 128))
+        self.spatial_size= self.sparseModel.input_spatial_size(torch.LongTensor([1, 1]))
+        self.inputLayer = scn.InputLayer(2,self.spatial_size,2)
        self.linear = nn.Linear(128, 3755)
    def forward(self, x):
+        x = self.inputLayer(x)
        x = self.sparseModel(x)
        x = x.view(-1, 128)
        x = self.linear(x)
        return x
 model = Model()
-spatial_size = model.sparseModel.input_spatial_size(torch.LongTensor([1, 1]))
+scale=63
-print('Input spatial size:', spatial_size)
+dataset = get_iterators(model.spatial_size, scale)
-dataset = get_iterators(spatial_size, 63, 3)
+print('Input spatial size:', model.spatial_size, 'Data scale:', scale)
 scn.ClassificationTrainValidate(
    model, dataset,
    {'n_epochs': 100,
     'initial_lr': 0.1,
     'lr_decay': 0.05,
     'weight_decay': 1e-4,
-     'use_gpu': torch.cuda.is_available(),
+     'use_cuda': torch.cuda.is_available(),
-     'check_point': True, })
+     'check_point': False, })
--- a/examples/Chinese_handwriting/data.py
+++ b/examples/Chinese_handwriting/data.py
@@ -10,7 +10,7 @@ import sparseconvnet as scn
 import pickle
 import math
 import random
-import numpy
+import numpy as np
 import os
 if not os.path.exists('pickle/'):
@@ -24,95 +24,57 @@ if not os.path.exists('pickle/'):
    os.system('unzip OLHWDB1.1tst_pot.zip -d POT/')
    os.system('python readPotFiles.py')
+def interp(sample,x,y):
+    return torch.from_numpy(np.hstack([np.interp(sample.numpy(),x.numpy(),y[:,i].numpy())[:,None] for i in range(y.shape[1])])).float()
+class Data(torch.utils.data.Dataset):
+    def __init__(self,file,scale=63):
+        print('Loading', file, 'and balancing points for scale', scale)
+        torch.utils.data.Dataset.__init__(self)
+        self.data = pickle.load(open(file, 'rb'))
+        for j in range(len(self.data)):
+            strokes=[]
+            features=[]
+            for k,stroke in enumerate(self.data[j]['input']):
+                if len(stroke)>1:
+                    stroke=stroke.float()/255-0.5
+                    stroke*=scale-1e-3
+                    delta=stroke[1:]-stroke[:-1]
+                    mag=(delta**2).sum(1)**0.5
+                    l=mag.cumsum(0)
+                    zl=torch.cat([torch.zeros(1),l])
+                    strokes.append(interp(torch.arange(0,zl[-1]),zl,stroke))
+                    delta/=mag[:,None]
+                    delta=torch.Tensor(delta[[i//2 for i in range(2*len(l))]])
+                    zl_=zl[[i//2 for i in range(1,2*len(l)+1)]]
+                    features.append(interp(torch.arange(0,zl[-1]),zl_,delta))
+            self.data[j]['coords'] = torch.cat(strokes,0)
+            self.data[j]['features'] = torch.cat(features,0)
+        for i, x in enumerate(self.data):
+            x['idx'] = i
+        print('Loaded', len(self.data), 'points')
+    def __getitem__(self,n):
+        return self.data[n]
+    def __len__(self):
+        return len(self.data)
-def train(spatial_size, Scale, precomputeSize):
+def MergeFn(spatial_size=63):
-    d = pickle.load(open('pickle/train.pickle', 'rb'))
+    center = spatial_size/2
-    d = torchnet.dataset.ListDataset(d)
-    randperm = torch.randperm(len(d))
-    def perm(idx, size):
-        return randperm[idx]
    def merge(tbl):
-        inp = scn.InputBatch(2, spatial_size)
+        v=torch.Tensor([[1,0,0]])
-        center = spatial_size.float().view(1, 2) / 2
+        targets=[x['target'] for x in tbl]
-        p = torch.LongTensor(2)
+        locations=[]
-        v = torch.FloatTensor([1, 0, 0])
+        features=[]
-        for char in tbl['input']:
+        for idx,char in enumerate(tbl):
-            inp.add_sample()
+            coords=char['coords']+center
-            for stroke in char:
+            coords = torch.cat([coords.long(),torch.LongTensor([idx]).expand([coords.size(0),1])],1)
-                stroke = stroke.float() * (Scale - 0.01) / 255 - 0.5 * (Scale - 0.01)
+            locations.append(coords)
-                stroke += center.expand_as(stroke)
+            f=char['features']
-                ###############################################################
+            f = torch.cat([f,torch.ones([f.size(0),1])],1)
-                # To avoid GIL problems use a helper function:
+            features.append(f)
-                scn.dim_fn(
+        return {'input': scn.InputLayerInput(torch.cat(locations,0), torch.cat(features,0)), 'target': torch.LongTensor(targets)}
-                    2,
+    return merge
-                    'drawCurve')(
-                    inp.metadata.ffi,
-                    inp.features,
-                    stroke)
-                ###############################################################
-                # Above is equivalent to :
-                # x1,x2,y1,y2,l=0,stroke[0][0],0,stroke[0][1],0
-                # for i in range(1,stroke.size(0)):
-                #     x1=x2
-                #     y1=y2
-                #     x2=stroke[i][0]
-                #     y2=stroke[i][1]
-                #     l=1e-10+((x2-x1)**2+(y2-y1)**2)**0.5
-                #     v[1]=(x2-x1)/l
-                #     v[2]=(y2-y1)/l
-                #     l=max(x2-x1,y2-y1,x1-x2,y1-y2,0.9)
-                #     for j in numpy.arange(0,1,1/l):
-                #         p[0]=math.floor(x1*j+x2*(1-j))
-                #         p[1]=math.floor(y1*j+y2*(1-j))
-                #         inp.set_location(p,v,False)
-                ###############################################################
-        inp.precomputeMetadata(precomputeSize)
-        return {'input': inp, 'target': torch.LongTensor(tbl['target'])}
-    bd = torchnet.dataset.BatchDataset(d, 100, perm=perm, merge=merge)
-    tdi = scn.threadDatasetIterator(bd)
-    def iter():
-        randperm.copy_(torch.randperm(len(d)))
-        return tdi()
-    return iter
-def val(spatial_size, Scale, precomputeSize):
-    d = pickle.load(open('pickle/test.pickle', 'rb'))
-    d = torchnet.dataset.ListDataset(d)
-    randperm = torch.randperm(len(d))
-    def perm(idx, size):
-        return randperm[idx]
-    def merge(tbl):
-        inp = scn.InputBatch(2, spatial_size)
-        center = spatial_size.float().view(1, 2) / 2
-        p = torch.LongTensor(2)
-        v = torch.FloatTensor([1, 0, 0])
-        for char in tbl['input']:
-            inp.add_sample()
-            for stroke in char:
-                stroke = stroke.float() * (Scale - 0.01) / 255 - 0.5 * (Scale - 0.01)
-                stroke += center.expand_as(stroke)
-                scn.dim_fn(
-                    2,
-                    'drawCurve')(
-                    inp.metadata.ffi,
-                    inp.features,
-                    stroke)
-        inp.precomputeMetadata(precomputeSize)
-        return {'input': inp, 'target': torch.LongTensor(tbl['target'])}
-    bd = torchnet.dataset.BatchDataset(d, 100, perm=perm, merge=merge)
-    tdi = scn.threadDatasetIterator(bd)
-    def iter():
-        randperm.copy_(torch.randperm(len(d)))
-        return tdi()
-    return iter
 def get_iterators(*args):
-    return {'train': train(*args), 'val': val(*args)}
+    return {'train': torch.utils.data.DataLoader(Data('pickle/train.pickle'), collate_fn=MergeFn(), batch_size=100, shuffle=True, num_workers=10),
+            'val': torch.utils.data.DataLoader(Data('pickle/test.pickle'), collate_fn=MergeFn(), batch_size=100, shuffle=True, num_workers=10)}
--- a/examples/hello-world.py
+++ b/examples/hello-world.py
@@ -8,7 +8,7 @@ import torch
 import sparseconvnet as scn
 # Use the GPU if there is one, otherwise CPU
-use_gpu = torch.cuda.is_available()
+use_cuda = torch.cuda.is_available()
 model = scn.Sequential().add(
    scn.SparseVggNet(2, 1,
@@ -22,7 +22,7 @@ model = scn.Sequential().add(
 ).add(
    scn.SparseToDense(2, 32)
 )
-if use_gpu:
+if use_cuda:
    model.cuda()
 # output will be 10x10
@@ -67,7 +67,7 @@ input.set_locations(locations, features, 0)
 input.precomputeMetadata(3)
 model.train()
-if use_gpu:
+if use_cuda:
    input.cuda()
 output = model.forward(input)

--- a/setup.py
+++ b/setup.py
@@ -4,77 +4,34 @@
 # This source code is licensed under the license found in the
 # LICENSE file in the root directory of this source tree.
-import os
+import torch, os
-import torch
+from torch.utils.cpp_extension import BuildExtension, CppExtension, CUDAExtension
-from torch.utils.ffi import create_extension
+from setuptools import setup, find_packages
 this_dir = os.path.dirname(os.path.realpath(__file__))
 torch_dir = os.path.dirname(torch.__file__)
+conda_include_dir = '/'.join(torch_dir.split('/')[:-4]) + '/include'
-print('Building SCN module')
+extra = {'cxx': ['-std=c++11', '-fopenmp'], 'nvcc': ['-std=c++11', '-Xcompiler', '-fopenmp']}
-if torch.cuda.is_available():
-    s=('cd sparseconvnet/SCN; nvcc init.cu -c -o init.cu.o -ccbin /usr/bin/cc'
-        + ' -m64 --std c++11 -Xcompiler \"-fopenmp -fPIC -O3\" '
-        + '-gencode arch=compute_62,code=sm_62 '
-        + '-gencode arch=compute_61,code=sm_61 '
-        + '-gencode arch=compute_60,code=sm_60 '
-        + '-gencode arch=compute_52,code=sm_52 '
-        + '-gencode arch=compute_50,code=sm_50 '
-        + '-gencode arch=compute_30,code=sm_30 '
-        + '-DNVCC '
-        + '-I/usr/local/cuda/include '
-        + '-I' + '/'.join(torch_dir.split('/')[:-4]) + '/include '
-        + '-I' + torch_dir + '/lib/include '
-        + '-I' + torch_dir + '/lib/include/TH '
-        + '-I' + torch_dir + '/lib/include/THC '
-        + '-I.')
-    r = os.system(s)
-    assert r == 0
-    ffi = create_extension(
-        'sparseconvnet.SCN',
-        headers=[
-            'sparseconvnet/SCN/header_cpu.h',
-            'sparseconvnet/SCN/header_gpu.h'],
-        sources=[],
-        include_dirs=[os.path.expandvars('$CUDA_HOME') + '/include'],
-        extra_objects=[
-            this_dir +
-            '/sparseconvnet/SCN/init.cu.o'],
-        relative_to=__file__,
-        extra_compile_args=["-std=c99"],
-        with_cuda=True)
-else:
-    r = os.system(
-        'cd sparseconvnet/SCN; g++ -fopenmp -std=c++11 -O3 -fPIC -c init.cpp -o init.cpp.o '
-        + '-I' + '/'.join(torch_dir.split('/')[:-4]) + '/include '
-        + '-I' + torch_dir + '/lib/include '
-        + '-I' + torch_dir + '/lib/include/TH '
-        + '-I.')
-    assert r == 0
-    ffi = create_extension(
-        'sparseconvnet.SCN',
-        headers=['sparseconvnet/SCN/header_cpu.h'],
-        sources=[],
-        extra_objects=[
-            this_dir +
-            '/sparseconvnet/SCN/init.cpp.o'],
-        relative_to=__file__,
-        extra_compile_args=["-std=c99"],
-        with_cuda=False)
-ffi.build()
-from setuptools import setup, find_packages
 setup(
    name='sparseconvnet',
-    version='0.1.1',
+    version='0.2',
    description='Submanifold (Spatially) Sparse Convolutional Networks https://arxiv.org/abs/1706.01307',
    author='Facebook AI Research',
    author_email='benjamingraham@fb.com',
    url='https://github.com/facebookresearch/SparseConvNet',
-    package_data={
+    packages=['sparseconvnet','sparseconvnet.SCN'],
-        'sparseconvnet': ['SCN/_SCN.so'],
+    ext_modules=[
-    },
+      CUDAExtension('sparseconvnet_SCN',
-    packages=find_packages(),
+        ['sparseconvnet/SCN/pybind_cuda.cpp', 'sparseconvnet/SCN/instantiate_cpu.cpp', 'sparseconvnet/SCN/instantiate_cuda.cu'],
-    # Since the package includes a shared object, this is not zip-safe.
+        include_dirs=[conda_include_dir, this_dir+'/sparseconvnet/SCN/'],
+        extra_compile_args=extra)
+      if torch.cuda.is_available()  else
+      CppExtension('sparseconvnet_SCN',
+        ['sparseconvnet/SCN/pybind_cpu.cpp', 'sparseconvnet/SCN/instantiate_cpu.cpp'],
+        include_dirs=[conda_include_dir, this_dir+'/sparseconvnet/SCN/'],
+        extra_compile_args=extra['cxx'])],
+    cmdclass={'build_ext': BuildExtension},
    zip_safe=False,
 )
--- a/sparseconvnet/SCN/CPU/ActivePooling.cpp
+++ b/sparseconvnet/SCN/CPU/ActivePooling.cpp
+// Copyright 2016-present, Facebook, Inc.
+// All rights reserved.
+//
+// This source code is licensed under the license found in the
+// LICENSE file in the root directory of this source tree.
+#include "ActivePooling.h"
+template <typename T, Int Dimension>
+void cpu_ActivePooling_updateOutput(
+    /*long*/ at::Tensor inputSize, Metadata<Dimension> &m,
+    /*float*/ at::Tensor input_features,
+    /*float*/ at::Tensor output_features, bool average) {
+  Int nPlanes = input_features.size(1);
+  auto _rules = m.getActivePoolingRuleBook(inputSize);
+  Int batchSize = _rules[1][0];
+  Int maxActive = _rules[1][1];
+  output_features.resize_({batchSize, nPlanes});
+  output_features.zero_();
+  ActivePooling_ForwardPass<T>(input_features.data<T>(),
+                               output_features.data<T>(), batchSize, maxActive,
+                               nPlanes, _rules, average);
+}
+template <typename T, Int Dimension>
+void cpu_ActivePooling_updateGradInput(
+    /*long*/ at::Tensor inputSize, Metadata<Dimension> &m,
+    /*float*/ at::Tensor input_features,
+    /*float*/ at::Tensor d_input_features,
+    /*float*/ at::Tensor d_output_features, bool average) {
+  Int nPlanes = input_features.size(1);
+  auto _rules = m.getActivePoolingRuleBook(inputSize);
+  Int batchSize = _rules[1][0];
+  Int maxActive = _rules[1][1];
+  d_input_features.resize_as_(input_features);
+  d_input_features.zero_();
+  ActivePooling_BackwardPass<T>(d_input_features.data<T>(),
+                                d_output_features.data<T>(), batchSize,
+                                maxActive, nPlanes, _rules, average);
+}
--- a/sparseconvnet/SCN/generic/CPU/ActivePooling.h
+++ b/sparseconvnet/SCN/generic/CPU/ActivePooling.h
@@ -11,32 +11,32 @@
 template <typename T>
 void ActivePooling_ForwardPass(T *input_features, T *output_features,
-                               uInt batchSize, uInt maxActive, uInt nPlanes,
+                               Int batchSize, Int maxActive, Int nPlanes,
                               RuleBook &rules, bool average) {
-  for (uInt outSite = 0; outSite < batchSize; outSite++) {
+  for (Int outSite = 0; outSite < batchSize; outSite++) {
    T *out = &output_features[outSite * nPlanes];
-    uInt *r = &rules[0][outSite * (maxActive + 1)];
+    Int *r = &rules[0][outSite * (maxActive + 1)];
-    uInt nActive = *r++;
+    Int nActive = *r++;
    T multiplier = (average and nActive > 0) ? 1.0f / nActive : 1.0f;
    while (nActive-- > 0) {
      T *inp = &input_features[(*r++) * nPlanes];
-      for (uInt plane = 0; plane < nPlanes; plane++)
+      for (Int plane = 0; plane < nPlanes; plane++)
        out[plane] += inp[plane] * multiplier;
    }
  }
 }
 template <typename T>
 void ActivePooling_BackwardPass(T *d_input_features, T *d_output_features,
-                                uInt batchSize, uInt maxActive, uInt nPlanes,
+                                Int batchSize, Int maxActive, Int nPlanes,
                                RuleBook &rules, bool average) {
-  for (uInt outSite = 0; outSite < batchSize; outSite++) {
+  for (Int outSite = 0; outSite < batchSize; outSite++) {
    T *out = &d_output_features[outSite * nPlanes];
-    uInt *r = &rules[0][outSite * (maxActive + 1)];
+    Int *r = &rules[0][outSite * (maxActive + 1)];
-    uInt nActive = *r++;
+    Int nActive = *r++;
    T multiplier = (average and nActive > 0) ? 1.0f / nActive : 1.0f;
    while (nActive-- > 0) {
      T *inp = &d_input_features[(*r++) * nPlanes];
-      for (uInt plane = 0; plane < nPlanes; plane++)
+      for (Int plane = 0; plane < nPlanes; plane++)
        inp[plane] = out[plane] * multiplier;
    }
  }

--- a/sparseconvnet/SCN/CPU/AffineReluTrivialConvolution.cpp
+++ b/sparseconvnet/SCN/CPU/AffineReluTrivialConvolution.cpp
+// Copyright 2016-present, Facebook, Inc.
+// All rights reserved.
+//
+// This source code is licensed under the license found in the
+// LICENSE file in the root directory of this source tree.
+#include "AffineReluTrivialConvolution.h"
+template <typename T>
+double cpu_AffineReluTrivialConvolution_updateOutput(
+    /*float*/ at::Tensor input_features, /*float*/ at::Tensor output_features,
+    /*float*/ at::Tensor affineWeight,
+    /*float*/ at::Tensor affineBias, /*float*/ at::Tensor convWeight) {
+  output_features.resize_({input_features.size(0), convWeight.size(1)});
+  AffineReluTrivialConvolution_ForwardPass(
+      input_features.data<T>(), convWeight.size(0), input_features.stride(0),
+      output_features.data<T>(), convWeight.size(1), output_features.stride(0),
+      affineWeight.data<T>(), affineBias.data<T>(), convWeight.data<T>(),
+      input_features.size(0));
+  return input_features.size(0) * input_features.size(1) *
+         output_features.size(1);
+}
+template <typename T>
+void cpu_AffineReluTrivialConvolution_backward(
+    /*float*/ at::Tensor input_features, /*float*/ at::Tensor d_input_features,
+    /*float*/ at::Tensor d_output_features, /*float*/ at::Tensor affineWeight,
+    /*float*/ at::Tensor d_affineWeight, /*float*/ at::Tensor affineBias,
+    /*float*/ at::Tensor d_affineBias,
+    /*float*/ at::Tensor convWeight, /*float*/ at::Tensor d_convWeight,
+    bool additiveGrad) {
+  d_input_features.resize_as_(input_features);
+  AffineReluTrivialConvolution_BackwardPass(
+      input_features.data<T>(), d_input_features.data<T>(), convWeight.size(0),
+      input_features.stride(0), d_output_features.data<T>(), convWeight.size(1),
+      d_output_features.stride(0), affineWeight.data<T>(),
+      d_affineWeight.data<T>(), affineBias.data<T>(), d_affineBias.data<T>(),
+      convWeight.data<T>(), d_convWeight.data<T>(), input_features.size(0),
+      additiveGrad);
+}
--- a/sparseconvnet/SCN/generic/CPU/AffineReluTrivialConvolution.h
+++ b/sparseconvnet/SCN/generic/CPU/AffineReluTrivialConvolution.h
@@ -6,20 +6,19 @@
 #ifndef CPU_AffineReluTrivialConvolution_H
 #define CPU_AffineReluTrivialConvolution_H
-#include "../SparseConvNet.h"
 #include <cstring>
-// buffer must have size >= nHot * (nIn+nOut)
 template <typename T>
 void AffineReluTrivialConvolution_ForwardPass(
-    T *input_features, uInt input_nPlanes, uInt input_stride,
+    T *input_features, Int input_nPlanes, Int input_stride, T *output_features,
-    T *output_features, uInt output_nPlanes, uInt output_stride,
+    Int output_nPlanes, Int output_stride, T *affineWeight, T *affineBias,
-    T *affineWeight, T *affineBias, T *convWeight, uInt nActive) {
+    T *convWeight, Int nActive) {
-  for (uInt row = 0; row < nActive; row++) {
+  for (Int row = 0; row < nActive; row++) {
-    for (uInt column = 0; column < output_nPlanes; column++) {
+    for (Int column = 0; column < output_nPlanes; column++) {
      T sum = 0;
-      for (uInt j = 0; j < input_nPlanes; j++) {
+      for (Int j = 0; j < input_nPlanes; j++) {
        T i = input_features[row * input_stride + j] * affineWeight[j] +
              affineBias[j];
        i = (i > 0) ? i : 0;
@@ -32,16 +31,15 @@ void AffineReluTrivialConvolution_ForwardPass(
 template <typename T>
 void AffineReluTrivialConvolution_BackwardPass(
-    T *input_features, T *d_input_features, uInt input_nPlanes,
+    T *input_features, T *d_input_features, Int input_nPlanes, Int input_stride,
-    uInt input_stride, T *d_output_features, uInt output_nPlanes,
+    T *d_output_features, Int output_nPlanes, Int output_stride,
-    uInt output_stride, T *affineWeight, T *dAffineWeight, T *affineBias,
+    T *affineWeight, T *dAffineWeight, T *affineBias, T *dAffineBias,
-    T *dAffineBias, T *convWeight, T *dConvWeight, uInt nActive,
+    T *convWeight, T *dConvWeight, Int nActive, bool additiveGrad) {
-    bool additiveGrad) {
-  for (uInt row = 0; row < input_nPlanes; row++) {
+  for (Int row = 0; row < input_nPlanes; row++) {
-    for (uInt column = 0; column < output_nPlanes; column++) {
+    for (Int column = 0; column < output_nPlanes; column++) {
      T sum = 0;
-      for (uInt j = 0; j < nActive; j++) {
+      for (Int j = 0; j < nActive; j++) {
        T i = input_features[j * input_stride + row] * affineWeight[row] +
              affineBias[row];
        i = (i > 0) ? i : 0;
@@ -50,10 +48,10 @@ void AffineReluTrivialConvolution_BackwardPass(
      dConvWeight[row * output_nPlanes + column] += sum;
    }
  }
-  for (uInt row = 0; row < nActive; row++) {
+  for (Int row = 0; row < nActive; row++) {
-    for (uInt column = 0; column < input_nPlanes; column++) {
+    for (Int column = 0; column < input_nPlanes; column++) {
      T sum = 0;
-      for (uInt j = 0; j < output_nPlanes; j++) {
+      for (Int j = 0; j < output_nPlanes; j++) {
        sum += d_output_features[row * output_stride + j] *
               convWeight[column * output_nPlanes + j];
      }

--- a/sparseconvnet/SCN/CPU/AveragePooling.cpp
+++ b/sparseconvnet/SCN/CPU/AveragePooling.cpp
+// Copyright 2016-present, Facebook, Inc.
+// All rights reserved.
+//
+// This source code is licensed under the license found in the
+// LICENSE file in the root directory of this source tree.
+#include "AveragePooling.h"
+template <typename T, Int Dimension>
+void cpu_AveragePooling_updateOutput(
+    /*long*/ at::Tensor inputSize, /*long*/ at::Tensor outputSize,
+    /*long*/ at::Tensor poolSize,
+    /*long*/ at::Tensor poolStride, Metadata<Dimension> &m,
+    /*float*/ at::Tensor input_features,
+    /*float*/ at::Tensor output_features, long nFeaturesToDrop) {
+  Int nPlanes = input_features.size(1) - nFeaturesToDrop;
+  auto _rules =
+      m.getRuleBook(inputSize, outputSize, poolSize, poolStride, true);
+  Int nActive = m.getNActive(outputSize);
+  output_features.resize_({nActive, input_features.size(1) - nFeaturesToDrop});
+  output_features.zero_();
+  auto iF = input_features.data<T>() + nFeaturesToDrop;
+  auto oF = output_features.data<T>();
+  for (auto &r : _rules) {
+    Int nHot = r.size() / 2;
+    AveragePooling_ForwardPass<T>(iF, oF, nPlanes, input_features.stride(0),
+                                  output_features.stride(0), &r[0], nHot,
+                                  _rules.size());
+  }
+}
+template <typename T, Int Dimension>
+void cpu_AveragePooling_updateGradInput(
+    /*long*/ at::Tensor inputSize, /*long*/ at::Tensor outputSize,
+    /*long*/ at::Tensor poolSize,
+    /*long*/ at::Tensor poolStride, Metadata<Dimension> &m,
+    /*float*/ at::Tensor input_features,
+    /*float*/ at::Tensor d_input_features,
+    /*float*/ at::Tensor d_output_features, long nFeaturesToDrop) {
+  Int nPlanes = input_features.size(1) - nFeaturesToDrop;
+  auto _rules =
+      m.getRuleBook(inputSize, outputSize, poolSize, poolStride, true);
+  d_input_features.resize_as_(input_features);
+  d_input_features.zero_();
+  auto diF = d_input_features.data<T>() + nFeaturesToDrop;
+  auto doF = d_output_features.data<T>();
+  for (auto &r : _rules) {
+    Int nHot = r.size() / 2;
+    AveragePooling_BackwardPass<T>(diF, doF, nPlanes, input_features.stride(0),
+                                   d_output_features.stride(0), &r[0], nHot,
+                                   _rules.size());
+  }
+}
--- a/sparseconvnet/SCN/generic/CPU/AveragePooling.h
+++ b/sparseconvnet/SCN/generic/CPU/AveragePooling.h
@@ -6,29 +6,29 @@
 #ifndef CPU_AVERAGEPOOLING_H
 #define CPU_AVERAGEPOOLING_H
-#include "../SparseConvNet.h"
 template <typename T>
 void AveragePooling_ForwardPass(T *input_features, T *output_features,
-                                uInt nPlanes, uInt input_stride,
+                                Int nPlanes, Int input_stride,
-                                uInt output_stride, uInt *rules, uInt nHot,
+                                Int output_stride, Int *rules, Int nHot,
-                                uInt filterVolume) {
+                                Int filterVolume) {
-  for (uInt outSite = 0; outSite < nHot; outSite++) {
+  for (Int outSite = 0; outSite < nHot; outSite++) {
-    uInt i = rules[2 * outSite] * input_stride;
+    Int i = rules[2 * outSite] * input_stride;
-    uInt o = rules[2 * outSite + 1] * output_stride;
+    Int o = rules[2 * outSite + 1] * output_stride;
-    for (uInt plane = 0; plane < nPlanes; plane++)
+    for (Int plane = 0; plane < nPlanes; plane++)
      output_features[o + plane] += input_features[i + plane] / filterVolume;
  }
 }
 template <typename T>
 void AveragePooling_BackwardPass(T *d_input_features, T *d_output_features,
-                                 uInt nPlanes, uInt input_stride,
+                                 Int nPlanes, Int input_stride,
-                                 uInt output_stride, uInt *rules, uInt nHot,
+                                 Int output_stride, Int *rules, Int nHot,
-                                 uInt filterVolume) {
+                                 Int filterVolume) {
-  for (uInt outSite = 0; outSite < nHot; outSite++) {
+  for (Int outSite = 0; outSite < nHot; outSite++) {
-    uInt i = rules[2 * outSite] * input_stride;
+    Int i = rules[2 * outSite] * input_stride;
-    uInt o = rules[2 * outSite + 1] * output_stride;
+    Int o = rules[2 * outSite + 1] * output_stride;
-    for (uInt plane = 0; plane < nPlanes; plane++)
+    for (Int plane = 0; plane < nPlanes; plane++)
      d_input_features[i + plane] +=
          d_output_features[o + plane] / filterVolume;
  }

--- a/sparseconvnet/SCN/CPU/BatchNormalization.cpp
+++ b/sparseconvnet/SCN/CPU/BatchNormalization.cpp
+// Copyright 2016-present, Facebook, Inc.
+// All rights reserved.
+//
+// This source code is licensed under the license found in the
+// LICENSE file in the root directory of this source tree.
+#include "BatchNormalization.h"
+template <typename T>
+void cpu_BatchNormalization_updateOutput(
+    /*float*/ at::Tensor input_features, /*float*/ at::Tensor output_features,
+    /*float*/ at::Tensor saveMean,
+    /*float*/ at::Tensor saveInvStd, /*float*/ at::Tensor runningMean,
+    /*float*/ at::Tensor runningVar,
+    /*float*/ at::Tensor weight, /*float*/ at::Tensor bias, T eps, T momentum,
+    bool train, T leakiness) {
+  output_features.resize_as_(input_features);
+  if (input_features.ndimension() == 2) {
+    auto nActive = input_features.size(0);
+    auto nPlanes = input_features.size(1);
+    auto input_stride = input_features.stride(0);
+    auto output_stride = output_features.stride(0);
+    BatchNormalization_ForwardPass<T>(
+        input_features.data<T>(), output_features.data<T>(), nPlanes,
+        input_stride, output_stride, nActive, saveMean.data<T>(),
+        saveInvStd.data<T>(), runningMean.data<T>(), runningVar.data<T>(),
+        OptionalTensorData<T>(weight), OptionalTensorData<T>(bias), eps,
+        momentum, train, leakiness);
+  }
+}
+template <typename T>
+void cpu_BatchNormalizationInTensor_updateOutput(
+    /*float*/ at::Tensor input_features, /*float*/ at::Tensor output_features,
+    /*float*/ at::Tensor saveMean,
+    /*float*/ at::Tensor saveInvStd, /*float*/ at::Tensor runningMean,
+    /*float*/ at::Tensor runningVar,
+    /*float*/ at::Tensor weight, /*float*/ at::Tensor bias, T eps, T momentum,
+    bool train, T leakiness) {
+  if (input_features.ndimension() == 2) {
+    auto nActive = input_features.size(0);
+    auto nPlanes = input_features.size(1);
+    auto input_stride = input_features.stride(0);
+    auto output_stride = output_features.stride(0);
+    BatchNormalization_ForwardPass<T>(
+        input_features.data<T>(), output_features.data<T>(), nPlanes,
+        input_stride, output_stride, nActive, saveMean.data<T>(),
+        saveInvStd.data<T>(), runningMean.data<T>(), runningVar.data<T>(),
+        OptionalTensorData<T>(weight), OptionalTensorData<T>(bias), eps,
+        momentum, train, leakiness);
+  }
+}
+template <typename T>
+void cpu_BatchNormalization_backward(
+    /*float*/ at::Tensor input_features, /*float*/ at::Tensor d_input_features,
+    /*float*/ at::Tensor output_features,
+    /*float*/ at::Tensor d_output_features, /*float*/ at::Tensor saveMean,
+    /*float*/ at::Tensor saveInvStd, /*float*/ at::Tensor runningMean,
+    /*float*/ at::Tensor runningVar,
+    /*float*/ at::Tensor weight, /*float*/ at::Tensor bias,
+    /*float*/ at::Tensor d_weight, /*float*/ at::Tensor d_bias, T leakiness) {
+  d_input_features.resize_as_(input_features);
+  if (input_features.ndimension() == 2) {
+    auto nActive = input_features.size(0);
+    auto nPlanes = input_features.size(1);
+    auto input_stride = input_features.stride(0);
+    auto output_stride = output_features.stride(0);
+    BatchNormalization_BackwardPass<T>(
+        input_features.data<T>(), d_input_features.data<T>(),
+        output_features.data<T>(), d_output_features.data<T>(), nPlanes,
+        input_stride, output_stride, nActive, saveMean.data<T>(),
+        saveInvStd.data<T>(), runningMean.data<T>(), runningVar.data<T>(),
+        OptionalTensorData<T>(weight), OptionalTensorData<T>(bias),
+        OptionalTensorData<T>(d_weight), OptionalTensorData<T>(d_bias),
+        leakiness);
+  }
+}