v0.4.2

2d21747a · Zhang · 7e19143c · 2d21747a · 2d21747a · 2d21747a
Commit 2d21747a authored Jun 04, 2018 by Zhang
20 changed files
--- a/encoding/datasets/folder.py
+++ b/encoding/datasets/folder.py
+###########################################################################
+# Created by: Hang Zhang 
+# Email: zhang.hang@rutgers.edu 
+# Copyright (c) 2017
+###########################################################################
+import os
+import sys
+import numpy as np
+import random
+import math
+import torch.utils.data as data
+from PIL import Image, ImageOps
+import torch.utils.data as data
+import torchvision.transforms as transform
+from .dataset import ToLabel
+class FolderLoader(data.Dataset):
+    def __init__(self, root, transform=None):
+        self.root = root
+        self.transform = transform
+        self.images = get_folder_images(root)
+        if len(self.images) == 0:
+            raise(RuntimeError("Found 0 images in subfolders of: \
+                " + self.root + "\n"))
+    def __getitem__(self, index):
+        img = Image.open(self.images[index]).convert('RGB')
+        if self.transform is not None:
+            img = self.transform(img)
+        return img, os.path.basename(self.images[index])
+    def __len__(self):
+        return len(self.images)
+def get_folder_images(img_folder):
+    img_paths = []  
+    for filename in os.listdir(img_folder):
+        if filename.endswith(".jpg"):
+            imgpath = os.path.join(img_folder, filename)
+            img_paths.append(imgpath)
+    return img_paths
+class Dataloder():
+    def __init__(self, args):
+        # the data augmentation is implemented as part of the dataloader
+        assert(args.test)
+        input_transform = transform.Compose([
+            transform.ToTensor(),
+            transform.Normalize(args.mean, args.std)])
+        args.test_batch_size = 1 
+        assert(args.test_folder is not None)
+        print('loading the data from: {}'.format(args.test_folder))
+        testset = FolderLoader(args.test_folder, input_transform)
+        kwargs = {'num_workers': args.workers, 'pin_memory': True} \
+            if args.cuda else {}
+        self.trainloader = None
+        self.testloader = data.DataLoader(testset,
+                                     batch_size=args.test_batch_size,
+                                     shuffle=False, **kwargs)
+    def getloader(self):
+        return self.trainloader, self.testloader
--- a/encoding/datasets/hpw18.py
+++ b/encoding/datasets/hpw18.py
+# created by: Sean Liu
+# Amazon Lab 126
+from __future__ import print_function
+import errno
+import hashlib
+import os
+import sys
+import tarfile
+import numpy as np
+import random
+import math
+import torch.utils.data as data
+import PIL
+from PIL import Image, ImageOps
+from six.moves import urllib
+class Segmentation_HPW18(data.Dataset):
+    CLASSES = [
+        'background', 'hat', 'hair', 'sunglasses', 'upper-clothes', 
+        'skirt', 'pants', 'dress', 'belt', 'left-shoe', 'right-shoe', 
+        'face', 'left-leg', 'right-leg', 'left-arm', 'right-arm', 'bag', 
+        'scarf'
+    ]
+    URL = "/cvdata1/lliuqian/humanParsingDataset"
+    FILE = "hpw18.tar.gz"
+    MD5 = ''
+    BASE_DIR = ''
+    def __init__(self,
+                 root,
+                 train=True,
+                 transform=None,
+                 target_transform=None,
+                 download=False):
+        self.root = root
+        _hpw18_root = os.path.join(self.root, self.BASE_DIR)
+        _mask_dir = os.path.join(_hpw18_root, 'SegmentationClassAug_256x384')
+        _image_dir = os.path.join(_hpw18_root, 'JPEGImages_256x384')
+        self.transform = transform
+        self.target_transform = target_transform
+        self.train = train
+        if download:
+            self._download()
+        # train/val/test splits are pre-cut
+        _splits_dir = _hpw18_root
+        _split_f = os.path.join(_splits_dir, 'humanparsingImageMask_256x384_absPath_train.txt')
+        if not self.train:
+            _split_f = os.path.join(_splits_dir, 'humanparsingImageMask_256x384_absPath_val.txt')
+        print("reading from ", _split_f)
+        self.images = []
+        self.masks = []
+        with open(os.path.join(_split_f), "r") as lines:
+            for line in lines:
+                s = line.split()
+                _image = s[0] # image absolution path
+                _mask = s[1] # mask absolution path
+                assert os.path.isfile(_image)
+                assert os.path.isfile(_mask)
+                self.images.append(_image)
+                self.masks.append(_mask)
+        assert (len(self.images) == len(self.masks))
+    def __getitem__(self, index):
+        _img = Image.open(self.images[index]).convert('RGB')
+        _timg = Image.open(self.masks[index])
+        _target = np.array(_timg, dtype=np.uint8)
+        _target = Image.fromarray(_target)
+        # synchrosized transform
+        if self.train:
+            _img, _target = self._sync_transform( _img, _target)
+        # general resize, normalize and toTensor
+        if self.transform is not None:
+            _img = self.transform(_img)
+        if self.target_transform is not None:
+            _target = self.target_transform(_target)
+        return _img, _target
+    def __len__(self):
+        return len(self.images)
+    def _sync_transform(self, img, mask):
+        # random rotate -10~10
+        deg = random.uniform(-10,10)
+        img = img.rotate(deg)
+        mask = mask.rotate(deg, PIL.Image.NEAREST)
+        return img, mask
+if __name__ == '__main__':
+    hpw18 = Segmentation_HPW18('/cvdata1/lliuqian/', train=True)
+    print(hpw18[0])
+    print (len(hpw18))
--- a/encoding/datasets/pascal_aug.py
+++ b/encoding/datasets/pascal_aug.py
+import os
+import random
+import scipy.io
+import numpy as np
+from PIL import Image, ImageOps, ImageFilter
+from .base import BaseDataset
+class VOCAugSegmentation(BaseDataset):
+    voc = [
+        'background', 'airplane', 'bicycle', 'bird', 'boat', 'bottle', 
+        'bus', 'car', 'cat', 'chair', 'cow', 'diningtable', 'dog', 'horse',
+        'motorcycle', 'person', 'potted-plant', 'sheep', 'sofa', 'train',
+        'tv'
+    ]
+    NUM_CLASS = 21
+    TRAIN_BASE_DIR = 'VOCaug/dataset/'
+    def __init__(self, root, split='train', mode=None, transform=None, 
+                 target_transform=None):
+        super(VOCAugSegmentation, self).__init__(root, split, mode, transform, target_transform)
+        # train/val/test splits are pre-cut
+        _voc_root = os.path.join(root, self.TRAIN_BASE_DIR)
+        _mask_dir = os.path.join(_voc_root, 'cls')
+        _image_dir = os.path.join(_voc_root, 'img')
+        if self.mode == 'train':
+            _split_f = os.path.join(_voc_root, 'trainval.txt')
+        elif self.mode == 'val':
+            _split_f = os.path.join(_voc_root, 'val.txt')
+        else:
+            raise RuntimeError('Unknown dataset split.')
+        self.images = []
+        self.masks = []
+        with open(os.path.join(_split_f), "r") as lines:
+            for line in lines:
+                _image = os.path.join(_image_dir, line.rstrip('\n')+".jpg")
+                assert os.path.isfile(_image)
+                self.images.append(_image)
+                if self.mode != 'test':
+                    _mask = os.path.join(_mask_dir, line.rstrip('\n')+".mat")
+                    assert os.path.isfile(_mask)
+                    self.masks.append(_mask)
+        assert (len(self.images) == len(self.masks))
+    def __getitem__(self, index):
+        _img = Image.open(self.images[index]).convert('RGB')
+        if self.mode == 'test':
+            if self.transform is not None:
+                _img = self.transform(_img)
+            return _img, os.path.basename(self.images[index])
+        _target = self._load_mat(self.masks[index])
+        # synchrosized transform
+        if self.mode == 'train':
+            _img, _target = self._sync_transform( _img, _target)
+        elif self.mode == 'val':
+            _img, _target = self._val_sync_transform( _img, _target)
+        # general resize, normalize and toTensor
+        if self.transform is not None:
+            _img = self.transform(_img)
+        if self.target_transform is not None:
+            _target = self.target_transform(_target)
+        return _img, _target
+    def _load_mat(self, filename):
+        mat = scipy.io.loadmat(filename, mat_dtype=True, squeeze_me=True, 
+            struct_as_record=False)
+        mask = mat['GTcls'].Segmentation
+        return Image.fromarray(mask)
+    def __len__(self):
+        return len(self.images)
--- a/encoding/datasets/pascal_voc.py
+++ b/encoding/datasets/pascal_voc.py
+import os
+import random
+import numpy as np
+from PIL import Image, ImageOps, ImageFilter
+from tqdm import tqdm
+import torch
+from .base import BaseDataset
+class VOCSegmentation(BaseDataset):
+    CLASSES = [
+        'background', 'aeroplane', 'bicycle', 'bird', 'boat', 'bottle', 
+        'bus', 'car', 'cat', 'chair', 'cow', 'diningtable', 'dog', 'horse',
+        'motorbike', 'person', 'potted-plant', 'sheep', 'sofa', 'train',
+        'tv/monitor', 'ambigious'
+    ]
+    NUM_CLASS = 21
+    BASE_DIR = 'VOCdevkit/VOC2012'
+    def __init__(self, root, split='train', mode=None, transform=None, 
+                 target_transform=None):
+        super(VOCSegmentation, self).__init__(root, split, mode, transform, target_transform)
+        _voc_root = os.path.join(self.root, self.BASE_DIR)
+        _mask_dir = os.path.join(_voc_root, 'SegmentationClass')
+        _image_dir = os.path.join(_voc_root, 'JPEGImages')
+        # train/val/test splits are pre-cut
+        _splits_dir = os.path.join(_voc_root, 'ImageSets/Segmentation')
+        if self.mode == 'train':
+            _split_f = os.path.join(_splits_dir, 'trainval.txt')
+        elif self.mode == 'val':
+            _split_f = os.path.join(_splits_dir, 'val.txt')
+        elif self.mode == 'test':
+            _split_f = os.path.join(_splits_dir, 'test.txt')
+        else:
+            raise RuntimeError('Unknown dataset split.')
+        self.images = []
+        self.masks = []
+        with open(os.path.join(_split_f), "r") as lines:
+            for line in tqdm(lines):
+                _image = os.path.join(_image_dir, line.rstrip('\n')+".jpg")
+                assert os.path.isfile(_image)
+                self.images.append(_image)
+                if self.mode != 'test':
+                    _mask = os.path.join(_mask_dir, line.rstrip('\n')+".png")
+                    assert os.path.isfile(_mask)
+                    self.masks.append(_mask)
+        if self.mode != 'test':
+            assert (len(self.images) == len(self.masks))
+    def __getitem__(self, index):
+        img = Image.open(self.images[index]).convert('RGB')
+        if self.mode == 'test':
+            if self.transform is not None:
+                img = self.transform(img)
+            return img, os.path.basename(self.images[index])
+        target = Image.open(self.masks[index])
+        # synchrosized transform
+        if self.mode == 'train':
+            img, target = self._sync_transform( img, target)
+        elif self.mode == 'val':
+            img, target = self._val_sync_transform( img, target)
+        else:
+            assert self.mode == 'testval'
+            mask = self._mask_transform(mask)
+        # general resize, normalize and toTensor
+        if self.transform is not None:
+            #print("transform for input")
+            img = self.transform(img)
+        if self.target_transform is not None:
+            #print("transform for label")
+            target = self.target_transform(target)
+        return img, target
+    def _mask_transform(self, mask):
+        target = np.array(mask).astype('int32')
+        target[target == 255] = -1
+        return torch.from_numpy(target).long()
+    def __len__(self):
+        return len(self.images)
--- a/encoding/datasets/pcontext.py
+++ b/encoding/datasets/pcontext.py
+###########################################################################
+# Created by: Hang Zhang
+# Email: zhang.hang@rutgers.edu
+# Copyright (c) 2017
+###########################################################################
+from PIL import Image, ImageOps, ImageFilter
+import os
+import os.path
+import math
+import random
+import numpy as np
+import torch
+from .base import BaseDataset
+class ContextSegmentation(BaseDataset):
+    BASE_DIR = 'VOCdevkit/VOC2010'
+    NUM_CLASS = 59
+    def __init__(self, root=os.path.expanduser('~/.encoding/data'), split='train',
+                 mode=None, transform=None, target_transform=None):
+        super(ContextSegmentation, self).__init__(
+            root, split, mode, transform, target_transform)
+        from detail import Detail
+        #from detail import mask
+        root = os.path.join(root, self.BASE_DIR)
+        annFile = os.path.join(root, 'trainval_merged.json')
+        imgDir = os.path.join(root, 'JPEGImages')
+        # training mode
+        if split == 'train':
+            phase = 'train'
+        elif split == 'val':
+            phase = 'val'
+        elif split == 'test':
+            phase = 'val'
+            #phase = 'test'
+        print('annFile', annFile)
+        print('imgDir', imgDir)
+        self.detail = Detail(annFile, imgDir, phase)
+        self.transform = transform
+        self.target_transform = target_transform
+        self.ids = self.detail.getImgs()
+        self._mapping = np.sort(np.array([
+            0, 2, 259, 260, 415, 324, 9, 258, 144, 18, 19, 22, 
+            23, 397, 25, 284, 158, 159, 416, 33, 162, 420, 454, 295, 296, 
+            427, 44, 45, 46, 308, 59, 440, 445, 31, 232, 65, 354, 424, 
+            68, 326, 72, 458, 34, 207, 80, 355, 85, 347, 220, 349, 360, 
+            98, 187, 104, 105, 366, 189, 368, 113, 115]))
+        self._key = np.array(range(len(self._mapping))).astype('uint8')
+    def _class_to_index(self, mask):
+        # assert the values
+        values = np.unique(mask)
+        #assert(values.size > 1)
+        for i in range(len(values)):
+            assert(values[i] in self._mapping)
+        index = np.digitize(mask.ravel(), self._mapping, right=True)
+        return self._key[index].reshape(mask.shape)
+    def __getitem__(self, index):
+        detail = self.detail
+        img_id = self.ids[index]
+        path = img_id['file_name']
+        iid = img_id['image_id']
+        img = Image.open(os.path.join(detail.img_folder, path)).convert('RGB')
+        if self.mode == 'test':
+            if self.transform is not None:
+                img = self.transform(img)
+            return img, os.path.basename(path)
+        # convert mask to 60 categories
+        mask = Image.fromarray(self._class_to_index(
+            detail.getMask(img_id)))
+        # synchrosized transform
+        if self.mode == 'train':
+            img, mask = self._sync_transform(img, mask)
+        elif self.mode == 'val':
+            img, mask = self._val_sync_transform(img, mask)
+        else:
+            assert self.mode == 'testval'
+            mask = self._mask_transform(mask)
+        # general resize, normalize and toTensor
+        if self.transform is not None:
+            #print("transform for input")
+            img = self.transform(img)
+        if self.target_transform is not None:
+            #print("transform for label")
+            mask = self.target_transform(mask)
+        return img, mask
+    def _mask_transform(self, mask):
+        target = np.array(mask).astype('int32') - 1
+        return torch.from_numpy(target).long()
+    def __len__(self):
+        return len(self.ids)
+    @property
+    def pred_offset(self):
+        return 1
--- a/encoding/dilated/__init__.py
+++ b/encoding/dilated/__init__.py
 """Dilated ResNet and DenseNet"""
 from .resnet import *
-from .densenet import *
--- a/encoding/dilated/densenet.py
+++ b/encoding/dilated/densenet.py
-"""Dilated DenseNet"""
-from collections import OrderedDict
-import torch
-import torch.utils.model_zoo as model_zoo
-from .. import nn
-from .. import functions as F
-__all__ = ['DenseNet', 'densenet121', 'densenet169', 'densenet201', 'densenet161']
-model_urls = {
-    'densenet121': 'https://download.pytorch.org/models/densenet121-a639ec97.pth',
-    'densenet169': 'https://download.pytorch.org/models/densenet169-b2777c0a.pth',
-    'densenet201': 'https://download.pytorch.org/models/densenet201-c1103571.pth',
-    'densenet161': 'https://download.pytorch.org/models/densenet161-8d451a50.pth',
-}
-def densenet121(pretrained=False, **kwargs):
-    r"""Densenet-121 model from
-    `"Densely Connected Convolutional Networks" <https://arxiv.org/pdf/1608.06993.pdf>`_
-    Args:
-        pretrained (bool): If True, returns a model pre-trained on ImageNet
-    """
-    model = DenseNet(num_init_features=64, growth_rate=32, block_config=(6, 12, 24, 16),
-                     **kwargs)
-    if pretrained:
-        model.load_state_dict(model_zoo.load_url(model_urls['densenet121']))
-    return model
-def densenet169(pretrained=False, **kwargs):
-    r"""Densenet-169 model from
-    `"Densely Connected Convolutional Networks" <https://arxiv.org/pdf/1608.06993.pdf>`_
-    Args:
-        pretrained (bool): If True, returns a model pre-trained on ImageNet
-    """
-    model = DenseNet(num_init_features=64, growth_rate=32, block_config=(6, 12, 32, 32),
-                     **kwargs)
-    if pretrained:
-        model.load_state_dict(model_zoo.load_url(model_urls['densenet169']))
-    return model
-def densenet201(pretrained=False, **kwargs):
-    r"""Densenet-201 model from
-    `"Densely Connected Convolutional Networks" <https://arxiv.org/pdf/1608.06993.pdf>`_
-    Args:
-        pretrained (bool): If True, returns a model pre-trained on ImageNet
-    """
-    model = DenseNet(num_init_features=64, growth_rate=32, block_config=(6, 12, 48, 32),
-                     **kwargs)
-    if pretrained:
-        model.load_state_dict(model_zoo.load_url(model_urls['densenet201']))
-    return model
-def densenet161(pretrained=False, **kwargs):
-    r"""Densenet-161 model from
-    `"Densely Connected Convolutional Networks" <https://arxiv.org/pdf/1608.06993.pdf>`_
-    Args:
-        pretrained (bool): If True, returns a model pre-trained on ImageNet
-    """
-    model = DenseNet(num_init_features=96, growth_rate=48, block_config=(6, 12, 36, 24),
-                     **kwargs)
-    if pretrained:
-        model.load_state_dict(model_zoo.load_url(model_urls['densenet161']))
-    return model
-class _DenseLayer(nn.Sequential):
-    # pylint: disable=expression-not-assigned
-    def __init__(self, num_input_features, growth_rate, bn_size, drop_rate, dilation=1):
-        super(_DenseLayer, self).__init__()
-        self.add_module('norm.1', nn.BatchNorm2d(num_input_features)),
-        self.add_module('relu.1', nn.ReLU(inplace=True)),
-        self.add_module('conv.1', nn.Conv2d(
-            num_input_features, bn_size * growth_rate, kernel_size=1, stride=1, bias=False)),
-        self.add_module('norm.2', nn.BatchNorm2d(bn_size * growth_rate)),
-        self.add_module('relu.2', nn.ReLU(inplace=True)),
-        self.add_module('conv.2', nn.Conv2d(
-            bn_size * growth_rate, growth_rate, kernel_size=3, stride=1,
-            padding=dilation, dilation=dilation, bias=False)),
-        self.drop_rate = drop_rate
-    def forward(self, x):
-        new_features = super(_DenseLayer, self).forward(x)
-        if self.drop_rate > 0:
-            new_features = F.dropout(new_features, p=self.drop_rate, training=self.training)
-        return torch.cat([x, new_features], 1)
-class _DenseBlock(nn.Sequential):
-    def __init__(self, num_layers, num_input_features, bn_size, growth_rate, drop_rate, dilation=1):
-        super(_DenseBlock, self).__init__()
-        for i in range(num_layers):
-            layer = _DenseLayer(num_input_features + i * growth_rate, growth_rate, bn_size, drop_rate, dilation=dilation)
-            self.add_module('denselayer%d' % (i + 1), layer)
-class _Transition(nn.Sequential):
-    def __init__(self, num_input_features, num_output_features, stride, dilation=1):
-        super(_Transition, self).__init__()
-        self.add_module('norm', nn.BatchNorm2d(num_input_features))
-        self.add_module('relu', nn.ReLU(inplace=True))
-        self.add_module('conv', nn.Conv2d(num_input_features, num_output_features,
-                                          kernel_size=1, stride=1, bias=False))
-        self.add_module('pool', nn.DilatedAvgPool2d(kernel_size=2, stride=stride,
-                                                    dilation=dilation))
-class DenseNet(nn.Module):
-    r"""Dilated DenseNet.
-    For correctly dilation of transition layer fo DenseNet, we implement the :class:`encoding.nn.DilatedAvgPool2d`.
-    Args:
-        growth_rate (int) - how many filters to add each layer (`k` in paper)
-        block_config (list of 4 ints) - how many layers in each pooling block
-        num_init_features (int) - the number of filters to learn in the first convolution layer
-        bn_size (int) - multiplicative factor for number of bottle neck layers
-          (i.e. bn_size * k features in the bottleneck layer)
-        drop_rate (float) - dropout rate after each dense layer
-        num_classes (int) - number of classification classes
-    Reference:
-        Huang, Gao, et al. "Densely Connected Convolutional Networks" *CVPR 2017*
-    """
-    def __init__(self, growth_rate=32, block_config=(6, 12, 24, 16),
-                 num_init_features=64, bn_size=4, drop_rate=0, num_classes=1000):
-        super(DenseNet, self).__init__()
-        # First convolution
-        self.features = nn.Sequential(OrderedDict([
-            ('conv0', nn.Conv2d(3, num_init_features, kernel_size=7, stride=2, padding=3, bias=False)),
-            ('norm0', nn.BatchNorm2d(num_init_features)),
-            ('relu0', nn.ReLU(inplace=True)),
-            ('pool0', nn.MaxPool2d(kernel_size=3, stride=2, padding=1)),
-        ]))
-        # Each denseblock
-        strides = [1, 2, 1, 1]
-        dilations = [1, 1, 2, 4]
-        num_features = num_init_features
-        for i, num_layers in enumerate(block_config):
-            block = _DenseBlock(num_layers=num_layers, num_input_features=num_features,
-                                bn_size=bn_size, growth_rate=growth_rate, drop_rate=drop_rate,
-                                dilation=dilations[i])
-            self.features.add_module('denseblock%d' % (i + 1), block)
-            num_features = num_features + num_layers * growth_rate
-            if i != len(block_config) - 1:
-                trans = _Transition(num_input_features=num_features, num_output_features=num_features // 2, stride=strides[i+1], dilation=dilations[i])
-                self.features.add_module('transition%d' % (i + 1), trans)
-                num_features = num_features // 2
-        # Final batch norm
-        self.features.add_module('norm5', nn.BatchNorm2d(num_features))
-        # Linear layer
-        self.classifier = nn.Linear(num_features, num_classes)
-    def forward(self, x):
-        features = self.features(x)
-        out = F.relu(features, inplace=True)
-        # out = F.avg_pool2d(out, kernel_size=7).view(features.size(0), -1)
-        # out = self.classifier(out)
-        return out
--- a/encoding/dilated/resnet.py
+++ b/encoding/dilated/resnet.py
@@ -26,15 +26,15 @@ class BasicBlock(nn.Module):
    """ResNet BasicBlock
    """
    expansion = 1
-    def __init__(self, inplanes, planes, stride=1, dilation=1, downsample=None, first_dilation=1,
+    def __init__(self, inplanes, planes, stride=1, dilation=1, downsample=None, previous_dilation=1,
                 norm_layer=None):
        super(BasicBlock, self).__init__()
        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=3, stride=stride,
                               padding=dilation, dilation=dilation, bias=False)
        self.bn1 = norm_layer(planes)
-        self.relu = nn.ReLU(inplace=False)
+        self.relu = nn.ReLU(inplace=True)
        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=1,
-                               padding=first_dilation, dilation=first_dilation, bias=False)
+                               padding=previous_dilation, dilation=previous_dilation, bias=False)
        self.bn2 = norm_layer(planes)
        self.downsample = downsample
        self.stride = stride
@@ -64,7 +64,7 @@ class Bottleneck(nn.Module):
    # pylint: disable=unused-argument
    expansion = 4
    def __init__(self, inplanes, planes, stride=1, dilation=1,
-                 downsample=None, first_dilation=1, norm_layer=None):
+                 downsample=None, previous_dilation=1, norm_layer=None):
        super(Bottleneck, self).__init__()
        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
        self.bn1 = norm_layer(planes)
@@ -75,7 +75,7 @@ class Bottleneck(nn.Module):
        self.conv3 = nn.Conv2d(
            planes, planes * 4, kernel_size=1, bias=False)
        self.bn3 = norm_layer(planes * 4)
-        self.relu = nn.ReLU(inplace=False)
+        self.relu = nn.ReLU(inplace=True)
        self.downsample = downsample
        self.dilation = dilation
        self.stride = stride
@@ -113,6 +113,21 @@ class Bottleneck(nn.Module):
 class ResNet(nn.Module):
    """Dilated Pre-trained ResNet Model, which preduces the stride of 8 featuremaps at conv5.
+    Parameters
+    ----------
+    block : Block
+        Class for the residual block. Options are BasicBlockV1, BottleneckV1.
+    layers : list of int
+        Numbers of layers in each block
+    classes : int, default 1000
+        Number of classification classes.
+    dilated : bool, default False
+        Applying dilation strategy to pretrained ResNet yielding a stride-8 model,
+        typically used in Semantic Segmentation.
+    norm_layer : object
+        Normalization layer used in backbone network (default: :class:`mxnet.gluon.nn.BatchNorm`;
+        for Synchronized Cross-GPU BachNormalization).
    Reference:
        - He, Kaiming, et al. "Deep residual learning for image recognition." Proceedings of the IEEE conference on computer vision and pattern recognition. 2016.
@@ -120,18 +135,26 @@ class ResNet(nn.Module):
        - Yu, Fisher, and Vladlen Koltun. "Multi-scale context aggregation by dilated convolutions."
    """
    # pylint: disable=unused-variable
-    def __init__(self, block, layers, num_classes=1000, norm_layer=None):
+    def __init__(self, block, layers, num_classes=1000, dilated=True, norm_layer=nn.BatchNorm2d):
        self.inplanes = 64
        super(ResNet, self).__init__()
        self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3,
                               bias=False)
        self.bn1 = norm_layer(64)
-        self.relu = nn.ReLU(inplace=False)
+        self.relu = nn.ReLU(inplace=True)
        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
        self.layer1 = self._make_layer(block, 64, layers[0], norm_layer=norm_layer)
        self.layer2 = self._make_layer(block, 128, layers[1], stride=2, norm_layer=norm_layer)
-        self.layer3 = self._make_layer(block, 256, layers[2], stride=1, dilation=2, norm_layer=norm_layer)
+        if dilated:
-        self.layer4 = self._make_layer(block, 512, layers[3], stride=1, dilation=4, norm_layer=norm_layer)
+            self.layer3 = self._make_layer(block, 256, layers[2], stride=1,
+                                           dilation=2, norm_layer=norm_layer)
+            self.layer4 = self._make_layer(block, 512, layers[3], stride=1,
+                                           dilation=4, norm_layer=norm_layer)
+        else:
+            self.layer3 = self._make_layer(block, 256, layers[2], stride=2,
+                                           norm_layer=norm_layer)
+            self.layer4 = self._make_layer(block, 512, layers[3], stride=2,
+                                           norm_layer=norm_layer)
        self.avgpool = nn.AvgPool2d(7)
        self.fc = nn.Linear(512 * block.expansion, num_classes)
@@ -155,16 +178,16 @@ class ResNet(nn.Module):
        layers = []
        if dilation == 1 or dilation == 2:
            layers.append(block(self.inplanes, planes, stride, dilation=1,
-                                downsample=downsample, first_dilation=dilation, norm_layer=norm_layer))
+                                downsample=downsample, previous_dilation=dilation, norm_layer=norm_layer))
        elif dilation == 4:
            layers.append(block(self.inplanes, planes, stride, dilation=2,
-                                downsample=downsample, first_dilation=dilation, norm_layer=norm_layer))
+                                downsample=downsample, previous_dilation=dilation, norm_layer=norm_layer))
        else:
            raise RuntimeError("=> unknown dilation size: {}".format(dilation))
        self.inplanes = planes * block.expansion
        for i in range(1, blocks):
-            layers.append(block(self.inplanes, planes, dilation=dilation, first_dilation=dilation,
+            layers.append(block(self.inplanes, planes, dilation=dilation, previous_dilation=dilation,
                                norm_layer=norm_layer))
        return nn.Sequential(*layers)

--- a/encoding/functions/__init__.py
+++ b/encoding/functions/__init__.py
 """Encoding Autograd Fuctions"""
 from .encoding import *
 from .syncbn import *
-from .customize import *
--- a/encoding/functions/customize.py
+++ b/encoding/functions/customize.py
-##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
-## Created by: Hang Zhang
-## ECE Department, Rutgers University
-## Email: zhang.hang@rutgers.edu
-## Copyright (c) 2017
-##
-## This source code is licensed under the MIT-style license found in the
-## LICENSE file in the root directory of this source tree
-##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
-"""Encoding Customized Functions"""
-import math
-import torch
-from torch.autograd import Function, Variable
-from torch.nn.modules.utils import _pair
-from .._ext import encoding_lib
-__all__ = ['dilatedavgpool2d']
-class _dilatedavgpool2d(Function):
-    @staticmethod
-    def forward(ctx, input, kernel_size, stride, padding,
-                dilation=1):
-        ctx.kH, ctx.kW = _pair(kernel_size)
-        ctx.dH, ctx.dW = _pair(stride if stride is not None else kernel_size)
-        ctx.padH, ctx.padW = _pair(padding)
-        ctx.dilationH, ctx.dilationW = _pair(dilation)
-        b, c, h, w = input.size()
-        if ctx.dH == 1 and ctx.dW == 1:
-            # keep the size for dilated avgpool
-            ow, oh = w, h
-        else:
-            ow = math.floor(float(w-ctx.kW+2*ctx.padW)/float(ctx.dW)) +1
-            oh = math.floor(float(h-ctx.kH+2*ctx.padH)/float(ctx.dH)) +1
-        with torch.cuda.device_of(input):
-            output = input.new(b, c, oh, ow)
-        ctx.save_for_backward(input)
-        if isinstance(input, torch.cuda.FloatTensor):
-            with torch.cuda.device_of(input):
-                encoding_lib.Encoding_Float_DilatedAvgPool2d_Forward(
-                    input, output, ctx.kH, ctx.kW, ctx.dH, ctx.dW, ctx.padH,
-                    ctx.padW, ctx.dilationH, ctx.dilationW)
-        elif isinstance(input, torch.cuda.DoubleTensor):
-            with torch.cuda.device_of(input):
-                encoding_lib.Encoding_Double_DilatedAvgPool2d_Forward(
-                    input, output, ctx.kH, ctx.kW, ctx.dH, ctx.dW, ctx.padH,
-                    ctx.padW, ctx.dilationH, ctx.dilationW)
-        else:
-            raise RuntimeError('Unimplemented data type!')
-        return output
-    @staticmethod
-    def backward(ctx, gradOutput):
-        input, = ctx.saved_variables
-        with torch.cuda.device_of(input):
-            gradInput = Variable(input.data.new().resize_as_(input.data))
-        if isinstance(input.data, torch.cuda.FloatTensor):
-            with torch.cuda.device_of(input.data):
-                encoding_lib.Encoding_Float_DilatedAvgPool2d_Backward(
-                    gradInput.data, gradOutput.data,
-                    ctx.kH, ctx.kW, ctx.dH, ctx.dW, ctx.padH, ctx.padW,
-                    ctx.dilationH, ctx.dilationW)
-        elif isinstance(input.data, torch.cuda.DoubleTensor):
-            with torch.cuda.device_of(input.data):
-                encoding_lib.Encoding_Double_DilatedAvgPool2d_Backward(
-                    gradInput.data, gradOutput.data,
-                    ctx.kH, ctx.kW, ctx.dH, ctx.dW, ctx.padH, ctx.padW,
-                    ctx.dilationH, ctx.dilationW)
-        else:
-            raise RuntimeError('Unimplemented data type!')
-        return gradInput, None, None, None, None
-def dilatedavgpool2d(input, kernel_size, stride=None, padding=0,
-                     dilation=1):
-    """Dilated Average Pool 2d, for dilation of DenseNet.
-    Reference:
-        Hang Zhang, Kristin Dana, Jianping Shi, Zhongyue Zhang, Xiaogang Wang,
-        Ambrish Tyagi, Amit Agrawal. “Context Encoding for Semantic Segmentation. CVPR 2018
-    Applies 2D average-pooling operation in kh x kw regions by step size
-    dh x dw steps. The number of output features is equal to the number of
-    input planes.
-    See :class:`~encoding.nn.DilatedAvgPool2d` for details and output shape.
-    Args:
-        input: input tensor (minibatch x in_channels x iH x iW)
-        kernel_size: size of the pooling region, a single number or a
-          tuple (kh x kw)
-        stride: stride of the pooling operation, a single number or a
-          tuple (sh x sw). Default is equal to kernel size
-        padding: implicit zero padding on the input, a single number or
-          a tuple (padh x padw), Default: 0
-        dilation: the dilation parameter similar to Conv2d
-    """
-    return _dilatedavgpool2d.apply(input, kernel_size, stride, padding, dilation)
--- a/encoding/functions/encoding.py
+++ b/encoding/functions/encoding.py
 ##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 ## Created by: Hang Zhang
-## ECE Department, Rutgers University
+## Email: zhanghang0704@gmail.com
-## Email: zhang.hang@rutgers.edu
+## Copyright (c) 2018
-## Copyright (c) 2017
 ##
 ## This source code is licensed under the MIT-style license found in the
 ## LICENSE file in the root directory of this source tree
@@ -11,7 +10,7 @@
 """Functions for Encoding Layer"""
 import torch
 from torch.autograd import Function, Variable
-from .._ext import encoding_lib
+from .. import lib
 __all__ = ['aggregate', 'scaledL2']
@@ -20,47 +19,27 @@ class _aggregate(Function):
    def forward(ctx, A, X, C):
        # A \in(BxNxK) R \in(BxNxKxD) => E \in(BxNxD)
        ctx.save_for_backward(A, X, C)
-        B, _, K = A.size()
+        if A.is_cuda:
-        D = X.size(2)
+            E = lib.gpu.aggregate_forward(A, X, C)
-        with torch.cuda.device_of(A):
-            E = A.new(B, K, D)
-        if isinstance(A, torch.cuda.FloatTensor):
-            with torch.cuda.device_of(A):
-                encoding_lib.Encoding_Float_aggregate_forward(E, A, X, C)
-        elif isinstance(A, torch.cuda.DoubleTensor):
-            with torch.cuda.device_of(A):
-                encoding_lib.Encoding_Double_aggregate_forward(E, A, X, C)
        else:
-            raise RuntimeError('Unimplemented data type!')
+            raise NotImplemented
        return E
    @staticmethod
    def backward(ctx, gradE):
        A, X, C = ctx.saved_variables
-        with torch.cuda.device_of(A):
+        if A.is_cuda:
-            gradA = Variable(A.data.new().resize_as_(A.data))
+            gradA, gradX, gradC = lib.gpu.aggregate_backward(gradE, A, X, C)
-            gradX = Variable(A.data.new().resize_as_(X.data))
-            gradC = Variable(A.data.new().resize_as_(C.data))
-        if isinstance(A.data, torch.cuda.FloatTensor):
-            with torch.cuda.device_of(A.data):
-                encoding_lib.Encoding_Float_aggregate_backward(gradA.data, \
-                    gradE.data, A.data, X.data, C.data)
-        elif isinstance(A.data, torch.cuda.DoubleTensor):
-            with torch.cuda.device_of(A.data):
-                encoding_lib.Encoding_Double_aggregate_backward(gradA.data, \
-                    gradE.data, A.data, X.data, C.data)
        else:
-            raise RuntimeError('Unimplemented data type!')
+            raise NotImplemented
-        gradX.data.copy_(torch.bmm(A, gradE).data)
-        gradC.data.copy_((-gradE*A.sum(1).unsqueeze(2)).sum(0).data)
        return gradA, gradX, gradC
 def aggregate(A, X, C):
-    r"""
+    r""" Aggregate operation, aggregate the residuals of inputs (:math:`X`) with repect
-    Aggregate operation, aggregate the residuals of inputs (:math:`X`) with repect
    to the codewords (:math:`C`) with assignment weights (:math:`A`).
    .. math::
        e_{k} = \sum_{i=1}^{N} a_{ik} (x_i - d_k)
    Shape:
@@ -77,53 +56,31 @@ def aggregate(A, X, C):
        >>> C = Variable(torch.cuda.DoubleTensor(K,D).uniform_(-0.5,0.5), requires_grad=True)
        >>> func = encoding.aggregate()
        >>> E = func(A, X, C)
    """
    return _aggregate.apply(A, X, C)
 class _scaledL2(Function):
    @staticmethod
    def forward(ctx, X, C, S):
-        B, N, _ = X.size()
+        if X.is_cuda:
-        K = C.size(0)
+            SL = lib.gpu.scaled_l2_forward(X, C, S)
-        with torch.cuda.device_of(X):
-            SL = X.new(B, N, K)
-        if isinstance(X, torch.cuda.FloatTensor):
-            with torch.cuda.device_of(X):
-                encoding_lib.Encoding_Float_scaledl2_forward(SL, X, C, S)
-        elif isinstance(X, torch.cuda.DoubleTensor):
-            with torch.cuda.device_of(X):
-                encoding_lib.Encoding_Double_scaledl2_forward(SL, X, C, S)
        else:
-            raise RuntimeError('Unimplemented data type!')
+            raise NotImplemented
        ctx.save_for_backward(X, C, S, SL)
        return SL
    @staticmethod
    def backward(ctx, gradSL):
        X, C, S, SL = ctx.saved_variables
-        K = C.size(0)
+        if X.is_cuda:
-        with torch.cuda.device_of(X.data):
+            gradX, gradC, gradS = lib.gpu.scaled_l2_backward(gradSL, X, C, S, SL)
-            gradX = Variable(X.data.new().resize_as_(X.data))
-            gradC = Variable(X.data.new().resize_as_(C.data))
-            gradS = Variable(X.data.new().resize_as_(S.data))
-        if isinstance(X.data, torch.cuda.FloatTensor):
-            with torch.cuda.device_of(X.data):
-                encoding_lib.Encoding_Float_scaledl2_backward(gradSL.data, \
-                    gradX.data, gradC.data, X.data, C.data, S.data)
-        elif isinstance(X.data, torch.cuda.DoubleTensor):
-            with torch.cuda.device_of(X.data):
-                encoding_lib.Encoding_Double_scaledl2_backward(gradSL.data, \
-                    gradX.data, gradC.data, X.data, C.data, S.data)
        else:
-            raise RuntimeError('Unimplemented data type!')
+            raise NotImplemented
-        gradS.data.copy_((gradSL*(SL/S.view(1, 1, K))).sum(0).sum(0).data)
        return gradX, gradC, gradS
 def scaledL2(X, C, S):
-    r"""
+    r""" scaledL2 distance
-    scaledL2 distance
    .. math::
        sl_{ik} = s_k \|x_i-c_k\|^2
@@ -134,6 +91,5 @@ def scaledL2(X, C, S):
          (where :math:`B` is batch, :math:`N` is total number of features,
          :math:`K` is number is codewords, :math:`D` is feature dimensions.)
        - Output: :math:`E\in\mathcal{R}^{B\times N\times K}`
    """
    return _scaledL2.apply(X, C, S)
--- a/encoding/functions/syncbn.py
+++ b/encoding/functions/syncbn.py
 ##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 ## Created by: Hang Zhang
-## ECE Department, Rutgers University
+## Email: zhanghang0704@gmail.com
-## Email: zhang.hang@rutgers.edu
+## Copyright (c) 2018
-## Copyright (c) 2017
 ##
 ## This source code is licensed under the MIT-style license found in the
 ## LICENSE file in the root directory of this source tree
@@ -11,9 +10,9 @@
 """Synchronized Cross-GPU Batch Normalization functions"""
 import torch
 from torch.autograd import Variable, Function
-from .._ext import encoding_lib
+from .. import lib
-__all__ = ['sum_square', 'batchnormtrain', 'batchnormeval']
+__all__ = ['sum_square', 'batchnormtrain']
 def sum_square(input):
    r"""Calculate sum of elements and sum of squares for Batch Normalization"""
@@ -24,91 +23,45 @@ class _sum_square(Function):
    @staticmethod
    def forward(ctx, input):
        ctx.save_for_backward(input)
-        C = input.size(1)
+        if input.is_cuda:
-        with torch.cuda.device_of(input):
+            xsum, xsqusum = lib.gpu.sumsquare_forward(input)
-            xsum = input.new().resize_(C).zero_()
-            xsquare = input.new().resize_(C).zero_()
-        if isinstance(input, torch.cuda.FloatTensor):
-            with torch.cuda.device_of(input):
-                encoding_lib.Encoding_Float_sum_square_Forward(
-                    input, xsum, xsquare)
-        elif isinstance(input, torch.cuda.DoubleTensor):
-            with torch.cuda.device_of(input):
-                encoding_lib.Encoding_Double_sum_square_Forward(
-                    input, xsum, xsquare)
        else:
-            raise RuntimeError('Unimplemented data type!', type(input))
+            raise NotImplemented
-        return xsum, xsquare
+        return xsum, xsqusum
    @staticmethod
    def backward(ctx, gradSum, gradSquare):
        input, = ctx.saved_variables
-        with torch.cuda.device_of(input.data):
+        if input.is_cuda:
-            gradInput = Variable(input.data.new().resize_as_(input.data).zero_())
+            gradInput = lib.gpu.sumsquare_backward(input, gradSum, gradSquare)
-        if isinstance(input.data, torch.cuda.FloatTensor):
-            with torch.cuda.device_of(input.data):
-                encoding_lib.Encoding_Float_sum_square_Backward(
-                    gradInput.data, input.data, gradSum.data, gradSquare.data)
-        elif isinstance(input.data, torch.cuda.DoubleTensor):
-            with torch.cuda.device_of(input.data):
-                encoding_lib.Encoding_Double_sum_square_Backward(
-                    gradInput.data, input.data, gradSum.data, gradSquare.data)
        else:
-            raise RuntimeError('Unimplemented data type!')
+            raise NotImplemented
        return gradInput
-class _batchnorm(Function):
+class _batchnormtrain(Function):
-    def __init__(self, training=False):
+    @staticmethod
-        super(_batchnorm, self).__init__()
+    def forward(ctx, input, mean, std, gamma, beta):
-        self.training = training
+        ctx.save_for_backward(input, mean, std, gamma, beta)
+        if input.is_cuda:
-    def forward(self, input, gamma, beta, mean, std):
+            output = lib.gpu.batchnorm_forward(input, mean, std, gamma, beta)
-        self.save_for_backward(input, gamma, beta, mean, std)
-        assert(input.dim() == 3)
-        with torch.cuda.device_of(input):
-            invstd = 1.0 / std
-            output = input.new().resize_as_(input)
-        if isinstance(input, torch.cuda.FloatTensor):
-            with torch.cuda.device_of(input):
-                encoding_lib.Encoding_Float_batchnorm_Forward(output, \
-                    input, mean, invstd, gamma, beta)
-        elif isinstance(input, torch.cuda.DoubleTensor):
-            with torch.cuda.device_of(input):
-                encoding_lib.Encoding_Double_batchnorm_Forward(output, \
-                    input, mean, invstd, gamma, beta)
        else:
-            raise RuntimeError('Unimplemented data type!')
+            raise NotImplemented
        return output
-    def backward(self, gradOutput):
+    @staticmethod
-        input, gamma, beta, mean, std = self.saved_tensors
+    def backward(ctx, gradOutput):
-        invstd = 1.0 / std
+        input, mean, std, gamma, beta = ctx.saved_variables
-        with torch.cuda.device_of(input):
+        if gradOutput.is_cuda:
-            gradInput = gradOutput.new().resize_as_(input).zero_()
+            gradInput, gradMean, gradStd, gradGamma, gradBeta = \
-            gradGamma = gradOutput.new().resize_as_(gamma).zero_()
+                lib.gpu.batchnorm_backward(gradOutput, input, mean,
-            gradBeta = gradOutput.new().resize_as_(beta).zero_()
+                                           std, gamma, beta, True)
-            gradMean = gradOutput.new().resize_as_(mean).zero_()
-            gradStd = gradOutput.new().resize_as_(std).zero_()
-        if isinstance(input, torch.cuda.FloatTensor):
-            with torch.cuda.device_of(input):
-                encoding_lib.Encoding_Float_batchnorm_Backward(
-                    gradOutput, input, gradInput, gradGamma, gradBeta,
-                    mean, invstd, gamma, beta, gradMean, gradStd,
-                    self.training)
-        elif isinstance(input, torch.cuda.DoubleTensor):
-            with torch.cuda.device_of(input):
-                encoding_lib.Encoding_Double_batchnorm_Backward(
-                    gradOutput, input, gradInput, gradGamma, gradBeta,
-                    mean, invstd, gamma, beta, gradMean, gradStd,
-                    self.training)
        else:
-            raise RuntimeError('Unimplemented data type!')
+            raise NotImplemented
-        return gradInput, gradGamma, gradBeta, gradMean, gradStd
+        return gradInput, gradMean, gradStd, gradGamma, gradBeta
-def batchnormtrain(input, gamma, beta, mean, std):
+def batchnormtrain(input, mean, std, gamma, beta):
    r"""Applies Batch Normalization over a 3d input that is seen as a
    mini-batch.
@@ -123,14 +76,4 @@ def batchnormtrain(input, gamma, beta, mean, std):
        - Output: :math:`(N, C)` or :math:`(N, C, L)` (same shape as input)
    """
-    return _batchnorm(True)(input, gamma, beta, mean, std)
+    return _batchnormtrain.apply(input, mean, std, gamma, beta)
-def batchnormeval(input, gamma, beta, mean, std):
-    r"""Applies Batch Normalization over a 3d input that is seen as a
-    mini-batch.
-    Please see encoding.batchnormtrain_
-    """
-    return _batchnorm(False)(input, gamma, beta, mean, std)
--- a/encoding/kernel/common.h
+++ b/encoding/kernel/common.h
-// The maximum number of threads in a block
-const int WARP_SIZE = 32;
-const int MAX_BLOCK_SIZE = 512;
-// Number of threads in a block given an input size up to MAX_BLOCK_SIZE
-static int getNumThreads(int nElem) {
-  int threadSizes[5] = { 32, 64, 128, 256, MAX_BLOCK_SIZE };
-  for (int i = 0; i != 5; ++i) {
-    if (nElem <= threadSizes[i]) {
-      return threadSizes[i];
-    }
-  }
-  return MAX_BLOCK_SIZE;
-}
-__device__ __forceinline__ int getMSB(int val) {
-  return 31 - __clz(val);
-}
--- a/encoding/kernel/generic/device_tensor.h
+++ b/encoding/kernel/generic/device_tensor.h
-/*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
- * Created by: Hang Zhang
- * ECE Department, Rutgers University
- * Email: zhang.hang@rutgers.edu
- * Copyright (c) 2017
- *
- * This source code is licensed under the MIT-style license found in the
- * LICENSE file in the root directory of this source tree 
- *+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
- */
-#ifndef THC_GENERIC_FILE
-#define THC_GENERIC_FILE "generic/device_tensor.h"
-#else
-template <int Dim>
-THCDeviceTensor<real, Dim> devicetensor(THCState *state, THCTensor *t) {
-    if (!t) {
-        return THCDeviceTensor<real, Dim>();
-    }
-    int inDim = THCTensor_(nDimension)(state, t);
-    return toDeviceTensor<real, Dim>(state, t);
-    /*
-    if (inDim == Dim) {
-        return toDeviceTensor<real, Dim>(state, t);
-    }
-    // View in which the last dimensions are collapsed or expanded as needed
-    THAssert(THCTensor_(isContiguous)(state, t));
-    int size[Dim];
-    for (int i = 0; i < Dim || i < inDim; ++i) {
-        if (i < Dim && i < inDim) {
-            size[i] = t->size[i];
-        } else if (i < Dim) {
-            size[i] = 1;
-        } else {
-            size[Dim - 1] *= t->size[i];
-        }
-    }
-    return THCDeviceTensor<real, Dim>(THCTensor_(data)(state, t), size);
-    */
-}
-struct Encoding_(Float2)
-/*
- * For reduce sum calcualtion of two elements
- */
-{
-    real v1, v2;
-    __device__ Encoding_(Float2)() {}
-    __device__ Encoding_(Float2)(real x1, real x2) : v1(x1), v2(x2) {}
-    __device__ Encoding_(Float2)(real v) : v1(v), v2(v) {}
-    __device__ Encoding_(Float2)(int v) :  v1(v), v2(v) {}
-    __device__ Encoding_(Float2)& operator+=(const Encoding_(Float2)& a) 
-    {
-    v1 += a.v1;
-    v2 += a.v2;
-    return *this;
-    }
-};
-static __device__ __forceinline__ real Encoding_(rwarpSum)(real val) {
-#if CUDA_VERSION >= 9000
-    unsigned int mask = 0xffffffff;
-    for (int i = 0; i < getMSB(WARP_SIZE); ++i) {
-        val += __shfl_xor_sync(mask, val, 1 << i, WARP_SIZE);
-    }
-#else
-#if __CUDA_ARCH__ >= 300
-    for (int i = 0; i < getMSB(WARP_SIZE); ++i) {
-        val += __shfl_xor(val, 1 << i, WARP_SIZE);
-    }
-#else
-    __shared__ real values[MAX_BLOCK_SIZE];
-    values[threadIdx.x] = val;
-    __threadfence_block();
-    const int base = (threadIdx.x / WARP_SIZE) * WARP_SIZE;
-    for (int i = 1; i < WARP_SIZE; i++) {
-        val += values[base + ((i + threadIdx.x) % WARP_SIZE)];
-    }
-#endif
-#endif
-    return val;
-}
-static __device__ __forceinline__ Encoding_(Float2) Encoding_(warpSum)(
-    Encoding_(Float2) value) 
-{
-    value.v1 = Encoding_(rwarpSum)(value.v1);
-    value.v2 = Encoding_(rwarpSum)(value.v2);
-    return value;
-}
-#endif
--- a/encoding/kernel/generic/encoding_kernel.c
+++ b/encoding/kernel/generic/encoding_kernel.c
-/*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
- * Created by: Hang Zhang
- * ECE Department, Rutgers University
- * Email: zhang.hang@rutgers.edu
- * Copyright (c) 2017
- *
- * This source code is licensed under the MIT-style license found in the
- * LICENSE file in the root directory of this source tree 
- *+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
- */
-#ifndef THC_GENERIC_FILE
-#define THC_GENERIC_FILE "generic/encoding_kernel.c"
-#else
-__global__ void Encoding_(Aggregate_Forward_kernel) (
-    THCDeviceTensor<real, 3> E,
-    THCDeviceTensor<real, 3> A,
-    THCDeviceTensor<real, 3> X,
-    THCDeviceTensor<real, 2> C)
-/*
- * aggregating forward kernel function
- */
-{
-    /* declarations of the variables */
-    int b, k, d, N;
-    /* Get the index and channels */ 
-    b = blockIdx.z;
-    d = blockIdx.x;
-    k = blockIdx.y;
-    N = X.getSize(1);
-    /* main operation */
-    Encoding_(AggOp) g(A,X,C);
-    E[b][k][d] = Encoding_(reduce_agg)(g,b,k,d,N);
-}
-void Encoding_(Aggregate_Forward)(THCState *state, THCTensor *E_, 
-    THCTensor *A_, THCTensor *X_, THCTensor *C_)
-/*
- * aggregating forward the residuals with assignment weights
- */
-{
-    /* Check the GPU index and tensor dims*/
-    THCTensor_(checkGPU)(state, 4, E_, A_, X_, C_);
-    if (THCTensor_(nDimension)(state, E_) != 3 ||
-        THCTensor_(nDimension)(state, A_) != 3 ||
-        THCTensor_(nDimension)(state, X_) != 3 ||
-        THCTensor_(nDimension)(state, C_) != 2)
-        THError("Encoding: incorrect input dims. \n");
-    /* Device tensors */
-    THCDeviceTensor<real, 3> E = devicetensor<3>(state, E_);
-    THCDeviceTensor<real, 3> A = devicetensor<3>(state, A_);
-    THCDeviceTensor<real, 3> X = devicetensor<3>(state, X_);
-    THCDeviceTensor<real, 2> C = devicetensor<2>(state, C_);
-    /* kernel function */
-    cudaStream_t stream = THCState_getCurrentStream(state);
-    // B, K, D
-    dim3 blocks(C.getSize(1), C.getSize(0), X.getSize(0));
-    // N
-    dim3 threads(getNumThreads(X.getSize(1)));
-    Encoding_(Aggregate_Forward_kernel)<<<blocks, threads, 0, stream>>>
-        (E, A, X, C);
-    THCudaCheck(cudaGetLastError());
-}
-/*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
-__global__ void Encoding_(Aggregate_Backward_kernel) (
-    THCDeviceTensor<real, 3> GA,
-    THCDeviceTensor<real, 3> GE,
-    THCDeviceTensor<real, 3> A,
-    THCDeviceTensor<real, 3> X,
-    THCDeviceTensor<real, 2> C)
-/*
- * aggregating backward kernel function
- * G (dl/dR), L (dl/dE), A
- */
-{
-    /* declarations of the variables */
-    int b, k, i, D;
-    /* Get the index and channels */ 
-    b = blockIdx.z;
-    i = blockIdx.y;
-    k = blockIdx.x;
-    D = GE.getSize(2);
-    /* main operation */
-    Encoding_(AggBackOp) g(GE,X,C);
-    GA[b][i][k] = Encoding_(reduce_aggback)(g,b,i,k,D);
-}
-void Encoding_(Aggregate_Backward)(THCState *state, THCTensor *GA_, 
-     THCTensor *GE_, THCTensor *A_, THCTensor *X_, THCTensor *C_)
-/*
- * aggregate backward to assignment weights
- * G (dl/dR), L (dl/dE), A
- */
-{
-    /* Check the GPU index and tensor dims*/
-    THCTensor_(checkGPU)(state, 5, GA_, GE_, A_, X_, C_);
-    if (THCTensor_(nDimension)(state, GA_) != 3 ||
-        THCTensor_(nDimension)(state, GE_)  != 3 ||
-        THCTensor_(nDimension)(state, A_)  != 3 ||
-        THCTensor_(nDimension)(state, X_)  != 3 ||
-        THCTensor_(nDimension)(state, C_)  != 2)
-    THError("Encoding: incorrect input dims. \n");
-    /* Device tensors */
-    THCDeviceTensor<real, 3> GA = devicetensor<3>(state, GA_);
-    THCDeviceTensor<real, 3> GE = devicetensor<3>(state, GE_);
-    THCDeviceTensor<real, 3> A = devicetensor<3>(state, A_);
-    THCDeviceTensor<real, 3> X = devicetensor<3>(state, X_);
-    THCDeviceTensor<real, 2> C = devicetensor<2>(state, C_);
-    /* kernel function */
-    cudaStream_t stream = THCState_getCurrentStream(state);
-    // B, K, D
-    dim3 blocks(C.getSize(0), X.getSize(1), X.getSize(0));
-    // N
-    dim3 threads(getNumThreads(C.getSize(1)));
-    Encoding_(Aggregate_Backward_kernel)<<<blocks, threads, 0, stream>>>
-        (GA, GE, A, X, C);
-    THCudaCheck(cudaGetLastError());
-}
-/*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
-__global__ void Encoding_(ScaledL2_Forward_kernel) (
-    THCDeviceTensor<real, 3> SL,
-    THCDeviceTensor<real, 3> X,
-    THCDeviceTensor<real, 2> C,
-    THCDeviceTensor<real, 1> S)
-/*
- * aggregating forward kernel function
- */
-{
-    /* declarations of the variables */
-    int b, k, i, D;
-    /* Get the index and channels */ 
-    b = blockIdx.z;
-    k = blockIdx.x;
-    i = blockIdx.y;
-    D = X.getSize(2);
-    /* main operation */
-    Encoding_(L2Op) g(X,C);
-    SL[b][i][k] = S[k] * Encoding_(reduce_sl2)(g,b,i,k,D);;
-}
-void Encoding_(ScaledL2_Forward)(
-    THCState *state, THCTensor *SL_,  THCTensor *X_,
-    THCTensor *C_,  THCTensor *S_)
-/*
- * aggregating forward the residuals with assignment weights
- */
-{
-    /* Check the GPU index and tensor dims*/
-    THCTensor_(checkGPU)(state, 4, SL_, X_, C_, S_); 
-    if (THCTensor_(nDimension)(state, SL_) != 3 ||
-        THCTensor_(nDimension)(state, X_) != 3 ||
-        THCTensor_(nDimension)(state, C_) != 2 ||
-        THCTensor_(nDimension)(state, S_) != 1)
-    THError("Encoding: incorrect input dims. \n");
-    /* Device tensors */
-    THCDeviceTensor<real, 3> SL = devicetensor<3>(state, SL_);
-    THCDeviceTensor<real, 3> X  = devicetensor<3>(state, X_);
-    THCDeviceTensor<real, 2> C  = devicetensor<2>(state, C_);
-    THCDeviceTensor<real, 1> S  = devicetensor<1>(state, S_);
-    /* kernel function */
-    cudaStream_t stream = THCState_getCurrentStream(state);
-    dim3 blocks(C.getSize(0), X.getSize(1), X.getSize(0));
-    dim3 threads(getNumThreads(C.getSize(1)));
-    Encoding_(ScaledL2_Forward_kernel)<<<blocks, threads, 0, stream>>>
-        (SL, X, C, S);
-    THCudaCheck(cudaGetLastError());
-}
-/*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
-__global__ void Encoding_(ScaledL2X_Backward_kernel) (
-    THCDeviceTensor<real, 3> GSL,
-    THCDeviceTensor<real, 3> GX,
-    THCDeviceTensor<real, 3> X,
-    THCDeviceTensor<real, 2> C,
-    THCDeviceTensor<real, 1> S)
-/*
- */
-{
-    /* declarations of the variables */
-    int b, d, i, K;
-    /* Get the index and channels */ 
-    b = blockIdx.z;
-    d = blockIdx.x;
-    i = blockIdx.y;
-    K = C.getSize(0);
-    /* main operation */
-    Encoding_(L2XBackOp) g(GSL,X,C,S);
-    GX[b][i][d] = Encoding_(reduce_sl2xback)(g,b,i,d,K);
-}
-__global__ void Encoding_(ScaledL2C_Backward_kernel) (
-    THCDeviceTensor<real, 3> GSL,
-    THCDeviceTensor<real, 2> GC,
-    THCDeviceTensor<real, 3> X,
-    THCDeviceTensor<real, 2> C,
-    THCDeviceTensor<real, 1> S)
-/*
- */
-{
-    /* declarations of the variables */
-    int k, d, B, N;
-    /* Get the index and channels */ 
-    d = blockIdx.x;
-    k = blockIdx.y;
-    B = X.getSize(0);
-    N = X.getSize(1);
-    /* main operation */
-    Encoding_(L2CBackOp) g(GSL,X,C,S);
-    GC[k][d] = Encoding_(reduce_sl2cback)(g,k,d,B,N);
-}
-void Encoding_(ScaledL2_Backward)(
-    THCState *state, THCTensor *GSL_, THCTensor *GX_, THCTensor *GC_,
-    THCTensor *X_, THCTensor *C_, THCTensor *S_)
-/*
- */
-{
-    /* Check the GPU index and tensor dims*/
-    THCTensor_(checkGPU)(state, 6, GSL_, GX_, GC_, X_, C_, S_); 
-    if (THCTensor_(nDimension)(state, GSL_) != 3 ||
-        THCTensor_(nDimension)(state, GX_)  != 3 ||
-        THCTensor_(nDimension)(state, GC_)  != 2 ||
-        THCTensor_(nDimension)(state, X_)   != 3 ||
-        THCTensor_(nDimension)(state, C_)   != 2 ||
-        THCTensor_(nDimension)(state, S_)   != 1)
-    THError("Encoding: incorrect input dims. \n");
-    /* Device tensors */
-    THCDeviceTensor<real, 3> GSL = devicetensor<3>(state, GSL_);
-    THCDeviceTensor<real, 3> GX = devicetensor<3>(state, GX_);
-    THCDeviceTensor<real, 2> GC = devicetensor<2>(state, GC_);
-    THCDeviceTensor<real, 3> X  = devicetensor<3>(state, X_);
-    THCDeviceTensor<real, 2> C  = devicetensor<2>(state, C_);
-    THCDeviceTensor<real, 1> S = devicetensor<1>(state, S_);
-    /* kernel function */
-    cudaStream_t stream = THCState_getCurrentStream(state);
-    dim3 blocks(X.getSize(2), X.getSize(1), X.getSize(0));
-    dim3 threads(getNumThreads(C.getSize(0)));
-    Encoding_(ScaledL2X_Backward_kernel)<<<blocks, threads, 0, stream>>>
-        (GSL, GX, X, C, S);
-    THCudaCheck(cudaGetLastError());
-    dim3 blocks2(C.getSize(1), C.getSize(0));
-    dim3 threads2(getNumThreads(X.getSize(1)));
-    Encoding_(ScaledL2C_Backward_kernel)<<<blocks2, threads2, 0, stream>>>
-        (GSL, GC, X, C, S);
-    THCudaCheck(cudaGetLastError());
-}
-#endif
--- a/encoding/kernel/generic/encoding_kernel.h
+++ b/encoding/kernel/generic/encoding_kernel.h
-/*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
- * Created by: Hang Zhang
- * ECE Department, Rutgers University
- * Email: zhang.hang@rutgers.edu
- * Copyright (c) 2017
- *
- * This source code is licensed under the MIT-style license found in the
- * LICENSE file in the root directory of this source tree 
- *+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
- */
-#ifndef THC_GENERIC_FILE
-#define THC_GENERIC_FILE "generic/encoding_kernel.h"
-#else
-void Encoding_(Aggregate_Forward)(THCState *state, THCTensor *E_, 
-    THCTensor *A_, THCTensor *X_, THCTensor *C_);
-void Encoding_(Aggregate_Backward)(THCState *state, THCTensor *GA_, 
-     THCTensor *GE_, THCTensor *A_, THCTensor *X_, THCTensor *C_);
-void Encoding_(ScaledL2_Forward)( THCState *state, THCTensor *SL_,  
-    THCTensor *X_, THCTensor *C_,  THCTensor *S_);
-void Encoding_(ScaledL2_Backward)(
-    THCState *state, THCTensor *GSL_, THCTensor *GX_, THCTensor *GC_,
-    THCTensor *X_, THCTensor *C_, THCTensor *S_);
-#endif
--- a/encoding/kernel/generic/encoding_utils.c
+++ b/encoding/kernel/generic/encoding_utils.c
-/*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
- * Created by: Hang Zhang
- * ECE Department, Rutgers University
- * Email: zhang.hang@rutgers.edu
- * Copyright (c) 2017
- *
- * This source code is licensed under the MIT-style license found in the
- * LICENSE file in the root directory of this source tree 
- *+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
- */
-#ifndef THC_GENERIC_FILE
-#define THC_GENERIC_FILE "generic/encoding_utils.c"
-#else
-struct Encoding_(AggOp) {
-    __device__ Encoding_(AggOp)(THCDeviceTensor<real, 3> a,
-                                THCDeviceTensor<real, 3> x,
-                                THCDeviceTensor<real, 2> c)
-        : A(a), X(x), C(c) {}
-    __device__ __forceinline__ real operator()(int b, int i, int k, int d) 
-    {
-        return A[b][i][k].ldg() * (X[b][i][d].ldg()-C[k][d].ldg());
-    }
-    THCDeviceTensor<real, 3> A;
-    THCDeviceTensor<real, 3> X;
-    THCDeviceTensor<real, 2> C;
-};
-__device__ real Encoding_(reduce_agg)(
-        Encoding_(AggOp) op, 
-        int b, int k, int d, int N)
-{
-    real sum = 0;
-    for (int x = threadIdx.x; x < N; x += blockDim.x) {
-        sum += op(b,x,k,d);
-    }
-    // sum over NumThreads within a warp
-    sum = Encoding_(rwarpSum)(sum);
-    // 'transpose', and reduce within warp again
-    __shared__ real shared[32];
-    __syncthreads();
-    if (threadIdx.x % WARP_SIZE == 0) {
-        if (threadIdx.x / WARP_SIZE < 32) {
-                shared[threadIdx.x / WARP_SIZE] = sum;
-        }
-    }
-    if (threadIdx.x >= blockDim.x / WARP_SIZE && threadIdx.x < WARP_SIZE) {
-        // zero out the other entries in shared
-        shared[threadIdx.x] = (real) 0;
-    }
-    __syncthreads();
-    if (threadIdx.x / WARP_SIZE == 0) {
-        sum = Encoding_(rwarpSum)(shared[threadIdx.x]);
-        if (threadIdx.x == 0) {
-            shared[0] = sum;
-        }
-    }
-    __syncthreads();
-  // Everyone picks it up, should be broadcast into the whole gradInput
-  return shared[0];
-}
-struct Encoding_(AggBackOp) {
-    __device__ Encoding_(AggBackOp)(THCDeviceTensor<real, 3> ge,
-                                    THCDeviceTensor<real, 3> x,
-                                    THCDeviceTensor<real, 2> c)
-        : GE(ge), X(x), C(c) {}
-    __device__ __forceinline__ real operator()(int b, int i, int k, int d) 
-    {
-        return GE[b][k][d].ldg() * (X[b][i][d].ldg()-C[k][d].ldg());
-    }
-    THCDeviceTensor<real, 3> GE;
-    THCDeviceTensor<real, 3> X;
-    THCDeviceTensor<real, 2> C;
-};
-__device__ real Encoding_(reduce_aggback)(
-        Encoding_(AggBackOp) op, 
-        int b, int i, int k, int D)
-{
-    real sum = 0;
-    for (int x = threadIdx.x; x < D; x += blockDim.x) {
-        sum += op(b,i,k,x);
-    }
-    // sum over NumThreads within a warp
-    sum = Encoding_(rwarpSum)(sum);
-    // 'transpose', and reduce within warp again
-    __shared__ real shared[32];
-    __syncthreads();
-    if (threadIdx.x % WARP_SIZE == 0) {
-        if (threadIdx.x / WARP_SIZE < 32) {
-                shared[threadIdx.x / WARP_SIZE] = sum;
-        }
-    }
-    if (threadIdx.x >= blockDim.x / WARP_SIZE && threadIdx.x < WARP_SIZE) {
-        // zero out the other entries in shared
-        shared[threadIdx.x] = (real) 0;
-    }
-    __syncthreads();
-    if (threadIdx.x / WARP_SIZE == 0) {
-        sum = Encoding_(rwarpSum)(shared[threadIdx.x]);
-        if (threadIdx.x == 0) {
-            shared[0] = sum;
-        }
-    }
-    __syncthreads();
-  // Everyone picks it up, should be broadcast into the whole gradInput
-  return shared[0];
-}
-struct Encoding_(L2Op) {
-    __device__ Encoding_(L2Op)(THCDeviceTensor<real, 3> x,
-                               THCDeviceTensor<real, 2> c)
-        : X(x), C(c) {}
-    __device__ __forceinline__ real operator()(int b, int i, int k, int d) 
-    {
-        real r = X[b][i][d].ldg() - C[k][d].ldg();
-        return r * r;
-    }
-    THCDeviceTensor<real, 3> X;
-    THCDeviceTensor<real, 2> C;
-};
-__device__ real Encoding_(reduce_sl2)(
-        Encoding_(L2Op) op, 
-        int b, int i, int k, int D)
-{
-    real sum = 0;
-    for (int x = threadIdx.x; x < D; x += blockDim.x) {
-        sum += op(b,i,k,x);
-    }
-    // sum over NumThreads within a warp
-    sum = Encoding_(rwarpSum)(sum);
-    // 'transpose', and reduce within warp again
-    __shared__ real shared[32];
-    __syncthreads();
-    if (threadIdx.x % WARP_SIZE == 0) {
-        if (threadIdx.x / WARP_SIZE < 32) {
-                shared[threadIdx.x / WARP_SIZE] = sum;
-        }
-    }
-    if (threadIdx.x >= blockDim.x / WARP_SIZE && threadIdx.x < WARP_SIZE) {
-        // zero out the other entries in shared
-        shared[threadIdx.x] = (real) 0;
-    }
-    __syncthreads();
-    if (threadIdx.x / WARP_SIZE == 0) {
-        sum = Encoding_(rwarpSum)(shared[threadIdx.x]);
-        if (threadIdx.x == 0) {
-            shared[0] = sum;
-        }
-    }
-    __syncthreads();
-  // Everyone picks it up, should be broadcast into the whole gradInput
-  return shared[0];
-}
-struct Encoding_(L2XBackOp) {
-    __device__ Encoding_(L2XBackOp)(
-        THCDeviceTensor<real, 3> gsl,
-        THCDeviceTensor<real, 3> x,
-        THCDeviceTensor<real, 2> c,
-        THCDeviceTensor<real, 1> s
-    ) : GSL(gsl), X(x), C(c), S(s) {}
-    __device__ __forceinline__ real operator()(int b, int i, int k, int d) 
-    {
-        return 2*S[k].ldg() * GSL[b][i][k].ldg() *
-            (X[b][i][d].ldg()-C[k][d].ldg());
-    }
-    THCDeviceTensor<real, 3> GSL;
-    THCDeviceTensor<real, 3> X;
-    THCDeviceTensor<real, 2> C;
-    THCDeviceTensor<real, 1> S;
-};
-__device__ real Encoding_(reduce_sl2xback)(
-        Encoding_(L2XBackOp) op, 
-        int b, int i, int d, int K)
-{
-    real sum = 0;
-    for (int x = threadIdx.x; x < K; x += blockDim.x) {
-        sum += op(b,i,x,d);
-    }
-    // sum over NumThreads within a warp
-    sum = Encoding_(rwarpSum)(sum);
-    // 'transpose', and reduce within warp again
-    __shared__ real shared[32];
-    __syncthreads();
-    if (threadIdx.x % WARP_SIZE == 0) {
-        if (threadIdx.x / WARP_SIZE < 32) {
-                shared[threadIdx.x / WARP_SIZE] = sum;
-        }
-    }
-    if (threadIdx.x >= blockDim.x / WARP_SIZE && threadIdx.x < WARP_SIZE) {
-        // zero out the other entries in shared
-        shared[threadIdx.x] = (real) 0;
-    }
-    __syncthreads();
-    if (threadIdx.x / WARP_SIZE == 0) {
-        sum = Encoding_(rwarpSum)(shared[threadIdx.x]);
-        if (threadIdx.x == 0) {
-            shared[0] = sum;
-        }
-    }
-    __syncthreads();
-  // Everyone picks it up, should be broadcast into the whole gradInput
-  return shared[0];
-}
-struct Encoding_(L2CBackOp) {
-    __device__ Encoding_(L2CBackOp)(
-        THCDeviceTensor<real, 3> gsl,
-        THCDeviceTensor<real, 3> x,
-        THCDeviceTensor<real, 2> c,
-        THCDeviceTensor<real, 1> s
-    ) : GSL(gsl), X(x), C(c), S(s) {}
-    __device__ __forceinline__ real operator()(int b, int i, int k, int d) 
-    {
-        return -2*S[k].ldg() * GSL[b][i][k].ldg() *
-                (X[b][i][d].ldg()-C[k][d].ldg());
-    }
-    THCDeviceTensor<real, 3> GSL;
-    THCDeviceTensor<real, 3> X;
-    THCDeviceTensor<real, 2> C;
-    THCDeviceTensor<real, 1> S;
-};
-__device__ real Encoding_(reduce_sl2cback)(
-        Encoding_(L2CBackOp) op, 
-        int k, int d, int B, int N)
-{
-    real sum = 0;
-    for (int batch = 0; batch < B; ++batch) {
-        for (int x = threadIdx.x; x < N; x += blockDim.x) {
-            sum += op(batch,x,k,d);
-        }
-    }
-    // sum over NumThreads within a warp
-    sum = Encoding_(rwarpSum)(sum);
-    // 'transpose', and reduce within warp again
-    __shared__ real shared[32];
-    __syncthreads();
-    if (threadIdx.x % WARP_SIZE == 0) {
-        if (threadIdx.x / WARP_SIZE < 32) {
-                shared[threadIdx.x / WARP_SIZE] = sum;
-        }
-    }
-    if (threadIdx.x >= blockDim.x / WARP_SIZE && threadIdx.x < WARP_SIZE) {
-        // zero out the other entries in shared
-        shared[threadIdx.x] = (real) 0;
-    }
-    __syncthreads();
-    if (threadIdx.x / WARP_SIZE == 0) {
-        sum = Encoding_(rwarpSum)(shared[threadIdx.x]);
-        if (threadIdx.x == 0) {
-            shared[0] = sum;
-        }
-    }
-    __syncthreads();
-  // Everyone picks it up, should be broadcast into the whole gradInput
-  return shared[0];
-}
-#endif
--- a/encoding/kernel/generic/pooling_kernel.c
+++ b/encoding/kernel/generic/pooling_kernel.c
-/*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
- * Created by: Hang Zhang
- * ECE Department, Rutgers University
- * Email: zhang.hang@rutgers.edu
- * Copyright (c) 2017
- *
- * This source code is licensed under the MIT-style license found in the
- * LICENSE file in the root directory of this source tree 
- *+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
- */
-#ifndef THC_GENERIC_FILE
-#define THC_GENERIC_FILE "generic/pooling_kernel.c"
-#else
-__global__ void Encoding_(DilatedAvgPool_Forward_kernel) (
-    THCDeviceTensor<real, 4> X, 
-    THCDeviceTensor<real, 4> Y, 
-    int kH, int kW, int dH, int dW,
-    int padH, int padW, int dilationH, int dilationW
-    )
-/*
- * dilated avgpool2d forward kernel function
- */
-{
-    /* declarations of the variables */
-    int bc, b, c, w, h, C;
-    real sum;
-    /* Get the index and channels */ 
-    bc = blockIdx.z;
-    w = blockIdx.x * blockDim.x + threadIdx.x;
-    h = blockIdx.y * blockDim.y + threadIdx.y;
-    C = Y.getSize(1);
-    b = bc / C;
-    c = bc - b*C;
-    /* boundary check for output */
-    if (w >= Y.getSize(3) || h >= Y.getSize(2)) return;
-    int hstart = h*dH -padH;
-    int wstart = w*dW -padW;
-    int hend = min(hstart + kH*dilationH, X.getSize(2));
-    int wend = min(wstart + kW*dilationW, X.getSize(3));
-    hstart = max(hstart, 0);
-    wstart = max(wstart, 0);
-    int pool_size = ((hend - hstart - 1) / dilationH + 1) * 
-        ((wend - wstart - 1) / dilationW + 1);
-    sum = 0;
-    for (int th=hstart; th < hend; th+=dilationH) {
-        for (int tw=wstart; tw < wend; tw+=dilationW) {
-            sum += X[b][c][th][tw];
-        }
-    }
-    Y[b][c][h][w] = sum / pool_size;
-}
-void Encoding_(DilatedAvgPool_Forward)(THCState *state, 
-    THCTensor *X_, THCTensor *Y_, 
-    int kH, int kW, int dH, int dW,
-    int padH, int padW,
-    int dilationH, int dilationW)
-/*
- * dilated avgpool2d forward function
- */
-{
-    /* Check the GPU index and tensor dims*/
-    THCTensor_(checkGPU)(state, 2, X_, Y_);
-    if (THCTensor_(nDimension)(state, X_) != 4 ||
-        THCTensor_(nDimension)(state, Y_) != 4)
-        THError("Encoding: incorrect input dims. \n");
-    /* Device tensors */
-    THCDeviceTensor<real, 4> X = devicetensor<4>(state, X_);
-    THCDeviceTensor<real, 4> Y = devicetensor<4>(state, Y_);
-    /* kernel function */
-    cudaStream_t stream = THCState_getCurrentStream(state);
-    dim3 threads(16, 16);
-    dim3 blocks(Y.getSize(3)/16+1, Y.getSize(2)/16+1, 
-                Y.getSize(1)*Y.getSize(0));
-    Encoding_(DilatedAvgPool_Forward_kernel)<<<blocks, threads, 0, stream>>>
-        (X, Y, kH, kW, dH, dW, padH, padW, dilationH, dilationW);
-    THCudaCheck(cudaGetLastError());
-}
-__global__ void Encoding_(DilatedAvgPool_Backward_kernel) (
-    THCDeviceTensor<real, 4> gradX, 
-    THCDeviceTensor<real, 4> gradY, 
-    int kH, int kW, int dH, int dW,
-    int padH, int padW, int dilationH, int dilationW
-    )
-/*
- * dilated avgpool2d forward kernel function
- */
-{
-    /* declarations of the variables */
-    int bc, b, c, w, h, C;
-    real sum;
-    /* Get the index and channels */ 
-    bc = blockIdx.z;
-    w = blockIdx.x * blockDim.x + threadIdx.x;
-    h = blockIdx.y * blockDim.y + threadIdx.y;
-    C = gradX.getSize(1);
-    b = bc / C;
-    c = bc - b*C;
-    /* boundary check for output */
-    if (w >= gradX.getSize(3) || h >= gradX.getSize(2)) return;
-    int phstart = (h + padH < ((kH-1)*dilationH+1)) ? 0 : 
-        (h + padH - ((kH-1)*dilationH+1))/dH + 1;
-    int pwstart = (w + padW < ((kW-1)*dilationW+1)) ? 0 : 
-        (w + padW - ((kW-1)*dilationW+1))/dW + 1;
-    int phend = min((h+padH)/dH+1, gradY.getSize(2));
-    int pwend = min((w+padW)/dW+1, gradY.getSize(3));
-    sum = 0;
-    int hstart, wstart, hend, wend, pool_size;
-    for (int ph=phstart; ph < phend; ++ph) {
-        for (int pw=pwstart; pw < pwend; ++pw) {
-            hstart = ph*dW -padH;
-            wstart = pw*dW -padW;
-            hend = min(hstart + kH*dilationH, gradX.getSize(2));
-            wend = min(wstart + kW*dilationW, gradX.getSize(3));
-            hstart = max(hstart, 0);
-            wstart = max(wstart, 0);
-            pool_size = ((hend - hstart - 1) / dilationH + 1) * 
-                ((wend - wstart - 1) / dilationW + 1);
-            sum += gradY[b][c][ph][pw] / pool_size;
-        }
-    }
-    gradX[b][c][h][w] = sum;
-}
-void Encoding_(DilatedAvgPool_Backward)(THCState *state, 
-    THCTensor *gradX_, THCTensor *gradY_, 
-    int kH, int kW, int dH, int dW,
-    int padH, int padW,
-    int dilationH, int dilationW)
-/*
- * dilated avgpool2d forward function
- */
-{
-    /* Check the GPU index and tensor dims*/
-    THCTensor_(checkGPU)(state, 2, gradX_, gradY_);
-    if (THCTensor_(nDimension)(state, gradX_) != 4 ||
-        THCTensor_(nDimension)(state, gradY_) != 4)
-        THError("Encoding: incorrect input dims. \n");
-    /* Device tensors */
-    THCDeviceTensor<real, 4> gradX = devicetensor<4>(state, gradX_);
-    THCDeviceTensor<real, 4> gradY = devicetensor<4>(state, gradY_);
-    /* kernel function */
-    cudaStream_t stream = THCState_getCurrentStream(state);
-    dim3 threads(16, 16);
-    dim3 blocks(gradX.getSize(3)/16+1, gradX.getSize(2)/16+1, 
-                gradX.getSize(1)*gradX.getSize(0));
-    Encoding_(DilatedAvgPool_Backward_kernel)<<<blocks, threads, 0, stream>>>
-        (gradX, gradY, kH, kW, dH, dW, padH, padW, dilationH, dilationW);
-    THCudaCheck(cudaGetLastError());
-}
-#endif
--- a/encoding/kernel/generic/pooling_kernel.h
+++ b/encoding/kernel/generic/pooling_kernel.h
-/*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
- * Created by: Hang Zhang
- * ECE Department, Rutgers University
- * Email: zhang.hang@rutgers.edu
- * Copyright (c) 2017
- *
- * This source code is licensed under the MIT-style license found in the
- * LICENSE file in the root directory of this source tree 
- *+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
- */
-#ifndef THC_GENERIC_FILE
-#define THC_GENERIC_FILE "generic/pooling_kernel.h"
-#else
-void Encoding_(DilatedAvgPool_Forward)(THCState *state, 
-    THCTensor *X_, THCTensor *Y_, 
-    int kH, int kW, int dH, int dW,
-    int padH, int padW,
-    int dilationH, int dilationW);
-void Encoding_(DilatedAvgPool_Backward)(THCState *state, 
-    THCTensor *gradX_, THCTensor *gradY_, 
-    int kH, int kW, int dH, int dW,
-    int padH, int padW,
-    int dilationH, int dilationW);
-#endif
--- a/encoding/kernel/generic/syncbn_kernel.c
+++ b/encoding/kernel/generic/syncbn_kernel.c
-/*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
- * Created by: Hang Zhang
- * ECE Department, Rutgers University
- * Email: zhang.hang@rutgers.edu
- * Copyright (c) 2017
- *
- * This source code is licensed under the MIT-style license found in the
- * LICENSE file in the root directory of this source tree 
- *+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
- */
-#ifndef THC_GENERIC_FILE
-#define THC_GENERIC_FILE "generic/syncbn_kernel.c"
-#else
-__global__ void Encoding_(BatchNorm_Forward_kernel) (
-    THCDeviceTensor<real, 3> output,
-    THCDeviceTensor<real, 3> input,
-    THCDeviceTensor<real, 1> mean,
-    THCDeviceTensor<real, 1> invstd,
-    THCDeviceTensor<real, 1> gamma,
-    THCDeviceTensor<real, 1> beta)
-{
-    int c = blockIdx.x;
-    /* main operation */ 
-    for (int b = 0; b < input.getSize(0); ++b) {
-        for (int x = threadIdx.x; x < input.getSize(2); x += blockDim.x) {
-            real inp = input[b][c][x].ldg();
-            output[b][c][x] = gamma[c].ldg() * (inp - mean[c].ldg()) * 
-                invstd[c].ldg() + beta[c].ldg();
-        }
-    }
-}
-void Encoding_(BatchNorm_Forward)(THCState *state, 
-        THCTensor *output_, THCTensor *input_, 
-        THCTensor *mean_, THCTensor *invstd_,
-        THCTensor *gamma_, THCTensor *beta_)
-/*
- * batch norm forward function
- * assuming the input is already flaghten
- */
-{
-    /* Check the GPU index and tensor dims*/
-    THCTensor_(checkGPU)(state, 6, output_, input_, mean_, invstd_, 
-                         gamma_, beta_);
-    if (THCTensor_(nDimension)(state, output_) != 3 ||
-        THCTensor_(nDimension)(state, input_)  != 3 ||
-        THCTensor_(nDimension)(state, mean_)   != 1 ||
-        THCTensor_(nDimension)(state, invstd_) != 1 ||
-        THCTensor_(nDimension)(state, gamma_)  != 1 ||
-        THCTensor_(nDimension)(state, beta_)   != 1)
-        THError("BatchNorm2d forward: incorrect input dims. \n");
-    /* Device tensors */
-    THCDeviceTensor<real, 3> output = devicetensor<3>(state, output_);
-    THCDeviceTensor<real, 3> input  = devicetensor<3>(state, input_);
-    THCDeviceTensor<real, 1> mean   = devicetensor<1>(state, mean_);
-    THCDeviceTensor<real, 1> invstd    = devicetensor<1>(state, invstd_);
-    THCDeviceTensor<real, 1> gamma  = devicetensor<1>(state, gamma_);
-    THCDeviceTensor<real, 1> beta   = devicetensor<1>(state, beta_);
-    /* kernel function */
-    cudaStream_t stream = THCState_getCurrentStream(state);
-    dim3 blocks(input.getSize(1));
-    dim3 threads(getNumThreads(input.getSize(2)));
-    Encoding_(BatchNorm_Forward_kernel)<<<blocks, threads, 0, stream>>>(
-        output, input, mean, invstd, gamma, beta);
-    THCudaCheck(cudaGetLastError());
-}
-struct Encoding_(GradOp) {
-    __device__ Encoding_(GradOp)(real m, THCDeviceTensor<real, 3> i, THCDeviceTensor<real, 3> g)
-        : mean(m), input(i), gradOutput(g) {}
-    __device__ __forceinline__ Encoding_(Float2) operator()(int batch, int plane, int n) {
-        real g = gradOutput[batch][plane][n].ldg();
-        real c = input[batch][plane][n].ldg() - mean;
-        return Encoding_(Float2)(g, g * c);
-    }
-    real mean;
-    THCDeviceTensor<real, 3> input;
-    THCDeviceTensor<real, 3> gradOutput;
-};
-// Sum across (batch, b/c/n) applying Op() pointwise
-__device__ Encoding_(Float2) Encoding_(reduce)(
-        Encoding_(GradOp) op,
-        THCDeviceTensor<real, 3> tensor, 
-        int plane) 
-{
-    Encoding_(Float2) sum = (Encoding_(Float2))0;
-    for (int batch = 0; batch < tensor.getSize(0); ++batch) {
-        for (int x = threadIdx.x; x < tensor.getSize(2); x += blockDim.x) {
-            sum += op(batch, plane, x);
-        }
-    }
-    // sum over NumThreads within a warp
-    sum = Encoding_(warpSum)(sum);
-    // 'transpose', and reduce within warp again
-    __shared__ Encoding_(Float2) shared[32];
-    __syncthreads();
-    if (threadIdx.x % WARP_SIZE == 0) {
-        if (threadIdx.x / WARP_SIZE < 32) {
-                shared[threadIdx.x / WARP_SIZE] = sum;
-        }
-    }
-    if (threadIdx.x >= blockDim.x / WARP_SIZE && threadIdx.x < WARP_SIZE) {
-        // zero out the other entries in shared
-        shared[threadIdx.x] = (Encoding_(Float2))0;
-    }
-    __syncthreads();
-    if (threadIdx.x / WARP_SIZE == 0) {
-        sum = Encoding_(warpSum)(shared[threadIdx.x]);
-        if (threadIdx.x == 0) {
-            shared[0] = sum;
-        }
-    }
-    __syncthreads();
-  // Everyone picks it up, should be broadcast into the whole gradInput
-  return shared[0];
-}
-__global__ void Encoding_(BatchNorm_Backward_kernel) (
-    THCDeviceTensor<real, 3> gradoutput,
-    THCDeviceTensor<real, 3> input,
-    THCDeviceTensor<real, 3> gradinput,
-    THCDeviceTensor<real, 1> gradgamma,
-    THCDeviceTensor<real, 1> gradbeta,
-    THCDeviceTensor<real, 1> mean,
-    THCDeviceTensor<real, 1> invstd,
-    THCDeviceTensor<real, 1> gamma,
-    THCDeviceTensor<real, 1> beta,
-    THCDeviceTensor<real, 1> gradMean, 
-    THCDeviceTensor<real, 1> gradStd,
-    int train)
-{
-    /* declarations of the variables */
-    /* Get the index and channels */ 
-    int c = blockIdx.x; 
-    /* main operation */ 
-    Encoding_(GradOp) g(mean[c], input, gradoutput);
-    Encoding_(Float2) res = Encoding_(reduce)(g, gradoutput, c);
-    real gradOutputSum = res.v1;
-    real dotP = res.v2;
-    //real projScale = dotP * norm * invstd[c].ldg() * invstd[c].ldg();
-    real gradScale = invstd[c].ldg() * gamma[c].ldg();
-    if (train && threadIdx.x == 0) {
-        gradMean[c] = - gradOutputSum * gamma[c].ldg() * invstd[c].ldg();
-        gradStd[c]  = - dotP * gamma[c].ldg() * invstd[c].ldg() * invstd[c].ldg();
-    }
-    if (gradinput.numElements() > 0) {
-        for (int batch = 0; batch < gradoutput.getSize(0); ++batch) {
-            for (int x = threadIdx.x; x < gradoutput.getSize(2); x += blockDim.x) {
-                gradinput[batch][c][x] = gradoutput[batch][c][x].ldg() * gradScale;
-            }
-        }
-    }
-    if (gradgamma.numElements() > 0) {
-        if (threadIdx.x == 0) {
-            gradgamma[c] += dotP * invstd[c].ldg();
-        }
-    }
-    if (gradbeta.numElements() > 0) {
-        if (threadIdx.x == 0) {
-            gradbeta[c] += gradOutputSum;
-        }
-    }
-}
-void Encoding_(BatchNorm_Backward)(THCState *state, 
-        THCTensor *gradoutput_, THCTensor *input_, THCTensor *gradinput_, 
-        THCTensor *gradgamma_, THCTensor *gradbeta_, THCTensor *mean_, 
-        THCTensor *invstd_, THCTensor *gamma_, THCTensor *beta_, 
-        THCTensor *gradMean_, THCTensor *gradStd_, int train)
-/*
- * batch norm backward function
- * assuming the input is already flaghten
- */
-{
-    /* Check the GPU index and tensor dims*/
-    THCTensor_(checkGPU)(state, 6, gradoutput_, input_, gradinput_, 
-        gradgamma_, gradbeta_, mean_, invstd_, gamma_, beta_);
-    if (THCTensor_(nDimension)(state, gradoutput_) != 3 ||
-        THCTensor_(nDimension)(state, input_)      != 3 ||
-        THCTensor_(nDimension)(state, gradinput_)  != 3 ||
-        THCTensor_(nDimension)(state, gradgamma_)  != 1 ||
-        THCTensor_(nDimension)(state, gradbeta_)   != 1 ||
-        THCTensor_(nDimension)(state, mean_)   != 1 ||
-        THCTensor_(nDimension)(state, invstd_) != 1 ||
-        THCTensor_(nDimension)(state, gamma_)  != 1 ||
-        THCTensor_(nDimension)(state, beta_)   != 1 || 
-        THCTensor_(nDimension)(state, gradMean_) != 1 ||
-        THCTensor_(nDimension)(state, gradStd_)  != 1 )
-        THError("BatchNorm2d backward: incorrect input dims. \n");
-    /* Device tensors */
-    THCDeviceTensor<real, 3> gradoutput = 
-        devicetensor<3>(state, gradoutput_);
-    THCDeviceTensor<real, 3> input = 
-        devicetensor<3>(state, input_);
-    THCDeviceTensor<real, 3> gradinput = 
-        devicetensor<3>(state, gradinput_);
-    THCDeviceTensor<real, 1> gradgamma = 
-        devicetensor<1>(state, gradgamma_);
-    THCDeviceTensor<real, 1> gradbeta = devicetensor<1>(state, gradbeta_);
-    THCDeviceTensor<real, 1> mean     = devicetensor<1>(state, mean_);
-    THCDeviceTensor<real, 1> invstd   = devicetensor<1>(state, invstd_);
-    THCDeviceTensor<real, 1> gamma    = devicetensor<1>(state, gamma_);
-    THCDeviceTensor<real, 1> beta     = devicetensor<1>(state, beta_);
-    THCDeviceTensor<real, 1> gradMean = devicetensor<1>(state, gradMean_);
-    THCDeviceTensor<real, 1> gradStd  = devicetensor<1>(state, gradStd_);
-    /* kernel function */
-    cudaStream_t stream = THCState_getCurrentStream(state);
-    dim3 blocks(input.getSize(1));
-    dim3 threads(getNumThreads(input.getSize(2)));
-    Encoding_(BatchNorm_Backward_kernel)<<<blocks, threads, 0, stream>>>(
-        gradoutput, input, gradinput, gradgamma, gradbeta, mean, invstd, 
-        gamma, beta, gradMean, gradStd, train);
-    THCudaCheck(cudaGetLastError());
-}
-struct Encoding_(SumOp) {
-    __device__ Encoding_(SumOp)(THCDeviceTensor<real, 3> i)
-        : input(i){}
-    __device__ __forceinline__ Encoding_(Float2) operator()(int batch, int plane, int n) {
-        real g = input[batch][plane][n].ldg();
-        return Encoding_(Float2)(g, g * g);
-    }
-    real mean;
-    THCDeviceTensor<real, 3> input;
-};
-// Sum across (batch, x/y/z) applying Op() pointwise
-__device__ Encoding_(Float2) Encoding_(reduce_sum)(Encoding_(SumOp) op, THCDeviceTensor<real, 3> tensor, int plane) {
-  Encoding_(Float2) sum = (Encoding_(Float2))0;
-  for (int batch = 0; batch < tensor.getSize(0); ++batch) {
-    for (int x = threadIdx.x; x < tensor.getSize(2); x += blockDim.x) {
-      sum += op(batch, plane, x);
-    }
-  }
-  // sum over NumThreads within a warp
-  sum = Encoding_(warpSum)(sum);
-  // 'transpose', and reduce within warp again
-  __shared__ Encoding_(Float2) shared[32];
-  __syncthreads();
-  if (threadIdx.x % WARP_SIZE == 0) {
-    if (threadIdx.x / WARP_SIZE < 32) {
-        shared[threadIdx.x / WARP_SIZE] = sum;
-    }
-  }
-  if (threadIdx.x >= blockDim.x / WARP_SIZE && threadIdx.x < WARP_SIZE) {
-    // zero out the other entries in shared
-    shared[threadIdx.x] = (Encoding_(Float2))0;
-  }
-  __syncthreads();
-  if (threadIdx.x / WARP_SIZE == 0) {
-    sum = Encoding_(warpSum)(shared[threadIdx.x]);
-    if (threadIdx.x == 0) {
-      shared[0] = sum;
-    }
-  }
-  __syncthreads();
-  // Everyone picks it up, should be broadcast into the whole gradInput
-  return shared[0];
-}
-__global__ void Encoding_(Sum_Square_Forward_kernel) (
-    THCDeviceTensor<real, 3> input,
-    THCDeviceTensor<real, 1> sum,
-    THCDeviceTensor<real, 1> square)
-{
-    int c = blockIdx.x;
-    /* main operation */ 
-    Encoding_(SumOp) g(input);
-    Encoding_(Float2) res = Encoding_(reduce_sum)(g, input, c);
-    real xsum = res.v1;
-    real xsquare = res.v2;
-    if (threadIdx.x == 0) {
-        sum[c]    = xsum;
-        square[c] = xsquare;
-    }
-}
-void Encoding_(Sum_Square_Forward)(THCState *state, 
-        THCTensor *input_, THCTensor *sum_, THCTensor *square_)
-/*
- */
-{
-    /* Check the GPU index and tensor dims*/
-    THCTensor_(checkGPU)(state, 3, input_, sum_, square_);
-    if (THCTensor_(nDimension)(state, input_)   != 3 ||
-        THCTensor_(nDimension)(state, sum_)     != 1 ||
-        THCTensor_(nDimension)(state, square_)  != 1)
-        THError("Sum_Square forward: incorrect input dims. \n");
-    /* Device tensors */
-    THCDeviceTensor<real, 3> input  = devicetensor<3>(state, input_);
-    THCDeviceTensor<real, 1> sum    = devicetensor<1>(state, sum_);
-    THCDeviceTensor<real, 1> square = devicetensor<1>(state, square_);
-    /* kernel function */
-    cudaStream_t stream = THCState_getCurrentStream(state);
-    dim3 blocks(input.getSize(1));
-    dim3 threads(getNumThreads(input.getSize(2)));
-    Encoding_(Sum_Square_Forward_kernel)<<<blocks, threads, 0, stream>>>(
-        input, sum, square);
-    THCudaCheck(cudaGetLastError());
-}
-__global__ void Encoding_(Sum_Square_Backward_kernel) (
-    THCDeviceTensor<real, 3> gradInput,
-    THCDeviceTensor<real, 3> input,
-    THCDeviceTensor<real, 1> gradSum,
-    THCDeviceTensor<real, 1> gradSquare)
-{
-    int c = blockIdx.x;
-    /* main operation */ 
-    for (int batch = 0; batch < gradInput.getSize(0); ++batch) {
-        for (int x = threadIdx.x; x < gradInput.getSize(2); x += blockDim.x)
-        {
-            gradInput[batch][c][x] = gradSum[c] + 2 * gradSquare[c] *
-                input[batch][c][x];
-        }
-    }   
-}
-void Encoding_(Sum_Square_Backward)(THCState *state, 
-        THCTensor *gradInput_, THCTensor *input_, 
-        THCTensor *gradSum_, THCTensor *gradSquare_)
-/*
- */
-{
-    /* Check the GPU index and tensor dims*/
-    THCTensor_(checkGPU)(state, 4, gradInput_, input_, gradSum_, 
-                         gradSquare_);
-    if (THCTensor_(nDimension)(state, gradInput_)  != 3 ||
-        THCTensor_(nDimension)(state, input_)      != 3 ||
-        THCTensor_(nDimension)(state, gradSum_)    != 1 ||
-        THCTensor_(nDimension)(state, gradSquare_) != 1)
-        THError("Sum_Square forward: incorrect input dims. \n");
-    /* Device tensors */
-    THCDeviceTensor<real, 3> gradInput  = devicetensor<3>(state, gradInput_);
-    THCDeviceTensor<real, 3> input      = devicetensor<3>(state, input_);
-    THCDeviceTensor<real, 1> gradSum    = devicetensor<1>(state, gradSum_);
-    THCDeviceTensor<real, 1> gradSquare =devicetensor<1>(state, gradSquare_);
-    /* kernel function */
-    cudaStream_t stream = THCState_getCurrentStream(state);
-    dim3 blocks(input.getSize(1));
-    dim3 threads(getNumThreads(input.getSize(2)));
-    Encoding_(Sum_Square_Backward_kernel)<<<blocks, threads, 0, stream>>>(
-        gradInput, input, gradSum, gradSquare);
-    THCudaCheck(cudaGetLastError());
-}
-#endif