v1.0.1

8f8fbb9f · Hang Zhang · aa9af7fd · 8f8fbb9f · 8f8fbb9f · 8f8fbb9f
Commit 8f8fbb9f authored Oct 15, 2017 by Hang Zhang
20 changed files
--- a/encoding/dilated/__init__.py
+++ b/encoding/dilated/__init__.py
+from .resnet import *
+from .densenet import *
--- a/encoding/dilated/densenet.py
+++ b/encoding/dilated/densenet.py
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.model_zoo as model_zoo
+from collections import OrderedDict
+from ..nn import DilatedAvgPool2d
+__all__ = ['DenseNet', 'densenet121', 'densenet169', 'densenet201', 'densenet161']
+model_urls = {
+    'densenet121': 'https://download.pytorch.org/models/densenet121-a639ec97.pth',
+    'densenet169': 'https://download.pytorch.org/models/densenet169-b2777c0a.pth',
+    'densenet201': 'https://download.pytorch.org/models/densenet201-c1103571.pth',
+    'densenet161': 'https://download.pytorch.org/models/densenet161-8d451a50.pth',
+}
+def densenet121(pretrained=False, **kwargs):
+    r"""Densenet-121 model from
+    `"Densely Connected Convolutional Networks" <https://arxiv.org/pdf/1608.06993.pdf>`_
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+    """
+    model = DenseNet(num_init_features=64, growth_rate=32, block_config=(6, 12, 24, 16),
+                     **kwargs)
+    if pretrained:
+        model.load_state_dict(model_zoo.load_url(model_urls['densenet121']))
+    return model
+def densenet169(pretrained=False, **kwargs):
+    r"""Densenet-169 model from
+    `"Densely Connected Convolutional Networks" <https://arxiv.org/pdf/1608.06993.pdf>`_
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+    """
+    model = DenseNet(num_init_features=64, growth_rate=32, block_config=(6, 12, 32, 32),
+                     **kwargs)
+    if pretrained:
+        model.load_state_dict(model_zoo.load_url(model_urls['densenet169']))
+    return model
+def densenet201(pretrained=False, **kwargs):
+    r"""Densenet-201 model from
+    `"Densely Connected Convolutional Networks" <https://arxiv.org/pdf/1608.06993.pdf>`_
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+    """
+    model = DenseNet(num_init_features=64, growth_rate=32, block_config=(6, 12, 48, 32),
+                     **kwargs)
+    if pretrained:
+        model.load_state_dict(model_zoo.load_url(model_urls['densenet201']))
+    return model
+def densenet161(pretrained=False, **kwargs):
+    r"""Densenet-161 model from
+    `"Densely Connected Convolutional Networks" <https://arxiv.org/pdf/1608.06993.pdf>`_
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+    """
+    model = DenseNet(num_init_features=96, growth_rate=48, block_config=(6, 12, 36, 24),
+                     **kwargs)
+    if pretrained:
+        model.load_state_dict(model_zoo.load_url(model_urls['densenet161']))
+    return model
+class _DenseLayer(nn.Sequential):
+    def __init__(self, num_input_features, growth_rate, bn_size, drop_rate, dilation=1):
+        super(_DenseLayer, self).__init__()
+        self.add_module('norm.1', nn.BatchNorm2d(num_input_features)),
+        self.add_module('relu.1', nn.ReLU(inplace=True)),
+        self.add_module('conv.1', nn.Conv2d(num_input_features, bn_size *
+                        growth_rate, kernel_size=1, stride=1, bias=False)),
+        self.add_module('norm.2', nn.BatchNorm2d(bn_size * growth_rate)),
+        self.add_module('relu.2', nn.ReLU(inplace=True)),
+        self.add_module('conv.2', nn.Conv2d(bn_size * growth_rate, growth_rate,
+                        kernel_size=3, stride=1, padding=dilation, dilation=dilation, 
+                        bias=False)),
+        self.drop_rate = drop_rate
+    def forward(self, x):
+        new_features = super(_DenseLayer, self).forward(x)
+        if self.drop_rate > 0:
+            new_features = F.dropout(new_features, p=self.drop_rate, training=self.training)
+        return torch.cat([x, new_features], 1)
+class _DenseBlock(nn.Sequential):
+    def __init__(self, num_layers, num_input_features, bn_size, growth_rate, drop_rate, dilation=1):
+        super(_DenseBlock, self).__init__()
+        for i in range(num_layers):
+            layer = _DenseLayer(num_input_features + i * growth_rate, growth_rate, bn_size, drop_rate, dilation=dilation)
+            self.add_module('denselayer%d' % (i + 1), layer)
+class _Transition(nn.Sequential):
+    def __init__(self, num_input_features, num_output_features, stride, dilation=1):
+        super(_Transition, self).__init__()
+        self.add_module('norm', nn.BatchNorm2d(num_input_features))
+        self.add_module('relu', nn.ReLU(inplace=True))
+        self.add_module('conv', nn.Conv2d(num_input_features, num_output_features,
+                                          kernel_size=1, stride=1, bias=False))
+        self.add_module('pool', DilatedAvgPool2d(kernel_size=2, stride=stride, 
+                                                 dilation=dilation))
+class DenseNet(nn.Module):
+    r"""Dilated Densenet-BC model class
+    Args:
+        growth_rate (int) - how many filters to add each layer (`k` in paper)
+        block_config (list of 4 ints) - how many layers in each pooling block
+        num_init_features (int) - the number of filters to learn in the first convolution layer
+        bn_size (int) - multiplicative factor for number of bottle neck layers
+          (i.e. bn_size * k features in the bottleneck layer)
+        drop_rate (float) - dropout rate after each dense layer
+        num_classes (int) - number of classification classes
+    Reference:
+        Huang, Gao, et al. "Densely Connected Convolutional Networks" *CVPR 2017*
+    """
+    def __init__(self, growth_rate=32, block_config=(6, 12, 24, 16),
+                 num_init_features=64, bn_size=4, drop_rate=0, num_classes=1000):
+        super(DenseNet, self).__init__()
+        # First convolution
+        self.features = nn.Sequential(OrderedDict([
+            ('conv0', nn.Conv2d(3, num_init_features, kernel_size=7, stride=2, padding=3, bias=False)),
+            ('norm0', nn.BatchNorm2d(num_init_features)),
+            ('relu0', nn.ReLU(inplace=True)),
+            ('pool0', nn.MaxPool2d(kernel_size=3, stride=2, padding=1)),
+        ]))
+        # Each denseblock
+        strides = [1,2,1,1]
+        dilations = [1,1,2,4]
+        num_features = num_init_features
+        for i, num_layers in enumerate(block_config):
+            block = _DenseBlock(num_layers=num_layers, num_input_features=num_features,
+                                bn_size=bn_size, growth_rate=growth_rate, drop_rate=drop_rate,
+                                dilation=dilations[i])
+            self.features.add_module('denseblock%d' % (i + 1), block)
+            num_features = num_features + num_layers * growth_rate
+            if i != len(block_config) - 1:
+                trans = _Transition(num_input_features=num_features, num_output_features=num_features // 2, stride=strides[i+1], dilation=dilations[i])
+                self.features.add_module('transition%d' % (i + 1), trans)
+                num_features = num_features // 2
+        # Final batch norm
+        self.features.add_module('norm5', nn.BatchNorm2d(num_features))
+        # Linear layer
+        self.classifier = nn.Linear(num_features, num_classes)
+    def forward(self, x):
+        features = self.features(x)
+        out = F.relu(features, inplace=True)
+        """
+        out = F.avg_pool2d(out, kernel_size=7).view(features.size(0), -1)
+        out = self.classifier(out)
+        """
+        return out
--- a/encoding/dilated/resnet.py
+++ b/encoding/dilated/resnet.py
+from .. import nn
+import math
+from torch.autograd import Variable
+import torch.utils.model_zoo as model_zoo
+__all__ = ['ResNet', 'resnet18', 'resnet34', 'resnet50', 'resnet101',
+           'resnet152']
+model_urls = {
+    'resnet18': 'https://download.pytorch.org/models/resnet18-5c106cde.pth',
+    'resnet34': 'https://download.pytorch.org/models/resnet34-333f7ec4.pth',
+    'resnet50': 'https://download.pytorch.org/models/resnet50-19c8e357.pth',
+    'resnet101': 'https://download.pytorch.org/models/resnet101-5d3b4d8f.pth',
+    'resnet152': 'https://download.pytorch.org/models/resnet152-b121ed2d.pth',
+}
+def conv3x3(in_planes, out_planes, stride=1):
+    "3x3 convolution with padding"
+    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
+                     padding=1, bias=False)
+class BasicBlock(nn.Module):
+    expansion = 1
+    def __init__(self, inplanes, planes, stride=1, dilation=1, downsample=None, fist_dilation=1):
+        super(BasicBlock, self).__init__()
+        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=3, stride=stride,
+                               padding=dilation, dilation=dilation, bias=False)
+        self.bn1 = nn.BatchNorm2d(planes)
+        self.relu = nn.ReLU(inplace=True)
+        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=1,
+                               padding=fist_dilation, dilation=fist_dilation, bias=False)
+        self.bn2 = nn.BatchNorm2d(planes)
+        self.downsample = downsample
+        self.stride = stride
+    def forward(self, x):
+        residual = x
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+        out = self.conv2(out)
+        out = self.bn2(out)
+        if self.downsample is not None:
+            residual = self.downsample(x)
+        out += residual
+        out = self.relu(out)
+        return out
+class Bottleneck(nn.Module):
+    expansion = 4
+    def __init__(self, inplanes, planes, stride=1, dilation=1, downsample=None, fist_dilation=1):
+        super(Bottleneck, self).__init__()
+        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(planes)
+        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride,
+                               padding=dilation, dilation=dilation, bias=False)
+        self.bn2 = nn.BatchNorm2d(planes)
+        self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False)
+        self.bn3 = nn.BatchNorm2d(planes * 4)
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = downsample
+        self.dilation = dilation
+        self.stride = stride
+    def _sum_each(self, x, y):
+        assert(len(x)==len(y))
+        z = []
+        for i in range(len(x)):
+            z.append(x[i]+y[i])
+        return z
+    def forward(self, x):
+        residual = x
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.relu(out)
+        out = self.conv3(out)
+        out = self.bn3(out)
+        if self.downsample is not None:
+            residual = self.downsample(x)
+        if isinstance(out, Variable):
+            out += residual
+        elif isinstance(out, tuple) or isinstance(out, list):
+            out = self._sum_each(out, residual)
+        out = self.relu(out)
+        return out
+class ResNet(nn.Module):
+    """Dilated Pre-trained ResNet Model, which preduces the stride of 8 featuremaps at conv5.
+    Reference:
+        Yu, Fisher, and Vladlen Koltun. "Multi-scale context aggregation by dilated convolutions."
+    """
+    def __init__(self, block, layers, num_classes=1000):
+        self.inplanes = 64
+        super(ResNet, self).__init__()
+        self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3,
+                               bias=False)
+        self.bn1 = nn.BatchNorm2d(64)
+        self.relu = nn.ReLU(inplace=True)
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+        self.layer1 = self._make_layer(block, 64, layers[0])
+        self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
+        self.layer3 = self._make_layer(block, 256, layers[2], stride=1, dilation=2)
+        self.layer4 = self._make_layer(block, 512, layers[3], stride=1, dilation=4)
+        self.avgpool = nn.AvgPool2d(7)
+        self.fc = nn.Linear(512 * block.expansion, num_classes)
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+                m.weight.data.normal_(0, math.sqrt(2. / n))
+            elif isinstance(m, nn.BatchNorm2d):
+                m.weight.data.fill_(1)
+                m.bias.data.zero_()
+    def _make_layer(self, block, planes, blocks, stride=1, dilation=1):
+        downsample = None
+        if stride != 1 or self.inplanes != planes * block.expansion:
+            downsample = nn.Sequential(
+                nn.Conv2d(self.inplanes, planes * block.expansion,
+                          kernel_size=1, stride=stride, bias=False),
+                nn.BatchNorm2d(planes * block.expansion),
+            )
+        layers = []
+        if dilation == 1 or dilation == 2:
+            layers.append(block(self.inplanes, planes, stride, dilation=1, 
+                                downsample=downsample, fist_dilation=dilation))
+        elif dilation ==4:
+            layers.append(block(self.inplanes, planes, stride, dilation=2, 
+                                downsample=downsample, fist_dilation=dilation))
+        else:
+            raise RuntimeError("=> unknown dilation size: {}".format(dilation))
+        self.inplanes = planes * block.expansion
+        for i in range(1, blocks):
+            layers.append(block(self.inplanes, planes, dilation=dilation, fist_dilation=dilation))
+        return nn.Sequential(*layers)
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.relu(x)
+        x = self.maxpool(x)
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.layer4(x)
+        x = self.avgpool(x)
+        x = x.view(x.size(0), -1)
+        x = self.fc(x)
+        return x
+def resnet18(pretrained=False, **kwargs):
+    """Constructs a ResNet-18 model.
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+    """
+    model = ResNet(BasicBlock, [2, 2, 2, 2], **kwargs)
+    if pretrained:
+        model.load_state_dict(model_zoo.load_url(model_urls['resnet18']))
+    return model
+def resnet34(pretrained=False, **kwargs):
+    """Constructs a ResNet-34 model.
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+    """
+    model = ResNet(BasicBlock, [3, 4, 6, 3], **kwargs)
+    if pretrained:
+        model.load_state_dict(model_zoo.load_url(model_urls['resnet34']))
+    return model
+def resnet50(pretrained=False, **kwargs):
+    """Constructs a ResNet-50 model.
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+    """
+    model = ResNet(Bottleneck, [3, 4, 6, 3], **kwargs)
+    if pretrained:
+        model.load_state_dict(model_zoo.load_url(model_urls['resnet50']))
+    return model
+def resnet101(pretrained=False, **kwargs):
+    """Constructs a ResNet-101 model.
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+    """
+    model = ResNet(Bottleneck, [3, 4, 23, 3], **kwargs)
+    if pretrained:
+        model.load_state_dict(model_zoo.load_url(model_urls['resnet101']))
+    return model
+def resnet152(pretrained=False, **kwargs):
+    """Constructs a ResNet-152 model.
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+    """
+    model = ResNet(Bottleneck, [3, 8, 36, 3], **kwargs)
+    if pretrained:
+        model.load_state_dict(model_zoo.load_url(model_urls['resnet152']))
+    return model
+if __name__ == "__main__":
+    model = ResNet(Bottleneck, [3, 4, 23, 3])
+    model.load_state_dict(model_zoo.load_url(model_urls['resnet101']))
+    print(model.layer4)
--- a/encoding/functions/__init__.py
+++ b/encoding/functions/__init__.py
-from .aggregate import aggregate, scaledL2, aggregateP, residual, square_squeeze, assign
+from .encoding import *
+from .basic import *
+from .syncbn import *
+from .customize import *
--- a/encoding/functions/basic.py
+++ b/encoding/functions/basic.py
+##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+## Created by: Hang Zhang
+## ECE Department, Rutgers University
+## Email: zhang.hang@rutgers.edu
+## Copyright (c) 2017
+##
+## This source code is licensed under the MIT-style license found in the
+## LICENSE file in the root directory of this source tree 
+##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+__all__ = ['view_each', 'multi_each', 'sum_each', 'upsample']
+def view_each(x, size):
+    y = []
+    for i in range(len(x)):
+        y.append(x[i].view(size))
+    return y
+def multi_each(a, b):
+    y = []
+    for i in range(len(a)):
+        y.append(a[i] * b[i])
+    return y
+def sum_each(x, y):
+    assert(len(x)==len(y))
+    z = []
+    for i in range(len(x)):
+        z.append(x[i]+y[i])
+    return z
+def upsample(input, size=None, scale_factor=None, mode='nearest'):
+    if isinstance(input, Variable):
+        return F.upsample(input, size=size, scale_factor=scale_factor,
+                          mode=mode)
+    elif isinstance(input, tuple) or isinstance(input, list):
+        lock = threading.Lock()
+        results = {}
+        def _worker(i, x):
+            try:
+                with torch.cuda.device_of(x):
+                    result =  F.upsample(x, size=size, \
+                        scale_factor=scale_factor,mode=mode)
+                with lock:
+                    results[i] = result
+            except Exception as e:
+                with lock:
+                    resutls[i] = e 
+        # multi-threading for different gpu
+        threads = [threading.Thread(target=_worker,
+                                    args=(i, x),
+                                    )
+                   for i, (x) in enumerate(input)]
+        for thread in threads:
+            thread.start()
+        for thread in threads:
+            thread.join() 
+        # gather the results
+        def _list_gather(x):
+            y = []
+            for i in range(len(x)):
+                xi = x[i]
+                if isinstance(xi, Exception):
+                    raise xi
+                y.append(xi)
+            return y
+        outputs = _list_gather(results)
+        return outputs
+    else:
+        raise RuntimeError('unknown input type')
--- a/encoding/functions/customize.py
+++ b/encoding/functions/customize.py
+##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+## Created by: Hang Zhang
+## ECE Department, Rutgers University
+## Email: zhang.hang@rutgers.edu
+## Copyright (c) 2017
+##
+## This source code is licensed under the MIT-style license found in the
+## LICENSE file in the root directory of this source tree 
+##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+import math
+import threading
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.autograd import Function, Variable
+from torch.nn.modules.utils import _single, _pair, _triple
+from .._ext import encoding_lib
+__all__ = ['dilatedavgpool2d']
+class _dilatedavgpool2d(Function):
+    def forward(self, input, kernel_size, stride, padding,
+            dilation=1):
+        self.kH, self.kW = _pair(kernel_size)
+        self.dH, self.dW = _pair(stride if stride is not None else 
+            kernel_size)
+        self.padH, self.padW = _pair(padding)
+        self.dilationH, self.dilationW = _pair(dilation)
+        b,c,h,w = input.size()
+        if self.dH==1 and self.dW==1:
+            # keep the size for dilated avgpool
+            ow, oh = w, h
+        else:
+            ow = math.floor(float(w-self.kW+2*self.padW)/float(self.dW)) +1
+            oh = math.floor(float(h-self.kH+2*self.padH)/float(self.dH)) +1
+        output = input.new(b,c,oh,ow)
+        self.save_for_backward(input)
+        encoding_lib.Encoding_Float_DilatedAvgPool2d_Forward(input, output,
+            self.kH, self.kW, self.dH, self.dW, self.padH, self.padW,
+            self.dilationH, self.dilationW)
+        return output
+    def backward(self, gradOutput):
+        input, = self.saved_variables
+        gradInput = input.new().resize_as_(input)
+        encoding_lib.Encoding_Float_DilatedAvgPool2d_Backward(
+            gradinput, gradoutput,
+            self.kH, self.kW, self.dH, self.dW, self.padH, self.padW,
+            self.dilationH, self.dilationW)
+        return gradInput, None, None, None, None
+def dilatedavgpool2d(input, kernel_size, stride=None, padding=0, 
+        dilation=1):
+    """Dilated Average Pool 2d, for dilation of DenseNet. 
+    Applies 2D average-pooling operation in kh x kw regions by step size
+    dh x dw steps. The number of output features is equal to the number of
+    input planes.
+    See :class:`~encoding.nn.DilatedAvgPool2d` for details and output shape.
+    Args:
+        input: input tensor (minibatch x in_channels x iH x iW)
+        kernel_size: size of the pooling region, a single number or a
+          tuple (kh x kw)
+        stride: stride of the pooling operation, a single number or a
+          tuple (sh x sw). Default is equal to kernel size
+        padding: implicit zero padding on the input, a single number or
+          a tuple (padh x padw), Default: 0
+        dilation: the dilation parameter similar to Conv2d
+    """
+    return _dilatedavgpool2d.apply(input, kernel_size, stride, padding,
+            dilation)
--- a/encoding/functions/encoding.py
+++ b/encoding/functions/encoding.py
+##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+## Created by: Hang Zhang
+## ECE Department, Rutgers University
+## Email: zhang.hang@rutgers.edu
+## Copyright (c) 2017
+##
+## This source code is licensed under the MIT-style license found in the
+## LICENSE file in the root directory of this source tree 
+##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+import threading
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.autograd import Function, Variable
+from .._ext import encoding_lib
+__all__ = ['aggregate', 'scaledL2', 'aggregateP', 'residual', 'assign']
+class _aggregate(Function):
+    def forward(self, A, X, C):
+        # A \in(BxNxK) R \in(BxNxKxD) => E \in(BxNxD)
+        self.save_for_backward(A, X, C)
+        B, N, K = A.size()
+        D = X.size(2)
+        with torch.cuda.device_of(A):
+            E = A.new(B,K,D)
+        if isinstance(A, torch.cuda.FloatTensor):
+            with torch.cuda.device_of(A):
+                encoding_lib.Encoding_Float_aggregateE_forward(E, A, X, C)
+        elif isinstance(A, torch.cuda.DoubleTensor):
+            with torch.cuda.device_of(A):
+                encoding_lib.Encoding_Double_aggregateE_forward(E, A, X, C)
+        else:
+            raise RuntimeError('Unimplemented data type!')
+        return E
+    def backward(self, gradE):
+        A, X, C = self.saved_tensors
+        with torch.cuda.device_of(A):
+            gradA = A.new().resize_as_(A)
+            gradX = A.new().resize_as_(X)
+            gradC = A.new().resize_as_(C)
+        if isinstance(A, torch.cuda.FloatTensor):
+            with torch.cuda.device_of(A):
+                encoding_lib.Encoding_Float_aggregateE_backward(gradA, 
+                    gradE, A, X, C)
+        elif isinstance(A, torch.cuda.DoubleTensor):
+            with torch.cuda.device_of(A):
+                encoding_lib.Encoding_Double_aggregateE_backward(gradA, 
+                    gradE, A, X, C)
+        else:
+            raise RuntimeError('Unimplemented data type!')
+        gradX.copy_(torch.bmm(A, gradE))
+        gradC.copy_((-gradE*A.sum(1).unsqueeze(2)).sum(0))
+        return gradA, gradX, gradC
+def aggregate(A, X, C):
+    r"""
+    Aggregate operation, aggregate the residuals of inputs (:math:`X`) with repect to the codewords (:math:`C`) with assignment weights (:math:`A`).
+    .. math::
+        e_{k} = \sum_{i=1}^{N} a_{ik} (x_i - d_k)
+    Shape:
+        - Input: :math:`A\in\mathcal{R}^{B\times N\times K}` :math:`X\in\mathcal{R}^{B\times N\times D}` :math:`C\in\mathcal{R}^{K\times D}`  (where :math:`B` is batch, :math:`N` is total number of features, :math:`K` is number is codewords, :math:`D` is feature dimensions.)
+        - Output: :math:`E\in\mathcal{R}^{B\times K\times D}`
+    Examples:
+        >>> B,N,K,D = 2,3,4,5
+        >>> A = Variable(torch.cuda.DoubleTensor(B,N,K).uniform_(-0.5,0.5), requires_grad=True)
+        >>> X = Variable(torch.cuda.DoubleTensor(B,N,D).uniform_(-0.5,0.5), requires_grad=True)
+        >>> C = Variable(torch.cuda.DoubleTensor(K,D).uniform_(-0.5,0.5), requires_grad=True)
+        >>> func = encoding.aggregate()
+        >>> E = func(A, X, C)
+    """
+    return _aggregate()(A, X, C)
+class _scaledL2(Function):
+    def forward(self, X, C, S):
+        B,N,D = X.size()
+        K = C.size(0)
+        with torch.cuda.device_of(X):
+            SL = X.new(B,N,K)
+        if isinstance(X, torch.cuda.FloatTensor):
+            with torch.cuda.device_of(X):
+                encoding_lib.Encoding_Float_scaledl2_forward(SL, X, C, S)
+        elif isinstance(X, torch.cuda.DoubleTensor):
+            with torch.cuda.device_of(X):
+                encoding_lib.Encoding_Double_scaledl2_forward(SL, X, C, S)
+        else:
+            raise RuntimeError('Unimplemented data type!')
+        self.save_for_backward(X, C, S, SL)
+        return SL
+    def backward(self, gradSL):
+        X, C, S, SL = self.saved_tensors
+        K = C.size(0)
+        with torch.cuda.device_of(X):
+            gradX = X.new().resize_as_(X)
+            gradC = X.new().resize_as_(C)
+            gradS = X.new().resize_as_(S)
+        if isinstance(X, torch.cuda.FloatTensor):
+            with torch.cuda.device_of(X):
+                encoding_lib.Encoding_Float_scaledl2_backward(gradSL, 
+                    gradX, gradC, X, C, S)
+        elif isinstance(X, torch.cuda.DoubleTensor):
+            with torch.cuda.device_of(X):
+                encoding_lib.Encoding_Double_scaledl2_backward(gradSL, 
+                    gradX, gradC, X, C, S)
+        else:
+            raise RuntimeError('Unimplemented data type!')
+        gradS.copy_((gradSL*(SL/S.view(1,1,K))).sum(0).sum(0))
+        return gradX, gradC, gradS
+def scaledL2(X, C, S):
+    r"""
+    scaledL2 distance
+    .. math::
+        sl_{ik} = s_k \|x_i-c_k\|^2
+    Shape:
+        - Input: :math:`X\in\mathcal{R}^{B\times N\times D}` :math:`C\in\mathcal{R}^{K\times D}` :math:`S\in \mathcal{R}^K` (where :math:`B` is batch, :math:`N` is total number of features, :math:`K` is number is codewords, :math:`D` is feature dimensions.)
+        - Output: :math:`E\in\mathcal{R}^{B\times N\times K}`
+    """
+    return _scaledL2()(X, C, S)
+class _aggregateP(Function):
+    def forward(self, A, R):
+        # A \in(BxNxK) R \in(BxNxKxD) => E \in(BxNxD)
+        self.save_for_backward(A, R)
+        B, N, K, D = R.size()
+        with torch.cuda.device_of(A):
+            E = A.new(B,K,D)
+        if isinstance(A, torch.cuda.FloatTensor):
+            with torch.cuda.device_of(A):
+                encoding_lib.Encoding_Float_aggregate_forward(E, A, R)
+        elif isinstance(A, torch.cuda.DoubleTensor):
+            with torch.cuda.device_of(A):
+                encoding_lib.Encoding_Double_aggregate_forward(E, A, R)
+        else:
+            raise RuntimeError('Unimplemented data type!')
+        return E
+    def backward(self, gradE):
+        A, R = self.saved_tensors
+        with torch.cuda.device_of(A):
+            gradA = A.new().resize_as_(A)
+            gradR = R.new().resize_as_(R)
+        if isinstance(A, torch.cuda.FloatTensor):
+            with torch.cuda.device_of(A):
+                encoding_lib.Encoding_Float_aggregate_backward(gradA, 
+                    gradR, gradE, A, R)
+        elif isinstance(A, torch.cuda.DoubleTensor):
+            with torch.cuda.device_of(A):
+                encoding_lib.Encoding_Double_aggregate_backward(gradA, 
+                    gradR, gradE, A, R)
+        else:
+            raise RuntimeError('Unimplemented data type!')
+        return gradA, gradR
+def aggregateP(A, R):
+    return _aggregateP()(A, R)
+class _residual(Function):
+    def forward(self, X, C):
+        # X \in(BxNxD) D \in(KxD) R \in(BxNxKxD) 
+        B, N, D = X.size()
+        K = C.size(0)
+        with torch.cuda.device_of(X):
+            R = X.new(B,N,K,D)
+        if isinstance(X, torch.cuda.FloatTensor):
+            with torch.cuda.device_of(X):
+                encoding_lib.Encoding_Float_residual_forward(R, X, C)
+        elif isinstance(X, torch.cuda.DoubleTensor):
+            with torch.cuda.device_of(X):
+                encoding_lib.Encoding_Double_residual_forward(R, X, C)
+        else:
+            raise RuntimeError('Unimplemented data type!')
+        return R
+    def backward(self, gradR):
+        B, N, K, D = gradR.size()
+        with torch.cuda.device_of(gradR):
+            gradX = gradR.new(B,N,D)
+            gradD = gradR.new(K,D)
+        if isinstance(gradR, torch.cuda.FloatTensor):
+            with torch.cuda.device_of(gradR):
+                encoding_lib.Encoding_Float_residual_backward(gradR, 
+                    gradX, gradD)
+        elif isinstance(gradR, torch.cuda.DoubleTensor):
+            with torch.cuda.device_of(gradR):
+                encoding_lib.Encoding_Double_residual_backward(gradR, 
+                    gradX, gradD)
+        else:
+            raise RuntimeError('Unimplemented data type!')
+        return gradX, gradD
+def residual(X, C):
+    r"""
+    Calculate residuals over a mini-batch
+    .. math::
+        r_{ik} = x_i - c_k
+    Shape:
+        - Input: :math:`X\in\mathcal{R}^{B\times N\times D}` :math:`C\in\mathcal{R}^{K\times D}` (where :math:`B` is batch, :math:`N` is total number of features, :math:`K` is number is codewords, :math:`D` is feature dimensions.)
+        - Output: :math:`R\in\mathcal{R}^{B\times N\times K\times D}`
+    """
+    return _residual()(X, C)
+class _square_squeeze(Function):
+    def forward(self, R):
+        B, N, K, D = R.size()
+        with torch.cuda.device_of(R):
+            L = R.new(B,N,K)
+        if isinstance(R, torch.cuda.FloatTensor):
+            with torch.cuda.device_of(R):
+                encoding_lib.Encoding_Float_squaresqueeze_forward(L, R)
+        elif isinstance(R, torch.cuda.DoubleTensor):
+            with torch.cuda.device_of(R):
+                encoding_lib.Encoding_Double_squaresqueeze_forward(L, R)
+        else:
+            raise RuntimeError('Unimplemented data type!')
+        self.save_for_backward(L, R)
+        return L
+    def backward(self, gradL):
+        L, R = self.saved_tensors
+        B, N, K, D = R.size()
+        with torch.cuda.device_of(R):
+            gradR = R.new(B,N,K,D)
+        if isinstance(R, torch.cuda.FloatTensor):
+            with torch.cuda.device_of(gradL):
+                encoding_lib.Encoding_Float_squaresqueeze_backward(gradL, 
+                    gradR, R)
+        elif isinstance(R, torch.cuda.DoubleTensor):
+            with torch.cuda.device_of(gradL):
+                encoding_lib.Encoding_Double_squaresqueeze_backward(gradL, 
+                    gradR, R)
+        else:
+            raise RuntimeError('Unimplemented data type!')
+        return gradR
+def assign(R, S):
+    r"""
+    Calculate assignment weights for given residuals (:math:`R`) and scale (:math:`S`)
+    .. math::
+        a_{ik} = \frac{exp(-s_k\|r_{ik}\|^2)}{\sum_{j=1}^K exp(-s_j\|r_{ik}\|^2)}
+    Shape:
+        - Input: :math:`R\in\mathcal{R}^{B\times N\times K\times D}` :math:`S\in \mathcal{R}^K` (where :math:`B` is batch, :math:`N` is total number of features, :math:`K` is number is codewords, :math:`D` is feature dimensions.)
+        - Output :math:`A\in\mathcal{R}^{B\times N\times K}`
+    """
+    L = _square_squeeze()(R)
+    K = S.size(0)
+    SL = L * S.view(1,1,K)
+    return F.softmax(SL)
--- a/encoding/functions/syncbn.py
+++ b/encoding/functions/syncbn.py
+##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+## Created by: Hang Zhang
+## ECE Department, Rutgers University
+## Email: zhang.hang@rutgers.edu
+## Copyright (c) 2017
+##
+## This source code is licensed under the MIT-style license found in the
+## LICENSE file in the root directory of this source tree 
+##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+import threading
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.autograd import Function, Variable
+from .._ext import encoding_lib
+__all__ = ['sum_square', 'batchnormtrain', 'batchnormeval']
+class _sum_square(Function):
+    def forward(ctx, input):
+        ctx.save_for_backward(input)
+        B,C,H,W = input.size()
+        with torch.cuda.device_of(input):
+            xsum    = input.new().resize_(C).zero_()
+            xsquare = input.new().resize_(C).zero_()
+        if isinstance(input, torch.cuda.FloatTensor):
+            with torch.cuda.device_of(input):
+                encoding_lib.Encoding_Float_sum_square_Forward(
+                    input.view(B,C,-1), xsum, xsquare)
+        elif isinstance(input, torch.cuda.DoubleTensor):
+            with torch.cuda.device_of(input):
+                encoding_lib.Encoding_Double_sum_square_Forward( 
+                    input.view(B,C,-1), xsum, xsquare)
+        else:
+            raise RuntimeError('Unimplemented data type!') 
+        return xsum, xsquare
+    def backward(ctx, gradSum, gradSquare):
+        input, = ctx.saved_tensors
+        B,C,H,W = input.size()
+        with torch.cuda.device_of(input):
+            gradInput = input.new().resize_(B,C,H*W).zero_()
+        if isinstance(input, torch.cuda.FloatTensor):
+            with torch.cuda.device_of(input):
+                encoding_lib.Encoding_Float_sum_square_Backward(
+                    gradInput, input.view(B,C,-1), gradSum, gradSquare)
+        elif isinstance(input, torch.cuda.DoubleTensor):
+            with torch.cuda.device_of(input):
+                encoding_lib.Encoding_Double_sum_square_Backward( 
+                    gradInput, input.view(B,C,-1), gradSum, gradSquare)
+        else:
+            raise RuntimeError('Unimplemented data type!') 
+        return gradInput.view(B,C,H,W)
+def sum_square(input):
+    r"""
+    Calculate sum of elements and sum of squares for Batch Normalization.
+    """
+    return _sum_square()(input)
+class _batchnormtrain(Function):
+    def forward(ctx, input, gamma, beta, mean, std):
+        ctx.save_for_backward(input, gamma, beta, mean, std)
+        assert(input.dim()==3)
+        with torch.cuda.device_of(input):
+            invstd = 1.0 / std
+            output = input.new().resize_as_(input)
+        if isinstance(input, torch.cuda.FloatTensor):
+            with torch.cuda.device_of(input):
+                encoding_lib.Encoding_Float_batchnorm_Forward(output, 
+                    input, mean, invstd, gamma, beta)
+        elif isinstance(input, torch.cuda.DoubleTensor):
+            with torch.cuda.device_of(input):
+                encoding_lib.Encoding_Double_batchnorm_Forward(output, 
+                    input, mean, invstd, gamma, beta)
+        else:
+            raise RuntimeError('Unimplemented data type!')
+        return output 
+    def backward(ctx, gradOutput):
+        input, gamma, beta, mean, std = ctx.saved_tensors
+        invstd = 1.0 / std
+        with torch.cuda.device_of(input):
+            gradInput = gradOutput.new().resize_as_(input).zero_()
+            gradGamma = gradOutput.new().resize_as_(gamma).zero_()
+            gradBeta  = gradOutput.new().resize_as_(beta).zero_()
+            gradMean  = gradOutput.new().resize_as_(mean).zero_()
+            gradStd   = gradOutput.new().resize_as_(std).zero_()
+        if isinstance(input, torch.cuda.FloatTensor):
+            with torch.cuda.device_of(input):
+                encoding_lib.Encoding_Float_batchnorm_Backward(
+                    gradOutput, input, gradInput, gradGamma, gradBeta, 
+                    mean, invstd, gamma, beta, gradMean, gradStd,
+                    True) 
+        elif isinstance(input, torch.cuda.DoubleTensor):
+            with torch.cuda.device_of(input):
+                encoding_lib.Encoding_Double_batchnorm_Backward(
+                    gradOutput, input, gradInput, gradGamma, gradBeta, 
+                    mean, invstd, gamma, beta, gradMean, gradStd,
+                    True) 
+        else:
+            raise RuntimeError('Unimplemented data type!')
+        return gradInput, gradGamma, gradBeta, gradMean, gradStd
+def batchnormtrain(input, gamma, beta, mean, std):
+    r"""Applies Batch Normalization over a 3d input that is seen as a
+    mini-batch.
+    .. _encoding.batchnormtrain:
+    .. math::
+        y = \frac{x - \mu[x]}{ \sqrt{var[x] + \epsilon}} * \gamma + \beta
+    Shape:
+        - Input: :math:`(N, C)` or :math:`(N, C, L)`
+        - Output: :math:`(N, C)` or :math:`(N, C, L)` (same shape as input)
+    """
+    return _batchnormtrain()(input, gamma, beta, mean, std)
+class _batchnormeval(Function):
+    def forward(ctx, input, gamma, beta, mean, std):
+        ctx.save_for_backward(input, gamma, beta, mean, std)
+        assert(input.dim()==3)
+        with torch.cuda.device_of(input):
+            invstd = 1.0 / std
+            output = input.new().resize_as_(input)
+        if isinstance(input, torch.cuda.FloatTensor):
+            with torch.cuda.device_of(input):
+                encoding_lib.Encoding_Float_batchnorm_Forward(output, 
+                    input, mean, invstd, gamma, beta)
+        elif isinstance(input, torch.cuda.DoubleTensor):
+            with torch.cuda.device_of(input):
+                encoding_lib.Encoding_Double_batchnorm_Forward(output, 
+                    input, mean, invstd, gamma, beta)
+        else:
+            raise RuntimeError('Unimplemented data type!')
+        return output 
+    def backward(ctx, gradOutput):
+        input, gamma, beta, mean, std = ctx.saved_tensors
+        invstd = 1.0 / std
+        with torch.cuda.device_of(input):
+            gradInput = gradOutput.new().resize_as_(input).zero_()
+            gradGamma = gradOutput.new().resize_as_(gamma).zero_()
+            gradBeta  = gradOutput.new().resize_as_(beta).zero_()
+            gradMean  = gradOutput.new().resize_as_(mean).zero_()
+            gradStd   = gradOutput.new().resize_as_(std).zero_()
+        if isinstance(input, torch.cuda.FloatTensor):
+            with torch.cuda.device_of(input):
+                encoding_lib.Encoding_Float_batchnorm_Backward(
+                    gradOutput, input, gradInput, gradGamma, gradBeta, 
+                    mean, invstd, gamma, beta, gradMean, gradStd,
+                    False) 
+        elif isinstance(input, torch.cuda.DoubleTensor):
+            with torch.cuda.device_of(input):
+                encoding_lib.Encoding_Double_batchnorm_Backward(
+                    gradOutput, input, gradInput, gradGamma, gradBeta, 
+                    mean, invstd, gamma, beta, gradMean, gradStd,
+                    False) 
+        else:
+            raise RuntimeError('Unimplemented data type!')
+        return gradInput, gradGamma, gradBeta, gradMean, gradStd
+def batchnormeval(input, gamma, beta, mean, std):
+    r"""Applies Batch Normalization over a 3d input that is seen as a
+    mini-batch.
+    Please see encoding.batchnormtrain_
+    """
+    return _batchnormeval()(input, gamma, beta, mean, std)
--- a/encoding/kernel/generic/encoding_kernel.c
+++ b/encoding/kernel/generic/encoding_kernel.c
@@ -31,8 +31,8 @@ __global__ void Encoding_(AggregateE_Forward_kernel) (
    k = blockIdx.y * blockDim.y + threadIdx.y;
    N = A.getSize(1);
    /* boundary check for output */
-    sum = 0;
    if (d >= E.getSize(2) || k >= E.getSize(1))    return;
+    sum = 0;
    /* main operation */
    for(i=0; i<N; i++) {
        sum += A[b][i][k].ldg() * (X[b][i][d].ldg()-C[k][d].ldg());
@@ -49,9 +49,9 @@ void Encoding_(AggregateE_Forward)(THCState *state, THCTensor *E_,
    /* Check the GPU index and tensor dims*/
    THCTensor_(checkGPU)(state, 4, E_, A_, X_, C_);
    if (THCTensor_(nDimension)(state, E_) != 3 ||
-            THCTensor_(nDimension)(state, A_) != 3 ||
+        THCTensor_(nDimension)(state, A_) != 3 ||
-            THCTensor_(nDimension)(state, X_) != 3 ||
+        THCTensor_(nDimension)(state, X_) != 3 ||
-            THCTensor_(nDimension)(state, C_) != 2)
+        THCTensor_(nDimension)(state, C_) != 2)
        THError("Encoding: incorrect input dims. \n");
    /* Device tensors */
    THCDeviceTensor<real, 3> E = devicetensor<3>(state, E_);
@@ -62,7 +62,7 @@ void Encoding_(AggregateE_Forward)(THCState *state, THCTensor *E_,
    cudaStream_t stream = THCState_getCurrentStream(state);
    dim3 threads(16, 16);
    dim3 blocks(E.getSize(2)/16+1, E.getSize(1)/16+1, 
-                            E.getSize(0));
+                E.getSize(0));
    Encoding_(AggregateE_Forward_kernel)<<<blocks, threads, 0, stream>>>
        (E, A, X, C);
    THCudaCheck(cudaGetLastError());
@@ -527,491 +527,5 @@ void Encoding_(ScaledL2_Backward)(
        (GSL, GC, X, C, S);
    THCudaCheck(cudaGetLastError());
 }
-/*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
-__global__ void Encoding_(SquareSqueeze_Forward_kernel) (
-    THCDeviceTensor<real, 3> L,
-    THCDeviceTensor<real, 4> R)
-/*
- * aggregating forward kernel function
- */
-{
-    /* declarations of the variables */
-    int b, k, d, i, D;
-    real sum;
-    /* Get the index and channels */ 
-    b = blockIdx.z;
-    k = blockIdx.x * blockDim.x + threadIdx.x;
-    i = blockIdx.y * blockDim.y + threadIdx.y;
-    D = R.getSize(3);
-    /* boundary check for output */
-    if (k >= L.getSize(2) || i >= L.getSize(1)) return;
-    /* main operation */
-    sum = 0;
-    for(d=0; d<D; d++) {
-        sum += R[b][i][k][d].ldg()*R[b][i][k][d].ldg();
-    }
-    L[b][i][k] = sum;
-}
-void Encoding_(SquareSqueeze_Forward)(
-    THCState *state, THCTensor *L_,  THCTensor *R_)
-/*
- * aggregating forward the residuals with assignment weights
- */
-{
-    /* Check the GPU index and tensor dims*/
-    THCTensor_(checkGPU)(state, 2, L_, R_); 
-    if (THCTensor_(nDimension)(state, L_) != 3 ||
-        THCTensor_(nDimension)(state, R_) != 4)
-    THError("Encoding: incorrect input dims. \n");
-    /* Device tensors */
-    THCDeviceTensor<real, 3> L = devicetensor<3>(state, L_);
-    THCDeviceTensor<real, 4> R = devicetensor<4>(state, R_);
-    /* kernel function */
-    cudaStream_t stream = THCState_getCurrentStream(state);
-    dim3 threads(16, 16);
-    dim3 blocks(L.getSize(2)/16+1, L.getSize(1)/16+1, 
-                L.getSize(0));
-    Encoding_(SquareSqueeze_Forward_kernel)<<<blocks, threads, 0, stream>>>
-        (L, R);
-    THCudaCheck(cudaGetLastError());
-}
-/*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
-__global__ void Encoding_(SquareSqueeze_Backward_kernel) (
-    THCDeviceTensor<real, 3> GL,
-    THCDeviceTensor<real, 4> GR,
-    THCDeviceTensor<real, 4> R)
-/*
- */
-{
-    /* declarations of the variables */
-    int b, k, d, i, D;
-    real scale;
-    /* Get the index and channels */ 
-    b = blockIdx.z;
-    k = blockIdx.x * blockDim.x + threadIdx.x;
-    i = blockIdx.y * blockDim.y + threadIdx.y;
-    D = R.getSize(3);
-    /* boundary check for output */
-    if (k >= R.getSize(2) || i >= R.getSize(1)) return;
-    /* main operation */
-    scale = GL[b][i][k] * 2;
-    for(d=0; d<D; d++) {
-        GR[b][i][k][d] = scale * R[b][i][k][d];
-    }
-}
-void Encoding_(SquareSqueeze_Backward)(
-    THCState *state, THCTensor *GL_, THCTensor *GR_, THCTensor *R_)
-/*
- */
-{
-    /* Check the GPU index and tensor dims*/
-    THCTensor_(checkGPU)(state, 3, GL_, GR_, R_); 
-    if (THCTensor_(nDimension)(state, GL_) != 3 ||
-        THCTensor_(nDimension)(state, GR_) != 4 ||
-        THCTensor_(nDimension)(state, R_) != 4)
-    THError("Encoding: incorrect input dims. \n");
-    /* Device tensors */
-    THCDeviceTensor<real, 3> GL = devicetensor<3>(state, GL_);
-    THCDeviceTensor<real, 4> GR = devicetensor<4>(state, GR_);
-    THCDeviceTensor<real, 4> R = devicetensor<4>(state, R_);
-    /* kernel function */
-    cudaStream_t stream = THCState_getCurrentStream(state);
-    dim3 threads(16, 16);
-    dim3 blocks(R.getSize(2)/16+1, R.getSize(1)/16+1, 
-                R.getSize(0));
-    Encoding_(SquareSqueeze_Backward_kernel)<<<blocks, threads, 0, stream>>>
-        (GL, GR, R);
-    THCudaCheck(cudaGetLastError());
-}
-/*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
-__global__ void Encoding_(BatchNorm_Forward_kernel) (
-    THCDeviceTensor<real, 3> output,
-    THCDeviceTensor<real, 3> input,
-    THCDeviceTensor<real, 1> mean,
-    THCDeviceTensor<real, 1> invstd,
-    THCDeviceTensor<real, 1> gamma,
-    THCDeviceTensor<real, 1> beta)
-{
-    int c = blockIdx.x;
-    /* main operation */ 
-    for (int b = 0; b < input.getSize(0); ++b) {
-        for (int x = threadIdx.x; x < input.getSize(2); x += blockDim.x) {
-            real inp = input[b][c][x].ldg();
-            output[b][c][x] = gamma[c].ldg() * (inp - mean[c].ldg()) * 
-                invstd[c].ldg() + beta[c].ldg();
-        }
-    }
-}
-void Encoding_(BatchNorm_Forward)(THCState *state, 
-        THCTensor *output_, THCTensor *input_, 
-        THCTensor *mean_, THCTensor *invstd_,
-        THCTensor *gamma_, THCTensor *beta_)
-/*
- * batch norm forward function
- * assuming the input is already flaghten
- */
-{
-    /* Check the GPU index and tensor dims*/
-    THCTensor_(checkGPU)(state, 6, output_, input_, mean_, invstd_, 
-                         gamma_, beta_);
-    if (THCTensor_(nDimension)(state, output_) != 3 ||
-        THCTensor_(nDimension)(state, input_)  != 3 ||
-        THCTensor_(nDimension)(state, mean_)   != 1 ||
-        THCTensor_(nDimension)(state, invstd_) != 1 ||
-        THCTensor_(nDimension)(state, gamma_)  != 1 ||
-        THCTensor_(nDimension)(state, beta_)   != 1)
-        THError("BatchNorm2d forward: incorrect input dims. \n");
-    /* Device tensors */
-    THCDeviceTensor<real, 3> output = devicetensor<3>(state, output_);
-    THCDeviceTensor<real, 3> input  = devicetensor<3>(state, input_);
-    THCDeviceTensor<real, 1> mean   = devicetensor<1>(state, mean_);
-    THCDeviceTensor<real, 1> invstd    = devicetensor<1>(state, invstd_);
-    THCDeviceTensor<real, 1> gamma  = devicetensor<1>(state, gamma_);
-    THCDeviceTensor<real, 1> beta   = devicetensor<1>(state, beta_);
-    /* kernel function */
-    cudaStream_t stream = THCState_getCurrentStream(state);
-    dim3 blocks(input.getSize(1));
-    dim3 threads(getNumThreads(input.getSize(2)));
-    Encoding_(BatchNorm_Forward_kernel)<<<blocks, threads, 0, stream>>>(
-        output, input, mean, invstd, gamma, beta);
-    THCudaCheck(cudaGetLastError());
-}
-struct Encoding_(Float2){
-    real v1, v2;
-    __device__ Encoding_(Float2)() {}
-    __device__ Encoding_(Float2)(real x1, real x2) : v1(x1), v2(x2) {}
-    __device__ Encoding_(Float2)(real v) : v1(v), v2(v) {}
-    __device__ Encoding_(Float2)(int v) :  v1(v), v2(v) {}
-    __device__ Encoding_(Float2)& operator+=(const Encoding_(Float2)& a) {
-    v1 += a.v1;
-    v2 += a.v2;
-    return *this;
-  }
-};
-static __device__ __forceinline__ real Encoding_(rwarpSum)(real val) {
-#if __CUDA_ARCH__ >= 300
-  for (int i = 0; i < getMSB(WARP_SIZE); ++i) {
-    val += __shfl_xor(val, 1 << i, WARP_SIZE);
-  }
-#else
-  __shared__ real values[MAX_BLOCK_SIZE];
-  values[threadIdx.x] = val;
-  __threadfence_block();
-  const int base = (threadIdx.x / WARP_SIZE) * WARP_SIZE;
-  for (int i = 1; i < WARP_SIZE; i++) {
-    val += values[base + ((i + threadIdx.x) % WARP_SIZE)];
-  }
-#endif
-  return val;
-}
-static __device__ __forceinline__ Encoding_(Float2) Encoding_(warpSum)(Encoding_(Float2) value) {
-  value.v1 = Encoding_(rwarpSum)(value.v1);
-  value.v2 = Encoding_(rwarpSum)(value.v2);
-  return value;
-}
-struct Encoding_(GradOp) {
-    __device__ Encoding_(GradOp)(real m, THCDeviceTensor<real, 3> i, THCDeviceTensor<real, 3> g)
-        : mean(m), input(i), gradOutput(g) {}
-    __device__ __forceinline__ Encoding_(Float2) operator()(int batch, int plane, int n) {
-        real g = gradOutput[batch][plane][n].ldg();
-        real c = input[batch][plane][n].ldg() - mean;
-        return Encoding_(Float2)(g, g * c);
-    }
-    real mean;
-    THCDeviceTensor<real, 3> input;
-    THCDeviceTensor<real, 3> gradOutput;
-};
-// Sum across (batch, x/y/z) applying Op() pointwise
-__device__ Encoding_(Float2) Encoding_(reduce)(Encoding_(GradOp) op, THCDeviceTensor<real, 3> tensor, int plane) {
-    Encoding_(Float2) sum = (Encoding_(Float2))0;
-    for (int batch = 0; batch < tensor.getSize(0); ++batch) {
-        for (int x = threadIdx.x; x < tensor.getSize(2); x += blockDim.x) {
-            sum += op(batch, plane, x);
-        }
-    }
-    // sum over NumThreads within a warp
-    sum = Encoding_(warpSum)(sum);
-    // 'transpose', and reduce within warp again
-    __shared__ Encoding_(Float2) shared[32];
-    __syncthreads();
-    if (threadIdx.x % WARP_SIZE == 0) {
-        if (threadIdx.x / WARP_SIZE < 32) {
-                shared[threadIdx.x / WARP_SIZE] = sum;
-        }
-    }
-    if (threadIdx.x >= blockDim.x / WARP_SIZE && threadIdx.x < WARP_SIZE) {
-        // zero out the other entries in shared
-        shared[threadIdx.x] = (Encoding_(Float2))0;
-    }
-    __syncthreads();
-    if (threadIdx.x / WARP_SIZE == 0) {
-        sum = Encoding_(warpSum)(shared[threadIdx.x]);
-        if (threadIdx.x == 0) {
-            shared[0] = sum;
-        }
-    }
-    __syncthreads();
-  // Everyone picks it up, should be broadcast into the whole gradInput
-  return shared[0];
-}
-__global__ void Encoding_(BatchNorm_Backward_kernel) (
-    THCDeviceTensor<real, 3> gradoutput,
-    THCDeviceTensor<real, 3> input,
-    THCDeviceTensor<real, 3> gradinput,
-    THCDeviceTensor<real, 1> gradgamma,
-    THCDeviceTensor<real, 1> gradbeta,
-    THCDeviceTensor<real, 1> mean,
-    THCDeviceTensor<real, 1> invstd,
-    THCDeviceTensor<real, 1> gamma,
-    THCDeviceTensor<real, 1> beta,
-    THCDeviceTensor<real, 1> gradMean, 
-    THCDeviceTensor<real, 1> gradStd,
-    int train)
-{
-    /* declarations of the variables */
-    /* Get the index and channels */ 
-    int c = blockIdx.x; 
-    /* main operation */ 
-    Encoding_(GradOp) g(mean[c], input, gradoutput);
-    Encoding_(Float2) res = Encoding_(reduce)(g, gradoutput, c);
-    real gradOutputSum = res.v1;
-    real dotP = res.v2;
-    //real projScale = dotP * norm * invstd[c].ldg() * invstd[c].ldg();
-    real gradScale = invstd[c].ldg() * gamma[c].ldg();
-    if (train && threadIdx.x == 0) {
-        gradMean[c] = - gradOutputSum * gamma[c].ldg() * invstd[c].ldg();
-        gradStd[c]  = - dotP * gamma[c].ldg() * invstd[c].ldg() * invstd[c].ldg();
-    }
-    if (gradinput.numElements() > 0) {
-        for (int batch = 0; batch < gradoutput.getSize(0); ++batch) {
-            for (int x = threadIdx.x; x < gradoutput.getSize(2); x += blockDim.x) {
-                gradinput[batch][c][x] = gradoutput[batch][c][x].ldg() * gradScale;
-            }
-        }
-    }
-    if (gradgamma.numElements() > 0) {
-        if (threadIdx.x == 0) {
-            gradgamma[c] += dotP * invstd[c].ldg();
-        }
-    }
-    if (gradbeta.numElements() > 0) {
-        if (threadIdx.x == 0) {
-            gradbeta[c] += gradOutputSum;
-        }
-    }
-}
-void Encoding_(BatchNorm_Backward)(THCState *state, 
-        THCTensor *gradoutput_, THCTensor *input_, THCTensor *gradinput_, 
-        THCTensor *gradgamma_, THCTensor *gradbeta_, THCTensor *mean_, 
-        THCTensor *invstd_, THCTensor *gamma_, THCTensor *beta_, 
-        THCTensor *gradMean_, THCTensor *gradStd_, int train)
-/*
- * batch norm backward function
- * assuming the input is already flaghten
- */
-{
-    /* Check the GPU index and tensor dims*/
-    THCTensor_(checkGPU)(state, 6, gradoutput_, input_, gradinput_, 
-        gradgamma_, gradbeta_, mean_, invstd_, gamma_, beta_);
-    if (THCTensor_(nDimension)(state, gradoutput_) != 3 ||
-        THCTensor_(nDimension)(state, input_)      != 3 ||
-        THCTensor_(nDimension)(state, gradinput_)  != 3 ||
-        THCTensor_(nDimension)(state, gradgamma_)  != 1 ||
-        THCTensor_(nDimension)(state, gradbeta_)   != 1 ||
-        THCTensor_(nDimension)(state, mean_)   != 1 ||
-        THCTensor_(nDimension)(state, invstd_) != 1 ||
-        THCTensor_(nDimension)(state, gamma_)  != 1 ||
-        THCTensor_(nDimension)(state, beta_)   != 1 || 
-        THCTensor_(nDimension)(state, gradMean_) != 1 ||
-        THCTensor_(nDimension)(state, gradStd_)  != 1 )
-        THError("BatchNorm2d backward: incorrect input dims. \n");
-    /* Device tensors */
-    THCDeviceTensor<real, 3> gradoutput = 
-        devicetensor<3>(state, gradoutput_);
-    THCDeviceTensor<real, 3> input = 
-        devicetensor<3>(state, input_);
-    THCDeviceTensor<real, 3> gradinput = 
-        devicetensor<3>(state, gradinput_);
-    THCDeviceTensor<real, 1> gradgamma = 
-        devicetensor<1>(state, gradgamma_);
-    THCDeviceTensor<real, 1> gradbeta = devicetensor<1>(state, gradbeta_);
-    THCDeviceTensor<real, 1> mean     = devicetensor<1>(state, mean_);
-    THCDeviceTensor<real, 1> invstd   = devicetensor<1>(state, invstd_);
-    THCDeviceTensor<real, 1> gamma    = devicetensor<1>(state, gamma_);
-    THCDeviceTensor<real, 1> beta     = devicetensor<1>(state, beta_);
-    THCDeviceTensor<real, 1> gradMean = devicetensor<1>(state, gradMean_);
-    THCDeviceTensor<real, 1> gradStd  = devicetensor<1>(state, gradStd_);
-    /* kernel function */
-    cudaStream_t stream = THCState_getCurrentStream(state);
-    dim3 blocks(input.getSize(1));
-    dim3 threads(getNumThreads(input.getSize(2)));
-    Encoding_(BatchNorm_Backward_kernel)<<<blocks, threads, 0, stream>>>(
-        gradoutput, input, gradinput, gradgamma, gradbeta, mean, invstd, 
-        gamma, beta, gradMean, gradStd, train);
-    THCudaCheck(cudaGetLastError());
-}
-struct Encoding_(SumOp) {
-    __device__ Encoding_(SumOp)(THCDeviceTensor<real, 3> i)
-        : input(i){}
-    __device__ __forceinline__ Encoding_(Float2) operator()(int batch, int plane, int n) {
-        real g = input[batch][plane][n].ldg();
-        return Encoding_(Float2)(g, g * g);
-    }
-    real mean;
-    THCDeviceTensor<real, 3> input;
-};
-// Sum across (batch, x/y/z) applying Op() pointwise
-__device__ Encoding_(Float2) Encoding_(reduce_sum)(Encoding_(SumOp) op, THCDeviceTensor<real, 3> tensor, int plane) {
-  Encoding_(Float2) sum = (Encoding_(Float2))0;
-  for (int batch = 0; batch < tensor.getSize(0); ++batch) {
-    for (int x = threadIdx.x; x < tensor.getSize(2); x += blockDim.x) {
-      sum += op(batch, plane, x);
-    }
-  }
-  // sum over NumThreads within a warp
-  sum = Encoding_(warpSum)(sum);
-  // 'transpose', and reduce within warp again
-  __shared__ Encoding_(Float2) shared[32];
-  __syncthreads();
-  if (threadIdx.x % WARP_SIZE == 0) {
-    if (threadIdx.x / WARP_SIZE < 32) {
-        shared[threadIdx.x / WARP_SIZE] = sum;
-    }
-  }
-  if (threadIdx.x >= blockDim.x / WARP_SIZE && threadIdx.x < WARP_SIZE) {
-    // zero out the other entries in shared
-    shared[threadIdx.x] = (Encoding_(Float2))0;
-  }
-  __syncthreads();
-  if (threadIdx.x / WARP_SIZE == 0) {
-    sum = Encoding_(warpSum)(shared[threadIdx.x]);
-    if (threadIdx.x == 0) {
-      shared[0] = sum;
-    }
-  }
-  __syncthreads();
-  // Everyone picks it up, should be broadcast into the whole gradInput
-  return shared[0];
-}
-__global__ void Encoding_(Sum_Square_Forward_kernel) (
-    THCDeviceTensor<real, 3> input,
-    THCDeviceTensor<real, 1> sum,
-    THCDeviceTensor<real, 1> square)
-{
-    int c = blockIdx.x;
-    /* main operation */ 
-    Encoding_(SumOp) g(input);
-    Encoding_(Float2) res = Encoding_(reduce_sum)(g, input, c);
-    real xsum = res.v1;
-    real xsquare = res.v2;
-    if (threadIdx.x == 0) {
-        sum[c]    = xsum;
-        square[c] = xsquare;
-    }
-}
-void Encoding_(Sum_Square_Forward)(THCState *state, 
-        THCTensor *input_, THCTensor *sum_, THCTensor *square_)
-/*
- */
-{
-    /* Check the GPU index and tensor dims*/
-    THCTensor_(checkGPU)(state, 3, input_, sum_, square_);
-    if (THCTensor_(nDimension)(state, input_)   != 3 ||
-        THCTensor_(nDimension)(state, sum_)     != 1 ||
-        THCTensor_(nDimension)(state, square_)  != 1)
-        THError("Sum_Square forward: incorrect input dims. \n");
-    /* Device tensors */
-    THCDeviceTensor<real, 3> input  = devicetensor<3>(state, input_);
-    THCDeviceTensor<real, 1> sum    = devicetensor<1>(state, sum_);
-    THCDeviceTensor<real, 1> square = devicetensor<1>(state, square_);
-    /* kernel function */
-    cudaStream_t stream = THCState_getCurrentStream(state);
-    dim3 blocks(input.getSize(1));
-    dim3 threads(getNumThreads(input.getSize(2)));
-    Encoding_(Sum_Square_Forward_kernel)<<<blocks, threads, 0, stream>>>(
-        input, sum, square);
-    THCudaCheck(cudaGetLastError());
-}
-__global__ void Encoding_(Sum_Square_Backward_kernel) (
-    THCDeviceTensor<real, 3> gradInput,
-    THCDeviceTensor<real, 3> input,
-    THCDeviceTensor<real, 1> gradSum,
-    THCDeviceTensor<real, 1> gradSquare)
-{
-    int c = blockIdx.x;
-    /* main operation */ 
-    for (int batch = 0; batch < gradInput.getSize(0); ++batch) {
-        for (int x = threadIdx.x; x < gradInput.getSize(2); x += blockDim.x)
-        {
-            gradInput[batch][c][x] = gradSum[c] + 2 * gradSquare[c] *
-                input[batch][c][x];
-        }
-    }   
-}
-void Encoding_(Sum_Square_Backward)(THCState *state, 
-        THCTensor *gradInput_, THCTensor *input_, 
-        THCTensor *gradSum_, THCTensor *gradSquare_)
-/*
- */
-{
-    /* Check the GPU index and tensor dims*/
-    THCTensor_(checkGPU)(state, 4, gradInput_, input_, gradSum_, 
-                         gradSquare_);
-    if (THCTensor_(nDimension)(state, gradInput_)  != 3 ||
-        THCTensor_(nDimension)(state, input_)      != 3 ||
-        THCTensor_(nDimension)(state, gradSum_)    != 1 ||
-        THCTensor_(nDimension)(state, gradSquare_) != 1)
-        THError("Sum_Square forward: incorrect input dims. \n");
-    /* Device tensors */
-    THCDeviceTensor<real, 3> gradInput  = devicetensor<3>(state, gradInput_);
-    THCDeviceTensor<real, 3> input      = devicetensor<3>(state, input_);
-    THCDeviceTensor<real, 1> gradSum    = devicetensor<1>(state, gradSum_);
-    THCDeviceTensor<real, 1> gradSquare =devicetensor<1>(state, gradSquare_);
-    /* kernel function */
-    cudaStream_t stream = THCState_getCurrentStream(state);
-    dim3 blocks(input.getSize(1));
-    dim3 threads(getNumThreads(input.getSize(2)));
-    Encoding_(Sum_Square_Backward_kernel)<<<blocks, threads, 0, stream>>>(
-        gradInput, input, gradSum, gradSquare);
-    THCudaCheck(cudaGetLastError());
-}
 #endif
--- a/encoding/kernel/generic/encoding_kernel.h
+++ b/encoding/kernel/generic/encoding_kernel.h
@@ -38,28 +38,4 @@ void Encoding_(Residual_Forward)(
 void Encoding_(Residual_Backward)(
    THCState *state, THCTensor *GR_, THCTensor *GX_, THCTensor *GD_);
-void Encoding_(SquareSqueeze_Forward)(
-    THCState *state, THCTensor *L_,  THCTensor *R_);
-void Encoding_(SquareSqueeze_Backward)(
-    THCState *state, THCTensor *GL_, THCTensor *GR_, THCTensor *R_);
-void Encoding_(BatchNorm_Forward)(THCState *state, 
-    THCTensor *output_, THCTensor *input_, 
-    THCTensor *mean_, THCTensor *invstd_,
-    THCTensor *gamma_, THCTensor *beta_);
-void Encoding_(BatchNorm_Backward)(THCState *state, 
-    THCTensor *gradoutput_, THCTensor *input_, THCTensor *gradinput_, 
-    THCTensor *gradgamma_, THCTensor *gradbeta_, THCTensor *mean_, 
-    THCTensor *invstd_, THCTensor *gamma_, THCTensor *beta_, 
-    THCTensor *gradMean_, THCTensor *gradStd_, int train);
-void Encoding_(Sum_Square_Forward)(THCState *state, 
-    THCTensor *input_, THCTensor *sum_, THCTensor *square_);
-void Encoding_(Sum_Square_Backward)(THCState *state, 
-    THCTensor *gradInput, THCTensor *input_, 
-    THCTensor *gradSum_, THCTensor *gradSquare_);
 #endif
--- a/encoding/kernel/generic/pooling_kernel.c
+++ b/encoding/kernel/generic/pooling_kernel.c
+/*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+ * Created by: Hang Zhang
+ * ECE Department, Rutgers University
+ * Email: zhang.hang@rutgers.edu
+ * Copyright (c) 2017
+ *
+ * This source code is licensed under the MIT-style license found in the
+ * LICENSE file in the root directory of this source tree 
+ *+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+ */
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/pooling_kernel.c"
+#else
+__global__ void Encoding_(DilatedAvgPool_Forward_kernel) (
+    THCDeviceTensor<real, 4> X, 
+    THCDeviceTensor<real, 4> Y, 
+    int kH, int kW, int dH, int dW,
+    int padH, int padW, int dilationH, int dilationW
+    )
+/*
+ * dilated avgpool2d forward kernel function
+ */
+{
+    /* declarations of the variables */
+    int bc, b, c, w, h, C;
+    real sum;
+    /* Get the index and channels */ 
+    bc = blockIdx.z;
+    w = blockIdx.x * blockDim.x + threadIdx.x;
+    h = blockIdx.y * blockDim.y + threadIdx.y;
+    C = Y.getSize(1);
+    b = bc / C;
+    c = bc - b*C;
+    /* boundary check for output */
+    if (w >= Y.getSize(3) || h >= Y.getSize(2)) return;
+    int hstart = h*dW -padH;
+    int wstart = w*dW -padW;
+    int hend = min(hstart + kH*dilationH, X.getSize(2));
+    int wend = min(wstart + kW*dilationW, X.getSize(3));
+    hstart = max(hstart, 0);
+    wstart = max(wstart, 0);
+    int pool_size = ((hend - hstart - 1) / dilationH + 1) * 
+        ((wend - wstart - 1) / dilationW + 1);
+    sum = 0;
+    for (int th=hstart; th < hend; th+=dilationH) {
+        for (int tw=wstart; tw < wend; tw+=dilationW) {
+            sum += X[b][c][th][tw];
+        }
+    }
+    Y[b][c][h][w] = sum / pool_size;
+}
+void Encoding_(DilatedAvgPool_Forward)(THCState *state, 
+    THCTensor *X_, THCTensor *Y_, 
+    int kH, int kW, int dH, int dW,
+    int padH, int padW,
+    int dilationH, int dilationW)
+/*
+ * dilated avgpool2d forward function
+ */
+{
+    /* Check the GPU index and tensor dims*/
+    THCTensor_(checkGPU)(state, 2, X_, Y_);
+    if (THCTensor_(nDimension)(state, X_) != 4 ||
+        THCTensor_(nDimension)(state, Y_) != 4)
+        THError("Encoding: incorrect input dims. \n");
+    /* Device tensors */
+    THCDeviceTensor<real, 4> X = devicetensor<4>(state, X_);
+    THCDeviceTensor<real, 4> Y = devicetensor<4>(state, Y_);
+    /* kernel function */
+    cudaStream_t stream = THCState_getCurrentStream(state);
+    dim3 threads(16, 16);
+    dim3 blocks(Y.getSize(3)/16+1, Y.getSize(2)/16+1, 
+                Y.getSize(1)*Y.getSize(0));
+    Encoding_(DilatedAvgPool_Forward_kernel)<<<blocks, threads, 0, stream>>>
+        (X, Y, kH, kW, dH, dW, padH, padW, dilationH, dilationW);
+    THCudaCheck(cudaGetLastError());
+}
+__global__ void Encoding_(DilatedAvgPool_Backward_kernel) (
+    THCDeviceTensor<real, 4> gradX, 
+    THCDeviceTensor<real, 4> gradY, 
+    int kH, int kW, int dH, int dW,
+    int padH, int padW, int dilationH, int dilationW
+    )
+/*
+ * dilated avgpool2d forward kernel function
+ */
+{
+    /* declarations of the variables */
+    int bc, b, c, w, h, C;
+    real sum;
+    /* Get the index and channels */ 
+    bc = blockIdx.z;
+    w = blockIdx.x * blockDim.x + threadIdx.x;
+    h = blockIdx.y * blockDim.y + threadIdx.y;
+    C = gradX.getSize(1);
+    b = bc / C;
+    c = bc - b*C;
+    /* boundary check for output */
+    if (w >= gradX.getSize(3) || h >= gradX.getSize(2)) return;
+    int phstart = (h + padH < ((kH-1)*dilationH+1)) ? 0 : 
+        (h + padH - ((kH-1)*dilationH+1))/dH + 1;
+    int pwstart = (w + padW < ((kW-1)*dilationW+1)) ? 0 : 
+        (w + padW - ((kW-1)*dilationW+1))/dW + 1;
+    int phend = min((h+padH)/dH+1, gradY.getSize(2));
+    int pwend = min((w+padW)/dW+1, gradY.getSize(3));
+    sum = 0;
+    int hstart, wstart, hend, wend, pool_size;
+    for (int ph=phstart; ph < phend; ++ph) {
+        for (int pw=pwstart; pw < pwend; ++pw) {
+            hstart = ph*dW -padH;
+            wstart = pw*dW -padW;
+            hend = min(hstart + kH*dilationH, gradX.getSize(2));
+            wend = min(wstart + kW*dilationW, gradX.getSize(3));
+            hstart = max(hstart, 0);
+            wstart = max(wstart, 0);
+            pool_size = ((hend - hstart - 1) / dilationH + 1) * 
+                ((wend - wstart - 1) / dilationW + 1);
+            sum += gradY[b][c][ph][pw] / pool_size;
+        }
+    }
+    gradX[b][c][h][w] = sum;
+}
+void Encoding_(DilatedAvgPool_Backward)(THCState *state, 
+    THCTensor *gradX_, THCTensor *gradY_, 
+    int kH, int kW, int dH, int dW,
+    int padH, int padW,
+    int dilationH, int dilationW)
+/*
+ * dilated avgpool2d forward function
+ */
+{
+    /* Check the GPU index and tensor dims*/
+    THCTensor_(checkGPU)(state, 2, gradX_, gradY_);
+    if (THCTensor_(nDimension)(state, gradX_) != 4 ||
+        THCTensor_(nDimension)(state, gradY_) != 4)
+        THError("Encoding: incorrect input dims. \n");
+    /* Device tensors */
+    THCDeviceTensor<real, 4> gradX = devicetensor<4>(state, gradX_);
+    THCDeviceTensor<real, 4> gradY = devicetensor<4>(state, gradY_);
+    /* kernel function */
+    cudaStream_t stream = THCState_getCurrentStream(state);
+    dim3 threads(16, 16);
+    dim3 blocks(gradX.getSize(3)/16+1, gradX.getSize(2)/16+1, 
+                gradX.getSize(1)*gradX.getSize(0));
+    Encoding_(DilatedAvgPool_Backward_kernel)<<<blocks, threads, 0, stream>>>
+        (gradX, gradY, kH, kW, dH, dW, padH, padW, dilationH, dilationW);
+    THCudaCheck(cudaGetLastError());
+}
+#endif
--- a/encoding/kernel/generic/pooling_kernel.h
+++ b/encoding/kernel/generic/pooling_kernel.h
+/*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+ * Created by: Hang Zhang
+ * ECE Department, Rutgers University
+ * Email: zhang.hang@rutgers.edu
+ * Copyright (c) 2017
+ *
+ * This source code is licensed under the MIT-style license found in the
+ * LICENSE file in the root directory of this source tree 
+ *+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+ */
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/pooling_kernel.h"
+#else
+void Encoding_(DilatedAvgPool_Forward)(THCState *state, 
+    THCTensor *X_, THCTensor *Y_, 
+    int kH, int kW, int dH, int dW,
+    int padH, int padW,
+    int dilationH, int dilationW);
+void Encoding_(DilatedAvgPool_Backward)(THCState *state, 
+    THCTensor *gradX_, THCTensor *gradY_, 
+    int kH, int kW, int dH, int dW,
+    int padH, int padW,
+    int dilationH, int dilationW);
+#endif
--- a/encoding/kernel/generic/syncbn_kernel.c
+++ b/encoding/kernel/generic/syncbn_kernel.c
+/*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+ * Created by: Hang Zhang
+ * ECE Department, Rutgers University
+ * Email: zhang.hang@rutgers.edu
+ * Copyright (c) 2017
+ *
+ * This source code is licensed under the MIT-style license found in the
+ * LICENSE file in the root directory of this source tree 
+ *+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+ */
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/syncbn_kernel.c"
+#else
+__global__ void Encoding_(SquareSqueeze_Forward_kernel) (
+    THCDeviceTensor<real, 3> L,
+    THCDeviceTensor<real, 4> R)
+/*
+ * aggregating forward kernel function
+ */
+{
+    /* declarations of the variables */
+    int b, k, d, i, D;
+    real sum;
+    /* Get the index and channels */ 
+    b = blockIdx.z;
+    k = blockIdx.x * blockDim.x + threadIdx.x;
+    i = blockIdx.y * blockDim.y + threadIdx.y;
+    D = R.getSize(3);
+    /* boundary check for output */
+    if (k >= L.getSize(2) || i >= L.getSize(1)) return;
+    /* main operation */
+    sum = 0;
+    for(d=0; d<D; d++) {
+        sum += R[b][i][k][d].ldg()*R[b][i][k][d].ldg();
+    }
+    L[b][i][k] = sum;
+}
+void Encoding_(SquareSqueeze_Forward)(
+    THCState *state, THCTensor *L_,  THCTensor *R_)
+/*
+ * aggregating forward the residuals with assignment weights
+ */
+{
+    /* Check the GPU index and tensor dims*/
+    THCTensor_(checkGPU)(state, 2, L_, R_); 
+    if (THCTensor_(nDimension)(state, L_) != 3 ||
+        THCTensor_(nDimension)(state, R_) != 4)
+    THError("Encoding: incorrect input dims. \n");
+    /* Device tensors */
+    THCDeviceTensor<real, 3> L = devicetensor<3>(state, L_);
+    THCDeviceTensor<real, 4> R = devicetensor<4>(state, R_);
+    /* kernel function */
+    cudaStream_t stream = THCState_getCurrentStream(state);
+    dim3 threads(16, 16);
+    dim3 blocks(L.getSize(2)/16+1, L.getSize(1)/16+1, 
+                L.getSize(0));
+    Encoding_(SquareSqueeze_Forward_kernel)<<<blocks, threads, 0, stream>>>
+        (L, R);
+    THCudaCheck(cudaGetLastError());
+}
+/*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
+__global__ void Encoding_(SquareSqueeze_Backward_kernel) (
+    THCDeviceTensor<real, 3> GL,
+    THCDeviceTensor<real, 4> GR,
+    THCDeviceTensor<real, 4> R)
+/*
+ */
+{
+    /* declarations of the variables */
+    int b, k, d, i, D;
+    real scale;
+    /* Get the index and channels */ 
+    b = blockIdx.z;
+    k = blockIdx.x * blockDim.x + threadIdx.x;
+    i = blockIdx.y * blockDim.y + threadIdx.y;
+    D = R.getSize(3);
+    /* boundary check for output */
+    if (k >= R.getSize(2) || i >= R.getSize(1)) return;
+    /* main operation */
+    scale = GL[b][i][k] * 2;
+    for(d=0; d<D; d++) {
+        GR[b][i][k][d] = scale * R[b][i][k][d];
+    }
+}
+void Encoding_(SquareSqueeze_Backward)(
+    THCState *state, THCTensor *GL_, THCTensor *GR_, THCTensor *R_)
+/*
+ */
+{
+    /* Check the GPU index and tensor dims*/
+    THCTensor_(checkGPU)(state, 3, GL_, GR_, R_); 
+    if (THCTensor_(nDimension)(state, GL_) != 3 ||
+        THCTensor_(nDimension)(state, GR_) != 4 ||
+        THCTensor_(nDimension)(state, R_) != 4)
+    THError("Encoding: incorrect input dims. \n");
+    /* Device tensors */
+    THCDeviceTensor<real, 3> GL = devicetensor<3>(state, GL_);
+    THCDeviceTensor<real, 4> GR = devicetensor<4>(state, GR_);
+    THCDeviceTensor<real, 4> R = devicetensor<4>(state, R_);
+    /* kernel function */
+    cudaStream_t stream = THCState_getCurrentStream(state);
+    dim3 threads(16, 16);
+    dim3 blocks(R.getSize(2)/16+1, R.getSize(1)/16+1, 
+                R.getSize(0));
+    Encoding_(SquareSqueeze_Backward_kernel)<<<blocks, threads, 0, stream>>>
+        (GL, GR, R);
+    THCudaCheck(cudaGetLastError());
+}
+/*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
+__global__ void Encoding_(BatchNorm_Forward_kernel) (
+    THCDeviceTensor<real, 3> output,
+    THCDeviceTensor<real, 3> input,
+    THCDeviceTensor<real, 1> mean,
+    THCDeviceTensor<real, 1> invstd,
+    THCDeviceTensor<real, 1> gamma,
+    THCDeviceTensor<real, 1> beta)
+{
+    int c = blockIdx.x;
+    /* main operation */ 
+    for (int b = 0; b < input.getSize(0); ++b) {
+        for (int x = threadIdx.x; x < input.getSize(2); x += blockDim.x) {
+            real inp = input[b][c][x].ldg();
+            output[b][c][x] = gamma[c].ldg() * (inp - mean[c].ldg()) * 
+                invstd[c].ldg() + beta[c].ldg();
+        }
+    }
+}
+void Encoding_(BatchNorm_Forward)(THCState *state, 
+        THCTensor *output_, THCTensor *input_, 
+        THCTensor *mean_, THCTensor *invstd_,
+        THCTensor *gamma_, THCTensor *beta_)
+/*
+ * batch norm forward function
+ * assuming the input is already flaghten
+ */
+{
+    /* Check the GPU index and tensor dims*/
+    THCTensor_(checkGPU)(state, 6, output_, input_, mean_, invstd_, 
+                         gamma_, beta_);
+    if (THCTensor_(nDimension)(state, output_) != 3 ||
+        THCTensor_(nDimension)(state, input_)  != 3 ||
+        THCTensor_(nDimension)(state, mean_)   != 1 ||
+        THCTensor_(nDimension)(state, invstd_) != 1 ||
+        THCTensor_(nDimension)(state, gamma_)  != 1 ||
+        THCTensor_(nDimension)(state, beta_)   != 1)
+        THError("BatchNorm2d forward: incorrect input dims. \n");
+    /* Device tensors */
+    THCDeviceTensor<real, 3> output = devicetensor<3>(state, output_);
+    THCDeviceTensor<real, 3> input  = devicetensor<3>(state, input_);
+    THCDeviceTensor<real, 1> mean   = devicetensor<1>(state, mean_);
+    THCDeviceTensor<real, 1> invstd    = devicetensor<1>(state, invstd_);
+    THCDeviceTensor<real, 1> gamma  = devicetensor<1>(state, gamma_);
+    THCDeviceTensor<real, 1> beta   = devicetensor<1>(state, beta_);
+    /* kernel function */
+    cudaStream_t stream = THCState_getCurrentStream(state);
+    dim3 blocks(input.getSize(1));
+    dim3 threads(getNumThreads(input.getSize(2)));
+    Encoding_(BatchNorm_Forward_kernel)<<<blocks, threads, 0, stream>>>(
+        output, input, mean, invstd, gamma, beta);
+    THCudaCheck(cudaGetLastError());
+}
+struct Encoding_(Float2){
+    real v1, v2;
+    __device__ Encoding_(Float2)() {}
+    __device__ Encoding_(Float2)(real x1, real x2) : v1(x1), v2(x2) {}
+    __device__ Encoding_(Float2)(real v) : v1(v), v2(v) {}
+    __device__ Encoding_(Float2)(int v) :  v1(v), v2(v) {}
+    __device__ Encoding_(Float2)& operator+=(const Encoding_(Float2)& a) {
+    v1 += a.v1;
+    v2 += a.v2;
+    return *this;
+  }
+};
+static __device__ __forceinline__ real Encoding_(rwarpSum)(real val) {
+#if __CUDA_ARCH__ >= 300
+  for (int i = 0; i < getMSB(WARP_SIZE); ++i) {
+    val += __shfl_xor(val, 1 << i, WARP_SIZE);
+  }
+#else
+  __shared__ real values[MAX_BLOCK_SIZE];
+  values[threadIdx.x] = val;
+  __threadfence_block();
+  const int base = (threadIdx.x / WARP_SIZE) * WARP_SIZE;
+  for (int i = 1; i < WARP_SIZE; i++) {
+    val += values[base + ((i + threadIdx.x) % WARP_SIZE)];
+  }
+#endif
+  return val;
+}
+static __device__ __forceinline__ Encoding_(Float2) Encoding_(warpSum)(Encoding_(Float2) value) {
+  value.v1 = Encoding_(rwarpSum)(value.v1);
+  value.v2 = Encoding_(rwarpSum)(value.v2);
+  return value;
+}
+struct Encoding_(GradOp) {
+    __device__ Encoding_(GradOp)(real m, THCDeviceTensor<real, 3> i, THCDeviceTensor<real, 3> g)
+        : mean(m), input(i), gradOutput(g) {}
+    __device__ __forceinline__ Encoding_(Float2) operator()(int batch, int plane, int n) {
+        real g = gradOutput[batch][plane][n].ldg();
+        real c = input[batch][plane][n].ldg() - mean;
+        return Encoding_(Float2)(g, g * c);
+    }
+    real mean;
+    THCDeviceTensor<real, 3> input;
+    THCDeviceTensor<real, 3> gradOutput;
+};
+// Sum across (batch, x/y/z) applying Op() pointwise
+__device__ Encoding_(Float2) Encoding_(reduce)(Encoding_(GradOp) op, THCDeviceTensor<real, 3> tensor, int plane) {
+    Encoding_(Float2) sum = (Encoding_(Float2))0;
+    for (int batch = 0; batch < tensor.getSize(0); ++batch) {
+        for (int x = threadIdx.x; x < tensor.getSize(2); x += blockDim.x) {
+            sum += op(batch, plane, x);
+        }
+    }
+    // sum over NumThreads within a warp
+    sum = Encoding_(warpSum)(sum);
+    // 'transpose', and reduce within warp again
+    __shared__ Encoding_(Float2) shared[32];
+    __syncthreads();
+    if (threadIdx.x % WARP_SIZE == 0) {
+        if (threadIdx.x / WARP_SIZE < 32) {
+                shared[threadIdx.x / WARP_SIZE] = sum;
+        }
+    }
+    if (threadIdx.x >= blockDim.x / WARP_SIZE && threadIdx.x < WARP_SIZE) {
+        // zero out the other entries in shared
+        shared[threadIdx.x] = (Encoding_(Float2))0;
+    }
+    __syncthreads();
+    if (threadIdx.x / WARP_SIZE == 0) {
+        sum = Encoding_(warpSum)(shared[threadIdx.x]);
+        if (threadIdx.x == 0) {
+            shared[0] = sum;
+        }
+    }
+    __syncthreads();
+  // Everyone picks it up, should be broadcast into the whole gradInput
+  return shared[0];
+}
+__global__ void Encoding_(BatchNorm_Backward_kernel) (
+    THCDeviceTensor<real, 3> gradoutput,
+    THCDeviceTensor<real, 3> input,
+    THCDeviceTensor<real, 3> gradinput,
+    THCDeviceTensor<real, 1> gradgamma,
+    THCDeviceTensor<real, 1> gradbeta,
+    THCDeviceTensor<real, 1> mean,
+    THCDeviceTensor<real, 1> invstd,
+    THCDeviceTensor<real, 1> gamma,
+    THCDeviceTensor<real, 1> beta,
+    THCDeviceTensor<real, 1> gradMean, 
+    THCDeviceTensor<real, 1> gradStd,
+    int train)
+{
+    /* declarations of the variables */
+    /* Get the index and channels */ 
+    int c = blockIdx.x; 
+    /* main operation */ 
+    Encoding_(GradOp) g(mean[c], input, gradoutput);
+    Encoding_(Float2) res = Encoding_(reduce)(g, gradoutput, c);
+    real gradOutputSum = res.v1;
+    real dotP = res.v2;
+    //real projScale = dotP * norm * invstd[c].ldg() * invstd[c].ldg();
+    real gradScale = invstd[c].ldg() * gamma[c].ldg();
+    if (train && threadIdx.x == 0) {
+        gradMean[c] = - gradOutputSum * gamma[c].ldg() * invstd[c].ldg();
+        gradStd[c]  = - dotP * gamma[c].ldg() * invstd[c].ldg() * invstd[c].ldg();
+    }
+    if (gradinput.numElements() > 0) {
+        for (int batch = 0; batch < gradoutput.getSize(0); ++batch) {
+            for (int x = threadIdx.x; x < gradoutput.getSize(2); x += blockDim.x) {
+                gradinput[batch][c][x] = gradoutput[batch][c][x].ldg() * gradScale;
+            }
+        }
+    }
+    if (gradgamma.numElements() > 0) {
+        if (threadIdx.x == 0) {
+            gradgamma[c] += dotP * invstd[c].ldg();
+        }
+    }
+    if (gradbeta.numElements() > 0) {
+        if (threadIdx.x == 0) {
+            gradbeta[c] += gradOutputSum;
+        }
+    }
+}
+void Encoding_(BatchNorm_Backward)(THCState *state, 
+        THCTensor *gradoutput_, THCTensor *input_, THCTensor *gradinput_, 
+        THCTensor *gradgamma_, THCTensor *gradbeta_, THCTensor *mean_, 
+        THCTensor *invstd_, THCTensor *gamma_, THCTensor *beta_, 
+        THCTensor *gradMean_, THCTensor *gradStd_, int train)
+/*
+ * batch norm backward function
+ * assuming the input is already flaghten
+ */
+{
+    /* Check the GPU index and tensor dims*/
+    THCTensor_(checkGPU)(state, 6, gradoutput_, input_, gradinput_, 
+        gradgamma_, gradbeta_, mean_, invstd_, gamma_, beta_);
+    if (THCTensor_(nDimension)(state, gradoutput_) != 3 ||
+        THCTensor_(nDimension)(state, input_)      != 3 ||
+        THCTensor_(nDimension)(state, gradinput_)  != 3 ||
+        THCTensor_(nDimension)(state, gradgamma_)  != 1 ||
+        THCTensor_(nDimension)(state, gradbeta_)   != 1 ||
+        THCTensor_(nDimension)(state, mean_)   != 1 ||
+        THCTensor_(nDimension)(state, invstd_) != 1 ||
+        THCTensor_(nDimension)(state, gamma_)  != 1 ||
+        THCTensor_(nDimension)(state, beta_)   != 1 || 
+        THCTensor_(nDimension)(state, gradMean_) != 1 ||
+        THCTensor_(nDimension)(state, gradStd_)  != 1 )
+        THError("BatchNorm2d backward: incorrect input dims. \n");
+    /* Device tensors */
+    THCDeviceTensor<real, 3> gradoutput = 
+        devicetensor<3>(state, gradoutput_);
+    THCDeviceTensor<real, 3> input = 
+        devicetensor<3>(state, input_);
+    THCDeviceTensor<real, 3> gradinput = 
+        devicetensor<3>(state, gradinput_);
+    THCDeviceTensor<real, 1> gradgamma = 
+        devicetensor<1>(state, gradgamma_);
+    THCDeviceTensor<real, 1> gradbeta = devicetensor<1>(state, gradbeta_);
+    THCDeviceTensor<real, 1> mean     = devicetensor<1>(state, mean_);
+    THCDeviceTensor<real, 1> invstd   = devicetensor<1>(state, invstd_);
+    THCDeviceTensor<real, 1> gamma    = devicetensor<1>(state, gamma_);
+    THCDeviceTensor<real, 1> beta     = devicetensor<1>(state, beta_);
+    THCDeviceTensor<real, 1> gradMean = devicetensor<1>(state, gradMean_);
+    THCDeviceTensor<real, 1> gradStd  = devicetensor<1>(state, gradStd_);
+    /* kernel function */
+    cudaStream_t stream = THCState_getCurrentStream(state);
+    dim3 blocks(input.getSize(1));
+    dim3 threads(getNumThreads(input.getSize(2)));
+    Encoding_(BatchNorm_Backward_kernel)<<<blocks, threads, 0, stream>>>(
+        gradoutput, input, gradinput, gradgamma, gradbeta, mean, invstd, 
+        gamma, beta, gradMean, gradStd, train);
+    THCudaCheck(cudaGetLastError());
+}
+struct Encoding_(SumOp) {
+    __device__ Encoding_(SumOp)(THCDeviceTensor<real, 3> i)
+        : input(i){}
+    __device__ __forceinline__ Encoding_(Float2) operator()(int batch, int plane, int n) {
+        real g = input[batch][plane][n].ldg();
+        return Encoding_(Float2)(g, g * g);
+    }
+    real mean;
+    THCDeviceTensor<real, 3> input;
+};
+// Sum across (batch, x/y/z) applying Op() pointwise
+__device__ Encoding_(Float2) Encoding_(reduce_sum)(Encoding_(SumOp) op, THCDeviceTensor<real, 3> tensor, int plane) {
+  Encoding_(Float2) sum = (Encoding_(Float2))0;
+  for (int batch = 0; batch < tensor.getSize(0); ++batch) {
+    for (int x = threadIdx.x; x < tensor.getSize(2); x += blockDim.x) {
+      sum += op(batch, plane, x);
+    }
+  }
+  // sum over NumThreads within a warp
+  sum = Encoding_(warpSum)(sum);
+  // 'transpose', and reduce within warp again
+  __shared__ Encoding_(Float2) shared[32];
+  __syncthreads();
+  if (threadIdx.x % WARP_SIZE == 0) {
+    if (threadIdx.x / WARP_SIZE < 32) {
+        shared[threadIdx.x / WARP_SIZE] = sum;
+    }
+  }
+  if (threadIdx.x >= blockDim.x / WARP_SIZE && threadIdx.x < WARP_SIZE) {
+    // zero out the other entries in shared
+    shared[threadIdx.x] = (Encoding_(Float2))0;
+  }
+  __syncthreads();
+  if (threadIdx.x / WARP_SIZE == 0) {
+    sum = Encoding_(warpSum)(shared[threadIdx.x]);
+    if (threadIdx.x == 0) {
+      shared[0] = sum;
+    }
+  }
+  __syncthreads();
+  // Everyone picks it up, should be broadcast into the whole gradInput
+  return shared[0];
+}
+__global__ void Encoding_(Sum_Square_Forward_kernel) (
+    THCDeviceTensor<real, 3> input,
+    THCDeviceTensor<real, 1> sum,
+    THCDeviceTensor<real, 1> square)
+{
+    int c = blockIdx.x;
+    /* main operation */ 
+    Encoding_(SumOp) g(input);
+    Encoding_(Float2) res = Encoding_(reduce_sum)(g, input, c);
+    real xsum = res.v1;
+    real xsquare = res.v2;
+    if (threadIdx.x == 0) {
+        sum[c]    = xsum;
+        square[c] = xsquare;
+    }
+}
+void Encoding_(Sum_Square_Forward)(THCState *state, 
+        THCTensor *input_, THCTensor *sum_, THCTensor *square_)
+/*
+ */
+{
+    /* Check the GPU index and tensor dims*/
+    THCTensor_(checkGPU)(state, 3, input_, sum_, square_);
+    if (THCTensor_(nDimension)(state, input_)   != 3 ||
+        THCTensor_(nDimension)(state, sum_)     != 1 ||
+        THCTensor_(nDimension)(state, square_)  != 1)
+        THError("Sum_Square forward: incorrect input dims. \n");
+    /* Device tensors */
+    THCDeviceTensor<real, 3> input  = devicetensor<3>(state, input_);
+    THCDeviceTensor<real, 1> sum    = devicetensor<1>(state, sum_);
+    THCDeviceTensor<real, 1> square = devicetensor<1>(state, square_);
+    /* kernel function */
+    cudaStream_t stream = THCState_getCurrentStream(state);
+    dim3 blocks(input.getSize(1));
+    dim3 threads(getNumThreads(input.getSize(2)));
+    Encoding_(Sum_Square_Forward_kernel)<<<blocks, threads, 0, stream>>>(
+        input, sum, square);
+    THCudaCheck(cudaGetLastError());
+}
+__global__ void Encoding_(Sum_Square_Backward_kernel) (
+    THCDeviceTensor<real, 3> gradInput,
+    THCDeviceTensor<real, 3> input,
+    THCDeviceTensor<real, 1> gradSum,
+    THCDeviceTensor<real, 1> gradSquare)
+{
+    int c = blockIdx.x;
+    /* main operation */ 
+    for (int batch = 0; batch < gradInput.getSize(0); ++batch) {
+        for (int x = threadIdx.x; x < gradInput.getSize(2); x += blockDim.x)
+        {
+            gradInput[batch][c][x] = gradSum[c] + 2 * gradSquare[c] *
+                input[batch][c][x];
+        }
+    }   
+}
+void Encoding_(Sum_Square_Backward)(THCState *state, 
+        THCTensor *gradInput_, THCTensor *input_, 
+        THCTensor *gradSum_, THCTensor *gradSquare_)
+/*
+ */
+{
+    /* Check the GPU index and tensor dims*/
+    THCTensor_(checkGPU)(state, 4, gradInput_, input_, gradSum_, 
+                         gradSquare_);
+    if (THCTensor_(nDimension)(state, gradInput_)  != 3 ||
+        THCTensor_(nDimension)(state, input_)      != 3 ||
+        THCTensor_(nDimension)(state, gradSum_)    != 1 ||
+        THCTensor_(nDimension)(state, gradSquare_) != 1)
+        THError("Sum_Square forward: incorrect input dims. \n");
+    /* Device tensors */
+    THCDeviceTensor<real, 3> gradInput  = devicetensor<3>(state, gradInput_);
+    THCDeviceTensor<real, 3> input      = devicetensor<3>(state, input_);
+    THCDeviceTensor<real, 1> gradSum    = devicetensor<1>(state, gradSum_);
+    THCDeviceTensor<real, 1> gradSquare =devicetensor<1>(state, gradSquare_);
+    /* kernel function */
+    cudaStream_t stream = THCState_getCurrentStream(state);
+    dim3 blocks(input.getSize(1));
+    dim3 threads(getNumThreads(input.getSize(2)));
+    Encoding_(Sum_Square_Backward_kernel)<<<blocks, threads, 0, stream>>>(
+        gradInput, input, gradSum, gradSquare);
+    THCudaCheck(cudaGetLastError());
+}
+#endif
--- a/encoding/kernel/generic/syncbn_kernel.h
+++ b/encoding/kernel/generic/syncbn_kernel.h
+/*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+ * Created by: Hang Zhang
+ * ECE Department, Rutgers University
+ * Email: zhang.hang@rutgers.edu
+ * Copyright (c) 2017
+ *
+ * This source code is licensed under the MIT-style license found in the
+ * LICENSE file in the root directory of this source tree 
+ *+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+ */
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/syncbn_kernel.h"
+#else
+void Encoding_(SquareSqueeze_Forward)(
+    THCState *state, THCTensor *L_,  THCTensor *R_);
+void Encoding_(SquareSqueeze_Backward)(
+    THCState *state, THCTensor *GL_, THCTensor *GR_, THCTensor *R_);
+void Encoding_(BatchNorm_Forward)(THCState *state, 
+    THCTensor *output_, THCTensor *input_, 
+    THCTensor *mean_, THCTensor *invstd_,
+    THCTensor *gamma_, THCTensor *beta_);
+void Encoding_(BatchNorm_Backward)(THCState *state, 
+    THCTensor *gradoutput_, THCTensor *input_, THCTensor *gradinput_, 
+    THCTensor *gradgamma_, THCTensor *gradbeta_, THCTensor *mean_, 
+    THCTensor *invstd_, THCTensor *gamma_, THCTensor *beta_, 
+    THCTensor *gradMean_, THCTensor *gradStd_, int train);
+void Encoding_(Sum_Square_Forward)(THCState *state, 
+    THCTensor *input_, THCTensor *sum_, THCTensor *square_);
+void Encoding_(Sum_Square_Backward)(THCState *state, 
+    THCTensor *gradInput, THCTensor *input_, 
+    THCTensor *gradSum_, THCTensor *gradSquare_);
+#endif
--- a/encoding/kernel/thc_encoding.cu
+++ b/encoding/kernel/thc_encoding.cu
@@ -21,12 +21,26 @@
 extern "C" {
 #endif
+// float
 #include "generic/encoding_kernel.c"
 #include "THC/THCGenerateFloatType.h"
+#include "generic/syncbn_kernel.c"
+#include "THC/THCGenerateFloatType.h"
+#include "generic/pooling_kernel.c"
+#include "THC/THCGenerateFloatType.h"
+// double
 #include "generic/encoding_kernel.c"
 #include "THC/THCGenerateDoubleType.h"
+#include "generic/syncbn_kernel.c"
+#include "THC/THCGenerateDoubleType.h"
+#include "generic/pooling_kernel.c"
+#include "THC/THCGenerateDoubleType.h"
 #ifdef __cplusplus
 }
 #endif
--- a/encoding/kernel/thc_encoding.h
+++ b/encoding/kernel/thc_encoding.h
@@ -23,12 +23,26 @@ extern THCState *state;
 extern "C" {
 #endif
+// float
 #include "generic/encoding_kernel.h"
 #include "THC/THCGenerateFloatType.h"
+#include "generic/syncbn_kernel.h"
+#include "THC/THCGenerateFloatType.h"
+#include "generic/pooling_kernel.h"
+#include "THC/THCGenerateFloatType.h"
+// double
 #include "generic/encoding_kernel.h"
 #include "THC/THCGenerateDoubleType.h"
+#include "generic/syncbn_kernel.h"
+#include "THC/THCGenerateDoubleType.h"
+#include "generic/pooling_kernel.h"
+#include "THC/THCGenerateDoubleType.h"
 #ifdef __cplusplus
 }
 #endif
--- a/encoding/nn/__init__.py
+++ b/encoding/nn/__init__.py
+##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+## Created by: Hang Zhang
+## ECE Department, Rutgers University
+## Email: zhang.hang@rutgers.edu
+## Copyright (c) 2017
+##
+## This source code is licensed under the MIT-style license found in the
+## LICENSE file in the root directory of this source tree 
+##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+from .syncbn import *
+from .basic import *
+from .encoding import *
+from .customize import *
--- a/encoding/nn/basic.py
+++ b/encoding/nn/basic.py
--- a/encoding/nn/customize.py
+++ b/encoding/nn/customize.py
+##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+## Created by: Hang Zhang
+## ECE Department, Rutgers University
+## Email: zhang.hang@rutgers.edu
+## Copyright (c) 2017
+##
+## This source code is licensed under the MIT-style license found in the
+## LICENSE file in the root directory of this source tree 
+##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+import math
+import torch
+from torch.autograd import Variable
+from torch.nn import Module
+from torch.nn import functional as F
+from torch.nn.parameter import Parameter
+from ..parallel import my_data_parallel
+from .syncbn import BatchNorm2d
+from ..functions import dilatedavgpool2d
+__all__ = ['DilatedAvgPool2d', 'MyConvTranspose2d', 'View', 'Normalize',
+    'Bottleneck']
+class DilatedAvgPool2d(Module):
+    r"""We provide Dilated Average Pooling for the dilation of Densenet as
+    in :class:`encoding.dilated.DenseNet`.
+    Applies a 2D average pooling over an input signal composed of several input planes.
+    In the simplest case, the output value of the layer with input size :math:`(N, C, H, W)`,
+    output :math:`(N, C, H_{out}, W_{out})` and :attr:`kernel_size` :math:`(kH, kW)`
+    can be precisely described as:
+    .. math::
+        \begin{array}{ll}
+        out(b, c, h, w)  = 1 / (kH * kW) * 
+        \sum_{{m}=0}^{kH-1} \sum_{{n}=0}^{kW-1}
+        input(b, c, dH * h + m, dW * w + n)
+        \end{array}
+    | If :attr:`padding` is non-zero, then the input is implicitly zero-padded on both sides
+      for :attr:`padding` number of points
+    The parameters :attr:`kernel_size`, :attr:`stride`, :attr:`padding`, :attr:`dilation` can either be:
+        - a single ``int`` -- in which case the same value is used for the height and width dimension
+        - a ``tuple`` of two ints -- in which case, the first `int` is used for the height dimension,
+          and the second `int` for the width dimension
+    Args:
+        kernel_size: the size of the window
+        stride: the stride of the window. Default value is :attr:`kernel_size`
+        padding: implicit zero padding to be added on both sides
+        dilation: the dilation parameter similar to Conv2d
+    Shape:
+        - Input: :math:`(N, C, H_{in}, W_{in})`
+        - Output: :math:`(N, C, H_{out}, W_{out})` where
+          :math:`H_{out} = floor((H_{in}  + 2 * padding[0] - kernel\_size[0]) / stride[0] + 1)`
+          :math:`W_{out} = floor((W_{in}  + 2 * padding[1] - kernel\_size[1]) / stride[1] + 1)`
+    Examples::
+        >>> # pool of square window of size=3, stride=2, dilation=2
+        >>> m = nn.DilatedAvgPool2d(3, stride=2, dilation=2)
+        >>> input = autograd.Variable(torch.randn(20, 16, 50, 32))
+        >>> output = m(input)
+    Reference::
+        comming 
+    """
+    def __init__(self, kernel_size, stride=None, padding=0, dilation=1):
+        super(DilatedAvgPool2d, self).__init__()
+        self.kernel_size = kernel_size
+        self.stride = stride or kernel_size
+        self.padding = padding
+        self.dilation = dilation
+    def forward(self, input):
+        return dilatedavgpool2d(input, self.kernel_size, self.stride,
+                                self.padding, self.dilation)
+    def __repr__(self):
+        return self.__class__.__name__ + ' (' \
+            + 'size=' + str(self.kernel_size) \
+            + ', stride=' + str(self.stride) \
+            + ', padding=' + str(self.padding) \
+            + ', dilation=' + str(self.dilation) + ')'
+class MyConvTranspose2d(Module):
+    """Customized Layers, discuss later
+    """
+    def __init__(self, in_channels, out_channels, kernel_size, stride=1,
+                 padding=0, dilation=1, groups=1, scale_factor =1, 
+                 bias=True):
+        super(MyConvTranspose2d, self).__init__()
+        kernel_size = _pair(kernel_size)
+        stride = _pair(stride)
+        padding = _pair(padding)
+        dilation = _pair(dilation)
+        if in_channels % groups != 0:
+            raise ValueError('in_channels must be divisible by groups')
+        if out_channels % groups != 0:
+            raise ValueError('out_channels must be divisible by groups')
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.kernel_size = kernel_size
+        self.stride = stride
+        self.padding = padding
+        self.dilation = dilation
+        self.groups = groups
+        self.scale_factor = scale_factor
+        self.weight = Parameter(torch.Tensor(
+            out_channels * scale_factor * scale_factor, 
+            in_channels // groups, *kernel_size))
+        if bias:
+            self.bias = Parameter(torch.Tensor(out_channels * 
+                scale_factor * scale_factor))
+        else:
+            self.register_parameter('bias', None)
+        self.reset_parameters()
+    def reset_parameters(self):
+        n = self.in_channels
+        for k in self.kernel_size:
+            n *= k
+        stdv = 1. / math.sqrt(n)
+        self.weight.data.uniform_(-stdv, stdv)
+        if self.bias is not None:
+            self.bias.data.uniform_(-stdv, stdv)
+    def forward(self, input):
+        if isinstance(input, Variable):
+            out = F.conv2d(input, self.weight, self.bias, self.stride,
+                            self.padding, self.dilation, self.groups)
+            return F.pixel_shuffle(out, self.scale_factor)
+        elif isinstance(input, tuple) or isinstance(input, list):
+            return my_data_parallel(self, input)
+        else:
+            raise RuntimeError('unknown input type')
+class View(Module):
+    """Reshape the input into different size, an inplace operator, support
+    SelfParallel mode.
+    """
+    def __init__(self, *args):
+        super(View, self).__init__()
+        if len(args) == 1 and isinstance(args[0], torch.Size):
+            self.size = args[0]
+        else:
+            self.size = torch.Size(args)
+    def forward(self, input):
+        if isinstance(input, Variable):
+            return input.view(self.size)
+        elif isinstance(input, tuple) or isinstance(input, list):
+            return view_each(input, self.size)
+        else:
+            raise RuntimeError('unknown input type')
+class Normalize(Module):
+    r"""Performs :math:`L_p` normalization of inputs over specified dimension.
+    Does:
+    .. math::
+        v = \frac{v}{\max(\lVert v \rVert_p, \epsilon)}
+    for each subtensor v over dimension dim of input. Each subtensor is
+    flattened into a vector, i.e. :math:`\lVert v \rVert_p` is not a matrix
+    norm.
+    With default arguments normalizes over the second dimension with Euclidean
+    norm.
+    Args:
+        p (float): the exponent value in the norm formulation. Default: 2
+        dim (int): the dimension to reduce. Default: 1
+    """
+    def __init__(self, p=2, dim=1):
+        super(Normalize, self).__init__()
+        self.p = p
+        self.dim =dim
+    def forward(self, x):
+        if isinstance(x, Variable):
+            return F.normalize(x, self.p, self.dim)
+        elif isinstance(x, tuple) or isinstance(x, list):
+            return my_data_parallel(self, x)
+        else:
+            raise RuntimeError('unknown input type')
+class Bottleneck(Module):
+    """ Pre-activation residual block
+    Identity Mapping in Deep Residual Networks
+    ref https://arxiv.org/abs/1603.05027
+    """
+    def __init__(self, inplanes, planes, stride=1,
+            norm_layer=BatchNorm2d):
+        super(Bottleneck, self).__init__()
+        self.expansion = 4
+        if inplanes != planes*self.expansion or stride !=1 :
+            self.downsample = True
+            self.residual_layer = Conv2d(inplanes, planes * self.expansion,
+                                                        kernel_size=1, stride=stride)
+        else:
+            self.downsample = False
+        conv_block = []
+        conv_block += [norm_layer(inplanes),
+                       ReLU(inplace=True),
+                       Conv2d(inplanes, planes, kernel_size=1, stride=1)]
+        conv_block += [norm_layer(planes),
+                       ReLU(inplace=True),
+                       Conv2d(planes, planes, kernel_size=3, stride=stride,
+                           padding=1)]
+        conv_block += [norm_layer(planes),
+                       ReLU(inplace=True),
+                       Conv2d(planes, planes * self.expansion, kernel_size=1,
+                           stride=1)]
+        self.conv_block = Sequential(*conv_block)
+    def forward(self, x):
+        if self.downsample:
+            residual = self.residual_layer(x)
+        else:
+            residual = x
+        if isinstance(x, Variable):
+            return residual + self.conv_block(x)
+        elif isinstance(x, tuple) or isinstance(x, list):
+            return sum_each(residual, self.conv_block(x))
+        else:
+            raise RuntimeError('unknown input type')
+def _get_a_var(obj):
+    if isinstance(obj, Variable):
+        return obj
+    if isinstance(obj, list) or isinstance(obj, tuple):
+        results = map(_get_a_var, obj)
+        for result in results:
+            if isinstance(result, Variable):
+                return result
+    if isinstance(obj, dict):
+        results = map(_get_a_var, obj.items())
+        for result in results:
+            if isinstance(result, Variable):
+                return result
+    return None
--- a/encoding/nn/encoding.py
+++ b/encoding/nn/encoding.py
+##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+## Created by: Hang Zhang
+## ECE Department, Rutgers University
+## Email: zhang.hang@rutgers.edu
+## Copyright (c) 2017
+##
+## This source code is licensed under the MIT-style license found in the
+## LICENSE file in the root directory of this source tree 
+##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+import threading
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.autograd import Function, Variable
+from .._ext import encoding_lib
+from ..functions import scaledL2, aggregate, aggregateP, residual, assign
+from ..parallel import my_data_parallel
+__all__ = ['Encoding', 'Inspiration', 'GramMatrix', 'Aggregate','EncodingP']
+class Encoding(nn.Module):
+    r"""
+    Encoding Layer: a learnable residual encoder over 3d or 4d input that 
+    is seen as a mini-batch.
+    .. image:: http://hangzh.com/figure/cvpr17.svg
+        :width: 50%
+        :align: center
+    .. math::
+        e_{ik} = \frac{exp(-s_k\|x_{i}-c_k\|^2)}{\sum_{j=1}^K exp(-s_j\|x_{i}-c_j\|^2)} (x_i - c_k)
+    Please see the `example of training Deep TEN <./experiments/texture.html>`_.
+    Args:
+        D: dimention of the features or feature channels
+        K: number of codeswords
+    Shape:
+        - Input: :math:`X\in\mathcal{R}^{B\times N\times D}` or :math:`\mathcal{R}^{B\times D\times H\times W}` (where :math:`B` is batch, :math:`N` is total number of features or :math:`H\times W`.)
+        - Output: :math:`E\in\mathcal{R}^{B\times K\times D}`
+    Attributes:
+        codewords (Tensor): the learnable codewords of shape (:math:`K\times D`)
+        scale (Tensor): the learnable scale factor of visual centers
+    Examples:
+        >>> import encoding
+        >>> import torch
+        >>> import torch.nn.functional as F
+        >>> from torch.autograd import Variable, gradcheck
+        >>> B,C,H,W,K = 2,3,4,5,6
+        >>> X = Variable(torch.cuda.DoubleTensor(B,C,H,W).uniform_(-0.5,0.5), requires_grad=True)
+        >>> layer = encoding.Encoding(C,K).double().cuda()
+        >>> E = layer(X)
+    Reference:
+        Hang Zhang, Jia Xue, and Kristin Dana. "Deep TEN: Texture Encoding Network." *The IEEE Conference on Computer Vision and Pattern Recognition (CVPR) 2017*
+    """
+    def __init__(self, D, K):
+        super(Encoding, self).__init__()
+        # init codewords and smoothing factor
+        self.D, self.K = D, K
+        self.codewords = nn.Parameter(torch.Tensor(K, D), 
+            requires_grad=True)
+        self.scale = nn.Parameter(torch.Tensor(K), requires_grad=True) 
+        self.reset_params()
+    def reset_params(self):
+        std1 = 1./((self.K*self.D)**(1/2))
+        std2 = 1./((self.K)**(1/2))
+        self.codewords.data.uniform_(-std1, std1)
+        self.scale.data.uniform_(-std2, std2)
+    def forward(self, X):
+        if isinstance(X, tuple) or isinstance(X, list):
+            # for self-parallel mode, please see encoding.nn
+            return my_data_parallel(self, X)
+        elif not isinstance(X, Variable):
+            raise RuntimeError('unknown input type')
+        # input X is a 4D tensor
+        assert(X.size(1)==self.D,"Encoding Layer wrong channels!")
+        if X.dim() == 3:
+            # BxDxN
+            B, N, K, D = X.size(0), X.size(2), self.K, self.D
+            X = X.transpose(1,2).contiguous()
+        elif X.dim() == 4:
+            # BxDxHxW
+            B, N, K, D = X.size(0), X.size(2)*X.size(3), self.K, self.D
+            X = X.view(B,D,-1).transpose(1,2).contiguous()
+        else:
+            raise RuntimeError('Encoding Layer unknown input dims!')
+        # assignment weights
+        A = F.softmax(scaledL2(X, self.codewords, self.scale))
+        # aggregate
+        E = aggregate(A, X, self.codewords)
+        return E
+    def __repr__(self):
+        return self.__class__.__name__ + '(' \
+            + 'N x ' + str(self.D) + '=>' + str(self.K) + 'x' \
+            + str(self.D) + ')'
+class Inspiration(nn.Module):
+    r""" Inspiration Layer (for MSG-Net). 
+    Tuning the featuremap with target Gram Matrix
+    .. math::
+        Y = \phi^{-1}[\phi(\mathcal{F}^T)W\mathcal{G}]
+    Please see the `example of MSG-Net <./experiments/style.html>`_  
+    training multi-style generative network for real-time transfer.
+    Reference:
+        Hang Zhang, and Kristin Dana. "Multi-style Generative Network for Real-time Transfer."  *arXiv preprint arXiv:1703.06953 (2017)*
+    """
+    def __init__(self, C, B=1):
+        super(Inspiration, self).__init__()
+        # B is equal to 1 or input mini_batch
+        self.weight = nn.Parameter(torch.Tensor(1,C,C), requires_grad=True)
+        # non-parameter buffer
+        self.G = Variable(torch.Tensor(B,C,C), requires_grad=True)
+        self.C = C
+        self.reset_parameters()
+    def reset_parameters(self):
+        self.weight.data.uniform_(0.0, 0.02)
+    def setTarget(self, target):
+        self.G = target
+    def forward(self, X):
+        # input X is a 3D feature map
+        self.P = torch.bmm(self.weight.expand_as(self.G),self.G)
+        return torch.bmm(self.P.transpose(1,2).expand(X.size(0), self.C, self.C), X.view(X.size(0),X.size(1),-1)).view_as(X)
+    def __repr__(self):
+        return self.__class__.__name__ + '(' \
+            + 'N x ' + str(self.C) + ')'
+class GramMatrix(nn.Module):
+    r""" Gram Matrix for a 4D convolutional featuremaps as a mini-batch
+    .. math::
+        \mathcal{G} = \sum_{h=1}^{H_i}\sum_{w=1}^{W_i} \mathcal{F}_{h,w}\mathcal{F}_{h,w}^T
+    """
+    def forward(self, y):
+        (b, ch, h, w) = y.size()
+        features = y.view(b, ch, w * h)
+        features_t = features.transpose(1, 2)
+        gram = features.bmm(features_t) / (ch * h * w)
+        return gram
+class Aggregate(nn.Module):
+    r"""
+    Aggregate operation, aggregate the residuals (:math:`R`) with 
+    assignment weights (:math:`A`).
+    .. math::
+        e_{k} = \sum_{i=1}^{N} a_{ik} r_{ik}
+    Shape:
+        - Input: :math:`A\in\mathcal{R}^{B\times N\times K}` :math:`R\in\mathcal{R}^{B\times N\times K\times D}` (where :math:`B` is batch, :math:`N` is total number of features, :math:`K` is number is codewords, :math:`D` is feature dimensions.)
+        - Output: :math:`E\in\mathcal{R}^{B\times K\times D}`
+    """ 
+    def forward(self, A, R):
+        if isinstance(A, tuple) or isinstance(A, list):
+            # for self-parallel mode, please see encoding.nn
+            return my_data_parallel(self, A, R)
+        elif not isinstance(A, Variable):
+            raise RuntimeError('unknown input type')
+        return aggregateP(A, R)
+class EncodingP(nn.Module):
+    def __init__(self, D, K):
+        super(EncodingP, self).__init__()
+        # init codewords and smoothing factor
+        self.D, self.K = D, K
+        self.codewords = nn.Parameter(torch.Tensor(K, D), 
+            requires_grad=True)
+        self.scale = nn.Parameter(torch.Tensor(K), requires_grad=True) 
+        self.reset_params()
+        print('EncodingP is deprecated, please use Encoding.')
+    def reset_params(self):
+        std1 = 1./((self.K*self.D)**(1/2))
+        std2 = 1./((self.K)**(1/2))
+        self.codewords.data.uniform_(-std1, std1)
+        self.scale.data.uniform_(-std2, std2)
+    def forward(self, X):
+        if isinstance(X, tuple) or isinstance(X, list):
+            # for self-parallel mode, please see encoding.nn
+            return my_data_parallel(self, X)
+        elif not isinstance(X, Variable):
+            raise RuntimeError('unknown input type')
+        # input X is a 4D tensor
+        assert(X.size(1)==self.D,"Encoding Layer wrong channels!")
+        if X.dim() == 3:
+            # BxDxN
+            B, N, K, D = X.size(0), X.size(2), self.K, self.D
+            X = X.transpose(1,2)
+        elif X.dim() == 4:
+            # BxDxHxW
+            B, N, K, D = X.size(0), X.size(2)*X.size(3), self.K, self.D
+            X = X.view(B,D,-1).transpose(1,2)
+        else:
+            raise RuntimeError('Encoding Layer unknown input dims!')
+        # calculate residuals
+        R = residual(X.contiguous(), self.codewords)
+        # assignment weights
+        A = assign(R, self.scale)
+        # aggregate
+        E = aggregateP(A, R)
+        return E
+    def __repr__(self):
+        return self.__class__.__name__ + '(' \
+            + 'N x ' + str(self.D) + '=>' + str(self.K) + 'x' \
+            + str(self.D) + ')'