v0.1.0

d539ddfa · Hang Zhang · 80a12ef6 · d539ddfa · d539ddfa · d539ddfa
Commit d539ddfa authored Nov 16, 2017 by Hang Zhang
20 changed files
--- a/encoding/__init__.py
+++ b/encoding/__init__.py
@@ -8,10 +8,9 @@
 ## LICENSE file in the root directory of this source tree 
 ##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
-__version__ = '1.0.1'
+from .version import __version__
 import encoding.nn
 import encoding.functions
 import encoding.dilated
 import encoding.parallel 
--- a/encoding/cmake/FindTorch.cmake
+++ b/encoding/cmake/FindTorch.cmake
@@ -31,9 +31,8 @@ MESSAGE(STATUS "TORCH_BUILD_DIR: " ${TORCH_BUILD_DIR})
 # Find the include files
 SET(TORCH_TH_INCLUDE_DIR "${TORCH_BUILD_DIR}/include/TH")
 SET(TORCH_THC_INCLUDE_DIR "${TORCH_BUILD_DIR}/include/THC")
-SET(TORCH_THC_UTILS_INCLUDE_DIR "$ENV{HOME}/pytorch/torch/lib/THC")
-SET(Torch_INSTALL_INCLUDE "${TORCH_BUILD_DIR}/include" ${TORCH_TH_INCLUDE_DIR} ${TORCH_THC_INCLUDE_DIR} ${TORCH_THC_UTILS_INCLUDE_DIR})
+SET(Torch_INSTALL_INCLUDE "${TORCH_BUILD_DIR}/include" ${TORCH_TH_INCLUDE_DIR} ${TORCH_THC_INCLUDE_DIR})
 # Find the libs. We need to find libraries one by one.
 SET(TH_LIBRARIES "$ENV{TH_LIBRARIES}")

--- a/encoding/dilated/densenet.py
+++ b/encoding/dilated/densenet.py
 import torch
-import torch.nn as nn
+from torch.autograd import Variable
-import torch.nn.functional as F
 import torch.utils.model_zoo as model_zoo
 from collections import OrderedDict
-from ..nn import DilatedAvgPool2d
+from .. import nn
+from .. import functions as F
 __all__ = ['DenseNet', 'densenet121', 'densenet169', 'densenet201', 'densenet161']
@@ -91,7 +91,13 @@ class _DenseLayer(nn.Sequential):
        new_features = super(_DenseLayer, self).forward(x)
        if self.drop_rate > 0:
            new_features = F.dropout(new_features, p=self.drop_rate, training=self.training)
+        if isinstance(x, Variable):
            return torch.cat([x, new_features], 1)
+        elif isinstance(x, tuple) or isinstance(x, list):
+            return F.cat_each(x, new_features, 1)
+        else:
+            raise RuntimeError('unknown input type')
 class _DenseBlock(nn.Sequential):
@@ -109,7 +115,7 @@ class _Transition(nn.Sequential):
        self.add_module('relu', nn.ReLU(inplace=True))
        self.add_module('conv', nn.Conv2d(num_input_features, num_output_features,
                                          kernel_size=1, stride=1, bias=False))
-        self.add_module('pool', DilatedAvgPool2d(kernel_size=2, stride=stride, 
+        self.add_module('pool', nn.DilatedAvgPool2d(kernel_size=2, stride=stride, 
                                                 dilation=dilation))

--- a/encoding/dilated/resnet.py
+++ b/encoding/dilated/resnet.py
@@ -4,7 +4,7 @@ from torch.autograd import Variable
 import torch.utils.model_zoo as model_zoo
 __all__ = ['ResNet', 'resnet18', 'resnet34', 'resnet50', 'resnet101',
-           'resnet152']
+           'resnet152', 'BasicBlock', 'Bottleneck']
 model_urls = {
    'resnet18': 'https://download.pytorch.org/models/resnet18-5c106cde.pth',
@@ -23,14 +23,14 @@ def conv3x3(in_planes, out_planes, stride=1):
 class BasicBlock(nn.Module):
    expansion = 1
-    def __init__(self, inplanes, planes, stride=1, dilation=1, downsample=None, fist_dilation=1):
+    def __init__(self, inplanes, planes, stride=1, dilation=1, downsample=None, first_dilation=1):
        super(BasicBlock, self).__init__()
        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=3, stride=stride,
                               padding=dilation, dilation=dilation, bias=False)
        self.bn1 = nn.BatchNorm2d(planes)
        self.relu = nn.ReLU(inplace=True)
        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=1,
-                               padding=fist_dilation, dilation=fist_dilation, bias=False)
+                               padding=first_dilation, dilation=first_dilation, bias=False)
        self.bn2 = nn.BatchNorm2d(planes)
        self.downsample = downsample
        self.stride = stride
@@ -56,14 +56,16 @@ class BasicBlock(nn.Module):
 class Bottleneck(nn.Module):
    expansion = 4
-    def __init__(self, inplanes, planes, stride=1, dilation=1, downsample=None, fist_dilation=1):
+    def __init__(self, inplanes, planes, stride=1, dilation=1, 
+            downsample=None, first_dilation=1):
        super(Bottleneck, self).__init__()
        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
        self.bn1 = nn.BatchNorm2d(planes)
        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride,
            padding=dilation, dilation=dilation, bias=False)
        self.bn2 = nn.BatchNorm2d(planes)
-        self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False)
+        self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, 
+            bias=False)
        self.bn3 = nn.BatchNorm2d(planes * 4)
        self.relu = nn.ReLU(inplace=True)
        self.downsample = downsample
@@ -108,7 +110,10 @@ class ResNet(nn.Module):
    """Dilated Pre-trained ResNet Model, which preduces the stride of 8 featuremaps at conv5.
    Reference:
-        Yu, Fisher, and Vladlen Koltun. "Multi-scale context aggregation by dilated convolutions."
+        - He, Kaiming, et al. "Deep residual learning for image recognition." Proceedings of the IEEE conference on computer vision and pattern recognition. 2016.
+        - Yu, Fisher, and Vladlen Koltun. "Multi-scale context aggregation by dilated convolutions."
    """
    def __init__(self, block, layers, num_classes=1000):
        self.inplanes = 64
@@ -145,16 +150,16 @@ class ResNet(nn.Module):
        layers = []
        if dilation == 1 or dilation == 2:
            layers.append(block(self.inplanes, planes, stride, dilation=1, 
-                                downsample=downsample, fist_dilation=dilation))
+                                downsample=downsample, first_dilation=dilation))
        elif dilation ==4:
            layers.append(block(self.inplanes, planes, stride, dilation=2, 
-                                downsample=downsample, fist_dilation=dilation))
+                                downsample=downsample, first_dilation=dilation))
        else:
            raise RuntimeError("=> unknown dilation size: {}".format(dilation))
        self.inplanes = planes * block.expansion
        for i in range(1, blocks):
-            layers.append(block(self.inplanes, planes, dilation=dilation, fist_dilation=dilation))
+            layers.append(block(self.inplanes, planes, dilation=dilation, first_dilation=dilation))
        return nn.Sequential(*layers)

--- a/encoding/functions/basic.py
+++ b/encoding/functions/basic.py
@@ -8,21 +8,61 @@
 ## LICENSE file in the root directory of this source tree 
 ##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
-__all__ = ['view_each', 'multi_each', 'sum_each', 'upsample']
+import threading
+import torch
+import torch.nn.functional as F
+from torch.autograd import Function, Variable
+__all__ = ['squeeze_each', 'view_each', 'multi_each', 'sum_each', 
+    'cat_each', 'upsample', 'dropout', 'relu']
+def squeeze_each(x, dim=None):
+    """Multi-GPU version torch. squeeze()
+    """
+    y = []
+    for i in range(len(x)):
+        if dim is None:
+            y.append(x[i].squeeze())
+        else:
+            y.append(x[i].squeeze(dim))
+    return y
 def view_each(x, size):
+    """Multi-GPU version torch.view
+    Returns a new tensor with the same data but different size.
+    The returned tensor shares the same data and must have the same number
+    of elements, but may have a different size. A tensor must be
+    :attr:`contiguous` to be viewed.
+    Args:
+        input: list of multi-gpu tensors
+        size (torch.Size or int...): Desired size
+    """
    y = []
    for i in range(len(x)):
        y.append(x[i].view(size))
    return y
 def multi_each(a, b):
+    """Multi-GPU version multiplication
+    .. math::
+        y[i] = a[i] * b[i]
+    """
    y = []
    for i in range(len(a)):
        y.append(a[i] * b[i])
    return y
 def sum_each(x, y):
+    """Multi-GPU version torch.add
+    .. math::
+        y[i] = a[i] + b[i]
+    """
    assert(len(x)==len(y))
    z = []
    for i in range(len(x)):
@@ -30,7 +70,58 @@ def sum_each(x, y):
    return z
+def cat_each(x1, x2, dim):
+    """Multi-GPU version torch.cat
+    .. math::
+        y[i] = torch.cat(a[i], b[i], dim)
+    """
+    assert(len(x1)==len(x2))
+    z = []
+    for i in range(len(x1)):
+        with torch.cuda.device_of(x1[i]):
+            x = torch.cat((x1[i], x2[i]), dim)
+            z.append(x)
+    return z
+def dict_to_list(x):
+    """Converting Dict{} to list[]
+    """
+    y = []
+    for i in range(len(x)):
+        xi = x[i]
+        if isinstance(xi, Exception):
+            raise xi
+        y.append(xi)
+    return y
 def upsample(input, size=None, scale_factor=None, mode='nearest'):
+    """Multi-GPU version torch.nn.functional.upsample
+    Upsamples the input to either the given :attr:`size` or the given
+    :attr:`scale_factor`
+    The algorithm used for upsampling is determined by :attr:`mode`.
+    Currently temporal, spatial and volumetric upsampling are supported, i.e.
+    expected inputs are 3-D, 4-D or 5-D in shape.
+    The input dimensions are interpreted in the form:
+    `mini-batch x channels x [depth] x [height] x width`
+    The modes available for upsampling are: `nearest`, `linear` (3D-only),
+    `bilinear` (4D-only), `trilinear` (5D-only)
+    Args:
+        input (Variable): input
+        size (int or Tuple[int] or Tuple[int, int] or Tuple[int, int, int]):
+            output spatial size.
+        scale_factor (int): multiplier for spatial size. Has to be an integer.
+        mode (string): algorithm used for upsampling:
+            'nearest' | 'linear' | 'bilinear' | 'trilinear'. Default: 'nearest'
+    """
    if isinstance(input, Variable):
        return F.upsample(input, size=size, scale_factor=scale_factor,
                          mode=mode)
@@ -56,16 +147,105 @@ def upsample(input, size=None, scale_factor=None, mode='nearest'):
            thread.start()
        for thread in threads:
            thread.join() 
-        # gather the results
+        outputs = dict_to_list(results)
-        def _list_gather(x):
+        return outputs
-            y = []
+    else:
-            for i in range(len(x)):
+        raise RuntimeError('unknown input type')
-                xi = x[i]
-                if isinstance(xi, Exception):
-                    raise xi
+def dropout(input, p=0.5, training=False, inplace=True):
-                y.append(xi)
+    """Multi-GPU version torch.nn.functional.droupout
-            return y
-        outputs = _list_gather(results)
+    The channels to zero-out are randomized on every forward call.
+    *Usually the input comes from Conv2d modules.*
+    As described in the paper
+    `Efficient Object Localization Using Convolutional Networks`,
+    if adjacent pixels within feature maps are strongly correlated
+    (as is normally the case in early convolution layers) then iid dropout
+    will not regularize the activations and will otherwise just result
+    in an effective learning rate decrease.
+    In this case, :func:`nn.Dropout2d` will help promote independence between
+    feature maps and should be used instead.
+    Args:
+        p (float, optional): probability of an element to be zeroed.
+        inplace (bool, optional): If set to True, will do this operation
+            in-place
+    Shape:
+        - Input: :math:`(N, C, H, W)`
+        - Output: :math:`(N, C, H, W)` (same shape as input)
+    """
+    if isinstance(input, Variable):
+        return F.dropout(input, p, training, inplace)
+    elif isinstance(input, tuple) or isinstance(input, list):
+        lock = threading.Lock()
+        results = {}
+        def _worker(i, x):
+            try:
+                with torch.cuda.device_of(x):
+                    result =  F.dropout(x, p, training, inplace)
+                with lock:
+                    results[i] = result
+            except Exception as e:
+                with lock:
+                    resutls[i] = e 
+        # multi-threading for different gpu
+        threads = [threading.Thread(target=_worker,
+                                    args=(i, x),
+                                    )
+                   for i, (x) in enumerate(input)]
+        for thread in threads:
+            thread.start()
+        for thread in threads:
+            thread.join() 
+        outputs = dict_to_list(results)
+        return outputs
+    else:
+        raise RuntimeError('unknown input type')
+def relu(input, inplace=False):
+    """Multi-GPU version torch.nn.functional.relu
+    Applies the rectified linear unit function element-wise
+    :math:`{ReLU}(x)= max(0, x)`
+    Args:
+        inplace: can optionally do the operation in-place. Default: False
+    Shape:
+        - Input: :math:`(N, *)` where `*` means, any number of additional
+          dimensions
+        - Output: :math:`(N, *)`, same shape as the input
+    """
+    if isinstance(input, Variable):
+        return F.relu(input, inplace)
+    elif isinstance(input, tuple) or isinstance(input, list):
+        lock = threading.Lock()
+        results = {}
+        def _worker(i, x):
+            try:
+                with torch.cuda.device_of(x):
+                    result =  F.relu(x, inplace)
+                with lock:
+                    results[i] = result
+            except Exception as e:
+                with lock:
+                    resutls[i] = e 
+        # multi-threading for different gpu
+        threads = [threading.Thread(target=_worker,
+                                    args=(i, x),
+                                    )
+                   for i, (x) in enumerate(input)]
+        for thread in threads:
+            thread.start()
+        for thread in threads:
+            thread.join() 
+        outputs = dict_to_list(results)
        return outputs
    else:

--- a/encoding/functions/customize.py
+++ b/encoding/functions/customize.py
@@ -21,34 +21,57 @@ from .._ext import encoding_lib
 __all__ = ['dilatedavgpool2d']
 class _dilatedavgpool2d(Function):
-    def forward(self, input, kernel_size, stride, padding,
+    @staticmethod
+    def forward(ctx, input, kernel_size, stride, padding,
            dilation=1):
-        self.kH, self.kW = _pair(kernel_size)
+        ctx.kH, ctx.kW = _pair(kernel_size)
-        self.dH, self.dW = _pair(stride if stride is not None else 
+        ctx.dH, ctx.dW = _pair(stride if stride is not None else 
            kernel_size)
-        self.padH, self.padW = _pair(padding)
+        ctx.padH, ctx.padW = _pair(padding)
-        self.dilationH, self.dilationW = _pair(dilation)
+        ctx.dilationH, ctx.dilationW = _pair(dilation)
        b,c,h,w = input.size()
-        if self.dH==1 and self.dW==1:
+        if ctx.dH==1 and ctx.dW==1:
            # keep the size for dilated avgpool
            ow, oh = w, h
        else:
-            ow = math.floor(float(w-self.kW+2*self.padW)/float(self.dW)) +1
+            ow = math.floor(float(w-ctx.kW+2*ctx.padW)/float(ctx.dW)) +1
-            oh = math.floor(float(h-self.kH+2*self.padH)/float(self.dH)) +1
+            oh = math.floor(float(h-ctx.kH+2*ctx.padH)/float(ctx.dH)) +1
+        with torch.cuda.device_of(input):
            output = input.new(b,c,oh,ow)
-        self.save_for_backward(input)
+        ctx.save_for_backward(input)
+        if isinstance(input, torch.cuda.FloatTensor):
+            with torch.cuda.device_of(input):
                encoding_lib.Encoding_Float_DilatedAvgPool2d_Forward(input, output,
-            self.kH, self.kW, self.dH, self.dW, self.padH, self.padW,
+                    ctx.kH, ctx.kW, ctx.dH, ctx.dW, ctx.padH, ctx.padW,
-            self.dilationH, self.dilationW)
+                    ctx.dilationH, ctx.dilationW)
+        elif isinstance(input, torch.cuda.DoubleTensor):
+            with torch.cuda.device_of(input):
+                encoding_lib.Encoding_Double_DilatedAvgPool2d_Forward(input, output,
+                    ctx.kH, ctx.kW, ctx.dH, ctx.dW, ctx.padH, ctx.padW,
+                    ctx.dilationH, ctx.dilationW)
+        else:
+            raise RuntimeError('Unimplemented data type!')
        return output
-    def backward(self, gradOutput):
+    @staticmethod
-        input, = self.saved_variables
+    def backward(ctx, gradOutput):
-        gradInput = input.new().resize_as_(input)
+        input, = ctx.saved_variables
+        with torch.cuda.device_of(input):
+            gradInput = Variable(input.data.new().resize_as_(input.data))
+        if isinstance(input.data, torch.cuda.FloatTensor):
+            with torch.cuda.device_of(input.data):
                encoding_lib.Encoding_Float_DilatedAvgPool2d_Backward(
-            gradinput, gradoutput,
+                    gradInput.data, gradOutput.data,
-            self.kH, self.kW, self.dH, self.dW, self.padH, self.padW,
+                    ctx.kH, ctx.kW, ctx.dH, ctx.dW, ctx.padH, ctx.padW,
-            self.dilationH, self.dilationW)
+                    ctx.dilationH, ctx.dilationW)
+        elif isinstance(input.data, torch.cuda.DoubleTensor):
+            with torch.cuda.device_of(input.data):
+                encoding_lib.Encoding_Double_DilatedAvgPool2d_Backward(
+                    gradInput.data, gradOutput.data,
+                    ctx.kH, ctx.kW, ctx.dH, ctx.dW, ctx.padH, ctx.padW,
+                    ctx.dilationH, ctx.dilationW)
+        else:
+            raise RuntimeError('Unimplemented data type!')
        return gradInput, None, None, None, None

--- a/encoding/functions/encoding.py
+++ b/encoding/functions/encoding.py
@@ -15,9 +15,10 @@ import torch.nn.functional as F
 from torch.autograd import Function, Variable
 from .._ext import encoding_lib
-__all__ = ['aggregate', 'scaledL2', 'aggregateP', 'residual', 'assign']
+__all__ = ['aggregate', 'scaledL2']
 class _aggregate(Function):
+    @staticmethod
    def forward(self, A, X, C):
        # A \in(BxNxK) R \in(BxNxKxD) => E \in(BxNxD)
        self.save_for_backward(A, X, C)
@@ -27,32 +28,33 @@ class _aggregate(Function):
            E = A.new(B,K,D)
        if isinstance(A, torch.cuda.FloatTensor):
            with torch.cuda.device_of(A):
-                encoding_lib.Encoding_Float_aggregateE_forward(E, A, X, C)
+                encoding_lib.Encoding_Float_aggregate_forward(E, A, X, C)
        elif isinstance(A, torch.cuda.DoubleTensor):
            with torch.cuda.device_of(A):
-                encoding_lib.Encoding_Double_aggregateE_forward(E, A, X, C)
+                encoding_lib.Encoding_Double_aggregate_forward(E, A, X, C)
        else:
            raise RuntimeError('Unimplemented data type!')
        return E
+    @staticmethod
    def backward(self, gradE):
-        A, X, C = self.saved_tensors
+        A, X, C = self.saved_variables
        with torch.cuda.device_of(A):
-            gradA = A.new().resize_as_(A)
+            gradA = Variable(A.data.new().resize_as_(A.data))
-            gradX = A.new().resize_as_(X)
+            gradX = Variable(A.data.new().resize_as_(X.data))
-            gradC = A.new().resize_as_(C)
+            gradC = Variable(A.data.new().resize_as_(C.data))
-        if isinstance(A, torch.cuda.FloatTensor):
+        if isinstance(A.data, torch.cuda.FloatTensor):
-            with torch.cuda.device_of(A):
+            with torch.cuda.device_of(A.data):
-                encoding_lib.Encoding_Float_aggregateE_backward(gradA, 
+                encoding_lib.Encoding_Float_aggregate_backward(gradA.data, 
-                    gradE, A, X, C)
+                    gradE.data, A.data, X.data, C.data)
-        elif isinstance(A, torch.cuda.DoubleTensor):
+        elif isinstance(A.data, torch.cuda.DoubleTensor):
-            with torch.cuda.device_of(A):
+            with torch.cuda.device_of(A.data):
-                encoding_lib.Encoding_Double_aggregateE_backward(gradA, 
+                encoding_lib.Encoding_Double_aggregate_backward(gradA.data, 
-                    gradE, A, X, C)
+                    gradE.data, A.data, X.data, C.data)
        else:
            raise RuntimeError('Unimplemented data type!')
-        gradX.copy_(torch.bmm(A, gradE))
+        gradX.data.copy_(torch.bmm(A, gradE).data)
-        gradC.copy_((-gradE*A.sum(1).unsqueeze(2)).sum(0))
+        gradC.data.copy_((-gradE*A.sum(1).unsqueeze(2)).sum(0).data)
        return gradA, gradX, gradC
 def aggregate(A, X, C):
@@ -76,9 +78,10 @@ def aggregate(A, X, C):
        >>> E = func(A, X, C)
    """
-    return _aggregate()(A, X, C)
+    return _aggregate.apply(A, X, C)
 class _scaledL2(Function):
+    @staticmethod
    def forward(self, X, C, S):
        B,N,D = X.size()
        K = C.size(0)
@@ -94,24 +97,26 @@ class _scaledL2(Function):
            raise RuntimeError('Unimplemented data type!')
        self.save_for_backward(X, C, S, SL)
        return SL
+    @staticmethod
    def backward(self, gradSL):
-        X, C, S, SL = self.saved_tensors
+        X, C, S, SL = self.saved_variables
        K = C.size(0)
-        with torch.cuda.device_of(X):
+        with torch.cuda.device_of(X.data):
-            gradX = X.new().resize_as_(X)
+            gradX = Variable(X.data.new().resize_as_(X.data))
-            gradC = X.new().resize_as_(C)
+            gradC = Variable(X.data.new().resize_as_(C.data))
-            gradS = X.new().resize_as_(S)
+            gradS = Variable(X.data.new().resize_as_(S.data))
-        if isinstance(X, torch.cuda.FloatTensor):
+        if isinstance(X.data, torch.cuda.FloatTensor):
-            with torch.cuda.device_of(X):
+            with torch.cuda.device_of(X.data):
-                encoding_lib.Encoding_Float_scaledl2_backward(gradSL, 
+                encoding_lib.Encoding_Float_scaledl2_backward(gradSL.data, 
-                    gradX, gradC, X, C, S)
+                    gradX.data, gradC.data, X.data, C.data, S.data)
-        elif isinstance(X, torch.cuda.DoubleTensor):
+        elif isinstance(X.data, torch.cuda.DoubleTensor):
-            with torch.cuda.device_of(X):
+            with torch.cuda.device_of(X.data):
-                encoding_lib.Encoding_Double_scaledl2_backward(gradSL, 
+                encoding_lib.Encoding_Double_scaledl2_backward(gradSL.data, 
-                    gradX, gradC, X, C, S)
+                    gradX.data, gradC.data, X.data, C.data, S.data)
        else:
            raise RuntimeError('Unimplemented data type!')
-        gradS.copy_((gradSL*(SL/S.view(1,1,K))).sum(0).sum(0))
+        gradS.data.copy_((gradSL*(SL/S.view(1,1,K))).sum(0).sum(0).data)
        return gradX, gradC, gradS
@@ -127,145 +132,6 @@ def scaledL2(X, C, S):
        - Output: :math:`E\in\mathcal{R}^{B\times N\times K}`
    """
-    return _scaledL2()(X, C, S)
+    return _scaledL2.apply(X, C, S)
-class _aggregateP(Function):
-    def forward(self, A, R):
-        # A \in(BxNxK) R \in(BxNxKxD) => E \in(BxNxD)
-        self.save_for_backward(A, R)
-        B, N, K, D = R.size()
-        with torch.cuda.device_of(A):
-            E = A.new(B,K,D)
-        if isinstance(A, torch.cuda.FloatTensor):
-            with torch.cuda.device_of(A):
-                encoding_lib.Encoding_Float_aggregate_forward(E, A, R)
-        elif isinstance(A, torch.cuda.DoubleTensor):
-            with torch.cuda.device_of(A):
-                encoding_lib.Encoding_Double_aggregate_forward(E, A, R)
-        else:
-            raise RuntimeError('Unimplemented data type!')
-        return E
-    def backward(self, gradE):
-        A, R = self.saved_tensors
-        with torch.cuda.device_of(A):
-            gradA = A.new().resize_as_(A)
-            gradR = R.new().resize_as_(R)
-        if isinstance(A, torch.cuda.FloatTensor):
-            with torch.cuda.device_of(A):
-                encoding_lib.Encoding_Float_aggregate_backward(gradA, 
-                    gradR, gradE, A, R)
-        elif isinstance(A, torch.cuda.DoubleTensor):
-            with torch.cuda.device_of(A):
-                encoding_lib.Encoding_Double_aggregate_backward(gradA, 
-                    gradR, gradE, A, R)
-        else:
-            raise RuntimeError('Unimplemented data type!')
-        return gradA, gradR
-def aggregateP(A, R):
-    return _aggregateP()(A, R)
-class _residual(Function):
-    def forward(self, X, C):
-        # X \in(BxNxD) D \in(KxD) R \in(BxNxKxD) 
-        B, N, D = X.size()
-        K = C.size(0)
-        with torch.cuda.device_of(X):
-            R = X.new(B,N,K,D)
-        if isinstance(X, torch.cuda.FloatTensor):
-            with torch.cuda.device_of(X):
-                encoding_lib.Encoding_Float_residual_forward(R, X, C)
-        elif isinstance(X, torch.cuda.DoubleTensor):
-            with torch.cuda.device_of(X):
-                encoding_lib.Encoding_Double_residual_forward(R, X, C)
-        else:
-            raise RuntimeError('Unimplemented data type!')
-        return R
-    def backward(self, gradR):
-        B, N, K, D = gradR.size()
-        with torch.cuda.device_of(gradR):
-            gradX = gradR.new(B,N,D)
-            gradD = gradR.new(K,D)
-        if isinstance(gradR, torch.cuda.FloatTensor):
-            with torch.cuda.device_of(gradR):
-                encoding_lib.Encoding_Float_residual_backward(gradR, 
-                    gradX, gradD)
-        elif isinstance(gradR, torch.cuda.DoubleTensor):
-            with torch.cuda.device_of(gradR):
-                encoding_lib.Encoding_Double_residual_backward(gradR, 
-                    gradX, gradD)
-        else:
-            raise RuntimeError('Unimplemented data type!')
-        return gradX, gradD
-def residual(X, C):
-    r"""
-    Calculate residuals over a mini-batch
-    .. math::
-        r_{ik} = x_i - c_k
-    Shape:
-        - Input: :math:`X\in\mathcal{R}^{B\times N\times D}` :math:`C\in\mathcal{R}^{K\times D}` (where :math:`B` is batch, :math:`N` is total number of features, :math:`K` is number is codewords, :math:`D` is feature dimensions.)
-        - Output: :math:`R\in\mathcal{R}^{B\times N\times K\times D}`
-    """
-    return _residual()(X, C)
-class _square_squeeze(Function):
-    def forward(self, R):
-        B, N, K, D = R.size()
-        with torch.cuda.device_of(R):
-            L = R.new(B,N,K)
-        if isinstance(R, torch.cuda.FloatTensor):
-            with torch.cuda.device_of(R):
-                encoding_lib.Encoding_Float_squaresqueeze_forward(L, R)
-        elif isinstance(R, torch.cuda.DoubleTensor):
-            with torch.cuda.device_of(R):
-                encoding_lib.Encoding_Double_squaresqueeze_forward(L, R)
-        else:
-            raise RuntimeError('Unimplemented data type!')
-        self.save_for_backward(L, R)
-        return L
-    def backward(self, gradL):
-        L, R = self.saved_tensors
-        B, N, K, D = R.size()
-        with torch.cuda.device_of(R):
-            gradR = R.new(B,N,K,D)
-        if isinstance(R, torch.cuda.FloatTensor):
-            with torch.cuda.device_of(gradL):
-                encoding_lib.Encoding_Float_squaresqueeze_backward(gradL, 
-                    gradR, R)
-        elif isinstance(R, torch.cuda.DoubleTensor):
-            with torch.cuda.device_of(gradL):
-                encoding_lib.Encoding_Double_squaresqueeze_backward(gradL, 
-                    gradR, R)
-        else:
-            raise RuntimeError('Unimplemented data type!')
-        return gradR
-def assign(R, S):
-    r"""
-    Calculate assignment weights for given residuals (:math:`R`) and scale (:math:`S`)
-    .. math::
-        a_{ik} = \frac{exp(-s_k\|r_{ik}\|^2)}{\sum_{j=1}^K exp(-s_j\|r_{ik}\|^2)}
-    Shape:
-        - Input: :math:`R\in\mathcal{R}^{B\times N\times K\times D}` :math:`S\in \mathcal{R}^K` (where :math:`B` is batch, :math:`N` is total number of features, :math:`K` is number is codewords, :math:`D` is feature dimensions.)
-        - Output :math:`A\in\mathcal{R}^{B\times N\times K}`
-    """
-    L = _square_squeeze()(R)
-    K = S.size(0)
-    SL = L * S.view(1,1,K)
-    return F.softmax(SL)
--- a/encoding/kernel/generic/device_tensor.h
+++ b/encoding/kernel/generic/device_tensor.h
@@ -34,4 +34,56 @@ THCDeviceTensor<real, Dim> devicetensor(THCState *state, THCTensor *t) {
    }
    return THCDeviceTensor<real, Dim>(THCTensor_(data)(state, t), size);
 }
+struct Encoding_(Float2)
+/*
+ * For reduce sum calcualtion of two elements
+ */
+{
+    real v1, v2;
+    __device__ Encoding_(Float2)() {}
+    __device__ Encoding_(Float2)(real x1, real x2) : v1(x1), v2(x2) {}
+    __device__ Encoding_(Float2)(real v) : v1(v), v2(v) {}
+    __device__ Encoding_(Float2)(int v) :  v1(v), v2(v) {}
+    __device__ Encoding_(Float2)& operator+=(const Encoding_(Float2)& a) 
+    {
+    v1 += a.v1;
+    v2 += a.v2;
+    return *this;
+    }
+};
+static __device__ __forceinline__ real Encoding_(rwarpSum)(real val) {
+#if CUDA_VERSION >= 9000
+    unsigned int mask = 0xffffffff;
+    for (int i = 0; i < getMSB(WARP_SIZE); ++i) {
+        val += __shfl_xor_sync(mask, val, 1 << i, WARP_SIZE);
+    }
+#else
+#if __CUDA_ARCH__ >= 300
+    for (int i = 0; i < getMSB(WARP_SIZE); ++i) {
+        val += __shfl_xor(val, 1 << i, WARP_SIZE);
+    }
+#else
+    __shared__ real values[MAX_BLOCK_SIZE];
+    values[threadIdx.x] = val;
+    __threadfence_block();
+    const int base = (threadIdx.x / WARP_SIZE) * WARP_SIZE;
+    for (int i = 1; i < WARP_SIZE; i++) {
+        val += values[base + ((i + threadIdx.x) % WARP_SIZE)];
+    }
+#endif
+#endif
+    return val;
+}
+static __device__ __forceinline__ Encoding_(Float2) Encoding_(warpSum)(
+    Encoding_(Float2) value) 
+{
+    value.v1 = Encoding_(rwarpSum)(value.v1);
+    value.v2 = Encoding_(rwarpSum)(value.v2);
+    return value;
+}
 #endif
--- a/encoding/kernel/generic/encoding_kernel.c
+++ b/encoding/kernel/generic/encoding_kernel.c
@@ -12,8 +12,7 @@
 #define THC_GENERIC_FILE "generic/encoding_kernel.c"
 #else
+__global__ void Encoding_(Aggregate_Forward_kernel) (
-__global__ void Encoding_(AggregateE_Forward_kernel) (
    THCDeviceTensor<real, 3> E,
    THCDeviceTensor<real, 3> A,
    THCDeviceTensor<real, 3> X,
@@ -23,24 +22,19 @@ __global__ void Encoding_(AggregateE_Forward_kernel) (
 */
 {
    /* declarations of the variables */
-    int b, k, d, i, N;
+    int b, k, d, N;
-    real sum;
    /* Get the index and channels */ 
    b = blockIdx.z;
-    d = blockIdx.x * blockDim.x + threadIdx.x;
+    d = blockIdx.x;
-    k = blockIdx.y * blockDim.y + threadIdx.y;
+    k = blockIdx.y;
-    N = A.getSize(1);
+    N = X.getSize(1);
-    /* boundary check for output */
-    if (d >= E.getSize(2) || k >= E.getSize(1))    return;
-    sum = 0;
    /* main operation */
-    for(i=0; i<N; i++) {
+    Encoding_(AggOp) g(A,X,C);
-        sum += A[b][i][k].ldg() * (X[b][i][d].ldg()-C[k][d].ldg());
+    E[b][k][d] = Encoding_(reduce_agg)(g,b,k,d,N);
-    }
-    E[b][k][d] = sum;
 }
-void Encoding_(AggregateE_Forward)(THCState *state, THCTensor *E_, 
+void Encoding_(Aggregate_Forward)(THCState *state, THCTensor *E_, 
    THCTensor *A_, THCTensor *X_, THCTensor *C_)
 /*
 * aggregating forward the residuals with assignment weights
@@ -60,67 +54,17 @@ void Encoding_(AggregateE_Forward)(THCState *state, THCTensor *E_,
    THCDeviceTensor<real, 2> C = devicetensor<2>(state, C_);
    /* kernel function */
    cudaStream_t stream = THCState_getCurrentStream(state);
-    dim3 threads(16, 16);
+    // B, K, D
-    dim3 blocks(E.getSize(2)/16+1, E.getSize(1)/16+1, 
+    dim3 blocks(C.getSize(1), C.getSize(0), X.getSize(0));
-                E.getSize(0));
+    // N
-    Encoding_(AggregateE_Forward_kernel)<<<blocks, threads, 0, stream>>>
+    dim3 threads(getNumThreads(X.getSize(1)));
+    Encoding_(Aggregate_Forward_kernel)<<<blocks, threads, 0, stream>>>
        (E, A, X, C);
    THCudaCheck(cudaGetLastError());
 }
-/*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
-__global__ void Encoding_(Aggregate_Forward_kernel) (
-    THCDeviceTensor<real, 3> E,
-    THCDeviceTensor<real, 3> A,
-    THCDeviceTensor<real, 4> R)
-/*
- * aggregating forward kernel function
- */
-{
-    /* declarations of the variables */
-    int b, k, d, i, N;
-    real sum;
-    /* Get the index and channels */ 
-    b = blockIdx.z;
-    d = blockIdx.x * blockDim.x + threadIdx.x;
-    k = blockIdx.y * blockDim.y + threadIdx.y;
-    N = A.getSize(1);
-    /* boundary check for output */
-    sum = 0;
-    if (d >= E.getSize(2) || k >= E.getSize(1))    return;
-    /* main operation */
-    for(i=0; i<N; i++) {
-        sum += A[b][i][k].ldg() * R[b][i][k][d].ldg();
-    }
-    E[b][k][d] = sum;
-}
-void Encoding_(Aggregate_Forward)(THCState *state, THCTensor *E_, 
-                            THCTensor *A_, THCTensor *R_)
-/*
- * aggregating forward the residuals with assignment weights
- */
-{
-    /* Check the GPU index and tensor dims*/
-    THCTensor_(checkGPU)(state, 3, E_, A_, R_);
-    if (THCTensor_(nDimension)(state, E_) != 3 ||
-            THCTensor_(nDimension)(state, A_) != 3 ||
-            THCTensor_(nDimension)(state, R_) != 4)
-        THError("Encoding: incorrect input dims. \n");
-    /* Device tensors */
-    THCDeviceTensor<real, 3> E = devicetensor<3>(state, E_);
-    THCDeviceTensor<real, 3> A = devicetensor<3>(state, A_);
-    THCDeviceTensor<real, 4> R = devicetensor<4>(state, R_);
-    /* kernel function */
-    cudaStream_t stream = THCState_getCurrentStream(state);
-    dim3 threads(16, 16);
-    dim3 blocks(E.getSize(2)/16+1, E.getSize(1)/16+1, 
-                            E.getSize(0));
-    Encoding_(Aggregate_Forward_kernel)<<<blocks, threads, 0, stream>>>(E, A, R);
-    THCudaCheck(cudaGetLastError());
-}
 /*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
-__global__ void Encoding_(AggregateE_Backward_kernel) (
+__global__ void Encoding_(Aggregate_Backward_kernel) (
    THCDeviceTensor<real, 3> GA,
    THCDeviceTensor<real, 3> GE,
    THCDeviceTensor<real, 3> A,
@@ -132,24 +76,18 @@ __global__ void Encoding_(AggregateE_Backward_kernel) (
 */
 {
    /* declarations of the variables */
-    int b, k, d, i, D;
+    int b, k, i, D;
-    real sum;
    /* Get the index and channels */ 
    b = blockIdx.z;
-    i = blockIdx.y * blockDim.y + threadIdx.y;
+    i = blockIdx.y;
-    k = blockIdx.x * blockDim.x + threadIdx.x;
+    k = blockIdx.x;
    D = GE.getSize(2);
-    /* boundary check for output G \in R^{BxNxKxD} */
-    if (k >= GA.getSize(2) || i >= GA.getSize(1))    return;
    /* main operation */
-    sum = 0;
+    Encoding_(AggBackOp) g(GE,X,C);
-    for(d=0; d<D; d++) {
+    GA[b][i][k] = Encoding_(reduce_aggback)(g,b,i,k,D);
-        sum += GE[b][k][d].ldg() * (X[b][i][d].ldg()-C[k][d].ldg());
-    }
-    GA[b][i][k] = sum;
 }
-void Encoding_(AggregateE_Backward)(THCState *state, THCTensor *GA_, 
+void Encoding_(Aggregate_Backward)(THCState *state, THCTensor *GA_, 
     THCTensor *GE_, THCTensor *A_, THCTensor *X_, THCTensor *C_)
 /*
 * aggregate backward to assignment weights
@@ -172,209 +110,15 @@ void Encoding_(AggregateE_Backward)(THCState *state, THCTensor *GA_,
    THCDeviceTensor<real, 2> C = devicetensor<2>(state, C_);
    /* kernel function */
    cudaStream_t stream = THCState_getCurrentStream(state);
-    dim3 threads(16, 16);
+    // B, K, D
-    dim3 blocks(GA.getSize(2)/16+1, GA.getSize(1)/16+1, 
+    dim3 blocks(C.getSize(0), X.getSize(1), X.getSize(0));
-                GA.getSize(0));
+    // N
-    Encoding_(AggregateE_Backward_kernel)<<<blocks, threads, 0, stream>>>
+    dim3 threads(getNumThreads(C.getSize(1)));
+    Encoding_(Aggregate_Backward_kernel)<<<blocks, threads, 0, stream>>>
        (GA, GE, A, X, C);
    THCudaCheck(cudaGetLastError());
 }
-/*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
-__global__ void Encoding_(Aggregate_Backward_kernel) (
-    THCDeviceTensor<real, 3> GA,
-    THCDeviceTensor<real, 4> GR,
-    THCDeviceTensor<real, 3> GE,
-    THCDeviceTensor<real, 3> A,
-    THCDeviceTensor<real, 4> R)
-/*
- * aggregating backward kernel function
- * G (dl/dR), L (dl/dE), A
- */
-{
-    /* declarations of the variables */
-    int b, k, d, i, D;
-    real sum;
-    /* Get the index and channels */ 
-    b = blockIdx.z;
-    i = blockIdx.y * blockDim.y + threadIdx.y;
-    k = blockIdx.x * blockDim.x + threadIdx.x;
-    D = GE.getSize(2);
-    /* boundary check for output G \in R^{BxNxKxD} */
-    if (k >= GR.getSize(2) || i >= GR.getSize(1))    return;
-    /* main operation */
-    sum = 0;
-    for(d=0; d<D; d++) {
-        GR[b][i][k][d] = GE[b][k][d].ldg() * A[b][i][k].ldg();
-        sum += GE[b][k][d].ldg() * R[b][i][k][d].ldg();
-    }
-    GA[b][i][k] = sum;
-}
-void Encoding_(Aggregate_Backward)(THCState *state, THCTensor *GA_, 
-     THCTensor *GR_, THCTensor *GE_, THCTensor *A_, THCTensor *R_)
-/*
- * aggregate backward to assignment weights
- * G (dl/dR), L (dl/dE), A
- */
-{
-    /* Check the GPU index and tensor dims*/
-    THCTensor_(checkGPU)(state, 5, GA_, GR_, GE_, A_, R_);
-    if (THCTensor_(nDimension)(state, GA_) != 3 ||
-        THCTensor_(nDimension)(state, GR_) != 4 ||
-        THCTensor_(nDimension)(state, GE_)  != 3 ||
-        THCTensor_(nDimension)(state, A_)  != 3 ||
-        THCTensor_(nDimension)(state, R_)  != 4)
-    THError("Encoding: incorrect input dims. \n");
-    /* Device tensors */
-    THCDeviceTensor<real, 3> GA = devicetensor<3>(state, GA_);
-    THCDeviceTensor<real, 4> GR = devicetensor<4>(state, GR_);
-    THCDeviceTensor<real, 3> GE = devicetensor<3>(state, GE_);
-    THCDeviceTensor<real, 3> A = devicetensor<3>(state, A_);
-    THCDeviceTensor<real, 4> R = devicetensor<4>(state, R_);
-    /* kernel function */
-    cudaStream_t stream = THCState_getCurrentStream(state);
-    dim3 threads(16, 16);
-    dim3 blocks(GA.getSize(2)/16+1, GA.getSize(1)/16+1, 
-                GA.getSize(0));
-    Encoding_(Aggregate_Backward_kernel)<<<blocks, threads, 0, stream>>>(GA,
-              GR, GE, A, R);
-    THCudaCheck(cudaGetLastError());
-}
-/*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
-__global__ void Encoding_(Residual_Forward_kernel) (
-    THCDeviceTensor<real, 4> R,
-    THCDeviceTensor<real, 3> X,
-    THCDeviceTensor<real, 2> D)
-/*
- * aggregating forward kernel function
- */
-{
-    /* declarations of the variables */
-    int b, k, d, i, K;
-    /* Get the index and channels */ 
-    b = blockIdx.z;
-    d = blockIdx.x * blockDim.x + threadIdx.x;
-    i = blockIdx.y * blockDim.y + threadIdx.y;
-    K = R.getSize(2);
-    /* boundary check for output */
-    if (d >= X.getSize(2) || i >= X.getSize(1))    return;
-    /* main operation */
-    for(k=0; k<K; k++) {
-        R[b][i][k][d] = X[b][i][d].ldg() - D[k][d].ldg();
-    }
-}
-void Encoding_(Residual_Forward)(
-    THCState *state, THCTensor *R_, THCTensor *X_, THCTensor *D_)
-/*
- * aggregating forward the residuals with assignment weights
- */
-{
-    /* Check the GPU index and tensor dims*/
-    THCTensor_(checkGPU)(state, 3, R_, X_, D_); 
-    if (THCTensor_(nDimension)(state, R_) != 4 ||
-        THCTensor_(nDimension)(state, X_) != 3 ||
-        THCTensor_(nDimension)(state, D_) != 2)
-    THError("Encoding: incorrect input dims. \n");
-    /* Device tensors */
-    THCDeviceTensor<real, 4> R = devicetensor<4>(state, R_);
-    THCDeviceTensor<real, 3> X = devicetensor<3>(state, X_);
-    THCDeviceTensor<real, 2> D = devicetensor<2>(state, D_);
-    /* kernel function */
-    cudaStream_t stream = THCState_getCurrentStream(state);
-    dim3 threads(16, 16);
-    dim3 blocks(X.getSize(2)/16+1, X.getSize(1)/16+1, 
-                X.getSize(0));
-    Encoding_(Residual_Forward_kernel)<<<blocks, threads, 0, stream>>>(R, X, D);
-    THCudaCheck(cudaGetLastError());
-}
-/*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
-__global__ void Encoding_(ResidualX_Backward_kernel) (
-    THCDeviceTensor<real, 4> GR,
-    THCDeviceTensor<real, 3> GX)
-/*
- * aggregating forward kernel function
- */
-{
-    /* declarations of the variables */
-    int b, k, d, i, K;
-    real sum;
-    /* Get the index and channels */ 
-    b = blockIdx.z;
-    d = blockIdx.x * blockDim.x + threadIdx.x;
-    i = blockIdx.y * blockDim.y + threadIdx.y;
-    K = GR.getSize(2);
-    /* boundary check for output */
-    if (d >= GX.getSize(2) || i >= GX.getSize(1)) return;
-    /* main operation */
-    sum = 0;
-    for(k=0; k<K; k++) {
-        sum += GR[b][i][k][d].ldg();
-    }
-    GX[b][i][d] = sum;
-}
-__global__ void Encoding_(ResidualD_Backward_kernel) (
-    THCDeviceTensor<real, 4> GR,
-    THCDeviceTensor<real, 2> GD)
-/*
- * aggregating forward kernel function
- */
-{
-    /* declarations of the variables */
-    int b, k, d, i, B, N;
-    real sum;
-    /* Get the index and channels */ 
-    d = blockIdx.x * blockDim.x + threadIdx.x;
-    k = blockIdx.y * blockDim.y + threadIdx.y;
-    B = GR.getSize(0);
-    N = GR.getSize(1);
-    /* boundary check for output */
-    if (d >= GD.getSize(1) || k >= GD.getSize(0)) return;
-    /* main operation */
-    sum = 0;
-    for(b=0; b<B; b++) {
-        for(i=0; i<N; i++) {
-            sum -= GR[b][i][k][d].ldg();
-        }
-    }
-    GD[k][d] = sum;
-}
-void Encoding_(Residual_Backward)(
-    THCState *state, THCTensor *GR_, THCTensor *GX_, THCTensor *GD_)
-/*
- * aggregating forward the residuals with assignment weights
- */
-{
-    /* Check the GPU index and tensor dims*/
-    THCTensor_(checkGPU)(state, 3, GR_, GX_, GD_); 
-    if (THCTensor_(nDimension)(state, GR_) != 4 ||
-        THCTensor_(nDimension)(state, GX_) != 3 ||
-        THCTensor_(nDimension)(state, GD_) != 2)
-    THError("Encoding: incorrect input dims. \n");
-    /* Device tensors */
-    THCDeviceTensor<real, 4> GR = devicetensor<4>(state, GR_);
-    THCDeviceTensor<real, 3> GX = devicetensor<3>(state, GX_);
-    THCDeviceTensor<real, 2> GD = devicetensor<2>(state, GD_);
-    /* kernel function */
-    cudaStream_t stream = THCState_getCurrentStream(state);
-    dim3 threads(16, 16);
-    dim3 blocks(GX.getSize(2)/16+1, GX.getSize(1)/16+1, 
-                GX.getSize(0));
-    Encoding_(ResidualX_Backward_kernel)<<<blocks, threads, 0, stream>>>
-        (GR, GX);
-    THCudaCheck(cudaGetLastError());
-    dim3 blocks2(GD.getSize(1)/16+1, GD.getSize(0)/16+1); 
-    Encoding_(ResidualD_Backward_kernel)<<<blocks2, threads, 0, stream>>>
-        (GR, GD);
-    THCudaCheck(cudaGetLastError());
-}
 /*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
 __global__ void Encoding_(ScaledL2_Forward_kernel) (
    THCDeviceTensor<real, 3> SL,
@@ -386,22 +130,15 @@ __global__ void Encoding_(ScaledL2_Forward_kernel) (
 */
 {
    /* declarations of the variables */
-    int b, k, d, i, D;
+    int b, k, i, D;
-    real r, sum;
    /* Get the index and channels */ 
    b = blockIdx.z;
-    k = blockIdx.x * blockDim.x + threadIdx.x;
+    k = blockIdx.x;
-    i = blockIdx.y * blockDim.y + threadIdx.y;
+    i = blockIdx.y;
    D = X.getSize(2);
-    /* boundary check for output */
-    if (k >= SL.getSize(2) || i >= SL.getSize(1)) return;
    /* main operation */
-    sum = 0;
+    Encoding_(L2Op) g(X,C);
-    for(d=0; d<D; d++) {
+    SL[b][i][k] = S[k] * Encoding_(reduce_sl2)(g,b,i,k,D);;
-        r = X[b][i][d].ldg() - C[k][d].ldg();
-        sum += r * r;
-    }
-    SL[b][i][k] = S[k] * sum;
 }
 void Encoding_(ScaledL2_Forward)(
@@ -425,9 +162,8 @@ void Encoding_(ScaledL2_Forward)(
    THCDeviceTensor<real, 1> S  = devicetensor<1>(state, S_);
    /* kernel function */
    cudaStream_t stream = THCState_getCurrentStream(state);
-    dim3 threads(16, 16);
+    dim3 blocks(C.getSize(0), X.getSize(1), X.getSize(0));
-    dim3 blocks(SL.getSize(2)/16+1, SL.getSize(1)/16+1, 
+    dim3 threads(getNumThreads(C.getSize(1)));
-                SL.getSize(0));
    Encoding_(ScaledL2_Forward_kernel)<<<blocks, threads, 0, stream>>>
        (SL, X, C, S);
    THCudaCheck(cudaGetLastError());
@@ -444,22 +180,15 @@ __global__ void Encoding_(ScaledL2X_Backward_kernel) (
 */
 {
    /* declarations of the variables */
-    int b, k, d, i, K;
+    int b, d, i, K;
-    real sum;
    /* Get the index and channels */ 
    b = blockIdx.z;
-    d = blockIdx.x * blockDim.x + threadIdx.x;
+    d = blockIdx.x;
-    i = blockIdx.y * blockDim.y + threadIdx.y;
+    i = blockIdx.y;
    K = C.getSize(0);
-    /* boundary check for output */
-    if (d >= GX.getSize(2) || i >= GX.getSize(1)) return;
    /* main operation */
-    sum = 0;
+    Encoding_(L2XBackOp) g(GSL,X,C,S);
-    for(k=0; k<K; k++) {
+    GX[b][i][d] = Encoding_(reduce_sl2xback)(g,b,i,d,K);
-        sum += 2*S[k].ldg() * GSL[b][i][k].ldg() *
-            (X[b][i][d].ldg()-C[k][d].ldg());
-    }
-    GX[b][i][d] = sum;
 }
 __global__ void Encoding_(ScaledL2C_Backward_kernel) (
@@ -472,24 +201,15 @@ __global__ void Encoding_(ScaledL2C_Backward_kernel) (
 */
 {
    /* declarations of the variables */
-    int b, k, d, i, B, N;
+    int k, d, B, N;
-    real sum;
    /* Get the index and channels */ 
-    d = blockIdx.x * blockDim.x + threadIdx.x;
+    d = blockIdx.x;
-    k = blockIdx.y * blockDim.y + threadIdx.y;
+    k = blockIdx.y;
    B = X.getSize(0);
    N = X.getSize(1);
-    /* boundary check for output */
-    if (d >= GC.getSize(1) || k >= GC.getSize(0)) return;
    /* main operation */
-    sum = 0;
+    Encoding_(L2CBackOp) g(GSL,X,C,S);
-    for(b=0; b<B; b++) {
+    GC[k][d] = Encoding_(reduce_sl2cback)(g,k,d,B,N);
-        for(i=0; i<N; i++) {
-            sum += -2*S[k].ldg() * GSL[b][i][k].ldg() *
-                (X[b][i][d].ldg()-C[k][d].ldg());
-        }
-    }
-    GC[k][d] = sum;
 }
 void Encoding_(ScaledL2_Backward)(
@@ -516,14 +236,14 @@ void Encoding_(ScaledL2_Backward)(
    THCDeviceTensor<real, 1> S = devicetensor<1>(state, S_);
    /* kernel function */
    cudaStream_t stream = THCState_getCurrentStream(state);
-    dim3 threads(16, 16);
+    dim3 blocks(X.getSize(2), X.getSize(1), X.getSize(0));
-    dim3 blocks(GX.getSize(2)/16+1, GX.getSize(1)/16+1, 
+    dim3 threads(getNumThreads(C.getSize(0)));
-                GX.getSize(0));
    Encoding_(ScaledL2X_Backward_kernel)<<<blocks, threads, 0, stream>>>
        (GSL, GX, X, C, S);
    THCudaCheck(cudaGetLastError());
-    dim3 blocks2(GC.getSize(1)/16+1, GX.getSize(0)/16+1);
+    dim3 blocks2(C.getSize(1), C.getSize(0));
-    Encoding_(ScaledL2C_Backward_kernel)<<<blocks2, threads, 0, stream>>>
+    dim3 threads2(getNumThreads(X.getSize(1)));
+    Encoding_(ScaledL2C_Backward_kernel)<<<blocks2, threads2, 0, stream>>>
        (GSL, GC, X, C, S);
    THCudaCheck(cudaGetLastError());
 }

--- a/encoding/kernel/generic/encoding_kernel.h
+++ b/encoding/kernel/generic/encoding_kernel.h
@@ -12,10 +12,10 @@
 #define THC_GENERIC_FILE "generic/encoding_kernel.h"
 #else
-void Encoding_(AggregateE_Forward)(THCState *state, THCTensor *E_, 
+void Encoding_(Aggregate_Forward)(THCState *state, THCTensor *E_, 
    THCTensor *A_, THCTensor *X_, THCTensor *C_);
-void Encoding_(AggregateE_Backward)(THCState *state, THCTensor *GA_, 
+void Encoding_(Aggregate_Backward)(THCState *state, THCTensor *GA_, 
     THCTensor *GE_, THCTensor *A_, THCTensor *X_, THCTensor *C_);
 void Encoding_(ScaledL2_Forward)( THCState *state, THCTensor *SL_,  
@@ -25,17 +25,4 @@ void Encoding_(ScaledL2_Backward)(
    THCState *state, THCTensor *GSL_, THCTensor *GX_, THCTensor *GC_,
    THCTensor *X_, THCTensor *C_, THCTensor *S_);
-void Encoding_(Aggregate_Forward)(
-    THCState *state, THCTensor *E_, THCTensor *A_, THCTensor *R_);
-void Encoding_(Aggregate_Backward)(
-    THCState *state, THCTensor *GA_, THCTensor *GR_, THCTensor *L_, 
-    THCTensor *A_, THCTensor *R_);
-void Encoding_(Residual_Forward)(
-    THCState *state, THCTensor *R_, THCTensor *X_, THCTensor *D_);
-void Encoding_(Residual_Backward)(
-    THCState *state, THCTensor *GR_, THCTensor *GX_, THCTensor *GD_);
 #endif
--- a/encoding/kernel/generic/encoding_utils.c
+++ b/encoding/kernel/generic/encoding_utils.c
+/*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+ * Created by: Hang Zhang
+ * ECE Department, Rutgers University
+ * Email: zhang.hang@rutgers.edu
+ * Copyright (c) 2017
+ *
+ * This source code is licensed under the MIT-style license found in the
+ * LICENSE file in the root directory of this source tree 
+ *+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+ */
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/encoding_utils.c"
+#else
+struct Encoding_(AggOp) {
+    __device__ Encoding_(AggOp)(THCDeviceTensor<real, 3> a,
+                                THCDeviceTensor<real, 3> x,
+                                THCDeviceTensor<real, 2> c)
+        : A(a), X(x), C(c) {}
+    __device__ __forceinline__ real operator()(int b, int i, int k, int d) 
+    {
+        return A[b][i][k].ldg() * (X[b][i][d].ldg()-C[k][d].ldg());
+    }
+    THCDeviceTensor<real, 3> A;
+    THCDeviceTensor<real, 3> X;
+    THCDeviceTensor<real, 2> C;
+};
+__device__ real Encoding_(reduce_agg)(
+        Encoding_(AggOp) op, 
+        int b, int k, int d, int N)
+{
+    real sum = 0;
+    for (int x = threadIdx.x; x < N; x += blockDim.x) {
+        sum += op(b,x,k,d);
+    }
+    // sum over NumThreads within a warp
+    sum = Encoding_(rwarpSum)(sum);
+    // 'transpose', and reduce within warp again
+    __shared__ real shared[32];
+    __syncthreads();
+    if (threadIdx.x % WARP_SIZE == 0) {
+        if (threadIdx.x / WARP_SIZE < 32) {
+                shared[threadIdx.x / WARP_SIZE] = sum;
+        }
+    }
+    if (threadIdx.x >= blockDim.x / WARP_SIZE && threadIdx.x < WARP_SIZE) {
+        // zero out the other entries in shared
+        shared[threadIdx.x] = (real) 0;
+    }
+    __syncthreads();
+    if (threadIdx.x / WARP_SIZE == 0) {
+        sum = Encoding_(rwarpSum)(shared[threadIdx.x]);
+        if (threadIdx.x == 0) {
+            shared[0] = sum;
+        }
+    }
+    __syncthreads();
+  // Everyone picks it up, should be broadcast into the whole gradInput
+  return shared[0];
+}
+struct Encoding_(AggBackOp) {
+    __device__ Encoding_(AggBackOp)(THCDeviceTensor<real, 3> ge,
+                                    THCDeviceTensor<real, 3> x,
+                                    THCDeviceTensor<real, 2> c)
+        : GE(ge), X(x), C(c) {}
+    __device__ __forceinline__ real operator()(int b, int i, int k, int d) 
+    {
+        return GE[b][k][d].ldg() * (X[b][i][d].ldg()-C[k][d].ldg());
+    }
+    THCDeviceTensor<real, 3> GE;
+    THCDeviceTensor<real, 3> X;
+    THCDeviceTensor<real, 2> C;
+};
+__device__ real Encoding_(reduce_aggback)(
+        Encoding_(AggBackOp) op, 
+        int b, int i, int k, int D)
+{
+    real sum = 0;
+    for (int x = threadIdx.x; x < D; x += blockDim.x) {
+        sum += op(b,i,k,x);
+    }
+    // sum over NumThreads within a warp
+    sum = Encoding_(rwarpSum)(sum);
+    // 'transpose', and reduce within warp again
+    __shared__ real shared[32];
+    __syncthreads();
+    if (threadIdx.x % WARP_SIZE == 0) {
+        if (threadIdx.x / WARP_SIZE < 32) {
+                shared[threadIdx.x / WARP_SIZE] = sum;
+        }
+    }
+    if (threadIdx.x >= blockDim.x / WARP_SIZE && threadIdx.x < WARP_SIZE) {
+        // zero out the other entries in shared
+        shared[threadIdx.x] = (real) 0;
+    }
+    __syncthreads();
+    if (threadIdx.x / WARP_SIZE == 0) {
+        sum = Encoding_(rwarpSum)(shared[threadIdx.x]);
+        if (threadIdx.x == 0) {
+            shared[0] = sum;
+        }
+    }
+    __syncthreads();
+  // Everyone picks it up, should be broadcast into the whole gradInput
+  return shared[0];
+}
+struct Encoding_(L2Op) {
+    __device__ Encoding_(L2Op)(THCDeviceTensor<real, 3> x,
+                               THCDeviceTensor<real, 2> c)
+        : X(x), C(c) {}
+    __device__ __forceinline__ real operator()(int b, int i, int k, int d) 
+    {
+        real r = X[b][i][d].ldg() - C[k][d].ldg();
+        return r * r;
+    }
+    THCDeviceTensor<real, 3> X;
+    THCDeviceTensor<real, 2> C;
+};
+__device__ real Encoding_(reduce_sl2)(
+        Encoding_(L2Op) op, 
+        int b, int i, int k, int D)
+{
+    real sum = 0;
+    for (int x = threadIdx.x; x < D; x += blockDim.x) {
+        sum += op(b,i,k,x);
+    }
+    // sum over NumThreads within a warp
+    sum = Encoding_(rwarpSum)(sum);
+    // 'transpose', and reduce within warp again
+    __shared__ real shared[32];
+    __syncthreads();
+    if (threadIdx.x % WARP_SIZE == 0) {
+        if (threadIdx.x / WARP_SIZE < 32) {
+                shared[threadIdx.x / WARP_SIZE] = sum;
+        }
+    }
+    if (threadIdx.x >= blockDim.x / WARP_SIZE && threadIdx.x < WARP_SIZE) {
+        // zero out the other entries in shared
+        shared[threadIdx.x] = (real) 0;
+    }
+    __syncthreads();
+    if (threadIdx.x / WARP_SIZE == 0) {
+        sum = Encoding_(rwarpSum)(shared[threadIdx.x]);
+        if (threadIdx.x == 0) {
+            shared[0] = sum;
+        }
+    }
+    __syncthreads();
+  // Everyone picks it up, should be broadcast into the whole gradInput
+  return shared[0];
+}
+struct Encoding_(L2XBackOp) {
+    __device__ Encoding_(L2XBackOp)(
+        THCDeviceTensor<real, 3> gsl,
+        THCDeviceTensor<real, 3> x,
+        THCDeviceTensor<real, 2> c,
+        THCDeviceTensor<real, 1> s
+    ) : GSL(gsl), X(x), C(c), S(s) {}
+    __device__ __forceinline__ real operator()(int b, int i, int k, int d) 
+    {
+        return 2*S[k].ldg() * GSL[b][i][k].ldg() *
+            (X[b][i][d].ldg()-C[k][d].ldg());
+    }
+    THCDeviceTensor<real, 3> GSL;
+    THCDeviceTensor<real, 3> X;
+    THCDeviceTensor<real, 2> C;
+    THCDeviceTensor<real, 1> S;
+};
+__device__ real Encoding_(reduce_sl2xback)(
+        Encoding_(L2XBackOp) op, 
+        int b, int i, int d, int K)
+{
+    real sum = 0;
+    for (int x = threadIdx.x; x < K; x += blockDim.x) {
+        sum += op(b,i,x,d);
+    }
+    // sum over NumThreads within a warp
+    sum = Encoding_(rwarpSum)(sum);
+    // 'transpose', and reduce within warp again
+    __shared__ real shared[32];
+    __syncthreads();
+    if (threadIdx.x % WARP_SIZE == 0) {
+        if (threadIdx.x / WARP_SIZE < 32) {
+                shared[threadIdx.x / WARP_SIZE] = sum;
+        }
+    }
+    if (threadIdx.x >= blockDim.x / WARP_SIZE && threadIdx.x < WARP_SIZE) {
+        // zero out the other entries in shared
+        shared[threadIdx.x] = (real) 0;
+    }
+    __syncthreads();
+    if (threadIdx.x / WARP_SIZE == 0) {
+        sum = Encoding_(rwarpSum)(shared[threadIdx.x]);
+        if (threadIdx.x == 0) {
+            shared[0] = sum;
+        }
+    }
+    __syncthreads();
+  // Everyone picks it up, should be broadcast into the whole gradInput
+  return shared[0];
+}
+struct Encoding_(L2CBackOp) {
+    __device__ Encoding_(L2CBackOp)(
+        THCDeviceTensor<real, 3> gsl,
+        THCDeviceTensor<real, 3> x,
+        THCDeviceTensor<real, 2> c,
+        THCDeviceTensor<real, 1> s
+    ) : GSL(gsl), X(x), C(c), S(s) {}
+    __device__ __forceinline__ real operator()(int b, int i, int k, int d) 
+    {
+        return -2*S[k].ldg() * GSL[b][i][k].ldg() *
+                (X[b][i][d].ldg()-C[k][d].ldg());
+    }
+    THCDeviceTensor<real, 3> GSL;
+    THCDeviceTensor<real, 3> X;
+    THCDeviceTensor<real, 2> C;
+    THCDeviceTensor<real, 1> S;
+};
+__device__ real Encoding_(reduce_sl2cback)(
+        Encoding_(L2CBackOp) op, 
+        int k, int d, int B, int N)
+{
+    real sum = 0;
+    for (int batch = 0; batch < B; ++batch) {
+        for (int x = threadIdx.x; x < N; x += blockDim.x) {
+            sum += op(batch,x,k,d);
+        }
+    }
+    // sum over NumThreads within a warp
+    sum = Encoding_(rwarpSum)(sum);
+    // 'transpose', and reduce within warp again
+    __shared__ real shared[32];
+    __syncthreads();
+    if (threadIdx.x % WARP_SIZE == 0) {
+        if (threadIdx.x / WARP_SIZE < 32) {
+                shared[threadIdx.x / WARP_SIZE] = sum;
+        }
+    }
+    if (threadIdx.x >= blockDim.x / WARP_SIZE && threadIdx.x < WARP_SIZE) {
+        // zero out the other entries in shared
+        shared[threadIdx.x] = (real) 0;
+    }
+    __syncthreads();
+    if (threadIdx.x / WARP_SIZE == 0) {
+        sum = Encoding_(rwarpSum)(shared[threadIdx.x]);
+        if (threadIdx.x == 0) {
+            shared[0] = sum;
+        }
+    }
+    __syncthreads();
+  // Everyone picks it up, should be broadcast into the whole gradInput
+  return shared[0];
+}
+#endif
--- a/encoding/kernel/generic/syncbn_kernel.c
+++ b/encoding/kernel/generic/syncbn_kernel.c
@@ -13,107 +13,6 @@
 #else
-__global__ void Encoding_(SquareSqueeze_Forward_kernel) (
-    THCDeviceTensor<real, 3> L,
-    THCDeviceTensor<real, 4> R)
-/*
- * aggregating forward kernel function
- */
-{
-    /* declarations of the variables */
-    int b, k, d, i, D;
-    real sum;
-    /* Get the index and channels */ 
-    b = blockIdx.z;
-    k = blockIdx.x * blockDim.x + threadIdx.x;
-    i = blockIdx.y * blockDim.y + threadIdx.y;
-    D = R.getSize(3);
-    /* boundary check for output */
-    if (k >= L.getSize(2) || i >= L.getSize(1)) return;
-    /* main operation */
-    sum = 0;
-    for(d=0; d<D; d++) {
-        sum += R[b][i][k][d].ldg()*R[b][i][k][d].ldg();
-    }
-    L[b][i][k] = sum;
-}
-void Encoding_(SquareSqueeze_Forward)(
-    THCState *state, THCTensor *L_,  THCTensor *R_)
-/*
- * aggregating forward the residuals with assignment weights
- */
-{
-    /* Check the GPU index and tensor dims*/
-    THCTensor_(checkGPU)(state, 2, L_, R_); 
-    if (THCTensor_(nDimension)(state, L_) != 3 ||
-        THCTensor_(nDimension)(state, R_) != 4)
-    THError("Encoding: incorrect input dims. \n");
-    /* Device tensors */
-    THCDeviceTensor<real, 3> L = devicetensor<3>(state, L_);
-    THCDeviceTensor<real, 4> R = devicetensor<4>(state, R_);
-    /* kernel function */
-    cudaStream_t stream = THCState_getCurrentStream(state);
-    dim3 threads(16, 16);
-    dim3 blocks(L.getSize(2)/16+1, L.getSize(1)/16+1, 
-                L.getSize(0));
-    Encoding_(SquareSqueeze_Forward_kernel)<<<blocks, threads, 0, stream>>>
-        (L, R);
-    THCudaCheck(cudaGetLastError());
-}
-/*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
-__global__ void Encoding_(SquareSqueeze_Backward_kernel) (
-    THCDeviceTensor<real, 3> GL,
-    THCDeviceTensor<real, 4> GR,
-    THCDeviceTensor<real, 4> R)
-/*
- */
-{
-    /* declarations of the variables */
-    int b, k, d, i, D;
-    real scale;
-    /* Get the index and channels */ 
-    b = blockIdx.z;
-    k = blockIdx.x * blockDim.x + threadIdx.x;
-    i = blockIdx.y * blockDim.y + threadIdx.y;
-    D = R.getSize(3);
-    /* boundary check for output */
-    if (k >= R.getSize(2) || i >= R.getSize(1)) return;
-    /* main operation */
-    scale = GL[b][i][k] * 2;
-    for(d=0; d<D; d++) {
-        GR[b][i][k][d] = scale * R[b][i][k][d];
-    }
-}
-void Encoding_(SquareSqueeze_Backward)(
-    THCState *state, THCTensor *GL_, THCTensor *GR_, THCTensor *R_)
-/*
- */
-{
-    /* Check the GPU index and tensor dims*/
-    THCTensor_(checkGPU)(state, 3, GL_, GR_, R_); 
-    if (THCTensor_(nDimension)(state, GL_) != 3 ||
-        THCTensor_(nDimension)(state, GR_) != 4 ||
-        THCTensor_(nDimension)(state, R_) != 4)
-    THError("Encoding: incorrect input dims. \n");
-    /* Device tensors */
-    THCDeviceTensor<real, 3> GL = devicetensor<3>(state, GL_);
-    THCDeviceTensor<real, 4> GR = devicetensor<4>(state, GR_);
-    THCDeviceTensor<real, 4> R = devicetensor<4>(state, R_);
-    /* kernel function */
-    cudaStream_t stream = THCState_getCurrentStream(state);
-    dim3 threads(16, 16);
-    dim3 blocks(R.getSize(2)/16+1, R.getSize(1)/16+1, 
-                R.getSize(0));
-    Encoding_(SquareSqueeze_Backward_kernel)<<<blocks, threads, 0, stream>>>
-        (GL, GR, R);
-    THCudaCheck(cudaGetLastError());
-}
-/*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
 __global__ void Encoding_(BatchNorm_Forward_kernel) (
    THCDeviceTensor<real, 3> output,
    THCDeviceTensor<real, 3> input,
@@ -168,42 +67,6 @@ void Encoding_(BatchNorm_Forward)(THCState *state,
    THCudaCheck(cudaGetLastError());
 }
-struct Encoding_(Float2){
-    real v1, v2;
-    __device__ Encoding_(Float2)() {}
-    __device__ Encoding_(Float2)(real x1, real x2) : v1(x1), v2(x2) {}
-    __device__ Encoding_(Float2)(real v) : v1(v), v2(v) {}
-    __device__ Encoding_(Float2)(int v) :  v1(v), v2(v) {}
-    __device__ Encoding_(Float2)& operator+=(const Encoding_(Float2)& a) {
-    v1 += a.v1;
-    v2 += a.v2;
-    return *this;
-  }
-};
-static __device__ __forceinline__ real Encoding_(rwarpSum)(real val) {
-#if __CUDA_ARCH__ >= 300
-  for (int i = 0; i < getMSB(WARP_SIZE); ++i) {
-    val += __shfl_xor(val, 1 << i, WARP_SIZE);
-  }
-#else
-  __shared__ real values[MAX_BLOCK_SIZE];
-  values[threadIdx.x] = val;
-  __threadfence_block();
-  const int base = (threadIdx.x / WARP_SIZE) * WARP_SIZE;
-  for (int i = 1; i < WARP_SIZE; i++) {
-    val += values[base + ((i + threadIdx.x) % WARP_SIZE)];
-  }
-#endif
-  return val;
-}
-static __device__ __forceinline__ Encoding_(Float2) Encoding_(warpSum)(Encoding_(Float2) value) {
-  value.v1 = Encoding_(rwarpSum)(value.v1);
-  value.v2 = Encoding_(rwarpSum)(value.v2);
-  return value;
-}
 struct Encoding_(GradOp) {
    __device__ Encoding_(GradOp)(real m, THCDeviceTensor<real, 3> i, THCDeviceTensor<real, 3> g)
        : mean(m), input(i), gradOutput(g) {}
@@ -217,8 +80,12 @@ struct Encoding_(GradOp) {
    THCDeviceTensor<real, 3> gradOutput;
 };
-// Sum across (batch, x/y/z) applying Op() pointwise
+// Sum across (batch, b/c/n) applying Op() pointwise
-__device__ Encoding_(Float2) Encoding_(reduce)(Encoding_(GradOp) op, THCDeviceTensor<real, 3> tensor, int plane) {
+__device__ Encoding_(Float2) Encoding_(reduce)(
+        Encoding_(GradOp) op,
+        THCDeviceTensor<real, 3> tensor, 
+        int plane) 
+{
    Encoding_(Float2) sum = (Encoding_(Float2))0;
    for (int batch = 0; batch < tensor.getSize(0); ++batch) {
        for (int x = threadIdx.x; x < tensor.getSize(2); x += blockDim.x) {

--- a/encoding/kernel/generic/syncbn_kernel.h
+++ b/encoding/kernel/generic/syncbn_kernel.h
@@ -12,12 +12,6 @@
 #define THC_GENERIC_FILE "generic/syncbn_kernel.h"
 #else
-void Encoding_(SquareSqueeze_Forward)(
-    THCState *state, THCTensor *L_,  THCTensor *R_);
-void Encoding_(SquareSqueeze_Backward)(
-    THCState *state, THCTensor *GL_, THCTensor *GR_, THCTensor *R_);
 void Encoding_(BatchNorm_Forward)(THCState *state, 
    THCTensor *output_, THCTensor *input_, 
    THCTensor *mean_, THCTensor *invstd_,

--- a/encoding/kernel/include/README.rst
+++ b/encoding/kernel/include/README.rst
+Make a copy from PyTorch lib to make the compilation easier for users, due to so many questions and requests.
--- a/encoding/kernel/include/THCDeviceTensor-inl.cuh
+++ b/encoding/kernel/include/THCDeviceTensor-inl.cuh
+#include <assert.h>
+namespace detail {
+template <typename T, int N>
+__host__ __device__ void copy(T to[N], T from[N]) {
+  for (int i = 0; i < N; ++i) {
+    to[i] = from[i];
+  }
+}
+} // namespace detail
+template <typename T, int Dim,
+          typename IndexT, template <typename U> class PtrTraits>
+__host__ __device__
+THCDeviceTensor<T, Dim, IndexT, PtrTraits>::THCDeviceTensor()
+    : data_(NULL) {
+  thc_static_assert(Dim > 0);
+  for (int i = 0; i < Dim; ++i) {
+    size_[i] = 0;
+    stride_[i] = (IndexT) 1;
+  }
+}
+template <typename T, int Dim,
+          typename IndexT, template <typename U> class PtrTraits>
+__host__ __device__
+THCDeviceTensor<T, Dim, IndexT, PtrTraits>::
+#ifdef _MSC_VER
+THCDeviceTensor(DataPtrType data, const IndexT (&sizes)[Dim])
+#else
+THCDeviceTensor(DataPtrType data, const IndexT sizes[Dim])
+#endif
+    : data_(data) {
+  thc_static_assert(Dim > 0);
+  for (int i = 0; i < Dim; ++i) {
+    size_[i] = sizes[i];
+  }
+  stride_[Dim - 1] = (IndexT) 1;
+  for (int i = Dim - 2; i >= 0; --i) {
+    stride_[i] = stride_[i + 1] * sizes[i + 1];
+  }
+}
+template <typename T, int Dim,
+          typename IndexT, template <typename U> class PtrTraits>
+__host__ __device__
+THCDeviceTensor<T, Dim, IndexT, PtrTraits>::THCDeviceTensor(
+#ifdef _MSC_VER
+  DataPtrType data, const IndexT (&sizes)[Dim], const IndexT (&strides)[Dim])
+#else
+  DataPtrType data, const IndexT sizes[Dim], const IndexT strides[Dim])
+#endif
+    : data_(data) {
+  thc_static_assert(Dim > 0);
+  for (int i = 0; i < Dim; ++i) {
+    size_[i] = sizes[i];
+    stride_[i] = strides[i];
+  }
+}
+template <typename T, int Dim,
+          typename IndexT, template <typename U> class PtrTraits>
+template <int OtherDim>
+__host__ __device__ bool
+THCDeviceTensor<T, Dim, IndexT, PtrTraits>::isSameSizeAndStride(
+  const THCDeviceTensor<T, OtherDim, IndexT, PtrTraits>& rhs) const {
+  if (Dim != OtherDim) {
+    return false;
+  }
+  for (int i = 0; i < Dim; ++i) {
+    if (size_[i] != rhs.size_[i]) {
+      return false;
+    }
+    if (stride_[i] != rhs.stride_[i]) {
+      return false;
+    }
+  }
+  return true;
+}
+template <typename T, int Dim,
+          typename IndexT, template <typename U> class PtrTraits>
+template <typename U>
+__host__ __device__ THCDeviceTensor<U, Dim, IndexT, PtrTraits>
+THCDeviceTensor<T, Dim, IndexT, PtrTraits>::cast() {
+  thc_static_assert(sizeof(U) == sizeof(T));
+  return THCDeviceTensor<U, Dim, IndexT, PtrTraits>(
+    reinterpret_cast<U*>(data_), size_, stride_);
+}
+template <typename T, int Dim,
+          typename IndexT, template <typename U> class PtrTraits>
+template <typename U>
+__host__ __device__ const THCDeviceTensor<U, Dim, IndexT, PtrTraits>
+THCDeviceTensor<T, Dim, IndexT, PtrTraits>::cast() const {
+  thc_static_assert(sizeof(U) == sizeof(T));
+  return THCDeviceTensor<U, Dim, IndexT, PtrTraits>(
+    reinterpret_cast<U*>(data_), size_, stride_);
+}
+template <typename T, int Dim,
+          typename IndexT, template <typename U> class PtrTraits>
+__host__ __device__ ptrdiff_t
+THCDeviceTensor<T, Dim, IndexT, PtrTraits>::numElements() const {
+  ptrdiff_t size = getSize(0);
+  for (int i = 1; i < Dim; ++i) {
+    size *= getSize(i);
+  }
+  return size;
+}
+template <typename T, int Dim,
+          typename IndexT, template <typename U> class PtrTraits>
+__host__ __device__ bool
+THCDeviceTensor<T, Dim, IndexT, PtrTraits>::isContiguous() const {
+  return isContiguousRange(0, Dim);
+}
+template <typename T, int Dim,
+          typename IndexT, template <typename U> class PtrTraits>
+__host__ __device__ bool
+THCDeviceTensor<T, Dim, IndexT, PtrTraits>::isConsistentlySized(int i) const {
+  if (i == 0 && getStride(i) > 0 && getSize(i) > 0) {
+    return true;
+  } else if ((i > 0) && (i < Dim) && (getStride(i) > 0) &&
+             ((getStride(i - 1) / getStride(i)) >= getSize(i))) {
+    return true;
+  }
+  return false;
+}
+template <typename T, int Dim,
+          typename IndexT, template <typename U> class PtrTraits>
+__host__ __device__ bool
+THCDeviceTensor<T, Dim, IndexT, PtrTraits>::isConsistentlySized() const {
+  for (int i = 0; i < Dim; ++i) {
+    if (!isConsistentlySized(i)) {
+      return false;
+    }
+  }
+  return true;
+}
+template <typename T, int Dim,
+          typename IndexT, template <typename U> class PtrTraits>
+__host__ __device__ bool
+THCDeviceTensor<T, Dim, IndexT, PtrTraits>::isContiguousRange(
+  int first, int last) const {
+  int64_t prevSize = last < Dim ? getStride(last) * getSize(last) : 1;
+  for (int i = last - 1; i >= first; --i) {
+    if (getSize(i) != (IndexT) 1) {
+      if (getStride(i) == prevSize) {
+        prevSize *= getSize(i);
+      } else {
+        return false;
+      }
+    }
+  }
+  return true;
+}
+template <typename T, int Dim,
+          typename IndexT, template <typename U> class PtrTraits>
+__host__ __device__ THCDeviceTensor<T, Dim, IndexT, PtrTraits>
+THCDeviceTensor<T, Dim, IndexT, PtrTraits>::transpose(int dim1,
+                                                      int dim2) const {
+#ifdef __CUDA_ARCH__
+  // Device code
+  assert(dim1 >= 0 && dim1 < Dim);
+  assert(dim1 >= 0 && dim2 < Dim);
+#else
+  // Host code
+  if (dim1 < 0 || dim1 >= Dim) {
+    THError("dim1 out of bounds");
+  }
+  if (dim2 < 0 || dim2 >= Dim) {
+    THError("dim2 out of bounds");
+  }
+#endif
+  IndexT newSize[Dim];
+  IndexT newStride[Dim];
+  for (int i = 0; i < Dim; ++i) {
+    newSize[i] = size_[i];
+    newStride[i] = stride_[i];
+  }
+  IndexT tmp = newSize[dim1];
+  newSize[dim1] = newSize[dim2];
+  newSize[dim2] = tmp;
+  tmp = newStride[dim1];
+  newStride[dim1] = newStride[dim2];
+  newStride[dim2] = tmp;
+  return THCDeviceTensor<T, Dim, IndexT, PtrTraits>(data_, newSize, newStride);
+}
+template <typename T, int Dim,
+          typename IndexT, template <typename U> class PtrTraits>
+template <int NewDim>
+__host__ __device__ THCDeviceTensor<T, NewDim, IndexT, PtrTraits>
+THCDeviceTensor<T, Dim, IndexT, PtrTraits>::upcastOuter() {
+  // Can only create tensors of greater dimension
+  thc_static_assert(NewDim > Dim);
+  IndexT newSize[NewDim];
+  IndexT newStride[NewDim];
+  int shift = NewDim - Dim;
+  for (int i = 0; i < NewDim; ++i) {
+    if (i < shift) {
+      // These are the extended dimensions
+      newSize[i] = (IndexT) 1;
+      newStride[i] = size_[0] * stride_[0];
+    } else {
+      // Shift the remaining dimensions
+      newSize[i] = size_[i - shift];
+      newStride[i] = stride_[i - shift];
+    }
+  }
+  return THCDeviceTensor<T, NewDim, IndexT, PtrTraits>(
+    data_, newSize, newStride);
+}
+template <typename T, int Dim,
+          typename IndexT, template <typename U> class PtrTraits>
+template <int NewDim>
+__host__ __device__ THCDeviceTensor<T, NewDim, IndexT, PtrTraits>
+THCDeviceTensor<T, Dim, IndexT, PtrTraits>::upcastInner() {
+  // Can only create tensors of greater dimension
+  thc_static_assert(NewDim > Dim);
+  IndexT newSize[NewDim];
+  IndexT newStride[NewDim];
+  for (int i = 0; i < NewDim; ++i) {
+    if (i < Dim) {
+      // Existing dimensions get copied over
+      newSize[i] = size_[i];
+      newStride[i] = stride_[i];
+    } else {
+      // Extended dimensions
+      newSize[i] = (IndexT) 1;
+      newStride[i] = (IndexT) 1;
+    }
+  }
+  return THCDeviceTensor<T, NewDim, IndexT, PtrTraits>(
+    data_, newSize, newStride);
+}
+template <typename T, int Dim,
+          typename IndexT, template <typename U> class PtrTraits>
+template <int NewDim>
+__host__ __device__ THCDeviceTensor<T, NewDim, IndexT, PtrTraits>
+THCDeviceTensor<T, Dim, IndexT, PtrTraits>::downcastOuter() {
+  // Can only create tensors of lesser dimension
+  thc_static_assert(NewDim < Dim);
+  // We can't downcast non-contiguous tensors, since it leaves
+  // garbage data in the tensor. The tensor needs to be contiguous
+  // in all of the dimensions we are collapsing (no padding in
+  // them).
+  bool cont = isContiguousRange(0, Dim - NewDim);
+#ifdef __CUDA_ARCH__
+  // Device code
+  assert(cont);
+#else
+  // Host code
+  if (!cont) {
+    THError("Can only downcast contiguous tensors");
+  }
+#endif
+  IndexT newSize[NewDim];
+  IndexT newStride[NewDim];
+  int ignoredDims = Dim - NewDim;
+  IndexT collapsedSize = 1;
+  for (int i = 0; i < Dim; ++i) {
+    if (i < ignoredDims) {
+      // Collapse these dimensions
+      collapsedSize *= getSize(i);
+    } else {
+      // Non-collapsed dimensions
+      if (i == ignoredDims) {
+        // This is the first non-collapsed dimension
+        newSize[i - ignoredDims] = collapsedSize * getSize(i);
+      } else {
+        // Subsequent non-collapsed dimensions
+        newSize[i - ignoredDims] = getSize(i);
+      }
+      newStride[i - ignoredDims] = getStride(i);
+    }
+  }
+  return THCDeviceTensor<T, NewDim, IndexT, PtrTraits>(
+    data_, newSize, newStride);
+}
+template <typename T, int Dim,
+          typename IndexT, template <typename U> class PtrTraits>
+template <int NewDim>
+__host__ __device__ THCDeviceTensor<T, NewDim, IndexT, PtrTraits>
+THCDeviceTensor<T, Dim, IndexT, PtrTraits>::downcastInner() {
+  // Can only create tensors of lesser dimension
+  thc_static_assert(NewDim < Dim);
+  // We can't downcast non-contiguous tensors, since it leaves
+  // garbage data in the tensor. The tensor needs to be contiguous
+  // in all of the dimensions we are collapsing (no padding in
+  // them).
+  bool cont = isContiguousRange(NewDim, Dim);
+#ifdef __CUDA_ARCH__
+  // Device code
+  assert(cont);
+#else
+  // Host code
+  if (!cont) {
+    THError("Can only downcast contiguous tensors");
+  }
+#endif
+  IndexT newSize[NewDim];
+  IndexT newStride[NewDim];
+  IndexT collapsedSize = 1;
+  for (int i = Dim - 1; i >= 0; --i) {
+    if (i >= NewDim) {
+      // Collapse these dimensions
+      collapsedSize *= getSize(i);
+    } else {
+      // Non-collapsed dimensions
+      if (i == NewDim - 1) {
+        // This is the first non-collapsed dimension
+        newSize[i] = collapsedSize * getSize(i);
+        newStride[i] = getStride(Dim - 1);
+      } else {
+        // Subsequent non-collapsed dimensions
+        newSize[i] = getSize(i);
+        newStride[i] = getStride(i);
+      }
+    }
+  }
+  return THCDeviceTensor<T, NewDim, IndexT, PtrTraits>(
+    data_, newSize, newStride);
+}
+template <typename T, int Dim,
+          typename IndexT, template <typename U> class PtrTraits>
+template <int SubDim>
+__host__ __device__ THCDeviceTensor<T, SubDim, IndexT, PtrTraits>
+THCDeviceTensor<T, Dim, IndexT, PtrTraits>::view(DataPtrType at) {
+  thc_static_assert(SubDim >= 1 && SubDim < Dim);
+  IndexT viewSizes[SubDim];
+  IndexT viewStrides[SubDim];
+  for (int i = 0; i < SubDim; ++i) {
+    viewSizes[i] = size_[Dim - SubDim + i];
+    viewStrides[i] = stride_[Dim - SubDim + i];
+  }
+  return THCDeviceTensor<T, SubDim, IndexT, PtrTraits>(
+    at, viewSizes, viewStrides);
+}
+template <typename T, int Dim,
+          typename IndexT, template <typename U> class PtrTraits>
+template <int SubDim>
+__host__ __device__ THCDeviceTensor<T, SubDim, IndexT, PtrTraits>
+THCDeviceTensor<T, Dim, IndexT, PtrTraits>::view() {
+  return view<SubDim>(data_);
+}
+template <typename T, int Dim,
+          typename IndexT, template <typename U> class PtrTraits>
+void
+THCDeviceTensor<T, Dim, IndexT, PtrTraits>::zero(cudaStream_t stream) {
+#ifdef __CUDA_ARCH__
+  assert(isContiguous());
+#else
+  if (!isContiguous()) {
+    THError("fillAsync only works on contiguous data");
+  }
+#endif
+  cudaMemsetAsync(data(), 0, numElements() * sizeof(T), stream);
+}
--- a/encoding/kernel/include/THCDeviceTensor.cuh
+++ b/encoding/kernel/include/THCDeviceTensor.cuh
+#ifndef THC_DEVICE_TENSOR_INC
+#define THC_DEVICE_TENSOR_INC
+#include <cuda.h>
+#include <cuda_runtime.h>
+// A CUDA 6.5 compatible version of static_assert. Remove once on CUDA 7.0.
+template <bool>
+struct THCStaticAssert;
+template <>
+struct THCStaticAssert<true> {
+};
+#define thc_static_assert(expr) (THCStaticAssert<(expr) != 0>())
+/// Our tensor type
+template <typename T,
+          int Dim,
+          typename IndexT,
+          template <typename U> class PtrTraits>
+class THCDeviceTensor;
+/// Type of a subspace of a tensor
+namespace detail {
+template <typename TensorType,
+          int SubDim,
+          template <typename U> class PtrTraits>
+class THCDeviceSubTensor;
+}
+template <typename T>
+struct RestrictPtrTraits {
+  typedef T* __restrict__ PtrType;
+};
+template <typename T>
+struct DefaultPtrTraits {
+  typedef T* PtrType;
+};
+/**
+   Templated multi-dimensional array that supports strided access of
+   elements. Main access is through `operator[]`; e.g.,
+   `tensor[x][y][z]`.
+- `T` is the contained type (e.g., `float`)
+- `Dim` is the tensor rank
+- `IndexT` is the integer type used for size/stride arrays, and for
+- all indexing math. Default is `int`, but for large tensors, `int64_t`
+- can be used instead.
+- `PtrTraits` are traits applied to our data pointer (T*). By default,
+- this is just T*, but RestrictPtrTraits can be used to apply T*
+- __restrict__ for alias-free analysis.
+*/
+template <typename T,
+          int Dim,
+          typename IndexT = int,
+          template <typename U> class PtrTraits = DefaultPtrTraits>
+class THCDeviceTensor {
+ public:
+  enum { NumDim = Dim };
+  typedef T DataType;
+  typedef IndexT IndexType;
+  typedef typename PtrTraits<T>::PtrType DataPtrType;
+  typedef THCDeviceTensor<T, Dim, IndexT, PtrTraits> TensorType;
+  /// Default constructor
+  __host__ __device__ THCDeviceTensor();
+  /// Constructor that calculates strides with no padding
+  __host__ __device__ THCDeviceTensor(DataPtrType data,
+#ifdef _MSC_VER
+                                      const IndexT (&sizes)[Dim]);
+#else
+                                      const IndexT sizes[Dim]);
+#endif
+  /// Constructor that takes arbitrary size/stride arrays
+  __host__ __device__ THCDeviceTensor(DataPtrType data,
+#ifdef _MSC_VER
+                                      const IndexT (&sizes)[Dim],
+                                      const IndexT (&strides)[Dim]);
+#else
+                                      const IndexT sizes[Dim],
+                                      const IndexT strides[Dim]);
+#endif
+  /// Returns true if the two tensors are of the same dimensionality,
+  /// size and stride.
+  template <int OtherDim>
+  __host__ __device__ bool
+  isSameSizeAndStride(
+    const THCDeviceTensor<T, OtherDim, IndexT, PtrTraits>& rhs) const;
+  /// Cast to a tensor of a different type of the same size and stride
+  template <typename U>
+  __host__ __device__ THCDeviceTensor<U, Dim, IndexT, PtrTraits> cast();
+  /// Const version of `cast`
+  template <typename U>
+  __host__ __device__
+  const THCDeviceTensor<U, Dim, IndexT, PtrTraits> cast() const;
+  /// Returns a raw pointer to the start of our data.
+  __host__ __device__ __forceinline__ DataPtrType data() {
+    return data_;
+  }
+  /// Returns a raw pointer to the start of our data (const).
+  __host__ __device__ __forceinline__
+  const DataPtrType data() const {
+    return data_;
+  }
+  /// Cast to a different datatype
+  template <typename U>
+  __host__ __device__ __forceinline__
+  typename PtrTraits<U>::PtrType dataAs() {
+    return reinterpret_cast<typename PtrTraits<U>::PtrType>(data_);
+  }
+  /// Cast to a different datatype
+  template <typename U>
+  __host__ __device__ __forceinline__
+  const typename PtrTraits<const U>::PtrType dataAs() const {
+    return reinterpret_cast<typename PtrTraits<const U>::PtrType>(data_);
+  }
+  /// Returns a read/write view of a portion of our tensor.
+  __host__ __device__ __forceinline__
+  detail::THCDeviceSubTensor<TensorType, Dim - 1, PtrTraits>
+    operator[](IndexT);
+  /// Returns a read/write view of a portion of our tensor (const).
+  __host__ __device__ __forceinline__
+  const detail::THCDeviceSubTensor<TensorType, Dim - 1, PtrTraits>
+    operator[](IndexT) const;
+  /// Returns the size of a given dimension, `[0, Dim - 1]`. No bounds
+  /// checking.
+  __host__ __device__ __forceinline__ int getSize(int i) const {
+    return size_[i];
+  }
+  /// Returns the stride of a given dimension, `[0, Dim - 1]`. No bounds
+  /// checking.
+  __host__ __device__ __forceinline__ int getStride(int i) const {
+    return stride_[i];
+  }
+  /// Returns the total number of elements contained within our data
+  /// (product of `getSize(i)`)
+  __host__ __device__ ptrdiff_t numElements() const;
+  /// Returns the size array.
+  __host__ __device__ __forceinline__ const IndexT* sizes() const {
+    return size_;
+  }
+  /// Returns the stride array.
+  __host__ __device__ __forceinline__ const IndexT* strides() const {
+    return stride_;
+  }
+  /// Returns true if there is no padding within the tensor and no
+  /// re-ordering of the dimensions.
+  /// ~~~
+  /// (stride(i) == size(i + 1) * stride(i + 1)) && stride(dim - 1) == 0
+  /// ~~~
+  __host__ __device__ bool isContiguous() const;
+  /// Returns whether a given dimension has only increasing stride
+  /// from the previous dimension. A tensor that was permuted by
+  /// exchanging size and stride only will fail this check.
+  /// If `i == 0` just check `size > 0`. Returns `false` if `stride` is `<= 0`.
+  __host__ __device__ bool isConsistentlySized(int i) const;
+  // Returns whether at each dimension `stride <= size`.
+  // If this is not the case then iterating once over the size space will
+  // touch the same memory locations multiple times.
+  __host__ __device__ bool isConsistentlySized() const;
+  /// Returns true if the given dimension range [first, last) has no padding.
+  __host__ __device__ bool isContiguousRange(int first, int last) const;
+  /// Returns a tensor of the same dimension after transposing the two
+  /// dimensions given. Does not actually move elements; transposition
+  /// is made by permuting the size/stride arrays.
+  /// If the dimensions are not valid, asserts.
+  __host__ __device__ THCDeviceTensor<T, Dim, IndexT, PtrTraits>
+  transpose(int dim1, int dim2) const;
+  /// Upcast a tensor of dimension `D` to some tensor of dimension
+  /// D' > D by padding the leading dimensions by 1
+  /// e.g., upcasting a 2-d tensor `[2][3]` to a 4-d tensor `[1][1][2][3]`
+  template <int NewDim>
+  __host__ __device__ THCDeviceTensor<T, NewDim, IndexT, PtrTraits>
+  upcastOuter();
+  /// Upcast a tensor of dimension `D` to some tensor of dimension
+  /// D' > D by padding the lowest/most varying dimensions by 1
+  /// e.g., upcasting a 2-d tensor `[2][3]` to a 4-d tensor `[2][3][1][1]`
+  template <int NewDim>
+  __host__ __device__ THCDeviceTensor<T, NewDim, IndexT, PtrTraits>
+  upcastInner();
+  /// Downcast a tensor of dimension `D` to some tensor of dimension
+  /// D' < D by collapsing the leading dimensions. asserts if there is
+  /// padding on the leading dimensions.
+  template <int NewDim>
+  __host__ __device__
+  THCDeviceTensor<T, NewDim, IndexT, PtrTraits> downcastOuter();
+  /// Downcast a tensor of dimension `D` to some tensor of dimension
+  /// D' < D by collapsing the leading dimensions. asserts if there is
+  /// padding on the leading dimensions.
+  template <int NewDim>
+  __host__ __device__
+  THCDeviceTensor<T, NewDim, IndexT, PtrTraits> downcastInner();
+  /// Returns a tensor that is a view of the `SubDim`-dimensional slice
+  /// of this tensor, starting at `at`.
+  template <int SubDim>
+  __host__ __device__ THCDeviceTensor<T, SubDim, IndexT, PtrTraits>
+  view(DataPtrType at);
+  /// Returns a tensor that is a view of the `SubDim`-dimensional slice
+  /// of this tensor, starting where our data begins
+  template <int SubDim>
+  __host__ __device__ THCDeviceTensor<T, SubDim, IndexT, PtrTraits>
+  view();
+  /// Zeroes out the tensor asynchronously. Asserts if the contents
+  /// in question are not contiguous.
+  void zero(cudaStream_t stream = 0);
+ private:
+  /// Raw pointer to where the tensor data begins
+  DataPtrType data_;
+  /// Array of strides (in sizeof(T) terms) per each dimension
+  IndexT stride_[Dim];
+  /// Size per each dimension
+  IndexT size_[Dim];
+};
+namespace detail {
+/// Specialization for a view of a single value (0-dimensional)
+template <typename TensorType, template <typename U> class PtrTraits>
+class THCDeviceSubTensor<TensorType, 0, PtrTraits> {
+ public:
+  __host__ __device__ THCDeviceSubTensor<TensorType, 0, PtrTraits>
+  operator=(typename TensorType::DataType val) {
+    *data_ = val;
+    return *this;
+  }
+  // operator T&
+  __host__ __device__ operator typename TensorType::DataType&() {
+    return *data_;
+  }
+  // const operator T& returning const T&
+  __host__ __device__ operator const typename TensorType::DataType&() const {
+    return *data_;
+  }
+  // operator& returning T*
+  __host__ __device__ typename TensorType::DataType* operator&() {
+    return data_;
+  }
+  // const operator& returning const T*
+  __host__ __device__ const typename TensorType::DataType* operator&() const {
+    return data_;
+  }
+  /// Returns a raw accessor to our slice.
+  __host__ __device__ __forceinline__ typename TensorType::DataPtrType data() {
+    return data_;
+  }
+  /// Returns a raw accessor to our slice (const).
+  __host__ __device__ __forceinline__
+  const typename TensorType::DataPtrType data() const {
+    return data_;
+  }
+  /// Cast to a different datatype.
+  template <typename T>
+  __host__ __device__ T& as() {
+    return *dataAs<T>();
+  }
+  /// Cast to a different datatype (const).
+  template <typename T>
+  __host__ __device__ const T& as() const {
+    return *dataAs<T>();
+  }
+  /// Cast to a different datatype
+  template <typename T>
+  __host__ __device__ __forceinline__
+  typename PtrTraits<T>::PtrType dataAs() {
+    return reinterpret_cast<typename PtrTraits<T>::PtrType>(data_);
+  }
+  /// Cast to a different datatype (const)
+  template <typename T>
+  __host__ __device__ __forceinline__
+  typename PtrTraits<const T>::PtrType dataAs() const {
+    return reinterpret_cast<typename PtrTraits<const T>::PtrType>(data_);
+  }
+  /// Use the texture cache for reads
+  __device__ __forceinline__ typename TensorType::DataType ldg() const {
+#if __CUDA_ARCH__ >= 350
+    return __ldg(data_);
+#else
+    return *data_;
+#endif
+  }
+  /// Use the texture cache for reads; cast as a particular type
+  template <typename T>
+  __device__ __forceinline__ T ldgAs() const {
+#if __CUDA_ARCH__ >= 350
+    return __ldg(dataAs<T>());
+#else
+    return as<T>();
+#endif
+  }
+  private:
+  /// One dimension greater can create us
+  friend class THCDeviceSubTensor<TensorType, 1, PtrTraits>;
+  /// Our parent tensor can create us
+  friend class THCDeviceTensor<typename TensorType::DataType,
+                               1,
+                               typename TensorType::IndexType,
+                               PtrTraits>;
+  __host__ __device__ __forceinline__ THCDeviceSubTensor(
+    TensorType& t,
+    typename TensorType::DataPtrType data)
+      : tensor_(t),
+        data_(data) {
+  }
+  /// The tensor we're referencing
+  TensorType& tensor_;
+  /// Where our value is located
+  typename TensorType::DataPtrType const data_;
+};
+/// A `SubDim`-rank slice of a parent THCDeviceTensor
+template <typename TensorType,
+          int SubDim,
+          template <typename U> class PtrTraits>
+class THCDeviceSubTensor {
+ public:
+  /// Returns a view of the data located at our offset (the dimension
+  /// `SubDim` - 1 tensor).
+  __host__ __device__ __forceinline__
+  THCDeviceSubTensor<TensorType, SubDim - 1, PtrTraits>
+    operator[](typename TensorType::IndexType index) {
+    return THCDeviceSubTensor<TensorType, SubDim - 1, PtrTraits>(
+      tensor_,
+      data_ + index * tensor_.getStride(TensorType::NumDim - SubDim));
+  }
+  /// Returns a view of the data located at our offset (the dimension
+  /// `SubDim` - 1 tensor) (const).
+  __host__ __device__ __forceinline__
+  const THCDeviceSubTensor<TensorType, SubDim - 1, PtrTraits>
+    operator[](typename TensorType::IndexType index) const {
+    return THCDeviceSubTensor<TensorType, SubDim - 1, PtrTraits>(
+      tensor_,
+      data_ + index * tensor_.getStride(TensorType::NumDim - SubDim));
+  }
+  // operator& returning T*
+  __host__ __device__ typename TensorType::DataType* operator&() {
+    return data_;
+  }
+  // const operator& returning const T*
+  __host__ __device__ const typename TensorType::DataType* operator&() const {
+    return data_;
+  }
+  /// Returns a raw accessor to our slice.
+  __host__ __device__ __forceinline__ typename TensorType::DataPtrType data() {
+    return data_;
+  }
+  /// Returns a raw accessor to our slice (const).
+  __host__ __device__ __forceinline__
+  const typename TensorType::DataPtrType data() const {
+    return data_;
+  }
+  /// Cast to a different datatype.
+  template <typename T>
+  __host__ __device__ T& as() {
+    return *dataAs<T>();
+  }
+  /// Cast to a different datatype (const).
+  template <typename T>
+  __host__ __device__ const T& as() const {
+    return *dataAs<T>();
+  }
+  /// Cast to a different datatype
+  template <typename T>
+  __host__ __device__ __forceinline__
+  typename PtrTraits<T>::PtrType dataAs() {
+    return reinterpret_cast<typename PtrTraits<T>::PtrType>(data_);
+  }
+  /// Cast to a different datatype (const)
+  template <typename T>
+  __host__ __device__ __forceinline__
+  typename PtrTraits<const T>::PtrType dataAs() const {
+    return reinterpret_cast<typename PtrTraits<const T>::PtrType>(data_);
+  }
+  /// Use the texture cache for reads
+  __device__ __forceinline__ typename TensorType::DataType ldg() const {
+#if __CUDA_ARCH__ >= 350
+    return __ldg(data_);
+#else
+    return *data_;
+#endif
+  }
+  /// Use the texture cache for reads; cast as a particular type
+  template <typename T>
+  __device__ __forceinline__ T ldgAs() const {
+#if __CUDA_ARCH__ >= 350
+    return __ldg(dataAs<T>());
+#else
+    return as<T>();
+#endif
+  }
+  /// Returns a tensor that is a view of the SubDim-dimensional slice
+  /// of this tensor, starting where our data begins
+  THCDeviceTensor<typename TensorType::DataType,
+               SubDim,
+               typename TensorType::IndexType,
+               PtrTraits> view() {
+    return tensor_.template view<SubDim>(data_);
+  }
+ private:
+  /// One dimension greater can create us
+  friend class THCDeviceSubTensor<TensorType, SubDim + 1, PtrTraits>;
+  /// Our parent tensor can create us
+  friend class
+  THCDeviceTensor<typename TensorType::DataType,
+               TensorType::NumDim,
+               typename TensorType::IndexType,
+               PtrTraits>;
+  __host__ __device__ __forceinline__ THCDeviceSubTensor(
+    TensorType& t,
+    typename TensorType::DataPtrType data)
+      : tensor_(t),
+        data_(data) {
+  }
+  /// The tensor we're referencing
+  TensorType& tensor_;
+  /// The start of our sub-region
+  typename TensorType::DataPtrType const data_;
+};
+} // namespace detail
+template <typename T, int Dim,
+          typename IndexT, template <typename U> class PtrTraits>
+__host__ __device__ __forceinline__
+detail::THCDeviceSubTensor<THCDeviceTensor<T, Dim, IndexT, PtrTraits>,
+                        Dim - 1, PtrTraits>
+THCDeviceTensor<T, Dim, IndexT, PtrTraits>::operator[](IndexT index) {
+  return detail::THCDeviceSubTensor<TensorType, Dim - 1, PtrTraits>(
+    detail::THCDeviceSubTensor<TensorType, Dim, PtrTraits>(
+      *this, data_)[index]);
+}
+template <typename T, int Dim,
+          typename IndexT, template <typename U> class PtrTraits>
+__host__ __device__ __forceinline__
+const detail::THCDeviceSubTensor<THCDeviceTensor<T, Dim, IndexT, PtrTraits>,
+                              Dim - 1, PtrTraits>
+THCDeviceTensor<T, Dim, IndexT, PtrTraits>::operator[](IndexT index) const {
+  return detail::THCDeviceSubTensor<TensorType, Dim - 1, PtrTraits>(
+    detail::THCDeviceSubTensor<TensorType, Dim, PtrTraits>(
+      const_cast<TensorType&>(*this), data_)[index]);
+}
+#include "THCDeviceTensor-inl.cuh"
+#endif // THC_DEVICE_TENSOR_INC
--- a/encoding/kernel/include/THCDeviceTensorUtils-inl.cuh
+++ b/encoding/kernel/include/THCDeviceTensorUtils-inl.cuh
+namespace detail {
+// Add a layer of SFINAE to support static_assert
+template <typename T, int Dim, typename IndexT,
+          template <typename U> class PtrTraits,
+          int NewDim, bool B>
+struct UpcastTHCRoot {
+  static THCDeviceTensor<T, NewDim, IndexT, PtrTraits>
+  make(THCState* state, THCudaTensor* t);
+};
+template <typename T, int Dim, typename IndexT,
+          template <typename U> class PtrTraits,
+          int NewDim, bool B>
+struct UpcastTHC :
+      UpcastTHCRoot<T, Dim, IndexT, PtrTraits, NewDim, B> {
+};
+// Never instantiated SFINAE purposes only
+template <typename T, int Dim, typename IndexT,
+          template <typename U> class PtrTraits,
+          int NewDim>
+struct UpcastTHC<T, Dim, IndexT, PtrTraits, NewDim, false> :
+      UpcastTHCRoot<T, Dim, IndexT, PtrTraits, NewDim, false> {
+};
+template <typename T, int Dim, typename IndexT,
+          template <typename U> class PtrTraits,
+          int NewDim>
+struct UpcastTHC<T, Dim, IndexT, PtrTraits, NewDim, true> :
+      UpcastTHCRoot<T, Dim, IndexT, PtrTraits, NewDim, true>  {
+  static THCDeviceTensor<T, NewDim, IndexT, PtrTraits>
+  make(THCState* state, THCudaTensor* t) {
+    thc_static_assert(NewDim > Dim);
+    return toDeviceTensor<T, Dim, IndexT, PtrTraits>(state, t).
+      template upcastOuter<NewDim>();
+  }
+};
+// Add a layer of SFINAE to support static_assert
+template <typename T, int Dim, typename IndexT,
+          template <typename U> class PtrTraits,
+          int NewDim, bool B>
+struct DowncastTHCRoot {
+  static THCDeviceTensor<T, NewDim, IndexT, PtrTraits>
+  make(THCState* state, THCudaTensor* t);
+};
+template <typename T, int Dim, typename IndexT,
+          template <typename U> class PtrTraits,
+          int NewDim, bool B>
+struct DowncastTHC :
+      DowncastTHCRoot<T, Dim, IndexT, PtrTraits, NewDim, B> {
+};
+// Never instantiated SFINAE purposes only
+template <typename T, int Dim, typename IndexT,
+          template <typename U> class PtrTraits,
+          int NewDim>
+struct DowncastTHC<T, Dim, IndexT, PtrTraits, NewDim, false> :
+      DowncastTHCRoot<T, Dim, IndexT, PtrTraits, NewDim, false> {
+};
+template <typename T, int Dim, typename IndexT,
+          template <typename U> class PtrTraits,
+          int NewDim>
+struct DowncastTHC<T, Dim, IndexT, PtrTraits, NewDim, true> :
+      DowncastTHCRoot<T, Dim, IndexT, PtrTraits, NewDim, true>  {
+  static THCDeviceTensor<T, NewDim, IndexT, PtrTraits>
+  make(THCState* state, THCudaTensor* t) {
+    thc_static_assert(NewDim < Dim);
+    return toDeviceTensor<T, Dim, IndexT, PtrTraits>(state, t).
+      template downcastOuter<NewDim>();
+  }
+};
+} // namespace detail
+#define SWITCH_UNROLL_CUDA_CAST_FACTORY(i)                              \
+  case i:                                                               \
+  if (NewDim > i) {                                                     \
+    return detail::UpcastTHC<T, i, IndexT,                              \
+                             PtrTraits, NewDim, (NewDim > i)>::         \
+      make(state, t);                                                   \
+  } else if (NewDim == i) {                                             \
+    return toDeviceTensor<T, NewDim, IndexT, PtrTraits>(state, t);      \
+  } else {                                                              \
+    return detail::DowncastTHC<T, i, IndexT,                            \
+                               PtrTraits, NewDim, (NewDim < i)>::       \
+      make(state, t);                                                   \
+  }                                                                     \
+  /* break; */
+template <typename T, int NewDim,
+          typename IndexT, template <typename U> class PtrTraits>
+THCDeviceTensor<T, NewDim, IndexT, PtrTraits>
+toDeviceTensorCast(THCState* state, THCudaTensor* t) {
+  switch (THCudaTensor_nDimension(state, t)) {
+    SWITCH_UNROLL_CUDA_CAST_FACTORY(1);
+    SWITCH_UNROLL_CUDA_CAST_FACTORY(2);
+    SWITCH_UNROLL_CUDA_CAST_FACTORY(3);
+    SWITCH_UNROLL_CUDA_CAST_FACTORY(4);
+    SWITCH_UNROLL_CUDA_CAST_FACTORY(5);
+    SWITCH_UNROLL_CUDA_CAST_FACTORY(6);
+    SWITCH_UNROLL_CUDA_CAST_FACTORY(7);
+    SWITCH_UNROLL_CUDA_CAST_FACTORY(8);
+    SWITCH_UNROLL_CUDA_CAST_FACTORY(9);
+    SWITCH_UNROLL_CUDA_CAST_FACTORY(10);
+    default:
+      ;
+  }
+  // Not implemented
+  THError("THCDeviceTensor dimension size not supported");
+  return NULL; /* never enters this piece, appeasing compiler warnings */
+}
+#undef SWITCH_UNROLL_CUDA_CAST_FACTORY
--- a/encoding/kernel/include/THCDeviceTensorUtils.cu
+++ b/encoding/kernel/include/THCDeviceTensorUtils.cu
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/THCDeviceTensorUtils.cu"
+#else
+/// Constructs a THCDeviceTensor initialized from a THCudaTensor. Will
+/// error if the dimensionality does not match exactly.
+template <typename T, int Dim,
+          typename IndexT, template <typename U> class PtrTraits>
+THCDeviceTensor<T, Dim, IndexT, PtrTraits>
+toDeviceTensor(THCState* state, THCTensor* t);
+template <typename T, int Dim, typename IndexT>
+THCDeviceTensor<T, Dim, IndexT, DefaultPtrTraits>
+toDeviceTensor(THCState* state, THCTensor* t) {
+  return toDeviceTensor<T, Dim, IndexT, DefaultPtrTraits>(state, t);
+}
+template <typename T, int Dim>
+THCDeviceTensor<T, Dim, int, DefaultPtrTraits>
+toDeviceTensor(THCState* state, THCTensor* t) {
+  return toDeviceTensor<T, Dim, int, DefaultPtrTraits>(state, t);
+}
+template <typename T, int Dim,
+          typename IndexT, template <typename U> class PtrTraits>
+THCDeviceTensor<T, Dim, IndexT, PtrTraits>
+toDeviceTensor(THCState* state, THCTensor* t) {
+  if (Dim != THCTensor_(nDimension)(state, t)) {
+    THError("THCudaTensor dimension mismatch");
+  }
+  // Determine the maximum offset into the tensor achievable; `IndexT`
+  // must be smaller than this type in order to use it.
+  ptrdiff_t maxOffset = 0;
+  IndexT sizes[Dim];
+  IndexT strides[Dim];
+  for (int i = 0; i < Dim; ++i) {
+    int64_t size = THCTensor_(size)(state, t, i);
+    int64_t stride = THCTensor_(stride)(state, t, i);
+    maxOffset += (size - 1) * stride;
+    sizes[i] = (IndexT) size;
+    strides[i] = (IndexT) stride;
+  }
+  if (maxOffset > std::numeric_limits<IndexT>::max()) {
+    THError("THCudaTensor sizes too large for THCDeviceTensor conversion");
+  }
+  return THCDeviceTensor<T, Dim, IndexT, PtrTraits>(
+    THCTensor_(data)(state, t), sizes, strides);
+}
+#endif
--- a/encoding/kernel/include/THCDeviceTensorUtils.cuh
+++ b/encoding/kernel/include/THCDeviceTensorUtils.cuh
+#ifndef THC_DEVICE_TENSOR_UTILS_INC
+#define THC_DEVICE_TENSOR_UTILS_INC
+#include "THCDeviceTensor.cuh"
+#include "THCTensor.h"
+#include <limits>
+/// Constructs a DeviceTensor initialized from a THCudaTensor by
+/// upcasting or downcasting the tensor to that of a different
+/// dimension.
+template <typename T, int Dim,
+          typename IndexT, template <typename U> class PtrTraits>
+THCDeviceTensor<T, Dim, IndexT, PtrTraits>
+toDeviceTensorCast(THCState* state, THCudaTensor* t);
+template <typename T, int Dim, typename IndexT>
+THCDeviceTensor<T, Dim, IndexT, DefaultPtrTraits>
+toDeviceTensorCast(THCState* state, THCudaTensor* t) {
+  return toDeviceTensorCast<T, Dim, IndexT, DefaultPtrTraits>(state, t);
+}
+template <typename T, int Dim>
+THCDeviceTensor<T, Dim, int, DefaultPtrTraits>
+toDeviceTensorCast(THCState* state, THCudaTensor* t) {
+  return toDeviceTensorCast<T, Dim, int, DefaultPtrTraits>(state, t);
+}
+#include "generic/THCDeviceTensorUtils.cu"
+#include "THCGenerateAllTypes.h"
+#include "THCDeviceTensorUtils-inl.cuh"
+#endif // THC_DEVICE_TENSOR_UTILS_INC
--- a/encoding/kernel/include/generic/THCDeviceTensorUtils.cu
+++ b/encoding/kernel/include/generic/THCDeviceTensorUtils.cu
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/THCDeviceTensorUtils.cu"
+#else
+/// Constructs a THCDeviceTensor initialized from a THCudaTensor. Will
+/// error if the dimensionality does not match exactly.
+template <typename T, int Dim,
+          typename IndexT, template <typename U> class PtrTraits>
+THCDeviceTensor<T, Dim, IndexT, PtrTraits>
+toDeviceTensor(THCState* state, THCTensor* t);
+template <typename T, int Dim, typename IndexT>
+THCDeviceTensor<T, Dim, IndexT, DefaultPtrTraits>
+toDeviceTensor(THCState* state, THCTensor* t) {
+  return toDeviceTensor<T, Dim, IndexT, DefaultPtrTraits>(state, t);
+}
+template <typename T, int Dim>
+THCDeviceTensor<T, Dim, int, DefaultPtrTraits>
+toDeviceTensor(THCState* state, THCTensor* t) {
+  return toDeviceTensor<T, Dim, int, DefaultPtrTraits>(state, t);
+}
+template <typename T, int Dim,
+          typename IndexT, template <typename U> class PtrTraits>
+THCDeviceTensor<T, Dim, IndexT, PtrTraits>
+toDeviceTensor(THCState* state, THCTensor* t) {
+  if (Dim != THCTensor_(nDimension)(state, t)) {
+    THError("THCudaTensor dimension mismatch");
+  }
+  // Determine the maximum offset into the tensor achievable; `IndexT`
+  // must be smaller than this type in order to use it.
+  ptrdiff_t maxOffset = 0;
+  IndexT sizes[Dim];
+  IndexT strides[Dim];
+  for (int i = 0; i < Dim; ++i) {
+    int64_t size = THCTensor_(size)(state, t, i);
+    int64_t stride = THCTensor_(stride)(state, t, i);
+    maxOffset += (size - 1) * stride;
+    sizes[i] = (IndexT) size;
+    strides[i] = (IndexT) stride;
+  }
+  if (maxOffset > std::numeric_limits<IndexT>::max()) {
+    THError("THCudaTensor sizes too large for THCDeviceTensor conversion");
+  }
+  return THCDeviceTensor<T, Dim, IndexT, PtrTraits>(
+    THCTensor_(data)(state, t), sizes, strides);
+}
+#endif