v0.4.2 (#59)

This PR should fix most of the issues: fixes https://github.com/zhanghang1989/PyTorch-Encoding/issues/54 fixes https://github.com/zhanghang1989/PyTorch-Encoding/issues/53 fixes https://github.com/zhanghang1989/PyTorch-Encoding/issues/50

v0.4.2 (#59)
This PR should fix most of the issues: fixes https://github.com/zhanghang1989/PyTorch-Encoding/issues/54 fixes https://github.com/zhanghang1989/PyTorch-Encoding/issues/53 fixes https://github.com/zhanghang1989/PyTorch-Encoding/issues/50
07f25381 · Hang Zhang · GitHub · cebf1341 · 70fdeb79 · 07f25381
Unverified Commit 07f25381 authored Jun 04, 2018 by Hang Zhang Committed by GitHub Jun 04, 2018
20 changed files
--- a/encoding/dilated/__init__.py
+++ b/encoding/dilated/__init__.py
 """Dilated ResNet and DenseNet"""
 from .resnet import *
-from .densenet import *
--- a/encoding/dilated/densenet.py
+++ b/encoding/dilated/densenet.py
-"""Dilated DenseNet"""
-from collections import OrderedDict
-
-import torch
-import torch.utils.model_zoo as model_zoo
-
-from .. import nn
-from .. import functions as F
-
-__all__ = ['DenseNet', 'densenet121', 'densenet169', 'densenet201', 'densenet161']
-
-
-model_urls = {
-    'densenet121': 'https://download.pytorch.org/models/densenet121-a639ec97.pth',
-    'densenet169': 'https://download.pytorch.org/models/densenet169-b2777c0a.pth',
-    'densenet201': 'https://download.pytorch.org/models/densenet201-c1103571.pth',
-    'densenet161': 'https://download.pytorch.org/models/densenet161-8d451a50.pth',
-}
-
-
-def densenet121(pretrained=False, **kwargs):
-    r"""Densenet-121 model from
-    `"Densely Connected Convolutional Networks" <https://arxiv.org/pdf/1608.06993.pdf>`_
-
-    Args:
-        pretrained (bool): If True, returns a model pre-trained on ImageNet
-    """
-    model = DenseNet(num_init_features=64, growth_rate=32, block_config=(6, 12, 24, 16),
-                     **kwargs)
-    if pretrained:
-        model.load_state_dict(model_zoo.load_url(model_urls['densenet121']))
-    return model
-
-
-def densenet169(pretrained=False, **kwargs):
-    r"""Densenet-169 model from
-    `"Densely Connected Convolutional Networks" <https://arxiv.org/pdf/1608.06993.pdf>`_
-
-    Args:
-        pretrained (bool): If True, returns a model pre-trained on ImageNet
-    """
-    model = DenseNet(num_init_features=64, growth_rate=32, block_config=(6, 12, 32, 32),
-                     **kwargs)
-    if pretrained:
-        model.load_state_dict(model_zoo.load_url(model_urls['densenet169']))
-    return model
-
-
-def densenet201(pretrained=False, **kwargs):
-    r"""Densenet-201 model from
-    `"Densely Connected Convolutional Networks" <https://arxiv.org/pdf/1608.06993.pdf>`_
-
-    Args:
-        pretrained (bool): If True, returns a model pre-trained on ImageNet
-    """
-    model = DenseNet(num_init_features=64, growth_rate=32, block_config=(6, 12, 48, 32),
-                     **kwargs)
-    if pretrained:
-        model.load_state_dict(model_zoo.load_url(model_urls['densenet201']))
-    return model
-
-
-def densenet161(pretrained=False, **kwargs):
-    r"""Densenet-161 model from
-    `"Densely Connected Convolutional Networks" <https://arxiv.org/pdf/1608.06993.pdf>`_
-
-    Args:
-        pretrained (bool): If True, returns a model pre-trained on ImageNet
-    """
-    model = DenseNet(num_init_features=96, growth_rate=48, block_config=(6, 12, 36, 24),
-                     **kwargs)
-    if pretrained:
-        model.load_state_dict(model_zoo.load_url(model_urls['densenet161']))
-    return model
-
-
-class _DenseLayer(nn.Sequential):
-    # pylint: disable=expression-not-assigned
-    def __init__(self, num_input_features, growth_rate, bn_size, drop_rate, dilation=1):
-        super(_DenseLayer, self).__init__()
-        self.add_module('norm.1', nn.BatchNorm2d(num_input_features)),
-        self.add_module('relu.1', nn.ReLU(inplace=True)),
-        self.add_module('conv.1', nn.Conv2d(
-            num_input_features, bn_size * growth_rate, kernel_size=1, stride=1, bias=False)),
-        self.add_module('norm.2', nn.BatchNorm2d(bn_size * growth_rate)),
-        self.add_module('relu.2', nn.ReLU(inplace=True)),
-        self.add_module('conv.2', nn.Conv2d(
-            bn_size * growth_rate, growth_rate, kernel_size=3, stride=1,
-            padding=dilation, dilation=dilation, bias=False)),
-        self.drop_rate = drop_rate
-
-    def forward(self, x):
-        new_features = super(_DenseLayer, self).forward(x)
-        if self.drop_rate > 0:
-            new_features = F.dropout(new_features, p=self.drop_rate, training=self.training)
-
-        return torch.cat([x, new_features], 1)
-
-
-class _DenseBlock(nn.Sequential):
-    def __init__(self, num_layers, num_input_features, bn_size, growth_rate, drop_rate, dilation=1):
-        super(_DenseBlock, self).__init__()
-        for i in range(num_layers):
-            layer = _DenseLayer(num_input_features + i * growth_rate, growth_rate, bn_size, drop_rate, dilation=dilation)
-            self.add_module('denselayer%d' % (i + 1), layer)
-
-
-class _Transition(nn.Sequential):
-    def __init__(self, num_input_features, num_output_features, stride, dilation=1):
-        super(_Transition, self).__init__()
-        self.add_module('norm', nn.BatchNorm2d(num_input_features))
-        self.add_module('relu', nn.ReLU(inplace=True))
-        self.add_module('conv', nn.Conv2d(num_input_features, num_output_features,
-                                          kernel_size=1, stride=1, bias=False))
-        self.add_module('pool', nn.DilatedAvgPool2d(kernel_size=2, stride=stride,
-                                                    dilation=dilation))
-
-
-class DenseNet(nn.Module):
-    r"""Dilated DenseNet.
-
-    For correctly dilation of transition layer fo DenseNet, we implement the :class:`encoding.nn.DilatedAvgPool2d`.
-
-    Args:
-        growth_rate (int) - how many filters to add each layer (`k` in paper)
-        block_config (list of 4 ints) - how many layers in each pooling block
-        num_init_features (int) - the number of filters to learn in the first convolution layer
-        bn_size (int) - multiplicative factor for number of bottle neck layers
-          (i.e. bn_size * k features in the bottleneck layer)
-        drop_rate (float) - dropout rate after each dense layer
-        num_classes (int) - number of classification classes
-
-    Reference:
-        Huang, Gao, et al. "Densely Connected Convolutional Networks" *CVPR 2017*
-    """
-    def __init__(self, growth_rate=32, block_config=(6, 12, 24, 16),
-                 num_init_features=64, bn_size=4, drop_rate=0, num_classes=1000):
-
-        super(DenseNet, self).__init__()
-
-        # First convolution
-        self.features = nn.Sequential(OrderedDict([
-            ('conv0', nn.Conv2d(3, num_init_features, kernel_size=7, stride=2, padding=3, bias=False)),
-            ('norm0', nn.BatchNorm2d(num_init_features)),
-            ('relu0', nn.ReLU(inplace=True)),
-            ('pool0', nn.MaxPool2d(kernel_size=3, stride=2, padding=1)),
-        ]))
-
-        # Each denseblock
-        strides = [1, 2, 1, 1]
-        dilations = [1, 1, 2, 4]
-        num_features = num_init_features
-        for i, num_layers in enumerate(block_config):
-            block = _DenseBlock(num_layers=num_layers, num_input_features=num_features,
-                                bn_size=bn_size, growth_rate=growth_rate, drop_rate=drop_rate,
-                                dilation=dilations[i])
-            self.features.add_module('denseblock%d' % (i + 1), block)
-            num_features = num_features + num_layers * growth_rate
-            if i != len(block_config) - 1:
-                trans = _Transition(num_input_features=num_features, num_output_features=num_features // 2, stride=strides[i+1], dilation=dilations[i])
-                self.features.add_module('transition%d' % (i + 1), trans)
-                num_features = num_features // 2
-
-        # Final batch norm
-        self.features.add_module('norm5', nn.BatchNorm2d(num_features))
-
-        # Linear layer
-        self.classifier = nn.Linear(num_features, num_classes)
-
-    def forward(self, x):
-        features = self.features(x)
-        out = F.relu(features, inplace=True)
-        # out = F.avg_pool2d(out, kernel_size=7).view(features.size(0), -1)
-        # out = self.classifier(out)
-        return out
--- a/encoding/dilated/resnet.py
+++ b/encoding/dilated/resnet.py
@@ -26,15 +26,15 @@ class BasicBlock(nn.Module):
    """ResNet BasicBlock
    """
    expansion = 1
-    def __init__(self, inplanes, planes, stride=1, dilation=1, downsample=None, first_dilation=1,
+    def __init__(self, inplanes, planes, stride=1, dilation=1, downsample=None, previous_dilation=1,
                 norm_layer=None):
        super(BasicBlock, self).__init__()
        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=3, stride=stride,
                               padding=dilation, dilation=dilation, bias=False)
        self.bn1 = norm_layer(planes)
-        self.relu = nn.ReLU(inplace=False)
+        self.relu = nn.ReLU(inplace=True)
        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=1,
-                               padding=first_dilation, dilation=first_dilation, bias=False)
+                               padding=previous_dilation, dilation=previous_dilation, bias=False)
        self.bn2 = norm_layer(planes)
        self.downsample = downsample
        self.stride = stride
@@ -64,7 +64,7 @@ class Bottleneck(nn.Module):
    # pylint: disable=unused-argument
    expansion = 4
    def __init__(self, inplanes, planes, stride=1, dilation=1,
-                 downsample=None, first_dilation=1, norm_layer=None):
+                 downsample=None, previous_dilation=1, norm_layer=None):
        super(Bottleneck, self).__init__()
        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
        self.bn1 = norm_layer(planes)
@@ -75,7 +75,7 @@ class Bottleneck(nn.Module):
        self.conv3 = nn.Conv2d(
            planes, planes * 4, kernel_size=1, bias=False)
        self.bn3 = norm_layer(planes * 4)
-        self.relu = nn.ReLU(inplace=False)
+        self.relu = nn.ReLU(inplace=True)
        self.downsample = downsample
        self.dilation = dilation
        self.stride = stride
@@ -113,6 +113,21 @@ class Bottleneck(nn.Module):
 class ResNet(nn.Module):
    """Dilated Pre-trained ResNet Model, which preduces the stride of 8 featuremaps at conv5.

+    Parameters
+    ----------
+    block : Block
+        Class for the residual block. Options are BasicBlockV1, BottleneckV1.
+    layers : list of int
+        Numbers of layers in each block
+    classes : int, default 1000
+        Number of classification classes.
+    dilated : bool, default False
+        Applying dilation strategy to pretrained ResNet yielding a stride-8 model,
+        typically used in Semantic Segmentation.
+    norm_layer : object
+        Normalization layer used in backbone network (default: :class:`mxnet.gluon.nn.BatchNorm`;
+        for Synchronized Cross-GPU BachNormalization).
+
    Reference:

        - He, Kaiming, et al. "Deep residual learning for image recognition." Proceedings of the IEEE conference on computer vision and pattern recognition. 2016.
@@ -120,18 +135,26 @@ class ResNet(nn.Module):
        - Yu, Fisher, and Vladlen Koltun. "Multi-scale context aggregation by dilated convolutions."
    """
    # pylint: disable=unused-variable
-    def __init__(self, block, layers, num_classes=1000, norm_layer=None):
+    def __init__(self, block, layers, num_classes=1000, dilated=True, norm_layer=nn.BatchNorm2d):
        self.inplanes = 64
        super(ResNet, self).__init__()
        self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3,
                               bias=False)
        self.bn1 = norm_layer(64)
-        self.relu = nn.ReLU(inplace=False)
+        self.relu = nn.ReLU(inplace=True)
        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
        self.layer1 = self._make_layer(block, 64, layers[0], norm_layer=norm_layer)
        self.layer2 = self._make_layer(block, 128, layers[1], stride=2, norm_layer=norm_layer)
-        self.layer3 = self._make_layer(block, 256, layers[2], stride=1, dilation=2, norm_layer=norm_layer)
-        self.layer4 = self._make_layer(block, 512, layers[3], stride=1, dilation=4, norm_layer=norm_layer)
+        if dilated:
+            self.layer3 = self._make_layer(block, 256, layers[2], stride=1,
+                                           dilation=2, norm_layer=norm_layer)
+            self.layer4 = self._make_layer(block, 512, layers[3], stride=1,
+                                           dilation=4, norm_layer=norm_layer)
+        else:
+            self.layer3 = self._make_layer(block, 256, layers[2], stride=2,
+                                           norm_layer=norm_layer)
+            self.layer4 = self._make_layer(block, 512, layers[3], stride=2,
+                                           norm_layer=norm_layer)
        self.avgpool = nn.AvgPool2d(7)
        self.fc = nn.Linear(512 * block.expansion, num_classes)

@@ -155,16 +178,16 @@ class ResNet(nn.Module):
        layers = []
        if dilation == 1 or dilation == 2:
            layers.append(block(self.inplanes, planes, stride, dilation=1,
-                                downsample=downsample, first_dilation=dilation, norm_layer=norm_layer))
+                                downsample=downsample, previous_dilation=dilation, norm_layer=norm_layer))
        elif dilation == 4:
            layers.append(block(self.inplanes, planes, stride, dilation=2,
-                                downsample=downsample, first_dilation=dilation, norm_layer=norm_layer))
+                                downsample=downsample, previous_dilation=dilation, norm_layer=norm_layer))
        else:
            raise RuntimeError("=> unknown dilation size: {}".format(dilation))

        self.inplanes = planes * block.expansion
        for i in range(1, blocks):
-            layers.append(block(self.inplanes, planes, dilation=dilation, first_dilation=dilation,
+            layers.append(block(self.inplanes, planes, dilation=dilation, previous_dilation=dilation,
                                norm_layer=norm_layer))

        return nn.Sequential(*layers)

--- a/encoding/functions/__init__.py
+++ b/encoding/functions/__init__.py
 """Encoding Autograd Fuctions"""
 from .encoding import *
 from .syncbn import *
-from .customize import *
--- a/encoding/functions/customize.py
+++ b/encoding/functions/customize.py
-##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
-## Created by: Hang Zhang
-## ECE Department, Rutgers University
-## Email: zhang.hang@rutgers.edu
-## Copyright (c) 2017
-##
-## This source code is licensed under the MIT-style license found in the
-## LICENSE file in the root directory of this source tree
-##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
-
-"""Encoding Customized Functions"""
-import math
-import torch
-from torch.autograd import Function, Variable
-from torch.nn.modules.utils import _pair
-
-from .._ext import encoding_lib
-
-__all__ = ['dilatedavgpool2d']
-
-class _dilatedavgpool2d(Function):
-    @staticmethod
-    def forward(ctx, input, kernel_size, stride, padding,
-                dilation=1):
-        ctx.kH, ctx.kW = _pair(kernel_size)
-        ctx.dH, ctx.dW = _pair(stride if stride is not None else kernel_size)
-        ctx.padH, ctx.padW = _pair(padding)
-        ctx.dilationH, ctx.dilationW = _pair(dilation)
-        b, c, h, w = input.size()
-        if ctx.dH == 1 and ctx.dW == 1:
-            # keep the size for dilated avgpool
-            ow, oh = w, h
-        else:
-            ow = math.floor(float(w-ctx.kW+2*ctx.padW)/float(ctx.dW)) +1
-            oh = math.floor(float(h-ctx.kH+2*ctx.padH)/float(ctx.dH)) +1
-        with torch.cuda.device_of(input):
-            output = input.new(b, c, oh, ow)
-        ctx.save_for_backward(input)
-        if isinstance(input, torch.cuda.FloatTensor):
-            with torch.cuda.device_of(input):
-                encoding_lib.Encoding_Float_DilatedAvgPool2d_Forward(
-                    input, output, ctx.kH, ctx.kW, ctx.dH, ctx.dW, ctx.padH,
-                    ctx.padW, ctx.dilationH, ctx.dilationW)
-        elif isinstance(input, torch.cuda.DoubleTensor):
-            with torch.cuda.device_of(input):
-                encoding_lib.Encoding_Double_DilatedAvgPool2d_Forward(
-                    input, output, ctx.kH, ctx.kW, ctx.dH, ctx.dW, ctx.padH,
-                    ctx.padW, ctx.dilationH, ctx.dilationW)
-        else:
-            raise RuntimeError('Unimplemented data type!')
-        return output
-
-    @staticmethod
-    def backward(ctx, gradOutput):
-        input, = ctx.saved_variables
-        with torch.cuda.device_of(input):
-            gradInput = Variable(input.data.new().resize_as_(input.data))
-        if isinstance(input.data, torch.cuda.FloatTensor):
-            with torch.cuda.device_of(input.data):
-                encoding_lib.Encoding_Float_DilatedAvgPool2d_Backward(
-                    gradInput.data, gradOutput.data,
-                    ctx.kH, ctx.kW, ctx.dH, ctx.dW, ctx.padH, ctx.padW,
-                    ctx.dilationH, ctx.dilationW)
-        elif isinstance(input.data, torch.cuda.DoubleTensor):
-            with torch.cuda.device_of(input.data):
-                encoding_lib.Encoding_Double_DilatedAvgPool2d_Backward(
-                    gradInput.data, gradOutput.data,
-                    ctx.kH, ctx.kW, ctx.dH, ctx.dW, ctx.padH, ctx.padW,
-                    ctx.dilationH, ctx.dilationW)
-        else:
-            raise RuntimeError('Unimplemented data type!')
-        return gradInput, None, None, None, None
-
-
-def dilatedavgpool2d(input, kernel_size, stride=None, padding=0,
-                     dilation=1):
-    """Dilated Average Pool 2d, for dilation of DenseNet.
-
-    Reference:
-
-        Hang Zhang, Kristin Dana, Jianping Shi, Zhongyue Zhang, Xiaogang Wang,
-        Ambrish Tyagi, Amit Agrawal. “Context Encoding for Semantic Segmentation. CVPR 2018
-
-    Applies 2D average-pooling operation in kh x kw regions by step size
-    dh x dw steps. The number of output features is equal to the number of
-    input planes.
-
-    See :class:`~encoding.nn.DilatedAvgPool2d` for details and output shape.
-
-    Args:
-        input: input tensor (minibatch x in_channels x iH x iW)
-        kernel_size: size of the pooling region, a single number or a
-          tuple (kh x kw)
-        stride: stride of the pooling operation, a single number or a
-          tuple (sh x sw). Default is equal to kernel size
-        padding: implicit zero padding on the input, a single number or
-          a tuple (padh x padw), Default: 0
-        dilation: the dilation parameter similar to Conv2d
-    """
-    return _dilatedavgpool2d.apply(input, kernel_size, stride, padding, dilation)
--- a/encoding/functions/encoding.py
+++ b/encoding/functions/encoding.py
 ##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 ## Created by: Hang Zhang
-## ECE Department, Rutgers University
-## Email: zhang.hang@rutgers.edu
-## Copyright (c) 2017
+## Email: zhanghang0704@gmail.com
+## Copyright (c) 2018
 ##
 ## This source code is licensed under the MIT-style license found in the
 ## LICENSE file in the root directory of this source tree
@@ -11,7 +10,7 @@
 """Functions for Encoding Layer"""
 import torch
 from torch.autograd import Function, Variable
-from .._ext import encoding_lib
+from .. import lib

 __all__ = ['aggregate', 'scaledL2']

@@ -20,47 +19,27 @@ class _aggregate(Function):
    def forward(ctx, A, X, C):
        # A \in(BxNxK) R \in(BxNxKxD) => E \in(BxNxD)
        ctx.save_for_backward(A, X, C)
-        B, _, K = A.size()
-        D = X.size(2)
-        with torch.cuda.device_of(A):
-            E = A.new(B, K, D)
-        if isinstance(A, torch.cuda.FloatTensor):
-            with torch.cuda.device_of(A):
-                encoding_lib.Encoding_Float_aggregate_forward(E, A, X, C)
-        elif isinstance(A, torch.cuda.DoubleTensor):
-            with torch.cuda.device_of(A):
-                encoding_lib.Encoding_Double_aggregate_forward(E, A, X, C)
+        if A.is_cuda:
+            E = lib.gpu.aggregate_forward(A, X, C)
        else:
-            raise RuntimeError('Unimplemented data type!')
+            raise NotImplemented
        return E

    @staticmethod
    def backward(ctx, gradE):
        A, X, C = ctx.saved_variables
-        with torch.cuda.device_of(A):
-            gradA = Variable(A.data.new().resize_as_(A.data))
-            gradX = Variable(A.data.new().resize_as_(X.data))
-            gradC = Variable(A.data.new().resize_as_(C.data))
-        if isinstance(A.data, torch.cuda.FloatTensor):
-            with torch.cuda.device_of(A.data):
-                encoding_lib.Encoding_Float_aggregate_backward(gradA.data, \
-                    gradE.data, A.data, X.data, C.data)
-        elif isinstance(A.data, torch.cuda.DoubleTensor):
-            with torch.cuda.device_of(A.data):
-                encoding_lib.Encoding_Double_aggregate_backward(gradA.data, \
-                    gradE.data, A.data, X.data, C.data)
+        if A.is_cuda:
+            gradA, gradX, gradC = lib.gpu.aggregate_backward(gradE, A, X, C)
        else:
-            raise RuntimeError('Unimplemented data type!')
-        gradX.data.copy_(torch.bmm(A, gradE).data)
-        gradC.data.copy_((-gradE*A.sum(1).unsqueeze(2)).sum(0).data)
+            raise NotImplemented
        return gradA, gradX, gradC

 def aggregate(A, X, C):
-    r"""
-    Aggregate operation, aggregate the residuals of inputs (:math:`X`) with repect
+    r""" Aggregate operation, aggregate the residuals of inputs (:math:`X`) with repect
    to the codewords (:math:`C`) with assignment weights (:math:`A`).

    .. math::
+
        e_{k} = \sum_{i=1}^{N} a_{ik} (x_i - d_k)

    Shape:
@@ -77,53 +56,31 @@ def aggregate(A, X, C):
        >>> C = Variable(torch.cuda.DoubleTensor(K,D).uniform_(-0.5,0.5), requires_grad=True)
        >>> func = encoding.aggregate()
        >>> E = func(A, X, C)
-
    """
    return _aggregate.apply(A, X, C)

 class _scaledL2(Function):
    @staticmethod
    def forward(ctx, X, C, S):
-        B, N, _ = X.size()
-        K = C.size(0)
-        with torch.cuda.device_of(X):
-            SL = X.new(B, N, K)
-        if isinstance(X, torch.cuda.FloatTensor):
-            with torch.cuda.device_of(X):
-                encoding_lib.Encoding_Float_scaledl2_forward(SL, X, C, S)
-        elif isinstance(X, torch.cuda.DoubleTensor):
-            with torch.cuda.device_of(X):
-                encoding_lib.Encoding_Double_scaledl2_forward(SL, X, C, S)
+        if X.is_cuda:
+            SL = lib.gpu.scaled_l2_forward(X, C, S)
        else:
-            raise RuntimeError('Unimplemented data type!')
+            raise NotImplemented
        ctx.save_for_backward(X, C, S, SL)
        return SL

    @staticmethod
    def backward(ctx, gradSL):
        X, C, S, SL = ctx.saved_variables
-        K = C.size(0)
-        with torch.cuda.device_of(X.data):
-            gradX = Variable(X.data.new().resize_as_(X.data))
-            gradC = Variable(X.data.new().resize_as_(C.data))
-            gradS = Variable(X.data.new().resize_as_(S.data))
-        if isinstance(X.data, torch.cuda.FloatTensor):
-            with torch.cuda.device_of(X.data):
-                encoding_lib.Encoding_Float_scaledl2_backward(gradSL.data, \
-                    gradX.data, gradC.data, X.data, C.data, S.data)
-        elif isinstance(X.data, torch.cuda.DoubleTensor):
-            with torch.cuda.device_of(X.data):
-                encoding_lib.Encoding_Double_scaledl2_backward(gradSL.data, \
-                    gradX.data, gradC.data, X.data, C.data, S.data)
+        if X.is_cuda:
+            gradX, gradC, gradS = lib.gpu.scaled_l2_backward(gradSL, X, C, S, SL)
        else:
-            raise RuntimeError('Unimplemented data type!')
-        gradS.data.copy_((gradSL*(SL/S.view(1, 1, K))).sum(0).sum(0).data)
+            raise NotImplemented
        return gradX, gradC, gradS


 def scaledL2(X, C, S):
-    r"""
-    scaledL2 distance
+    r""" scaledL2 distance

    .. math::
        sl_{ik} = s_k \|x_i-c_k\|^2
@@ -134,6 +91,5 @@ def scaledL2(X, C, S):
          (where :math:`B` is batch, :math:`N` is total number of features,
          :math:`K` is number is codewords, :math:`D` is feature dimensions.)
        - Output: :math:`E\in\mathcal{R}^{B\times N\times K}`
-
    """
    return _scaledL2.apply(X, C, S)
--- a/encoding/functions/syncbn.py
+++ b/encoding/functions/syncbn.py
 ##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 ## Created by: Hang Zhang
-## ECE Department, Rutgers University
-## Email: zhang.hang@rutgers.edu
-## Copyright (c) 2017
+## Email: zhanghang0704@gmail.com
+## Copyright (c) 2018
 ##
 ## This source code is licensed under the MIT-style license found in the
 ## LICENSE file in the root directory of this source tree
@@ -11,9 +10,9 @@
 """Synchronized Cross-GPU Batch Normalization functions"""
 import torch
 from torch.autograd import Variable, Function
-from .._ext import encoding_lib
+from .. import lib

-__all__ = ['sum_square', 'batchnormtrain', 'batchnormeval']
+__all__ = ['sum_square', 'batchnormtrain']

 def sum_square(input):
    r"""Calculate sum of elements and sum of squares for Batch Normalization"""
@@ -24,91 +23,45 @@ class _sum_square(Function):
    @staticmethod
    def forward(ctx, input):
        ctx.save_for_backward(input)
-        C = input.size(1)
-        with torch.cuda.device_of(input):
-            xsum = input.new().resize_(C).zero_()
-            xsquare = input.new().resize_(C).zero_()
-        if isinstance(input, torch.cuda.FloatTensor):
-            with torch.cuda.device_of(input):
-                encoding_lib.Encoding_Float_sum_square_Forward(
-                    input, xsum, xsquare)
-        elif isinstance(input, torch.cuda.DoubleTensor):
-            with torch.cuda.device_of(input):
-                encoding_lib.Encoding_Double_sum_square_Forward(
-                    input, xsum, xsquare)
+        if input.is_cuda:
+            xsum, xsqusum = lib.gpu.sumsquare_forward(input)
        else:
-            raise RuntimeError('Unimplemented data type!', type(input))
-        return xsum, xsquare
+            raise NotImplemented
+        return xsum, xsqusum

    @staticmethod
    def backward(ctx, gradSum, gradSquare):
        input, = ctx.saved_variables
-        with torch.cuda.device_of(input.data):
-            gradInput = Variable(input.data.new().resize_as_(input.data).zero_())
-        if isinstance(input.data, torch.cuda.FloatTensor):
-            with torch.cuda.device_of(input.data):
-                encoding_lib.Encoding_Float_sum_square_Backward(
-                    gradInput.data, input.data, gradSum.data, gradSquare.data)
-        elif isinstance(input.data, torch.cuda.DoubleTensor):
-            with torch.cuda.device_of(input.data):
-                encoding_lib.Encoding_Double_sum_square_Backward(
-                    gradInput.data, input.data, gradSum.data, gradSquare.data)
+        if input.is_cuda:
+            gradInput = lib.gpu.sumsquare_backward(input, gradSum, gradSquare)
        else:
-            raise RuntimeError('Unimplemented data type!')
+            raise NotImplemented
        return gradInput


-class _batchnorm(Function):
-    def __init__(self, training=False):
-        super(_batchnorm, self).__init__()
-        self.training = training
-
-    def forward(self, input, gamma, beta, mean, std):
-        self.save_for_backward(input, gamma, beta, mean, std)
-        assert(input.dim() == 3)
-        with torch.cuda.device_of(input):
-            invstd = 1.0 / std
-            output = input.new().resize_as_(input)
-        if isinstance(input, torch.cuda.FloatTensor):
-            with torch.cuda.device_of(input):
-                encoding_lib.Encoding_Float_batchnorm_Forward(output, \
-                    input, mean, invstd, gamma, beta)
-        elif isinstance(input, torch.cuda.DoubleTensor):
-            with torch.cuda.device_of(input):
-                encoding_lib.Encoding_Double_batchnorm_Forward(output, \
-                    input, mean, invstd, gamma, beta)
+class _batchnormtrain(Function):
+    @staticmethod
+    def forward(ctx, input, mean, std, gamma, beta):
+        ctx.save_for_backward(input, mean, std, gamma, beta)
+        if input.is_cuda:
+            output = lib.gpu.batchnorm_forward(input, mean, std, gamma, beta)
        else:
-            raise RuntimeError('Unimplemented data type!')
+            raise NotImplemented
        return output

-    def backward(self, gradOutput):
-        input, gamma, beta, mean, std = self.saved_tensors
-        invstd = 1.0 / std
-        with torch.cuda.device_of(input):
-            gradInput = gradOutput.new().resize_as_(input).zero_()
-            gradGamma = gradOutput.new().resize_as_(gamma).zero_()
-            gradBeta = gradOutput.new().resize_as_(beta).zero_()
-            gradMean = gradOutput.new().resize_as_(mean).zero_()
-            gradStd = gradOutput.new().resize_as_(std).zero_()
-
-        if isinstance(input, torch.cuda.FloatTensor):
-            with torch.cuda.device_of(input):
-                encoding_lib.Encoding_Float_batchnorm_Backward(
-                    gradOutput, input, gradInput, gradGamma, gradBeta,
-                    mean, invstd, gamma, beta, gradMean, gradStd,
-                    self.training)
-        elif isinstance(input, torch.cuda.DoubleTensor):
-            with torch.cuda.device_of(input):
-                encoding_lib.Encoding_Double_batchnorm_Backward(
-                    gradOutput, input, gradInput, gradGamma, gradBeta,
-                    mean, invstd, gamma, beta, gradMean, gradStd,
-                    self.training)
+    @staticmethod
+    def backward(ctx, gradOutput):
+        input, mean, std, gamma, beta = ctx.saved_variables
+        if gradOutput.is_cuda:
+            gradInput, gradMean, gradStd, gradGamma, gradBeta = \
+                lib.gpu.batchnorm_backward(gradOutput, input, mean,
+                                           std, gamma, beta, True)
        else:
-            raise RuntimeError('Unimplemented data type!')
-        return gradInput, gradGamma, gradBeta, gradMean, gradStd
+            raise NotImplemented
+        return gradInput, gradMean, gradStd, gradGamma, gradBeta


-def batchnormtrain(input, gamma, beta, mean, std):
+def batchnormtrain(input, mean, std, gamma, beta):
    r"""Applies Batch Normalization over a 3d input that is seen as a
    mini-batch.

@@ -123,14 +76,4 @@ def batchnormtrain(input, gamma, beta, mean, std):
        - Output: :math:`(N, C)` or :math:`(N, C, L)` (same shape as input)

    """
-    return _batchnorm(True)(input, gamma, beta, mean, std)
-
-
-def batchnormeval(input, gamma, beta, mean, std):
-    r"""Applies Batch Normalization over a 3d input that is seen as a
-    mini-batch.
-
-    Please see encoding.batchnormtrain_
-    """
-    return _batchnorm(False)(input, gamma, beta, mean, std)
-
+    return _batchnormtrain.apply(input, mean, std, gamma, beta)
--- a/encoding/kernel/common.h
+++ b/encoding/kernel/common.h
-// The maximum number of threads in a block
-const int WARP_SIZE = 32;
-const int MAX_BLOCK_SIZE = 512;
-
-// Number of threads in a block given an input size up to MAX_BLOCK_SIZE
-static int getNumThreads(int nElem) {
-  int threadSizes[5] = { 32, 64, 128, 256, MAX_BLOCK_SIZE };
-  for (int i = 0; i != 5; ++i) {
-    if (nElem <= threadSizes[i]) {
-      return threadSizes[i];
-    }
-  }
-  return MAX_BLOCK_SIZE;
-}
-
-__device__ __forceinline__ int getMSB(int val) {
-  return 31 - __clz(val);
-}
--- a/encoding/kernel/generic/device_tensor.h
+++ b/encoding/kernel/generic/device_tensor.h
-/*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
- * Created by: Hang Zhang
- * ECE Department, Rutgers University
- * Email: zhang.hang@rutgers.edu
- * Copyright (c) 2017
- *
- * This source code is licensed under the MIT-style license found in the
- * LICENSE file in the root directory of this source tree 
- *+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
- */
-#ifndef THC_GENERIC_FILE
-#define THC_GENERIC_FILE "generic/device_tensor.h"
-#else
-template <int Dim>
-THCDeviceTensor<real, Dim> devicetensor(THCState *state, THCTensor *t) {
-    if (!t) {
-        return THCDeviceTensor<real, Dim>();
-    }
-    int inDim = THCTensor_(nDimension)(state, t);
-    return toDeviceTensor<real, Dim>(state, t);
-    /*
-    if (inDim == Dim) {
-        return toDeviceTensor<real, Dim>(state, t);
-    }
-    // View in which the last dimensions are collapsed or expanded as needed
-    THAssert(THCTensor_(isContiguous)(state, t));
-    int size[Dim];
-    for (int i = 0; i < Dim || i < inDim; ++i) {
-        if (i < Dim && i < inDim) {
-            size[i] = t->size[i];
-        } else if (i < Dim) {
-            size[i] = 1;
-        } else {
-            size[Dim - 1] *= t->size[i];
-        }
-    }
-    return THCDeviceTensor<real, Dim>(THCTensor_(data)(state, t), size);
-    */
-}
-
-struct Encoding_(Float2)
-/*
- * For reduce sum calcualtion of two elements
- */
-{
-    real v1, v2;
-    __device__ Encoding_(Float2)() {}
-    __device__ Encoding_(Float2)(real x1, real x2) : v1(x1), v2(x2) {}
-    __device__ Encoding_(Float2)(real v) : v1(v), v2(v) {}
-    __device__ Encoding_(Float2)(int v) :  v1(v), v2(v) {}
-    __device__ Encoding_(Float2)& operator+=(const Encoding_(Float2)& a) 
-    {
-    v1 += a.v1;
-    v2 += a.v2;
-    return *this;
-    }
-};
-
-static __device__ __forceinline__ real Encoding_(rwarpSum)(real val) {
-#if CUDA_VERSION >= 9000
-    unsigned int mask = 0xffffffff;
-    for (int i = 0; i < getMSB(WARP_SIZE); ++i) {
-        val += __shfl_xor_sync(mask, val, 1 << i, WARP_SIZE);
-    }
-#else
-#if __CUDA_ARCH__ >= 300
-    for (int i = 0; i < getMSB(WARP_SIZE); ++i) {
-        val += __shfl_xor(val, 1 << i, WARP_SIZE);
-    }
-#else
-    __shared__ real values[MAX_BLOCK_SIZE];
-    values[threadIdx.x] = val;
-    __threadfence_block();
-    const int base = (threadIdx.x / WARP_SIZE) * WARP_SIZE;
-    for (int i = 1; i < WARP_SIZE; i++) {
-        val += values[base + ((i + threadIdx.x) % WARP_SIZE)];
-    }
-#endif
-#endif
-    return val;
-}
-
-static __device__ __forceinline__ Encoding_(Float2) Encoding_(warpSum)(
-    Encoding_(Float2) value) 
-{
-    value.v1 = Encoding_(rwarpSum)(value.v1);
-    value.v2 = Encoding_(rwarpSum)(value.v2);
-    return value;
-}
-
-
-#endif
--- a/encoding/kernel/generic/encoding_kernel.c
+++ b/encoding/kernel/generic/encoding_kernel.c
-/*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
- * Created by: Hang Zhang
- * ECE Department, Rutgers University
- * Email: zhang.hang@rutgers.edu
- * Copyright (c) 2017
- *
- * This source code is licensed under the MIT-style license found in the
- * LICENSE file in the root directory of this source tree 
- *+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
- */
-#ifndef THC_GENERIC_FILE
-#define THC_GENERIC_FILE "generic/encoding_kernel.c"
-#else
-
-__global__ void Encoding_(Aggregate_Forward_kernel) (
-    THCDeviceTensor<real, 3> E,
-    THCDeviceTensor<real, 3> A,
-    THCDeviceTensor<real, 3> X,
-    THCDeviceTensor<real, 2> C)
-/*
- * aggregating forward kernel function
- */
-{
-    /* declarations of the variables */
-    int b, k, d, N;
-    /* Get the index and channels */ 
-    b = blockIdx.z;
-    d = blockIdx.x;
-    k = blockIdx.y;
-    N = X.getSize(1);
-
-    /* main operation */
-    Encoding_(AggOp) g(A,X,C);
-    E[b][k][d] = Encoding_(reduce_agg)(g,b,k,d,N);
-}
-
-void Encoding_(Aggregate_Forward)(THCState *state, THCTensor *E_, 
-    THCTensor *A_, THCTensor *X_, THCTensor *C_)
-/*
- * aggregating forward the residuals with assignment weights
- */
-{
-    /* Check the GPU index and tensor dims*/
-    THCTensor_(checkGPU)(state, 4, E_, A_, X_, C_);
-    if (THCTensor_(nDimension)(state, E_) != 3 ||
-        THCTensor_(nDimension)(state, A_) != 3 ||
-        THCTensor_(nDimension)(state, X_) != 3 ||
-        THCTensor_(nDimension)(state, C_) != 2)
-        THError("Encoding: incorrect input dims. \n");
-    /* Device tensors */
-    THCDeviceTensor<real, 3> E = devicetensor<3>(state, E_);
-    THCDeviceTensor<real, 3> A = devicetensor<3>(state, A_);
-    THCDeviceTensor<real, 3> X = devicetensor<3>(state, X_);
-    THCDeviceTensor<real, 2> C = devicetensor<2>(state, C_);
-    /* kernel function */
-    cudaStream_t stream = THCState_getCurrentStream(state);
-    // B, K, D
-    dim3 blocks(C.getSize(1), C.getSize(0), X.getSize(0));
-    // N
-    dim3 threads(getNumThreads(X.getSize(1)));
-    Encoding_(Aggregate_Forward_kernel)<<<blocks, threads, 0, stream>>>
-        (E, A, X, C);
-    THCudaCheck(cudaGetLastError());
-}
-
-/*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
-__global__ void Encoding_(Aggregate_Backward_kernel) (
-    THCDeviceTensor<real, 3> GA,
-    THCDeviceTensor<real, 3> GE,
-    THCDeviceTensor<real, 3> A,
-    THCDeviceTensor<real, 3> X,
-    THCDeviceTensor<real, 2> C)
-/*
- * aggregating backward kernel function
- * G (dl/dR), L (dl/dE), A
- */
-{
-    /* declarations of the variables */
-    int b, k, i, D;
-    /* Get the index and channels */ 
-    b = blockIdx.z;
-    i = blockIdx.y;
-    k = blockIdx.x;
-    D = GE.getSize(2);
-    /* main operation */
-    Encoding_(AggBackOp) g(GE,X,C);
-    GA[b][i][k] = Encoding_(reduce_aggback)(g,b,i,k,D);
-}
-
-void Encoding_(Aggregate_Backward)(THCState *state, THCTensor *GA_, 
-     THCTensor *GE_, THCTensor *A_, THCTensor *X_, THCTensor *C_)
-/*
- * aggregate backward to assignment weights
- * G (dl/dR), L (dl/dE), A
- */
-{
-    /* Check the GPU index and tensor dims*/
-    THCTensor_(checkGPU)(state, 5, GA_, GE_, A_, X_, C_);
-    if (THCTensor_(nDimension)(state, GA_) != 3 ||
-        THCTensor_(nDimension)(state, GE_)  != 3 ||
-        THCTensor_(nDimension)(state, A_)  != 3 ||
-        THCTensor_(nDimension)(state, X_)  != 3 ||
-        THCTensor_(nDimension)(state, C_)  != 2)
-    THError("Encoding: incorrect input dims. \n");
-    /* Device tensors */
-    THCDeviceTensor<real, 3> GA = devicetensor<3>(state, GA_);
-    THCDeviceTensor<real, 3> GE = devicetensor<3>(state, GE_);
-    THCDeviceTensor<real, 3> A = devicetensor<3>(state, A_);
-    THCDeviceTensor<real, 3> X = devicetensor<3>(state, X_);
-    THCDeviceTensor<real, 2> C = devicetensor<2>(state, C_);
-    /* kernel function */
-    cudaStream_t stream = THCState_getCurrentStream(state);
-    // B, K, D
-    dim3 blocks(C.getSize(0), X.getSize(1), X.getSize(0));
-    // N
-    dim3 threads(getNumThreads(C.getSize(1)));
-    Encoding_(Aggregate_Backward_kernel)<<<blocks, threads, 0, stream>>>
-        (GA, GE, A, X, C);
-    THCudaCheck(cudaGetLastError());
-}
-
-/*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
-__global__ void Encoding_(ScaledL2_Forward_kernel) (
-    THCDeviceTensor<real, 3> SL,
-    THCDeviceTensor<real, 3> X,
-    THCDeviceTensor<real, 2> C,
-    THCDeviceTensor<real, 1> S)
-/*
- * aggregating forward kernel function
- */
-{
-    /* declarations of the variables */
-    int b, k, i, D;
-    /* Get the index and channels */ 
-    b = blockIdx.z;
-    k = blockIdx.x;
-    i = blockIdx.y;
-    D = X.getSize(2);
-    /* main operation */
-    Encoding_(L2Op) g(X,C);
-    SL[b][i][k] = S[k] * Encoding_(reduce_sl2)(g,b,i,k,D);;
-}
-
-void Encoding_(ScaledL2_Forward)(
-    THCState *state, THCTensor *SL_,  THCTensor *X_,
-    THCTensor *C_,  THCTensor *S_)
-/*
- * aggregating forward the residuals with assignment weights
- */
-{
-    /* Check the GPU index and tensor dims*/
-    THCTensor_(checkGPU)(state, 4, SL_, X_, C_, S_); 
-    if (THCTensor_(nDimension)(state, SL_) != 3 ||
-        THCTensor_(nDimension)(state, X_) != 3 ||
-        THCTensor_(nDimension)(state, C_) != 2 ||
-        THCTensor_(nDimension)(state, S_) != 1)
-    THError("Encoding: incorrect input dims. \n");
-    /* Device tensors */
-    THCDeviceTensor<real, 3> SL = devicetensor<3>(state, SL_);
-    THCDeviceTensor<real, 3> X  = devicetensor<3>(state, X_);
-    THCDeviceTensor<real, 2> C  = devicetensor<2>(state, C_);
-    THCDeviceTensor<real, 1> S  = devicetensor<1>(state, S_);
-    /* kernel function */
-    cudaStream_t stream = THCState_getCurrentStream(state);
-    dim3 blocks(C.getSize(0), X.getSize(1), X.getSize(0));
-    dim3 threads(getNumThreads(C.getSize(1)));
-    Encoding_(ScaledL2_Forward_kernel)<<<blocks, threads, 0, stream>>>
-        (SL, X, C, S);
-    THCudaCheck(cudaGetLastError());
-}
-
-/*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
-__global__ void Encoding_(ScaledL2X_Backward_kernel) (
-    THCDeviceTensor<real, 3> GSL,
-    THCDeviceTensor<real, 3> GX,
-    THCDeviceTensor<real, 3> X,
-    THCDeviceTensor<real, 2> C,
-    THCDeviceTensor<real, 1> S)
-/*
- */
-{
-    /* declarations of the variables */
-    int b, d, i, K;
-    /* Get the index and channels */ 
-    b = blockIdx.z;
-    d = blockIdx.x;
-    i = blockIdx.y;
-    K = C.getSize(0);
-    /* main operation */
-    Encoding_(L2XBackOp) g(GSL,X,C,S);
-    GX[b][i][d] = Encoding_(reduce_sl2xback)(g,b,i,d,K);
-}
-
-__global__ void Encoding_(ScaledL2C_Backward_kernel) (
-    THCDeviceTensor<real, 3> GSL,
-    THCDeviceTensor<real, 2> GC,
-    THCDeviceTensor<real, 3> X,
-    THCDeviceTensor<real, 2> C,
-    THCDeviceTensor<real, 1> S)
-/*
- */
-{
-    /* declarations of the variables */
-    int k, d, B, N;
-    /* Get the index and channels */ 
-    d = blockIdx.x;
-    k = blockIdx.y;
-    B = X.getSize(0);
-    N = X.getSize(1);
-    /* main operation */
-    Encoding_(L2CBackOp) g(GSL,X,C,S);
-    GC[k][d] = Encoding_(reduce_sl2cback)(g,k,d,B,N);
-}
-
-void Encoding_(ScaledL2_Backward)(
-    THCState *state, THCTensor *GSL_, THCTensor *GX_, THCTensor *GC_,
-    THCTensor *X_, THCTensor *C_, THCTensor *S_)
-/*
- */
-{
-    /* Check the GPU index and tensor dims*/
-    THCTensor_(checkGPU)(state, 6, GSL_, GX_, GC_, X_, C_, S_); 
-    if (THCTensor_(nDimension)(state, GSL_) != 3 ||
-        THCTensor_(nDimension)(state, GX_)  != 3 ||
-        THCTensor_(nDimension)(state, GC_)  != 2 ||
-        THCTensor_(nDimension)(state, X_)   != 3 ||
-        THCTensor_(nDimension)(state, C_)   != 2 ||
-        THCTensor_(nDimension)(state, S_)   != 1)
-    THError("Encoding: incorrect input dims. \n");
-    /* Device tensors */
-    THCDeviceTensor<real, 3> GSL = devicetensor<3>(state, GSL_);
-    THCDeviceTensor<real, 3> GX = devicetensor<3>(state, GX_);
-    THCDeviceTensor<real, 2> GC = devicetensor<2>(state, GC_);
-    THCDeviceTensor<real, 3> X  = devicetensor<3>(state, X_);
-    THCDeviceTensor<real, 2> C  = devicetensor<2>(state, C_);
-    THCDeviceTensor<real, 1> S = devicetensor<1>(state, S_);
-    /* kernel function */
-    cudaStream_t stream = THCState_getCurrentStream(state);
-    dim3 blocks(X.getSize(2), X.getSize(1), X.getSize(0));
-    dim3 threads(getNumThreads(C.getSize(0)));
-    Encoding_(ScaledL2X_Backward_kernel)<<<blocks, threads, 0, stream>>>
-        (GSL, GX, X, C, S);
-    THCudaCheck(cudaGetLastError());
-    dim3 blocks2(C.getSize(1), C.getSize(0));
-    dim3 threads2(getNumThreads(X.getSize(1)));
-    Encoding_(ScaledL2C_Backward_kernel)<<<blocks2, threads2, 0, stream>>>
-        (GSL, GC, X, C, S);
-    THCudaCheck(cudaGetLastError());
-}
-
-#endif
--- a/encoding/kernel/generic/encoding_kernel.h
+++ b/encoding/kernel/generic/encoding_kernel.h
-/*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
- * Created by: Hang Zhang
- * ECE Department, Rutgers University
- * Email: zhang.hang@rutgers.edu
- * Copyright (c) 2017
- *
- * This source code is licensed under the MIT-style license found in the
- * LICENSE file in the root directory of this source tree 
- *+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
- */
-#ifndef THC_GENERIC_FILE
-#define THC_GENERIC_FILE "generic/encoding_kernel.h"
-#else
-
-void Encoding_(Aggregate_Forward)(THCState *state, THCTensor *E_, 
-    THCTensor *A_, THCTensor *X_, THCTensor *C_);
-
-void Encoding_(Aggregate_Backward)(THCState *state, THCTensor *GA_, 
-     THCTensor *GE_, THCTensor *A_, THCTensor *X_, THCTensor *C_);
-
-void Encoding_(ScaledL2_Forward)( THCState *state, THCTensor *SL_,  
-    THCTensor *X_, THCTensor *C_,  THCTensor *S_);
-
-void Encoding_(ScaledL2_Backward)(
-    THCState *state, THCTensor *GSL_, THCTensor *GX_, THCTensor *GC_,
-    THCTensor *X_, THCTensor *C_, THCTensor *S_);
-
-#endif
--- a/encoding/kernel/generic/encoding_utils.c
+++ b/encoding/kernel/generic/encoding_utils.c
-/*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
- * Created by: Hang Zhang
- * ECE Department, Rutgers University
- * Email: zhang.hang@rutgers.edu
- * Copyright (c) 2017
- *
- * This source code is licensed under the MIT-style license found in the
- * LICENSE file in the root directory of this source tree 
- *+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
- */
-#ifndef THC_GENERIC_FILE
-#define THC_GENERIC_FILE "generic/encoding_utils.c"
-#else
-
-struct Encoding_(AggOp) {
-    __device__ Encoding_(AggOp)(THCDeviceTensor<real, 3> a,
-                                THCDeviceTensor<real, 3> x,
-                                THCDeviceTensor<real, 2> c)
-        : A(a), X(x), C(c) {}
-    __device__ __forceinline__ real operator()(int b, int i, int k, int d) 
-    {
-        return A[b][i][k].ldg() * (X[b][i][d].ldg()-C[k][d].ldg());
-    }
-    THCDeviceTensor<real, 3> A;
-    THCDeviceTensor<real, 3> X;
-    THCDeviceTensor<real, 2> C;
-};
-
-__device__ real Encoding_(reduce_agg)(
-        Encoding_(AggOp) op, 
-        int b, int k, int d, int N)
-{
-    real sum = 0;
-    for (int x = threadIdx.x; x < N; x += blockDim.x) {
-        sum += op(b,x,k,d);
-    }
-    // sum over NumThreads within a warp
-    sum = Encoding_(rwarpSum)(sum);
-
-    // 'transpose', and reduce within warp again
-    __shared__ real shared[32];
-
-    __syncthreads();
-    if (threadIdx.x % WARP_SIZE == 0) {
-        if (threadIdx.x / WARP_SIZE < 32) {
-                shared[threadIdx.x / WARP_SIZE] = sum;
-        }
-    }
-    if (threadIdx.x >= blockDim.x / WARP_SIZE && threadIdx.x < WARP_SIZE) {
-        // zero out the other entries in shared
-        shared[threadIdx.x] = (real) 0;
-    }
-    __syncthreads();
-    if (threadIdx.x / WARP_SIZE == 0) {
-        sum = Encoding_(rwarpSum)(shared[threadIdx.x]);
-        if (threadIdx.x == 0) {
-            shared[0] = sum;
-        }
-    }
-    __syncthreads();
-
-  // Everyone picks it up, should be broadcast into the whole gradInput
-  return shared[0];
-}
-
-struct Encoding_(AggBackOp) {
-    __device__ Encoding_(AggBackOp)(THCDeviceTensor<real, 3> ge,
-                                    THCDeviceTensor<real, 3> x,
-                                    THCDeviceTensor<real, 2> c)
-        : GE(ge), X(x), C(c) {}
-    __device__ __forceinline__ real operator()(int b, int i, int k, int d) 
-    {
-        return GE[b][k][d].ldg() * (X[b][i][d].ldg()-C[k][d].ldg());
-    }
-    THCDeviceTensor<real, 3> GE;
-    THCDeviceTensor<real, 3> X;
-    THCDeviceTensor<real, 2> C;
-};
-
-__device__ real Encoding_(reduce_aggback)(
-        Encoding_(AggBackOp) op, 
-        int b, int i, int k, int D)
-{
-    real sum = 0;
-    for (int x = threadIdx.x; x < D; x += blockDim.x) {
-        sum += op(b,i,k,x);
-    }
-    // sum over NumThreads within a warp
-    sum = Encoding_(rwarpSum)(sum);
-
-    // 'transpose', and reduce within warp again
-    __shared__ real shared[32];
-
-    __syncthreads();
-    if (threadIdx.x % WARP_SIZE == 0) {
-        if (threadIdx.x / WARP_SIZE < 32) {
-                shared[threadIdx.x / WARP_SIZE] = sum;
-        }
-    }
-    if (threadIdx.x >= blockDim.x / WARP_SIZE && threadIdx.x < WARP_SIZE) {
-        // zero out the other entries in shared
-        shared[threadIdx.x] = (real) 0;
-    }
-    __syncthreads();
-    if (threadIdx.x / WARP_SIZE == 0) {
-        sum = Encoding_(rwarpSum)(shared[threadIdx.x]);
-        if (threadIdx.x == 0) {
-            shared[0] = sum;
-        }
-    }
-    __syncthreads();
-
-  // Everyone picks it up, should be broadcast into the whole gradInput
-  return shared[0];
-}
-
-struct Encoding_(L2Op) {
-    __device__ Encoding_(L2Op)(THCDeviceTensor<real, 3> x,
-                               THCDeviceTensor<real, 2> c)
-        : X(x), C(c) {}
-    __device__ __forceinline__ real operator()(int b, int i, int k, int d) 
-    {
-        real r = X[b][i][d].ldg() - C[k][d].ldg();
-        return r * r;
-    }
-    THCDeviceTensor<real, 3> X;
-    THCDeviceTensor<real, 2> C;
-};
-
-__device__ real Encoding_(reduce_sl2)(
-        Encoding_(L2Op) op, 
-        int b, int i, int k, int D)
-{
-    real sum = 0;
-    for (int x = threadIdx.x; x < D; x += blockDim.x) {
-        sum += op(b,i,k,x);
-    }
-    // sum over NumThreads within a warp
-    sum = Encoding_(rwarpSum)(sum);
-
-    // 'transpose', and reduce within warp again
-    __shared__ real shared[32];
-
-    __syncthreads();
-    if (threadIdx.x % WARP_SIZE == 0) {
-        if (threadIdx.x / WARP_SIZE < 32) {
-                shared[threadIdx.x / WARP_SIZE] = sum;
-        }
-    }
-    if (threadIdx.x >= blockDim.x / WARP_SIZE && threadIdx.x < WARP_SIZE) {
-        // zero out the other entries in shared
-        shared[threadIdx.x] = (real) 0;
-    }
-    __syncthreads();
-    if (threadIdx.x / WARP_SIZE == 0) {
-        sum = Encoding_(rwarpSum)(shared[threadIdx.x]);
-        if (threadIdx.x == 0) {
-            shared[0] = sum;
-        }
-    }
-    __syncthreads();
-
-  // Everyone picks it up, should be broadcast into the whole gradInput
-  return shared[0];
-}
-
-struct Encoding_(L2XBackOp) {
-    __device__ Encoding_(L2XBackOp)(
-        THCDeviceTensor<real, 3> gsl,
-        THCDeviceTensor<real, 3> x,
-        THCDeviceTensor<real, 2> c,
-        THCDeviceTensor<real, 1> s
-    ) : GSL(gsl), X(x), C(c), S(s) {}
-    __device__ __forceinline__ real operator()(int b, int i, int k, int d) 
-    {
-        return 2*S[k].ldg() * GSL[b][i][k].ldg() *
-            (X[b][i][d].ldg()-C[k][d].ldg());
-    }
-    THCDeviceTensor<real, 3> GSL;
-    THCDeviceTensor<real, 3> X;
-    THCDeviceTensor<real, 2> C;
-    THCDeviceTensor<real, 1> S;
-};
-
-__device__ real Encoding_(reduce_sl2xback)(
-        Encoding_(L2XBackOp) op, 
-        int b, int i, int d, int K)
-{
-    real sum = 0;
-    for (int x = threadIdx.x; x < K; x += blockDim.x) {
-        sum += op(b,i,x,d);
-    }
-    // sum over NumThreads within a warp
-    sum = Encoding_(rwarpSum)(sum);
-
-    // 'transpose', and reduce within warp again
-    __shared__ real shared[32];
-
-    __syncthreads();
-    if (threadIdx.x % WARP_SIZE == 0) {
-        if (threadIdx.x / WARP_SIZE < 32) {
-                shared[threadIdx.x / WARP_SIZE] = sum;
-        }
-    }
-    if (threadIdx.x >= blockDim.x / WARP_SIZE && threadIdx.x < WARP_SIZE) {
-        // zero out the other entries in shared
-        shared[threadIdx.x] = (real) 0;
-    }
-    __syncthreads();
-    if (threadIdx.x / WARP_SIZE == 0) {
-        sum = Encoding_(rwarpSum)(shared[threadIdx.x]);
-        if (threadIdx.x == 0) {
-            shared[0] = sum;
-        }
-    }
-    __syncthreads();
-
-  // Everyone picks it up, should be broadcast into the whole gradInput
-  return shared[0];
-}
-
-struct Encoding_(L2CBackOp) {
-    __device__ Encoding_(L2CBackOp)(
-        THCDeviceTensor<real, 3> gsl,
-        THCDeviceTensor<real, 3> x,
-        THCDeviceTensor<real, 2> c,
-        THCDeviceTensor<real, 1> s
-    ) : GSL(gsl), X(x), C(c), S(s) {}
-    __device__ __forceinline__ real operator()(int b, int i, int k, int d) 
-    {
-        return -2*S[k].ldg() * GSL[b][i][k].ldg() *
-                (X[b][i][d].ldg()-C[k][d].ldg());
-    }
-    THCDeviceTensor<real, 3> GSL;
-    THCDeviceTensor<real, 3> X;
-    THCDeviceTensor<real, 2> C;
-    THCDeviceTensor<real, 1> S;
-};
-
-__device__ real Encoding_(reduce_sl2cback)(
-        Encoding_(L2CBackOp) op, 
-        int k, int d, int B, int N)
-{
-    real sum = 0;
-    for (int batch = 0; batch < B; ++batch) {
-        for (int x = threadIdx.x; x < N; x += blockDim.x) {
-            sum += op(batch,x,k,d);
-        }
-    }
-    // sum over NumThreads within a warp
-    sum = Encoding_(rwarpSum)(sum);
-
-    // 'transpose', and reduce within warp again
-    __shared__ real shared[32];
-
-    __syncthreads();
-    if (threadIdx.x % WARP_SIZE == 0) {
-        if (threadIdx.x / WARP_SIZE < 32) {
-                shared[threadIdx.x / WARP_SIZE] = sum;
-        }
-    }
-    if (threadIdx.x >= blockDim.x / WARP_SIZE && threadIdx.x < WARP_SIZE) {
-        // zero out the other entries in shared
-        shared[threadIdx.x] = (real) 0;
-    }
-    __syncthreads();
-    if (threadIdx.x / WARP_SIZE == 0) {
-        sum = Encoding_(rwarpSum)(shared[threadIdx.x]);
-        if (threadIdx.x == 0) {
-            shared[0] = sum;
-        }
-    }
-    __syncthreads();
-
-  // Everyone picks it up, should be broadcast into the whole gradInput
-  return shared[0];
-}
-
-#endif
--- a/encoding/kernel/generic/pooling_kernel.c
+++ b/encoding/kernel/generic/pooling_kernel.c
-/*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
- * Created by: Hang Zhang
- * ECE Department, Rutgers University
- * Email: zhang.hang@rutgers.edu
- * Copyright (c) 2017
- *
- * This source code is licensed under the MIT-style license found in the
- * LICENSE file in the root directory of this source tree 
- *+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
- */
-#ifndef THC_GENERIC_FILE
-#define THC_GENERIC_FILE "generic/pooling_kernel.c"
-#else
-
-
-__global__ void Encoding_(DilatedAvgPool_Forward_kernel) (
-    THCDeviceTensor<real, 4> X, 
-    THCDeviceTensor<real, 4> Y, 
-    int kH, int kW, int dH, int dW,
-    int padH, int padW, int dilationH, int dilationW
-    )
-/*
- * dilated avgpool2d forward kernel function
- */
-{
-    /* declarations of the variables */
-    int bc, b, c, w, h, C;
-    real sum;
-    /* Get the index and channels */ 
-    bc = blockIdx.z;
-    w = blockIdx.x * blockDim.x + threadIdx.x;
-    h = blockIdx.y * blockDim.y + threadIdx.y;
-    C = Y.getSize(1);
-    b = bc / C;
-    c = bc - b*C;
-    /* boundary check for output */
-    if (w >= Y.getSize(3) || h >= Y.getSize(2)) return;
-    int hstart = h*dH -padH;
-    int wstart = w*dW -padW;
-    int hend = min(hstart + kH*dilationH, X.getSize(2));
-    int wend = min(wstart + kW*dilationW, X.getSize(3));
-    hstart = max(hstart, 0);
-    wstart = max(wstart, 0);
-    int pool_size = ((hend - hstart - 1) / dilationH + 1) * 
-        ((wend - wstart - 1) / dilationW + 1);
-    sum = 0;
-    for (int th=hstart; th < hend; th+=dilationH) {
-        for (int tw=wstart; tw < wend; tw+=dilationW) {
-            sum += X[b][c][th][tw];
-        }
-    }
-    Y[b][c][h][w] = sum / pool_size;
-}
-
-void Encoding_(DilatedAvgPool_Forward)(THCState *state, 
-    THCTensor *X_, THCTensor *Y_, 
-    int kH, int kW, int dH, int dW,
-    int padH, int padW,
-    int dilationH, int dilationW)
-/*
- * dilated avgpool2d forward function
- */
-{
-    /* Check the GPU index and tensor dims*/
-    THCTensor_(checkGPU)(state, 2, X_, Y_);
-    if (THCTensor_(nDimension)(state, X_) != 4 ||
-        THCTensor_(nDimension)(state, Y_) != 4)
-        THError("Encoding: incorrect input dims. \n");
-    /* Device tensors */
-    THCDeviceTensor<real, 4> X = devicetensor<4>(state, X_);
-    THCDeviceTensor<real, 4> Y = devicetensor<4>(state, Y_);
-    /* kernel function */
-    cudaStream_t stream = THCState_getCurrentStream(state);
-    dim3 threads(16, 16);
-    dim3 blocks(Y.getSize(3)/16+1, Y.getSize(2)/16+1, 
-                Y.getSize(1)*Y.getSize(0));
-    Encoding_(DilatedAvgPool_Forward_kernel)<<<blocks, threads, 0, stream>>>
-        (X, Y, kH, kW, dH, dW, padH, padW, dilationH, dilationW);
-    THCudaCheck(cudaGetLastError());
-}
-
-__global__ void Encoding_(DilatedAvgPool_Backward_kernel) (
-    THCDeviceTensor<real, 4> gradX, 
-    THCDeviceTensor<real, 4> gradY, 
-    int kH, int kW, int dH, int dW,
-    int padH, int padW, int dilationH, int dilationW
-    )
-/*
- * dilated avgpool2d forward kernel function
- */
-{
-    /* declarations of the variables */
-    int bc, b, c, w, h, C;
-    real sum;
-    /* Get the index and channels */ 
-    bc = blockIdx.z;
-    w = blockIdx.x * blockDim.x + threadIdx.x;
-    h = blockIdx.y * blockDim.y + threadIdx.y;
-    C = gradX.getSize(1);
-    b = bc / C;
-    c = bc - b*C;
-    /* boundary check for output */
-    if (w >= gradX.getSize(3) || h >= gradX.getSize(2)) return;
-    int phstart = (h + padH < ((kH-1)*dilationH+1)) ? 0 : 
-        (h + padH - ((kH-1)*dilationH+1))/dH + 1;
-    int pwstart = (w + padW < ((kW-1)*dilationW+1)) ? 0 : 
-        (w + padW - ((kW-1)*dilationW+1))/dW + 1;
-    int phend = min((h+padH)/dH+1, gradY.getSize(2));
-    int pwend = min((w+padW)/dW+1, gradY.getSize(3));
-    sum = 0;
-    int hstart, wstart, hend, wend, pool_size;
-    for (int ph=phstart; ph < phend; ++ph) {
-        for (int pw=pwstart; pw < pwend; ++pw) {
-            hstart = ph*dW -padH;
-            wstart = pw*dW -padW;
-            hend = min(hstart + kH*dilationH, gradX.getSize(2));
-            wend = min(wstart + kW*dilationW, gradX.getSize(3));
-            hstart = max(hstart, 0);
-            wstart = max(wstart, 0);
-            pool_size = ((hend - hstart - 1) / dilationH + 1) * 
-                ((wend - wstart - 1) / dilationW + 1);
-            sum += gradY[b][c][ph][pw] / pool_size;
-        }
-    }
-    gradX[b][c][h][w] = sum;
-}
-
-void Encoding_(DilatedAvgPool_Backward)(THCState *state, 
-    THCTensor *gradX_, THCTensor *gradY_, 
-    int kH, int kW, int dH, int dW,
-    int padH, int padW,
-    int dilationH, int dilationW)
-/*
- * dilated avgpool2d forward function
- */
-{
-    /* Check the GPU index and tensor dims*/
-    THCTensor_(checkGPU)(state, 2, gradX_, gradY_);
-    if (THCTensor_(nDimension)(state, gradX_) != 4 ||
-        THCTensor_(nDimension)(state, gradY_) != 4)
-        THError("Encoding: incorrect input dims. \n");
-    /* Device tensors */
-    THCDeviceTensor<real, 4> gradX = devicetensor<4>(state, gradX_);
-    THCDeviceTensor<real, 4> gradY = devicetensor<4>(state, gradY_);
-    /* kernel function */
-    cudaStream_t stream = THCState_getCurrentStream(state);
-    dim3 threads(16, 16);
-    dim3 blocks(gradX.getSize(3)/16+1, gradX.getSize(2)/16+1, 
-                gradX.getSize(1)*gradX.getSize(0));
-    Encoding_(DilatedAvgPool_Backward_kernel)<<<blocks, threads, 0, stream>>>
-        (gradX, gradY, kH, kW, dH, dW, padH, padW, dilationH, dilationW);
-    THCudaCheck(cudaGetLastError());
-}
-
-#endif
--- a/encoding/kernel/generic/pooling_kernel.h
+++ b/encoding/kernel/generic/pooling_kernel.h
-/*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
- * Created by: Hang Zhang
- * ECE Department, Rutgers University
- * Email: zhang.hang@rutgers.edu
- * Copyright (c) 2017
- *
- * This source code is licensed under the MIT-style license found in the
- * LICENSE file in the root directory of this source tree 
- *+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
- */
-#ifndef THC_GENERIC_FILE
-#define THC_GENERIC_FILE "generic/pooling_kernel.h"
-#else
-
-void Encoding_(DilatedAvgPool_Forward)(THCState *state, 
-    THCTensor *X_, THCTensor *Y_, 
-    int kH, int kW, int dH, int dW,
-    int padH, int padW,
-    int dilationH, int dilationW);
-
-void Encoding_(DilatedAvgPool_Backward)(THCState *state, 
-    THCTensor *gradX_, THCTensor *gradY_, 
-    int kH, int kW, int dH, int dW,
-    int padH, int padW,
-    int dilationH, int dilationW);
-
-#endif
--- a/encoding/kernel/generic/syncbn_kernel.c
+++ b/encoding/kernel/generic/syncbn_kernel.c
-/*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
- * Created by: Hang Zhang
- * ECE Department, Rutgers University
- * Email: zhang.hang@rutgers.edu
- * Copyright (c) 2017
- *
- * This source code is licensed under the MIT-style license found in the
- * LICENSE file in the root directory of this source tree 
- *+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
- */
-#ifndef THC_GENERIC_FILE
-#define THC_GENERIC_FILE "generic/syncbn_kernel.c"
-#else
-
-
-__global__ void Encoding_(BatchNorm_Forward_kernel) (
-    THCDeviceTensor<real, 3> output,
-    THCDeviceTensor<real, 3> input,
-    THCDeviceTensor<real, 1> mean,
-    THCDeviceTensor<real, 1> invstd,
-    THCDeviceTensor<real, 1> gamma,
-    THCDeviceTensor<real, 1> beta)
-{
-    int c = blockIdx.x;
-    /* main operation */ 
-    for (int b = 0; b < input.getSize(0); ++b) {
-        for (int x = threadIdx.x; x < input.getSize(2); x += blockDim.x) {
-            real inp = input[b][c][x].ldg();
-            output[b][c][x] = gamma[c].ldg() * (inp - mean[c].ldg()) * 
-                invstd[c].ldg() + beta[c].ldg();
-        }
-    }
-}
-
-void Encoding_(BatchNorm_Forward)(THCState *state, 
-        THCTensor *output_, THCTensor *input_, 
-        THCTensor *mean_, THCTensor *invstd_,
-        THCTensor *gamma_, THCTensor *beta_)
-/*
- * batch norm forward function
- * assuming the input is already flaghten
- */
-{
-    /* Check the GPU index and tensor dims*/
-    THCTensor_(checkGPU)(state, 6, output_, input_, mean_, invstd_, 
-                         gamma_, beta_);
-    if (THCTensor_(nDimension)(state, output_) != 3 ||
-        THCTensor_(nDimension)(state, input_)  != 3 ||
-        THCTensor_(nDimension)(state, mean_)   != 1 ||
-        THCTensor_(nDimension)(state, invstd_) != 1 ||
-        THCTensor_(nDimension)(state, gamma_)  != 1 ||
-        THCTensor_(nDimension)(state, beta_)   != 1)
-        THError("BatchNorm2d forward: incorrect input dims. \n");
-    /* Device tensors */
-    THCDeviceTensor<real, 3> output = devicetensor<3>(state, output_);
-    THCDeviceTensor<real, 3> input  = devicetensor<3>(state, input_);
-    THCDeviceTensor<real, 1> mean   = devicetensor<1>(state, mean_);
-    THCDeviceTensor<real, 1> invstd    = devicetensor<1>(state, invstd_);
-    THCDeviceTensor<real, 1> gamma  = devicetensor<1>(state, gamma_);
-    THCDeviceTensor<real, 1> beta   = devicetensor<1>(state, beta_);
-    /* kernel function */
-    cudaStream_t stream = THCState_getCurrentStream(state);
-    dim3 blocks(input.getSize(1));
-    dim3 threads(getNumThreads(input.getSize(2)));
-    Encoding_(BatchNorm_Forward_kernel)<<<blocks, threads, 0, stream>>>(
-        output, input, mean, invstd, gamma, beta);
-    THCudaCheck(cudaGetLastError());
-}
-
-struct Encoding_(GradOp) {
-    __device__ Encoding_(GradOp)(real m, THCDeviceTensor<real, 3> i, THCDeviceTensor<real, 3> g)
-        : mean(m), input(i), gradOutput(g) {}
-    __device__ __forceinline__ Encoding_(Float2) operator()(int batch, int plane, int n) {
-        real g = gradOutput[batch][plane][n].ldg();
-        real c = input[batch][plane][n].ldg() - mean;
-        return Encoding_(Float2)(g, g * c);
-    }
-    real mean;
-    THCDeviceTensor<real, 3> input;
-    THCDeviceTensor<real, 3> gradOutput;
-};
-
-// Sum across (batch, b/c/n) applying Op() pointwise
-__device__ Encoding_(Float2) Encoding_(reduce)(
-        Encoding_(GradOp) op,
-        THCDeviceTensor<real, 3> tensor, 
-        int plane) 
-{
-    Encoding_(Float2) sum = (Encoding_(Float2))0;
-    for (int batch = 0; batch < tensor.getSize(0); ++batch) {
-        for (int x = threadIdx.x; x < tensor.getSize(2); x += blockDim.x) {
-            sum += op(batch, plane, x);
-        }
-    }
-
-    // sum over NumThreads within a warp
-    sum = Encoding_(warpSum)(sum);
-
-    // 'transpose', and reduce within warp again
-    __shared__ Encoding_(Float2) shared[32];
-
-    __syncthreads();
-    if (threadIdx.x % WARP_SIZE == 0) {
-        if (threadIdx.x / WARP_SIZE < 32) {
-                shared[threadIdx.x / WARP_SIZE] = sum;
-        }
-    }
-    if (threadIdx.x >= blockDim.x / WARP_SIZE && threadIdx.x < WARP_SIZE) {
-        // zero out the other entries in shared
-        shared[threadIdx.x] = (Encoding_(Float2))0;
-    }
-    __syncthreads();
-    if (threadIdx.x / WARP_SIZE == 0) {
-        sum = Encoding_(warpSum)(shared[threadIdx.x]);
-        if (threadIdx.x == 0) {
-            shared[0] = sum;
-        }
-    }
-    __syncthreads();
-
-  // Everyone picks it up, should be broadcast into the whole gradInput
-  return shared[0];
-}
-
-__global__ void Encoding_(BatchNorm_Backward_kernel) (
-    THCDeviceTensor<real, 3> gradoutput,
-    THCDeviceTensor<real, 3> input,
-    THCDeviceTensor<real, 3> gradinput,
-    THCDeviceTensor<real, 1> gradgamma,
-    THCDeviceTensor<real, 1> gradbeta,
-    THCDeviceTensor<real, 1> mean,
-    THCDeviceTensor<real, 1> invstd,
-    THCDeviceTensor<real, 1> gamma,
-    THCDeviceTensor<real, 1> beta,
-    THCDeviceTensor<real, 1> gradMean, 
-    THCDeviceTensor<real, 1> gradStd,
-    int train)
-{
-    /* declarations of the variables */
-    /* Get the index and channels */ 
-    int c = blockIdx.x; 
-    /* main operation */ 
-    Encoding_(GradOp) g(mean[c], input, gradoutput);
-    Encoding_(Float2) res = Encoding_(reduce)(g, gradoutput, c);
-    real gradOutputSum = res.v1;
-    real dotP = res.v2;
-
-    //real projScale = dotP * norm * invstd[c].ldg() * invstd[c].ldg();
-    real gradScale = invstd[c].ldg() * gamma[c].ldg();
-    if (train && threadIdx.x == 0) {
-        gradMean[c] = - gradOutputSum * gamma[c].ldg() * invstd[c].ldg();
-        gradStd[c]  = - dotP * gamma[c].ldg() * invstd[c].ldg() * invstd[c].ldg();
-    }
-
-    if (gradinput.numElements() > 0) {
-        for (int batch = 0; batch < gradoutput.getSize(0); ++batch) {
-            for (int x = threadIdx.x; x < gradoutput.getSize(2); x += blockDim.x) {
-                gradinput[batch][c][x] = gradoutput[batch][c][x].ldg() * gradScale;
-            }
-        }
-    }
-
-    if (gradgamma.numElements() > 0) {
-        if (threadIdx.x == 0) {
-            gradgamma[c] += dotP * invstd[c].ldg();
-        }
-    }
-
-    if (gradbeta.numElements() > 0) {
-        if (threadIdx.x == 0) {
-            gradbeta[c] += gradOutputSum;
-        }
-    }
-}
-
-void Encoding_(BatchNorm_Backward)(THCState *state, 
-        THCTensor *gradoutput_, THCTensor *input_, THCTensor *gradinput_, 
-        THCTensor *gradgamma_, THCTensor *gradbeta_, THCTensor *mean_, 
-        THCTensor *invstd_, THCTensor *gamma_, THCTensor *beta_, 
-        THCTensor *gradMean_, THCTensor *gradStd_, int train)
-/*
- * batch norm backward function
- * assuming the input is already flaghten
- */
-{
-    /* Check the GPU index and tensor dims*/
-    THCTensor_(checkGPU)(state, 6, gradoutput_, input_, gradinput_, 
-        gradgamma_, gradbeta_, mean_, invstd_, gamma_, beta_);
-    if (THCTensor_(nDimension)(state, gradoutput_) != 3 ||
-        THCTensor_(nDimension)(state, input_)      != 3 ||
-        THCTensor_(nDimension)(state, gradinput_)  != 3 ||
-        THCTensor_(nDimension)(state, gradgamma_)  != 1 ||
-        THCTensor_(nDimension)(state, gradbeta_)   != 1 ||
-        THCTensor_(nDimension)(state, mean_)   != 1 ||
-        THCTensor_(nDimension)(state, invstd_) != 1 ||
-        THCTensor_(nDimension)(state, gamma_)  != 1 ||
-        THCTensor_(nDimension)(state, beta_)   != 1 || 
-        THCTensor_(nDimension)(state, gradMean_) != 1 ||
-        THCTensor_(nDimension)(state, gradStd_)  != 1 )
-        THError("BatchNorm2d backward: incorrect input dims. \n");
-    /* Device tensors */
-    THCDeviceTensor<real, 3> gradoutput = 
-        devicetensor<3>(state, gradoutput_);
-    THCDeviceTensor<real, 3> input = 
-        devicetensor<3>(state, input_);
-    THCDeviceTensor<real, 3> gradinput = 
-        devicetensor<3>(state, gradinput_);
-    THCDeviceTensor<real, 1> gradgamma = 
-        devicetensor<1>(state, gradgamma_);
-    THCDeviceTensor<real, 1> gradbeta = devicetensor<1>(state, gradbeta_);
-    THCDeviceTensor<real, 1> mean     = devicetensor<1>(state, mean_);
-    THCDeviceTensor<real, 1> invstd   = devicetensor<1>(state, invstd_);
-    THCDeviceTensor<real, 1> gamma    = devicetensor<1>(state, gamma_);
-    THCDeviceTensor<real, 1> beta     = devicetensor<1>(state, beta_);
-    THCDeviceTensor<real, 1> gradMean = devicetensor<1>(state, gradMean_);
-    THCDeviceTensor<real, 1> gradStd  = devicetensor<1>(state, gradStd_);
-    /* kernel function */
-    cudaStream_t stream = THCState_getCurrentStream(state);
-    dim3 blocks(input.getSize(1));
-    dim3 threads(getNumThreads(input.getSize(2)));
-    Encoding_(BatchNorm_Backward_kernel)<<<blocks, threads, 0, stream>>>(
-        gradoutput, input, gradinput, gradgamma, gradbeta, mean, invstd, 
-        gamma, beta, gradMean, gradStd, train);
-    THCudaCheck(cudaGetLastError());
-}
-
-struct Encoding_(SumOp) {
-    __device__ Encoding_(SumOp)(THCDeviceTensor<real, 3> i)
-        : input(i){}
-    __device__ __forceinline__ Encoding_(Float2) operator()(int batch, int plane, int n) {
-        real g = input[batch][plane][n].ldg();
-        return Encoding_(Float2)(g, g * g);
-    }
-    real mean;
-    THCDeviceTensor<real, 3> input;
-};
-
-// Sum across (batch, x/y/z) applying Op() pointwise
-__device__ Encoding_(Float2) Encoding_(reduce_sum)(Encoding_(SumOp) op, THCDeviceTensor<real, 3> tensor, int plane) {
-  Encoding_(Float2) sum = (Encoding_(Float2))0;
-  for (int batch = 0; batch < tensor.getSize(0); ++batch) {
-    for (int x = threadIdx.x; x < tensor.getSize(2); x += blockDim.x) {
-      sum += op(batch, plane, x);
-    }
-  }
-
-  // sum over NumThreads within a warp
-  sum = Encoding_(warpSum)(sum);
-
-  // 'transpose', and reduce within warp again
-  __shared__ Encoding_(Float2) shared[32];
-
-  __syncthreads();
-  if (threadIdx.x % WARP_SIZE == 0) {
-    if (threadIdx.x / WARP_SIZE < 32) {
-        shared[threadIdx.x / WARP_SIZE] = sum;
-    }
-  }
-  if (threadIdx.x >= blockDim.x / WARP_SIZE && threadIdx.x < WARP_SIZE) {
-    // zero out the other entries in shared
-    shared[threadIdx.x] = (Encoding_(Float2))0;
-  }
-  __syncthreads();
-  if (threadIdx.x / WARP_SIZE == 0) {
-    sum = Encoding_(warpSum)(shared[threadIdx.x]);
-    if (threadIdx.x == 0) {
-      shared[0] = sum;
-    }
-  }
-  __syncthreads();
-
-  // Everyone picks it up, should be broadcast into the whole gradInput
-  return shared[0];
-}
-
-
-__global__ void Encoding_(Sum_Square_Forward_kernel) (
-    THCDeviceTensor<real, 3> input,
-    THCDeviceTensor<real, 1> sum,
-    THCDeviceTensor<real, 1> square)
-{
-    int c = blockIdx.x;
-    /* main operation */ 
-    Encoding_(SumOp) g(input);
-    Encoding_(Float2) res = Encoding_(reduce_sum)(g, input, c);
-    real xsum = res.v1;
-    real xsquare = res.v2;
-    
-    if (threadIdx.x == 0) {
-        sum[c]    = xsum;
-        square[c] = xsquare;
-    }
-}
-
-
-void Encoding_(Sum_Square_Forward)(THCState *state, 
-        THCTensor *input_, THCTensor *sum_, THCTensor *square_)
-/*
- */
-{
-    /* Check the GPU index and tensor dims*/
-    THCTensor_(checkGPU)(state, 3, input_, sum_, square_);
-    if (THCTensor_(nDimension)(state, input_)   != 3 ||
-        THCTensor_(nDimension)(state, sum_)     != 1 ||
-        THCTensor_(nDimension)(state, square_)  != 1)
-        THError("Sum_Square forward: incorrect input dims. \n");
-    /* Device tensors */
-    THCDeviceTensor<real, 3> input  = devicetensor<3>(state, input_);
-    THCDeviceTensor<real, 1> sum    = devicetensor<1>(state, sum_);
-    THCDeviceTensor<real, 1> square = devicetensor<1>(state, square_);
-    /* kernel function */
-    cudaStream_t stream = THCState_getCurrentStream(state);
-    dim3 blocks(input.getSize(1));
-    dim3 threads(getNumThreads(input.getSize(2)));
-    Encoding_(Sum_Square_Forward_kernel)<<<blocks, threads, 0, stream>>>(
-        input, sum, square);
-    THCudaCheck(cudaGetLastError());
-}
-
-
-__global__ void Encoding_(Sum_Square_Backward_kernel) (
-    THCDeviceTensor<real, 3> gradInput,
-    THCDeviceTensor<real, 3> input,
-    THCDeviceTensor<real, 1> gradSum,
-    THCDeviceTensor<real, 1> gradSquare)
-{
-    int c = blockIdx.x;
-    /* main operation */ 
-    for (int batch = 0; batch < gradInput.getSize(0); ++batch) {
-        for (int x = threadIdx.x; x < gradInput.getSize(2); x += blockDim.x)
-        {
-            gradInput[batch][c][x] = gradSum[c] + 2 * gradSquare[c] *
-                input[batch][c][x];
-        }
-    }   
-}
-
-
-void Encoding_(Sum_Square_Backward)(THCState *state, 
-        THCTensor *gradInput_, THCTensor *input_, 
-        THCTensor *gradSum_, THCTensor *gradSquare_)
-/*
- */
-{
-    /* Check the GPU index and tensor dims*/
-    THCTensor_(checkGPU)(state, 4, gradInput_, input_, gradSum_, 
-                         gradSquare_);
-    if (THCTensor_(nDimension)(state, gradInput_)  != 3 ||
-        THCTensor_(nDimension)(state, input_)      != 3 ||
-        THCTensor_(nDimension)(state, gradSum_)    != 1 ||
-        THCTensor_(nDimension)(state, gradSquare_) != 1)
-        THError("Sum_Square forward: incorrect input dims. \n");
-    /* Device tensors */
-    THCDeviceTensor<real, 3> gradInput  = devicetensor<3>(state, gradInput_);
-    THCDeviceTensor<real, 3> input      = devicetensor<3>(state, input_);
-    THCDeviceTensor<real, 1> gradSum    = devicetensor<1>(state, gradSum_);
-    THCDeviceTensor<real, 1> gradSquare =devicetensor<1>(state, gradSquare_);
-    /* kernel function */
-    cudaStream_t stream = THCState_getCurrentStream(state);
-    dim3 blocks(input.getSize(1));
-    dim3 threads(getNumThreads(input.getSize(2)));
-    Encoding_(Sum_Square_Backward_kernel)<<<blocks, threads, 0, stream>>>(
-        gradInput, input, gradSum, gradSquare);
-    THCudaCheck(cudaGetLastError());
-}
-
-
-#endif
--- a/encoding/kernel/generic/syncbn_kernel.h
+++ b/encoding/kernel/generic/syncbn_kernel.h
-/*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
- * Created by: Hang Zhang
- * ECE Department, Rutgers University
- * Email: zhang.hang@rutgers.edu
- * Copyright (c) 2017
- *
- * This source code is licensed under the MIT-style license found in the
- * LICENSE file in the root directory of this source tree 
- *+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
- */
-#ifndef THC_GENERIC_FILE
-#define THC_GENERIC_FILE "generic/syncbn_kernel.h"
-#else
-
-void Encoding_(BatchNorm_Forward)(THCState *state, 
-    THCTensor *output_, THCTensor *input_, 
-    THCTensor *mean_, THCTensor *invstd_,
-    THCTensor *gamma_, THCTensor *beta_);
-
-void Encoding_(BatchNorm_Backward)(THCState *state, 
-    THCTensor *gradoutput_, THCTensor *input_, THCTensor *gradinput_, 
-    THCTensor *gradgamma_, THCTensor *gradbeta_, THCTensor *mean_, 
-    THCTensor *invstd_, THCTensor *gamma_, THCTensor *beta_, 
-    THCTensor *gradMean_, THCTensor *gradStd_, int train);
-
-void Encoding_(Sum_Square_Forward)(THCState *state, 
-    THCTensor *input_, THCTensor *sum_, THCTensor *square_);
-
-void Encoding_(Sum_Square_Backward)(THCState *state, 
-    THCTensor *gradInput, THCTensor *input_, 
-    THCTensor *gradSum_, THCTensor *gradSquare_);
-
-#endif
--- a/encoding/kernel/include/README.rst
+++ b/encoding/kernel/include/README.rst
-Make a copy from PyTorch lib to make the compilation easier for users, due to so many questions and requests.
--- a/encoding/kernel/include/THCDeviceTensor-inl.cuh
+++ b/encoding/kernel/include/THCDeviceTensor-inl.cuh
-#include <assert.h>
-
-namespace detail {
-
-template <typename T, int N>
-__host__ __device__ void copy(T to[N], T from[N]) {
-  for (int i = 0; i < N; ++i) {
-    to[i] = from[i];
-  }
-}
-
-} // namespace detail
-
-template <typename T, int Dim,
-          typename IndexT, template <typename U> class PtrTraits>
-__host__ __device__
-THCDeviceTensor<T, Dim, IndexT, PtrTraits>::THCDeviceTensor()
-    : data_(NULL) {
-  thc_static_assert(Dim > 0);
-
-  for (int i = 0; i < Dim; ++i) {
-    size_[i] = 0;
-    stride_[i] = (IndexT) 1;
-  }
-}
-
-template <typename T, int Dim,
-          typename IndexT, template <typename U> class PtrTraits>
-__host__ __device__
-THCDeviceTensor<T, Dim, IndexT, PtrTraits>::
-#ifdef _MSC_VER
-THCDeviceTensor(DataPtrType data, const IndexT (&sizes)[Dim])
-#else
-THCDeviceTensor(DataPtrType data, const IndexT sizes[Dim])
-#endif
-    : data_(data) {
-  thc_static_assert(Dim > 0);
-
-  for (int i = 0; i < Dim; ++i) {
-    size_[i] = sizes[i];
-  }
-
-  stride_[Dim - 1] = (IndexT) 1;
-  for (int i = Dim - 2; i >= 0; --i) {
-    stride_[i] = stride_[i + 1] * sizes[i + 1];
-  }
-}
-
-template <typename T, int Dim,
-          typename IndexT, template <typename U> class PtrTraits>
-__host__ __device__
-THCDeviceTensor<T, Dim, IndexT, PtrTraits>::THCDeviceTensor(
-#ifdef _MSC_VER
-  DataPtrType data, const IndexT (&sizes)[Dim], const IndexT (&strides)[Dim])
-#else
-  DataPtrType data, const IndexT sizes[Dim], const IndexT strides[Dim])
-#endif
-    : data_(data) {
-  thc_static_assert(Dim > 0);
-
-  for (int i = 0; i < Dim; ++i) {
-    size_[i] = sizes[i];
-    stride_[i] = strides[i];
-  }
-}
-
-template <typename T, int Dim,
-          typename IndexT, template <typename U> class PtrTraits>
-template <int OtherDim>
-__host__ __device__ bool
-THCDeviceTensor<T, Dim, IndexT, PtrTraits>::isSameSizeAndStride(
-  const THCDeviceTensor<T, OtherDim, IndexT, PtrTraits>& rhs) const {
-  if (Dim != OtherDim) {
-    return false;
-  }
-
-  for (int i = 0; i < Dim; ++i) {
-    if (size_[i] != rhs.size_[i]) {
-      return false;
-    }
-
-    if (stride_[i] != rhs.stride_[i]) {
-      return false;
-    }
-  }
-
-  return true;
-}
-
-template <typename T, int Dim,
-          typename IndexT, template <typename U> class PtrTraits>
-template <typename U>
-__host__ __device__ THCDeviceTensor<U, Dim, IndexT, PtrTraits>
-THCDeviceTensor<T, Dim, IndexT, PtrTraits>::cast() {
-  thc_static_assert(sizeof(U) == sizeof(T));
-
-  return THCDeviceTensor<U, Dim, IndexT, PtrTraits>(
-    reinterpret_cast<U*>(data_), size_, stride_);
-}
-
-template <typename T, int Dim,
-          typename IndexT, template <typename U> class PtrTraits>
-template <typename U>
-__host__ __device__ const THCDeviceTensor<U, Dim, IndexT, PtrTraits>
-THCDeviceTensor<T, Dim, IndexT, PtrTraits>::cast() const {
-  thc_static_assert(sizeof(U) == sizeof(T));
-
-  return THCDeviceTensor<U, Dim, IndexT, PtrTraits>(
-    reinterpret_cast<U*>(data_), size_, stride_);
-}
-
-template <typename T, int Dim,
-          typename IndexT, template <typename U> class PtrTraits>
-__host__ __device__ ptrdiff_t
-THCDeviceTensor<T, Dim, IndexT, PtrTraits>::numElements() const {
-  ptrdiff_t size = getSize(0);
-
-  for (int i = 1; i < Dim; ++i) {
-    size *= getSize(i);
-  }
-
-  return size;
-}
-
-template <typename T, int Dim,
-          typename IndexT, template <typename U> class PtrTraits>
-__host__ __device__ bool
-THCDeviceTensor<T, Dim, IndexT, PtrTraits>::isContiguous() const {
-  return isContiguousRange(0, Dim);
-}
-
-template <typename T, int Dim,
-          typename IndexT, template <typename U> class PtrTraits>
-__host__ __device__ bool
-THCDeviceTensor<T, Dim, IndexT, PtrTraits>::isConsistentlySized(int i) const {
-  if (i == 0 && getStride(i) > 0 && getSize(i) > 0) {
-    return true;
-  } else if ((i > 0) && (i < Dim) && (getStride(i) > 0) &&
-             ((getStride(i - 1) / getStride(i)) >= getSize(i))) {
-    return true;
-  }
-
-  return false;
-}
-
-template <typename T, int Dim,
-          typename IndexT, template <typename U> class PtrTraits>
-__host__ __device__ bool
-THCDeviceTensor<T, Dim, IndexT, PtrTraits>::isConsistentlySized() const {
-  for (int i = 0; i < Dim; ++i) {
-    if (!isConsistentlySized(i)) {
-      return false;
-    }
-  }
-
-  return true;
-}
-
-template <typename T, int Dim,
-          typename IndexT, template <typename U> class PtrTraits>
-__host__ __device__ bool
-THCDeviceTensor<T, Dim, IndexT, PtrTraits>::isContiguousRange(
-  int first, int last) const {
-
-  int64_t prevSize = last < Dim ? getStride(last) * getSize(last) : 1;
-
-  for (int i = last - 1; i >= first; --i) {
-    if (getSize(i) != (IndexT) 1) {
-      if (getStride(i) == prevSize) {
-        prevSize *= getSize(i);
-      } else {
-        return false;
-      }
-    }
-  }
-
-  return true;
-}
-
-template <typename T, int Dim,
-          typename IndexT, template <typename U> class PtrTraits>
-__host__ __device__ THCDeviceTensor<T, Dim, IndexT, PtrTraits>
-THCDeviceTensor<T, Dim, IndexT, PtrTraits>::transpose(int dim1,
-                                                      int dim2) const {
-#ifdef __CUDA_ARCH__
-  // Device code
-  assert(dim1 >= 0 && dim1 < Dim);
-  assert(dim1 >= 0 && dim2 < Dim);
-#else
-  // Host code
-  if (dim1 < 0 || dim1 >= Dim) {
-    THError("dim1 out of bounds");
-  }
-
-  if (dim2 < 0 || dim2 >= Dim) {
-    THError("dim2 out of bounds");
-  }
-#endif
-
-  IndexT newSize[Dim];
-  IndexT newStride[Dim];
-
-  for (int i = 0; i < Dim; ++i) {
-    newSize[i] = size_[i];
-    newStride[i] = stride_[i];
-  }
-
-  IndexT tmp = newSize[dim1];
-  newSize[dim1] = newSize[dim2];
-  newSize[dim2] = tmp;
-
-  tmp = newStride[dim1];
-  newStride[dim1] = newStride[dim2];
-  newStride[dim2] = tmp;
-
-  return THCDeviceTensor<T, Dim, IndexT, PtrTraits>(data_, newSize, newStride);
-}
-
-template <typename T, int Dim,
-          typename IndexT, template <typename U> class PtrTraits>
-template <int NewDim>
-__host__ __device__ THCDeviceTensor<T, NewDim, IndexT, PtrTraits>
-THCDeviceTensor<T, Dim, IndexT, PtrTraits>::upcastOuter() {
-  // Can only create tensors of greater dimension
-  thc_static_assert(NewDim > Dim);
-
-  IndexT newSize[NewDim];
-  IndexT newStride[NewDim];
-
-  int shift = NewDim - Dim;
-
-  for (int i = 0; i < NewDim; ++i) {
-    if (i < shift) {
-      // These are the extended dimensions
-      newSize[i] = (IndexT) 1;
-      newStride[i] = size_[0] * stride_[0];
-    } else {
-      // Shift the remaining dimensions
-      newSize[i] = size_[i - shift];
-      newStride[i] = stride_[i - shift];
-    }
-  }
-
-  return THCDeviceTensor<T, NewDim, IndexT, PtrTraits>(
-    data_, newSize, newStride);
-}
-
-template <typename T, int Dim,
-          typename IndexT, template <typename U> class PtrTraits>
-template <int NewDim>
-__host__ __device__ THCDeviceTensor<T, NewDim, IndexT, PtrTraits>
-THCDeviceTensor<T, Dim, IndexT, PtrTraits>::upcastInner() {
-  // Can only create tensors of greater dimension
-  thc_static_assert(NewDim > Dim);
-
-  IndexT newSize[NewDim];
-  IndexT newStride[NewDim];
-
-  for (int i = 0; i < NewDim; ++i) {
-    if (i < Dim) {
-      // Existing dimensions get copied over
-      newSize[i] = size_[i];
-      newStride[i] = stride_[i];
-    } else {
-      // Extended dimensions
-      newSize[i] = (IndexT) 1;
-      newStride[i] = (IndexT) 1;
-    }
-  }
-
-  return THCDeviceTensor<T, NewDim, IndexT, PtrTraits>(
-    data_, newSize, newStride);
-}
-
-template <typename T, int Dim,
-          typename IndexT, template <typename U> class PtrTraits>
-template <int NewDim>
-__host__ __device__ THCDeviceTensor<T, NewDim, IndexT, PtrTraits>
-THCDeviceTensor<T, Dim, IndexT, PtrTraits>::downcastOuter() {
-  // Can only create tensors of lesser dimension
-  thc_static_assert(NewDim < Dim);
-
-  // We can't downcast non-contiguous tensors, since it leaves
-  // garbage data in the tensor. The tensor needs to be contiguous
-  // in all of the dimensions we are collapsing (no padding in
-  // them).
-  bool cont = isContiguousRange(0, Dim - NewDim);
-#ifdef __CUDA_ARCH__
-  // Device code
-  assert(cont);
-#else
-  // Host code
-  if (!cont) {
-    THError("Can only downcast contiguous tensors");
-  }
-#endif
-
-  IndexT newSize[NewDim];
-  IndexT newStride[NewDim];
-
-  int ignoredDims = Dim - NewDim;
-  IndexT collapsedSize = 1;
-
-  for (int i = 0; i < Dim; ++i) {
-    if (i < ignoredDims) {
-      // Collapse these dimensions
-      collapsedSize *= getSize(i);
-    } else {
-      // Non-collapsed dimensions
-      if (i == ignoredDims) {
-        // This is the first non-collapsed dimension
-        newSize[i - ignoredDims] = collapsedSize * getSize(i);
-      } else {
-        // Subsequent non-collapsed dimensions
-        newSize[i - ignoredDims] = getSize(i);
-      }
-
-      newStride[i - ignoredDims] = getStride(i);
-    }
-  }
-
-  return THCDeviceTensor<T, NewDim, IndexT, PtrTraits>(
-    data_, newSize, newStride);
-}
-
-template <typename T, int Dim,
-          typename IndexT, template <typename U> class PtrTraits>
-template <int NewDim>
-__host__ __device__ THCDeviceTensor<T, NewDim, IndexT, PtrTraits>
-THCDeviceTensor<T, Dim, IndexT, PtrTraits>::downcastInner() {
-  // Can only create tensors of lesser dimension
-  thc_static_assert(NewDim < Dim);
-
-  // We can't downcast non-contiguous tensors, since it leaves
-  // garbage data in the tensor. The tensor needs to be contiguous
-  // in all of the dimensions we are collapsing (no padding in
-  // them).
-  bool cont = isContiguousRange(NewDim, Dim);
-#ifdef __CUDA_ARCH__
-  // Device code
-  assert(cont);
-#else
-  // Host code
-  if (!cont) {
-    THError("Can only downcast contiguous tensors");
-  }
-#endif
-
-  IndexT newSize[NewDim];
-  IndexT newStride[NewDim];
-
-  IndexT collapsedSize = 1;
-
-  for (int i = Dim - 1; i >= 0; --i) {
-    if (i >= NewDim) {
-      // Collapse these dimensions
-      collapsedSize *= getSize(i);
-    } else {
-      // Non-collapsed dimensions
-      if (i == NewDim - 1) {
-        // This is the first non-collapsed dimension
-        newSize[i] = collapsedSize * getSize(i);
-        newStride[i] = getStride(Dim - 1);
-      } else {
-        // Subsequent non-collapsed dimensions
-        newSize[i] = getSize(i);
-        newStride[i] = getStride(i);
-      }
-    }
-  }
-
-  return THCDeviceTensor<T, NewDim, IndexT, PtrTraits>(
-    data_, newSize, newStride);
-}
-
-template <typename T, int Dim,
-          typename IndexT, template <typename U> class PtrTraits>
-template <int SubDim>
-__host__ __device__ THCDeviceTensor<T, SubDim, IndexT, PtrTraits>
-THCDeviceTensor<T, Dim, IndexT, PtrTraits>::view(DataPtrType at) {
-  thc_static_assert(SubDim >= 1 && SubDim < Dim);
-
-  IndexT viewSizes[SubDim];
-  IndexT viewStrides[SubDim];
-
-  for (int i = 0; i < SubDim; ++i) {
-    viewSizes[i] = size_[Dim - SubDim + i];
-    viewStrides[i] = stride_[Dim - SubDim + i];
-  }
-
-  return THCDeviceTensor<T, SubDim, IndexT, PtrTraits>(
-    at, viewSizes, viewStrides);
-}
-
-template <typename T, int Dim,
-          typename IndexT, template <typename U> class PtrTraits>
-template <int SubDim>
-__host__ __device__ THCDeviceTensor<T, SubDim, IndexT, PtrTraits>
-THCDeviceTensor<T, Dim, IndexT, PtrTraits>::view() {
-  return view<SubDim>(data_);
-}
-
-template <typename T, int Dim,
-          typename IndexT, template <typename U> class PtrTraits>
-void
-THCDeviceTensor<T, Dim, IndexT, PtrTraits>::zero(cudaStream_t stream) {
-#ifdef __CUDA_ARCH__
-  assert(isContiguous());
-#else
-  if (!isContiguous()) {
-    THError("fillAsync only works on contiguous data");
-  }
-#endif
-
-  cudaMemsetAsync(data(), 0, numElements() * sizeof(T), stream);
-}
--- a/encoding/kernel/include/THCDeviceTensor.cuh
+++ b/encoding/kernel/include/THCDeviceTensor.cuh
-#ifndef THC_DEVICE_TENSOR_INC
-#define THC_DEVICE_TENSOR_INC
-
-#include <cuda.h>
-#include <cuda_runtime.h>
-
-// A CUDA 6.5 compatible version of static_assert. Remove once on CUDA 7.0.
-template <bool>
-struct THCStaticAssert;
-
-template <>
-struct THCStaticAssert<true> {
-};
-
-#define thc_static_assert(expr) (THCStaticAssert<(expr) != 0>())
-
-/// Our tensor type
-template <typename T,
-          int Dim,
-          typename IndexT,
-          template <typename U> class PtrTraits>
-class THCDeviceTensor;
-
-/// Type of a subspace of a tensor
-namespace detail {
-template <typename TensorType,
-          int SubDim,
-          template <typename U> class PtrTraits>
-class THCDeviceSubTensor;
-}
-
-template <typename T>
-struct RestrictPtrTraits {
-  typedef T* __restrict__ PtrType;
-};
-
-template <typename T>
-struct DefaultPtrTraits {
-  typedef T* PtrType;
-};
-
-/**
-   Templated multi-dimensional array that supports strided access of
-   elements. Main access is through `operator[]`; e.g.,
-   `tensor[x][y][z]`.
-
- `T` is the contained type (e.g., `float`)
- `Dim` is the tensor rank
- `IndexT` is the integer type used for size/stride arrays, and for
- all indexing math. Default is `int`, but for large tensors, `int64_t`
- can be used instead.
- `PtrTraits` are traits applied to our data pointer (T*). By default,
- this is just T*, but RestrictPtrTraits can be used to apply T*
- __restrict__ for alias-free analysis.
-*/
-template <typename T,
-          int Dim,
-          typename IndexT = int,
-          template <typename U> class PtrTraits = DefaultPtrTraits>
-class THCDeviceTensor {
- public:
-  enum { NumDim = Dim };
-  typedef T DataType;
-  typedef IndexT IndexType;
-  typedef typename PtrTraits<T>::PtrType DataPtrType;
-  typedef THCDeviceTensor<T, Dim, IndexT, PtrTraits> TensorType;
-
-  /// Default constructor
-  __host__ __device__ THCDeviceTensor();
-
-  /// Constructor that calculates strides with no padding
-  __host__ __device__ THCDeviceTensor(DataPtrType data,
-#ifdef _MSC_VER
-                                      const IndexT (&sizes)[Dim]);
-#else
-                                      const IndexT sizes[Dim]);
-#endif
-
-  /// Constructor that takes arbitrary size/stride arrays
-  __host__ __device__ THCDeviceTensor(DataPtrType data,
-#ifdef _MSC_VER
-                                      const IndexT (&sizes)[Dim],
-                                      const IndexT (&strides)[Dim]);
-#else
-                                      const IndexT sizes[Dim],
-                                      const IndexT strides[Dim]);
-#endif
-
-  /// Returns true if the two tensors are of the same dimensionality,
-  /// size and stride.
-  template <int OtherDim>
-  __host__ __device__ bool
-  isSameSizeAndStride(
-    const THCDeviceTensor<T, OtherDim, IndexT, PtrTraits>& rhs) const;
-
-  /// Cast to a tensor of a different type of the same size and stride
-  template <typename U>
-  __host__ __device__ THCDeviceTensor<U, Dim, IndexT, PtrTraits> cast();
-
-  /// Const version of `cast`
-  template <typename U>
-  __host__ __device__
-  const THCDeviceTensor<U, Dim, IndexT, PtrTraits> cast() const;
-
-  /// Returns a raw pointer to the start of our data.
-  __host__ __device__ __forceinline__ DataPtrType data() {
-    return data_;
-  }
-
-  /// Returns a raw pointer to the start of our data (const).
-  __host__ __device__ __forceinline__
-  const DataPtrType data() const {
-    return data_;
-  }
-
-  /// Cast to a different datatype
-  template <typename U>
-  __host__ __device__ __forceinline__
-  typename PtrTraits<U>::PtrType dataAs() {
-    return reinterpret_cast<typename PtrTraits<U>::PtrType>(data_);
-  }
-
-  /// Cast to a different datatype
-  template <typename U>
-  __host__ __device__ __forceinline__
-  const typename PtrTraits<const U>::PtrType dataAs() const {
-    return reinterpret_cast<typename PtrTraits<const U>::PtrType>(data_);
-  }
-
-  /// Returns a read/write view of a portion of our tensor.
-  __host__ __device__ __forceinline__
-  detail::THCDeviceSubTensor<TensorType, Dim - 1, PtrTraits>
-    operator[](IndexT);
-
-  /// Returns a read/write view of a portion of our tensor (const).
-  __host__ __device__ __forceinline__
-  const detail::THCDeviceSubTensor<TensorType, Dim - 1, PtrTraits>
-    operator[](IndexT) const;
-
-  /// Returns the size of a given dimension, `[0, Dim - 1]`. No bounds
-  /// checking.
-  __host__ __device__ __forceinline__ int getSize(int i) const {
-    return size_[i];
-  }
-
-  /// Returns the stride of a given dimension, `[0, Dim - 1]`. No bounds
-  /// checking.
-  __host__ __device__ __forceinline__ int getStride(int i) const {
-    return stride_[i];
-  }
-
-  /// Returns the total number of elements contained within our data
-  /// (product of `getSize(i)`)
-  __host__ __device__ ptrdiff_t numElements() const;
-
-  /// Returns the size array.
-  __host__ __device__ __forceinline__ const IndexT* sizes() const {
-    return size_;
-  }
-
-  /// Returns the stride array.
-  __host__ __device__ __forceinline__ const IndexT* strides() const {
-    return stride_;
-  }
-
-  /// Returns true if there is no padding within the tensor and no
-  /// re-ordering of the dimensions.
-  /// ~~~
-  /// (stride(i) == size(i + 1) * stride(i + 1)) && stride(dim - 1) == 0
-  /// ~~~
-  __host__ __device__ bool isContiguous() const;
-
-  /// Returns whether a given dimension has only increasing stride
-  /// from the previous dimension. A tensor that was permuted by
-  /// exchanging size and stride only will fail this check.
-  /// If `i == 0` just check `size > 0`. Returns `false` if `stride` is `<= 0`.
-  __host__ __device__ bool isConsistentlySized(int i) const;
-
-  // Returns whether at each dimension `stride <= size`.
-  // If this is not the case then iterating once over the size space will
-  // touch the same memory locations multiple times.
-  __host__ __device__ bool isConsistentlySized() const;
-
-  /// Returns true if the given dimension range [first, last) has no padding.
-  __host__ __device__ bool isContiguousRange(int first, int last) const;
-
-  /// Returns a tensor of the same dimension after transposing the two
-  /// dimensions given. Does not actually move elements; transposition
-  /// is made by permuting the size/stride arrays.
-  /// If the dimensions are not valid, asserts.
-  __host__ __device__ THCDeviceTensor<T, Dim, IndexT, PtrTraits>
-  transpose(int dim1, int dim2) const;
-
-  /// Upcast a tensor of dimension `D` to some tensor of dimension
-  /// D' > D by padding the leading dimensions by 1
-  /// e.g., upcasting a 2-d tensor `[2][3]` to a 4-d tensor `[1][1][2][3]`
-  template <int NewDim>
-  __host__ __device__ THCDeviceTensor<T, NewDim, IndexT, PtrTraits>
-  upcastOuter();
-
-  /// Upcast a tensor of dimension `D` to some tensor of dimension
-  /// D' > D by padding the lowest/most varying dimensions by 1
-  /// e.g., upcasting a 2-d tensor `[2][3]` to a 4-d tensor `[2][3][1][1]`
-  template <int NewDim>
-  __host__ __device__ THCDeviceTensor<T, NewDim, IndexT, PtrTraits>
-  upcastInner();
-
-  /// Downcast a tensor of dimension `D` to some tensor of dimension
-  /// D' < D by collapsing the leading dimensions. asserts if there is
-  /// padding on the leading dimensions.
-  template <int NewDim>
-  __host__ __device__
-  THCDeviceTensor<T, NewDim, IndexT, PtrTraits> downcastOuter();
-
-  /// Downcast a tensor of dimension `D` to some tensor of dimension
-  /// D' < D by collapsing the leading dimensions. asserts if there is
-  /// padding on the leading dimensions.
-  template <int NewDim>
-  __host__ __device__
-  THCDeviceTensor<T, NewDim, IndexT, PtrTraits> downcastInner();
-
-  /// Returns a tensor that is a view of the `SubDim`-dimensional slice
-  /// of this tensor, starting at `at`.
-  template <int SubDim>
-  __host__ __device__ THCDeviceTensor<T, SubDim, IndexT, PtrTraits>
-  view(DataPtrType at);
-
-  /// Returns a tensor that is a view of the `SubDim`-dimensional slice
-  /// of this tensor, starting where our data begins
-  template <int SubDim>
-  __host__ __device__ THCDeviceTensor<T, SubDim, IndexT, PtrTraits>
-  view();
-
-  /// Zeroes out the tensor asynchronously. Asserts if the contents
-  /// in question are not contiguous.
-  void zero(cudaStream_t stream = 0);
-
- private:
-  /// Raw pointer to where the tensor data begins
-  DataPtrType data_;
-
-  /// Array of strides (in sizeof(T) terms) per each dimension
-  IndexT stride_[Dim];
-
-  /// Size per each dimension
-  IndexT size_[Dim];
-};
-
-namespace detail {
-
-/// Specialization for a view of a single value (0-dimensional)
-template <typename TensorType, template <typename U> class PtrTraits>
-class THCDeviceSubTensor<TensorType, 0, PtrTraits> {
- public:
-  __host__ __device__ THCDeviceSubTensor<TensorType, 0, PtrTraits>
-  operator=(typename TensorType::DataType val) {
-    *data_ = val;
-    return *this;
-  }
-
-  // operator T&
-  __host__ __device__ operator typename TensorType::DataType&() {
-    return *data_;
-  }
-
-  // const operator T& returning const T&
-  __host__ __device__ operator const typename TensorType::DataType&() const {
-    return *data_;
-  }
-
-  // operator& returning T*
-  __host__ __device__ typename TensorType::DataType* operator&() {
-    return data_;
-  }
-
-  // const operator& returning const T*
-  __host__ __device__ const typename TensorType::DataType* operator&() const {
-    return data_;
-  }
-
-  /// Returns a raw accessor to our slice.
-  __host__ __device__ __forceinline__ typename TensorType::DataPtrType data() {
-    return data_;
-  }
-
-  /// Returns a raw accessor to our slice (const).
-  __host__ __device__ __forceinline__
-  const typename TensorType::DataPtrType data() const {
-    return data_;
-  }
-
-  /// Cast to a different datatype.
-  template <typename T>
-  __host__ __device__ T& as() {
-    return *dataAs<T>();
-  }
-
-  /// Cast to a different datatype (const).
-  template <typename T>
-  __host__ __device__ const T& as() const {
-    return *dataAs<T>();
-  }
-
-  /// Cast to a different datatype
-  template <typename T>
-  __host__ __device__ __forceinline__
-  typename PtrTraits<T>::PtrType dataAs() {
-    return reinterpret_cast<typename PtrTraits<T>::PtrType>(data_);
-  }
-
-  /// Cast to a different datatype (const)
-  template <typename T>
-  __host__ __device__ __forceinline__
-  typename PtrTraits<const T>::PtrType dataAs() const {
-    return reinterpret_cast<typename PtrTraits<const T>::PtrType>(data_);
-  }
-
-  /// Use the texture cache for reads
-  __device__ __forceinline__ typename TensorType::DataType ldg() const {
-#if __CUDA_ARCH__ >= 350
-    return __ldg(data_);
-#else
-    return *data_;
-#endif
-  }
-
-  /// Use the texture cache for reads; cast as a particular type
-  template <typename T>
-  __device__ __forceinline__ T ldgAs() const {
-#if __CUDA_ARCH__ >= 350
-    return __ldg(dataAs<T>());
-#else
-    return as<T>();
-#endif
-  }
-
-  private:
-  /// One dimension greater can create us
-  friend class THCDeviceSubTensor<TensorType, 1, PtrTraits>;
-
-  /// Our parent tensor can create us
-  friend class THCDeviceTensor<typename TensorType::DataType,
-                               1,
-                               typename TensorType::IndexType,
-                               PtrTraits>;
-
-  __host__ __device__ __forceinline__ THCDeviceSubTensor(
-    TensorType& t,
-    typename TensorType::DataPtrType data)
-      : tensor_(t),
-        data_(data) {
-  }
-
-  /// The tensor we're referencing
-  TensorType& tensor_;
-
-  /// Where our value is located
-  typename TensorType::DataPtrType const data_;
-};
-
-/// A `SubDim`-rank slice of a parent THCDeviceTensor
-template <typename TensorType,
-          int SubDim,
-          template <typename U> class PtrTraits>
-class THCDeviceSubTensor {
- public:
-  /// Returns a view of the data located at our offset (the dimension
-  /// `SubDim` - 1 tensor).
-  __host__ __device__ __forceinline__
-  THCDeviceSubTensor<TensorType, SubDim - 1, PtrTraits>
-    operator[](typename TensorType::IndexType index) {
-    return THCDeviceSubTensor<TensorType, SubDim - 1, PtrTraits>(
-      tensor_,
-      data_ + index * tensor_.getStride(TensorType::NumDim - SubDim));
-  }
-
-  /// Returns a view of the data located at our offset (the dimension
-  /// `SubDim` - 1 tensor) (const).
-  __host__ __device__ __forceinline__
-  const THCDeviceSubTensor<TensorType, SubDim - 1, PtrTraits>
-    operator[](typename TensorType::IndexType index) const {
-    return THCDeviceSubTensor<TensorType, SubDim - 1, PtrTraits>(
-      tensor_,
-      data_ + index * tensor_.getStride(TensorType::NumDim - SubDim));
-  }
-
-  // operator& returning T*
-  __host__ __device__ typename TensorType::DataType* operator&() {
-    return data_;
-  }
-
-  // const operator& returning const T*
-  __host__ __device__ const typename TensorType::DataType* operator&() const {
-    return data_;
-  }
-
-  /// Returns a raw accessor to our slice.
-  __host__ __device__ __forceinline__ typename TensorType::DataPtrType data() {
-    return data_;
-  }
-
-  /// Returns a raw accessor to our slice (const).
-  __host__ __device__ __forceinline__
-  const typename TensorType::DataPtrType data() const {
-    return data_;
-  }
-
-  /// Cast to a different datatype.
-  template <typename T>
-  __host__ __device__ T& as() {
-    return *dataAs<T>();
-  }
-
-  /// Cast to a different datatype (const).
-  template <typename T>
-  __host__ __device__ const T& as() const {
-    return *dataAs<T>();
-  }
-
-  /// Cast to a different datatype
-  template <typename T>
-  __host__ __device__ __forceinline__
-  typename PtrTraits<T>::PtrType dataAs() {
-    return reinterpret_cast<typename PtrTraits<T>::PtrType>(data_);
-  }
-
-  /// Cast to a different datatype (const)
-  template <typename T>
-  __host__ __device__ __forceinline__
-  typename PtrTraits<const T>::PtrType dataAs() const {
-    return reinterpret_cast<typename PtrTraits<const T>::PtrType>(data_);
-  }
-
-  /// Use the texture cache for reads
-  __device__ __forceinline__ typename TensorType::DataType ldg() const {
-#if __CUDA_ARCH__ >= 350
-    return __ldg(data_);
-#else
-    return *data_;
-#endif
-  }
-
-  /// Use the texture cache for reads; cast as a particular type
-  template <typename T>
-  __device__ __forceinline__ T ldgAs() const {
-#if __CUDA_ARCH__ >= 350
-    return __ldg(dataAs<T>());
-#else
-    return as<T>();
-#endif
-  }
-
-  /// Returns a tensor that is a view of the SubDim-dimensional slice
-  /// of this tensor, starting where our data begins
-  THCDeviceTensor<typename TensorType::DataType,
-               SubDim,
-               typename TensorType::IndexType,
-               PtrTraits> view() {
-    return tensor_.template view<SubDim>(data_);
-  }
-
- private:
-  /// One dimension greater can create us
-  friend class THCDeviceSubTensor<TensorType, SubDim + 1, PtrTraits>;
-
-  /// Our parent tensor can create us
-  friend class
-  THCDeviceTensor<typename TensorType::DataType,
-               TensorType::NumDim,
-               typename TensorType::IndexType,
-               PtrTraits>;
-
-  __host__ __device__ __forceinline__ THCDeviceSubTensor(
-    TensorType& t,
-    typename TensorType::DataPtrType data)
-      : tensor_(t),
-        data_(data) {
-  }
-
-  /// The tensor we're referencing
-  TensorType& tensor_;
-
-  /// The start of our sub-region
-  typename TensorType::DataPtrType const data_;
-};
-
-} // namespace detail
-
-template <typename T, int Dim,
-          typename IndexT, template <typename U> class PtrTraits>
-__host__ __device__ __forceinline__
-detail::THCDeviceSubTensor<THCDeviceTensor<T, Dim, IndexT, PtrTraits>,
-                        Dim - 1, PtrTraits>
-THCDeviceTensor<T, Dim, IndexT, PtrTraits>::operator[](IndexT index) {
-  return detail::THCDeviceSubTensor<TensorType, Dim - 1, PtrTraits>(
-    detail::THCDeviceSubTensor<TensorType, Dim, PtrTraits>(
-      *this, data_)[index]);
-}
-
-template <typename T, int Dim,
-          typename IndexT, template <typename U> class PtrTraits>
-__host__ __device__ __forceinline__
-const detail::THCDeviceSubTensor<THCDeviceTensor<T, Dim, IndexT, PtrTraits>,
-                              Dim - 1, PtrTraits>
-THCDeviceTensor<T, Dim, IndexT, PtrTraits>::operator[](IndexT index) const {
-  return detail::THCDeviceSubTensor<TensorType, Dim - 1, PtrTraits>(
-    detail::THCDeviceSubTensor<TensorType, Dim, PtrTraits>(
-      const_cast<TensorType&>(*this), data_)[index]);
-}
-
-#include "THCDeviceTensor-inl.cuh"
-
-#endif // THC_DEVICE_TENSOR_INC
--- a/encoding/kernel/include/THCDeviceTensorUtils-inl.cuh
+++ b/encoding/kernel/include/THCDeviceTensorUtils-inl.cuh
-namespace detail {
-
-// Add a layer of SFINAE to support static_assert
-template <typename T, int Dim, typename IndexT,
-          template <typename U> class PtrTraits,
-          int NewDim, bool B>
-struct UpcastTHCRoot {
-  static THCDeviceTensor<T, NewDim, IndexT, PtrTraits>
-  make(THCState* state, THCudaTensor* t);
-};
-
-template <typename T, int Dim, typename IndexT,
-          template <typename U> class PtrTraits,
-          int NewDim, bool B>
-struct UpcastTHC :
-      UpcastTHCRoot<T, Dim, IndexT, PtrTraits, NewDim, B> {
-};
-
-// Never instantiated SFINAE purposes only
-template <typename T, int Dim, typename IndexT,
-          template <typename U> class PtrTraits,
-          int NewDim>
-struct UpcastTHC<T, Dim, IndexT, PtrTraits, NewDim, false> :
-      UpcastTHCRoot<T, Dim, IndexT, PtrTraits, NewDim, false> {
-};
-
-template <typename T, int Dim, typename IndexT,
-          template <typename U> class PtrTraits,
-          int NewDim>
-struct UpcastTHC<T, Dim, IndexT, PtrTraits, NewDim, true> :
-      UpcastTHCRoot<T, Dim, IndexT, PtrTraits, NewDim, true>  {
-  static THCDeviceTensor<T, NewDim, IndexT, PtrTraits>
-  make(THCState* state, THCudaTensor* t) {
-    thc_static_assert(NewDim > Dim);
-    return toDeviceTensor<T, Dim, IndexT, PtrTraits>(state, t).
-      template upcastOuter<NewDim>();
-  }
-};
-
-// Add a layer of SFINAE to support static_assert
-template <typename T, int Dim, typename IndexT,
-          template <typename U> class PtrTraits,
-          int NewDim, bool B>
-struct DowncastTHCRoot {
-  static THCDeviceTensor<T, NewDim, IndexT, PtrTraits>
-  make(THCState* state, THCudaTensor* t);
-};
-
-template <typename T, int Dim, typename IndexT,
-          template <typename U> class PtrTraits,
-          int NewDim, bool B>
-struct DowncastTHC :
-      DowncastTHCRoot<T, Dim, IndexT, PtrTraits, NewDim, B> {
-};
-
-// Never instantiated SFINAE purposes only
-template <typename T, int Dim, typename IndexT,
-          template <typename U> class PtrTraits,
-          int NewDim>
-struct DowncastTHC<T, Dim, IndexT, PtrTraits, NewDim, false> :
-      DowncastTHCRoot<T, Dim, IndexT, PtrTraits, NewDim, false> {
-};
-
-template <typename T, int Dim, typename IndexT,
-          template <typename U> class PtrTraits,
-          int NewDim>
-struct DowncastTHC<T, Dim, IndexT, PtrTraits, NewDim, true> :
-      DowncastTHCRoot<T, Dim, IndexT, PtrTraits, NewDim, true>  {
-  static THCDeviceTensor<T, NewDim, IndexT, PtrTraits>
-  make(THCState* state, THCudaTensor* t) {
-    thc_static_assert(NewDim < Dim);
-    return toDeviceTensor<T, Dim, IndexT, PtrTraits>(state, t).
-      template downcastOuter<NewDim>();
-  }
-};
-
-} // namespace detail
-
-#define SWITCH_UNROLL_CUDA_CAST_FACTORY(i)                              \
-  case i:                                                               \
-  if (NewDim > i) {                                                     \
-    return detail::UpcastTHC<T, i, IndexT,                              \
-                             PtrTraits, NewDim, (NewDim > i)>::         \
-      make(state, t);                                                   \
-  } else if (NewDim == i) {                                             \
-    return toDeviceTensor<T, NewDim, IndexT, PtrTraits>(state, t);      \
-  } else {                                                              \
-    return detail::DowncastTHC<T, i, IndexT,                            \
-                               PtrTraits, NewDim, (NewDim < i)>::       \
-      make(state, t);                                                   \
-  }                                                                     \
-  /* break; */
-
-template <typename T, int NewDim,
-          typename IndexT, template <typename U> class PtrTraits>
-THCDeviceTensor<T, NewDim, IndexT, PtrTraits>
-toDeviceTensorCast(THCState* state, THCudaTensor* t) {
-  switch (THCudaTensor_nDimension(state, t)) {
-    SWITCH_UNROLL_CUDA_CAST_FACTORY(1);
-    SWITCH_UNROLL_CUDA_CAST_FACTORY(2);
-    SWITCH_UNROLL_CUDA_CAST_FACTORY(3);
-    SWITCH_UNROLL_CUDA_CAST_FACTORY(4);
-    SWITCH_UNROLL_CUDA_CAST_FACTORY(5);
-    SWITCH_UNROLL_CUDA_CAST_FACTORY(6);
-    SWITCH_UNROLL_CUDA_CAST_FACTORY(7);
-    SWITCH_UNROLL_CUDA_CAST_FACTORY(8);
-    SWITCH_UNROLL_CUDA_CAST_FACTORY(9);
-    SWITCH_UNROLL_CUDA_CAST_FACTORY(10);
-    default:
-      ;
-  }
-
-  // Not implemented
-  THError("THCDeviceTensor dimension size not supported");
-  return NULL; /* never enters this piece, appeasing compiler warnings */
-}
-
-#undef SWITCH_UNROLL_CUDA_CAST_FACTORY