update docs

30be3799 · Hang Zhang · 79d52ff9 · 30be3799 · 30be3799 · 30be3799
Commit 30be3799 authored Oct 05, 2017 by Hang Zhang
9 changed files
--- a/docs/source/encoding.rst
+++ b/docs/source/encoding.rst
+.. role:: hidden
+    :class: hidden-section
+
 encoding
 ========
+
 .. automodule:: encoding


 Modules
 -------

-.. automodule:: encoding.modules
-.. currentmodule:: encoding.modules
+.. currentmodule:: encoding

+:hidden:`Encoding`
+~~~~~~~~~~~~~~~~~~

 .. autoclass:: Encoding
    :members:

-.. automodule:: encoding.functions
+:hidden:`Aggregate`
+~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: Aggregate
+    :members:
+
+Functions
+---------
+
+:hidden:`aggregate`
+~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: aggregate
+    :members:

-.. automodule:: encoding.syncbn

--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -6,7 +6,7 @@
 :github_url: https://github.com/zhanghang1989/PyTorch-Encoding

 Encoding documentation
-===================================
+======================

 PyTorch-Encoding is an optimized PyTorch package using GPU, including Encoding Layer, Synchronized Batch Normalization.

@@ -18,7 +18,7 @@ PyTorch-Encoding is an optimized PyTorch package using GPU, including Encoding L
   notes/*

 .. toctree::
-   :maxdepth: 1
+   :maxdepth: 2
   :caption: Package Reference

   encoding

--- a/encoding/__init__.py
+++ b/encoding/__init__.py
@@ -8,416 +8,7 @@
 ## LICENSE file in the root directory of this source tree 
 ##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

-import threading
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from torch.autograd import Function, Variable
-from ._ext import encoding_lib
-
-class aggregateE(Function):
-    def forward(self, A, X, C):
-        # A \in(BxNxK) R \in(BxNxKxD) => E \in(BxNxD)
-        self.save_for_backward(A, X, C)
-        B, N, K = A.size()
-        D = X.size(2)
-        with torch.cuda.device_of(A):
-            E = A.new(B,K,D)
-        if isinstance(A, torch.cuda.FloatTensor):
-            with torch.cuda.device_of(A):
-                encoding_lib.Encoding_Float_aggregateE_forward(E, A, X, C)
-        elif isinstance(A, torch.cuda.DoubleTensor):
-            with torch.cuda.device_of(A):
-                encoding_lib.Encoding_Double_aggregateE_forward(E, A, X, C)
-        else:
-            raise RuntimeError('Unimplemented data type!')
-        return E
-
-    def backward(self, gradE):
-        A, X, C = self.saved_tensors
-        with torch.cuda.device_of(A):
-            gradA = A.new().resize_as_(A)
-            gradX = A.new().resize_as_(X)
-            gradC = A.new().resize_as_(C)
-        if isinstance(A, torch.cuda.FloatTensor):
-            with torch.cuda.device_of(A):
-                encoding_lib.Encoding_Float_aggregateE_backward(gradA, 
-                    gradE, A, X, C)
-        elif isinstance(A, torch.cuda.DoubleTensor):
-            with torch.cuda.device_of(A):
-                encoding_lib.Encoding_Double_aggregateE_backward(gradA, 
-                    gradE, A, X, C)
-        else:
-            raise RuntimeError('Unimplemented data type!')
-        gradX.copy_(torch.bmm(A, gradE))
-        gradC.copy_((-gradE*A.sum(1).unsqueeze(2)).sum(0))
-        return gradA, gradX, gradC
-
-
-class ScaledL2(Function):
-    def forward(self, X, C, S):
-        B,N,D = X.size()
-        K = C.size(0)
-        with torch.cuda.device_of(X):
-            SL = X.new(B,N,K)
-        if isinstance(X, torch.cuda.FloatTensor):
-            with torch.cuda.device_of(X):
-                encoding_lib.Encoding_Float_scaledl2_forward(SL, X, C, S)
-        elif isinstance(X, torch.cuda.DoubleTensor):
-            with torch.cuda.device_of(X):
-                encoding_lib.Encoding_Double_scaledl2_forward(SL, X, C, S)
-        else:
-            raise RuntimeError('Unimplemented data type!')
-        self.save_for_backward(X, C, S, SL)
-        return SL
-    def backward(self, gradSL):
-        X, C, S, SL = self.saved_tensors
-        K = C.size(0)
-        with torch.cuda.device_of(X):
-            gradX = X.new().resize_as_(X)
-            gradC = X.new().resize_as_(C)
-            gradS = X.new().resize_as_(S)
-        if isinstance(X, torch.cuda.FloatTensor):
-            with torch.cuda.device_of(X):
-                encoding_lib.Encoding_Float_scaledl2_backward(gradSL, 
-                    gradX, gradC, X, C, S)
-        elif isinstance(X, torch.cuda.DoubleTensor):
-            with torch.cuda.device_of(X):
-                encoding_lib.Encoding_Double_scaledl2_backward(gradSL, 
-                    gradX, gradC, X, C, S)
-        else:
-            raise RuntimeError('Unimplemented data type!')
-        gradS.copy_((gradSL*(SL/S.view(1,1,K))).sum(0).sum(0))
-        return gradX, gradC, gradS
-
-
-class Encoding(nn.Module):
-    def __init__(self, D, K):
-        super(Encoding, self).__init__()
-        # init codewords and smoothing factor
-        self.D, self.K = D, K
-        self.codewords = nn.Parameter(torch.Tensor(K, D), 
-            requires_grad=True)
-        self.scale = nn.Parameter(torch.Tensor(K), requires_grad=True) 
-        self.reset_params()
-        
-    def reset_params(self):
-        std1 = 1./((self.K*self.D)**(1/2))
-        std2 = 1./((self.K)**(1/2))
-        self.codewords.data.uniform_(-std1, std1)
-        self.scale.data.uniform_(-std2, std2)
-
-    def forward(self, X):
-        # input X is a 4D tensor
-        assert(X.size(1)==self.D,"Encoding Layer wrong channels!")
-        if X.dim() == 3:
-            # BxDxN
-            B, N, K, D = X.size(0), X.size(2), self.K, self.D
-            X = X.transpose(1,2).contiguous()
-        elif X.dim() == 4:
-            # BxDxHxW
-            B, N, K, D = X.size(0), X.size(2)*X.size(3), self.K, self.D
-            X = X.view(B,D,-1).transpose(1,2).contiguous()
-        else:
-            raise RuntimeError('Encoding Layer unknown input dims!')
-        # assignment weights
-        A = F.softmax(ScaledL2()(X, self.codewords, self.scale))
-        # aggregate
-        E = aggregateE()(A, X, self.codewords)
-        return E
-
-    def __repr__(self):
-        return self.__class__.__name__ + '(' \
-            + 'N x ' + str(self.D) + '=>' + str(self.K) + 'x' \
-            + str(self.D) + ')'
-
-
-##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
-class aggregate(Function):
-    def forward(self, A, R):
-        # A \in(BxNxK) R \in(BxNxKxD) => E \in(BxNxD)
-        self.save_for_backward(A, R)
-        B, N, K, D = R.size()
-        with torch.cuda.device_of(A):
-            E = A.new(B,K,D)
-        if isinstance(A, torch.cuda.FloatTensor):
-            with torch.cuda.device_of(A):
-                encoding_lib.Encoding_Float_aggregate_forward(E, A, R)
-        elif isinstance(A, torch.cuda.DoubleTensor):
-            with torch.cuda.device_of(A):
-                encoding_lib.Encoding_Double_aggregate_forward(E, A, R)
-        else:
-            raise RuntimeError('Unimplemented data type!')
-        return E
-
-    def backward(self, gradE):
-        A, R = self.saved_tensors
-        with torch.cuda.device_of(A):
-            gradA = A.new().resize_as_(A)
-            gradR = R.new().resize_as_(R)
-        if isinstance(A, torch.cuda.FloatTensor):
-            with torch.cuda.device_of(A):
-                encoding_lib.Encoding_Float_aggregate_backward(gradA, 
-                    gradR, gradE, A, R)
-        elif isinstance(A, torch.cuda.DoubleTensor):
-            with torch.cuda.device_of(A):
-                encoding_lib.Encoding_Double_aggregate_backward(gradA, 
-                    gradR, gradE, A, R)
-        else:
-            raise RuntimeError('Unimplemented data type!')
-        return gradA, gradR
-
-
-class residual(Function):
-    def forward(self, X, C):
-        # X \in(BxNxD) D \in(KxD) R \in(BxNxKxD) 
-        B, N, D = X.size()
-        K = C.size(0)
-        with torch.cuda.device_of(X):
-            R = X.new(B,N,K,D)
-        if isinstance(X, torch.cuda.FloatTensor):
-            with torch.cuda.device_of(X):
-                encoding_lib.Encoding_Float_residual_forward(R, X, C)
-        elif isinstance(X, torch.cuda.DoubleTensor):
-            with torch.cuda.device_of(X):
-                encoding_lib.Encoding_Double_residual_forward(R, X, C)
-        else:
-            raise RuntimeError('Unimplemented data type!')
-        return R
-
-    def backward(self, gradR):
-        B, N, K, D = gradR.size()
-        with torch.cuda.device_of(gradR):
-            gradX = gradR.new(B,N,D)
-            gradD = gradR.new(K,D)
-        if isinstance(gradR, torch.cuda.FloatTensor):
-            with torch.cuda.device_of(gradR):
-                encoding_lib.Encoding_Float_residual_backward(gradR, 
-                    gradX, gradD)
-        elif isinstance(gradR, torch.cuda.DoubleTensor):
-            with torch.cuda.device_of(gradR):
-                encoding_lib.Encoding_Double_residual_backward(gradR, 
-                    gradX, gradD)
-        else:
-            raise RuntimeError('Unimplemented data type!')
-        return gradX, gradD
-
-
-class square_squeeze(Function):
-    def forward(self, R):
-        B, N, K, D = R.size()
-        with torch.cuda.device_of(R):
-            L = R.new(B,N,K)
-        if isinstance(R, torch.cuda.FloatTensor):
-            with torch.cuda.device_of(R):
-                encoding_lib.Encoding_Float_squaresqueeze_forward(L, R)
-        elif isinstance(R, torch.cuda.DoubleTensor):
-            with torch.cuda.device_of(R):
-                encoding_lib.Encoding_Double_squaresqueeze_forward(L, R)
-        else:
-            raise RuntimeError('Unimplemented data type!')
-        self.save_for_backward(L, R)
-        return L
-
-    def backward(self, gradL):
-        L, R = self.saved_tensors
-        B, N, K, D = R.size()
-        with torch.cuda.device_of(R):
-            gradR = R.new(B,N,K,D)
-        if isinstance(R, torch.cuda.FloatTensor):
-            with torch.cuda.device_of(gradL):
-                encoding_lib.Encoding_Float_squaresqueeze_backward(gradL, 
-                    gradR, R)
-        elif isinstance(R, torch.cuda.DoubleTensor):
-            with torch.cuda.device_of(gradL):
-                encoding_lib.Encoding_Double_squaresqueeze_backward(gradL, 
-                    gradR, R)
-        else:
-            raise RuntimeError('Unimplemented data type!')
-        return gradR
-    
-
-def assign(R, S):
-    L = square_squeeze()(R)
-    K = S.size(0)
-    SL = L * S.view(1,1,K)
-    return F.softmax(SL)
-
-
-class Aggregate(nn.Module):
-    def forward(self, A, R):
-        return aggregate()(A, R)
-
-
-class EncodingP(nn.Module):
-    def __init__(self, D, K):
-        super(EncodingP, self).__init__()
-        # init codewords and smoothing factor
-        self.D, self.K = D, K
-        self.codewords = nn.Parameter(torch.Tensor(K, D), 
-            requires_grad=True)
-        self.scale = nn.Parameter(torch.Tensor(K), requires_grad=True) 
-        self.reset_params()
-        
-    def reset_params(self):
-        std1 = 1./((self.K*self.D)**(1/2))
-        std2 = 1./((self.K)**(1/2))
-        self.codewords.data.uniform_(-std1, std1)
-        self.scale.data.uniform_(-std2, std2)
-
-    def forward(self, X):
-        # input X is a 4D tensor
-        assert(X.size(1)==self.D,"Encoding Layer wrong channels!")
-        if X.dim() == 3:
-            # BxDxN
-            B, N, K, D = X.size(0), X.size(2), self.K, self.D
-            X = X.transpose(1,2)
-        elif X.dim() == 4:
-            # BxDxHxW
-            B, N, K, D = X.size(0), X.size(2)*X.size(3), self.K, self.D
-            X = X.view(B,D,-1).transpose(1,2)
-        else:
-            raise RuntimeError('Encoding Layer unknown input dims!')
-        # calculate residuals
-        R = residual()(X.contiguous(), self.codewords)
-        # assignment weights
-        A = assign(R, self.scale)
-        # aggregate
-        E = aggregate()(A, R)
-
-        return E
-
-    def __repr__(self):
-        return self.__class__.__name__ + '(' \
-            + 'N x ' + str(self.D) + '=>' + str(self.K) + 'x' \
-            + str(self.D) + ')'
-
-
-##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
-class sum_square(Function):
-    def forward(ctx, input):
-        ctx.save_for_backward(input)
-        B,C,H,W = input.size()
-        with torch.cuda.device_of(input):
-            xsum    = input.new().resize_(C).zero_()
-            xsquare = input.new().resize_(C).zero_()
-        if isinstance(input, torch.cuda.FloatTensor):
-            with torch.cuda.device_of(input):
-                encoding_lib.Encoding_Float_sum_square_Forward(
-                    input.view(B,C,-1), xsum, xsquare)
-        elif isinstance(input, torch.cuda.DoubleTensor):
-            with torch.cuda.device_of(input):
-                encoding_lib.Encoding_Double_sum_square_Forward( 
-                    input.view(B,C,-1), xsum, xsquare)
-        else:
-            raise RuntimeError('Unimplemented data type!') 
-        return xsum, xsquare
-
-    def backward(ctx, gradSum, gradSquare):
-        input, = ctx.saved_tensors
-        B,C,H,W = input.size()
-        with torch.cuda.device_of(input):
-            gradInput = input.new().resize_(B,C,H*W).zero_()
-        if isinstance(input, torch.cuda.FloatTensor):
-            with torch.cuda.device_of(input):
-                encoding_lib.Encoding_Float_sum_square_Backward(
-                    gradInput, input.view(B,C,-1), gradSum, gradSquare)
-        elif isinstance(input, torch.cuda.DoubleTensor):
-            with torch.cuda.device_of(input):
-                encoding_lib.Encoding_Double_sum_square_Backward( 
-                    gradInput, input.view(B,C,-1), gradSum, gradSquare)
-        else:
-            raise RuntimeError('Unimplemented data type!') 
-        return gradInput.view(B,C,H,W)
-
-
-class batchnormtrain(Function):
-    def forward(ctx, input, gamma, beta, mean, std):
-        ctx.save_for_backward(input, gamma, beta, mean, std)
-        assert(input.dim()==3)
-        with torch.cuda.device_of(input):
-            invstd = 1.0 / std
-            output = input.new().resize_as_(input)
-        if isinstance(input, torch.cuda.FloatTensor):
-            with torch.cuda.device_of(input):
-                encoding_lib.Encoding_Float_batchnorm_Forward(output, 
-                    input, mean, invstd, gamma, beta)
-        elif isinstance(input, torch.cuda.DoubleTensor):
-            with torch.cuda.device_of(input):
-                encoding_lib.Encoding_Double_batchnorm_Forward(output, 
-                    input, mean, invstd, gamma, beta)
-        else:
-            raise RuntimeError('Unimplemented data type!')
-        return output 
-
-    def backward(ctx, gradOutput):
-        input, gamma, beta, mean, std = ctx.saved_tensors
-        invstd = 1.0 / std
-        with torch.cuda.device_of(input):
-            gradInput = gradOutput.new().resize_as_(input).zero_()
-            gradGamma = gradOutput.new().resize_as_(gamma).zero_()
-            gradBeta  = gradOutput.new().resize_as_(beta).zero_()
-            gradMean  = gradOutput.new().resize_as_(mean).zero_()
-            gradStd   = gradOutput.new().resize_as_(std).zero_()
-
-        if isinstance(input, torch.cuda.FloatTensor):
-            with torch.cuda.device_of(input):
-                encoding_lib.Encoding_Float_batchnorm_Backward(
-                    gradOutput, input, gradInput, gradGamma, gradBeta, 
-                    mean, invstd, gamma, beta, gradMean, gradStd,
-                    True) 
-        elif isinstance(input, torch.cuda.DoubleTensor):
-            with torch.cuda.device_of(input):
-                encoding_lib.Encoding_Double_batchnorm_Backward(
-                    gradOutput, input, gradInput, gradGamma, gradBeta, 
-                    mean, invstd, gamma, beta, gradMean, gradStd,
-                    True) 
-        else:
-            raise RuntimeError('Unimplemented data type!')
-        return gradInput, gradGamma, gradBeta, gradMean, gradStd
-
-
-class batchnormeval(Function):
-    def forward(ctx, input, gamma, beta, mean, std):
-        ctx.save_for_backward(input, gamma, beta, mean, std)
-        assert(input.dim()==3)
-        with torch.cuda.device_of(input):
-            invstd = 1.0 / std
-            output = input.new().resize_as_(input)
-        if isinstance(input, torch.cuda.FloatTensor):
-            with torch.cuda.device_of(input):
-                encoding_lib.Encoding_Float_batchnorm_Forward(output, 
-                    input, mean, invstd, gamma, beta)
-        elif isinstance(input, torch.cuda.DoubleTensor):
-            with torch.cuda.device_of(input):
-                encoding_lib.Encoding_Double_batchnorm_Forward(output, 
-                    input, mean, invstd, gamma, beta)
-        else:
-            raise RuntimeError('Unimplemented data type!')
-        return output 
-
-    def backward(ctx, gradOutput):
-        input, gamma, beta, mean, std = ctx.saved_tensors
-        invstd = 1.0 / std
-        with torch.cuda.device_of(input):
-            gradInput = gradOutput.new().resize_as_(input).zero_()
-            gradGamma = gradOutput.new().resize_as_(gamma).zero_()
-            gradBeta  = gradOutput.new().resize_as_(beta).zero_()
-            gradMean  = gradOutput.new().resize_as_(mean).zero_()
-            gradStd   = gradOutput.new().resize_as_(std).zero_()
-        if isinstance(input, torch.cuda.FloatTensor):
-            with torch.cuda.device_of(input):
-                encoding_lib.Encoding_Float_batchnorm_Backward(
-                    gradOutput, input, gradInput, gradGamma, gradBeta, 
-                    mean, invstd, gamma, beta, gradMean, gradStd,
-                    False) 
-        elif isinstance(input, torch.cuda.DoubleTensor):
-            with torch.cuda.device_of(input):
-                encoding_lib.Encoding_Double_batchnorm_Backward(
-                    gradOutput, input, gradInput, gradGamma, gradBeta, 
-                    mean, invstd, gamma, beta, gradMean, gradStd,
-                    False) 
-        else:
-            raise RuntimeError('Unimplemented data type!')
-        return gradInput, gradGamma, gradBeta, gradMean, gradStd
+from .functions import *
+from .modules import *
+from .syncbn import *

--- a/encoding/functions/__init__.py
+++ b/encoding/functions/__init__.py
+from .aggregate import *
--- a/encoding/functions/aggregate.py
+++ b/encoding/functions/aggregate.py
+##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+## Created by: Hang Zhang
+## ECE Department, Rutgers University
+## Email: zhang.hang@rutgers.edu
+## Copyright (c) 2017
+##
+## This source code is licensed under the MIT-style license found in the
+## LICENSE file in the root directory of this source tree 
+##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+
+import threading
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.autograd import Function, Variable
+from .._ext import encoding_lib
+
+
+class aggregate(Function):
+    r"""
+    Aggregate operation, aggregate the residuals of inputs (:math:`X`) with repect to the codewords (:math:`C`) with assignment weights (:math:`A`).
+    
+
+    .. math::
+        e_{k} = \sum_{i=1}^{N} a_{ik} (x_i - d_k)
+
+    Shape:
+        - Input: :math:`A\in\mathcal{R}^{B\times N\times K}` :math:`X\in\mathcal{R}^{B\times N\times D}` :math:`C\in\mathcal{R}^{K\times D}`  (where :math:`B` is batch, :math:`N` is total number of features, :math:`K` is number is codewords, :math:`D` is feature dimensions.)
+        - Output: :math:`E\in\mathcal{R}^{B\times K\times D}`
+
+    """
+    def forward(self, A, X, C):
+        # A \in(BxNxK) R \in(BxNxKxD) => E \in(BxNxD)
+        self.save_for_backward(A, X, C)
+        B, N, K = A.size()
+        D = X.size(2)
+        with torch.cuda.device_of(A):
+            E = A.new(B,K,D)
+        if isinstance(A, torch.cuda.FloatTensor):
+            with torch.cuda.device_of(A):
+                encoding_lib.Encoding_Float_aggregateE_forward(E, A, X, C)
+        elif isinstance(A, torch.cuda.DoubleTensor):
+            with torch.cuda.device_of(A):
+                encoding_lib.Encoding_Double_aggregateE_forward(E, A, X, C)
+        else:
+            raise RuntimeError('Unimplemented data type!')
+        return E
+
+    def backward(self, gradE):
+        A, X, C = self.saved_tensors
+        with torch.cuda.device_of(A):
+            gradA = A.new().resize_as_(A)
+            gradX = A.new().resize_as_(X)
+            gradC = A.new().resize_as_(C)
+        if isinstance(A, torch.cuda.FloatTensor):
+            with torch.cuda.device_of(A):
+                encoding_lib.Encoding_Float_aggregateE_backward(gradA, 
+                    gradE, A, X, C)
+        elif isinstance(A, torch.cuda.DoubleTensor):
+            with torch.cuda.device_of(A):
+                encoding_lib.Encoding_Double_aggregateE_backward(gradA, 
+                    gradE, A, X, C)
+        else:
+            raise RuntimeError('Unimplemented data type!')
+        gradX.copy_(torch.bmm(A, gradE))
+        gradC.copy_((-gradE*A.sum(1).unsqueeze(2)).sum(0))
+        return gradA, gradX, gradC
+
+
+class ScaledL2(Function):
+    def forward(self, X, C, S):
+        B,N,D = X.size()
+        K = C.size(0)
+        with torch.cuda.device_of(X):
+            SL = X.new(B,N,K)
+        if isinstance(X, torch.cuda.FloatTensor):
+            with torch.cuda.device_of(X):
+                encoding_lib.Encoding_Float_scaledl2_forward(SL, X, C, S)
+        elif isinstance(X, torch.cuda.DoubleTensor):
+            with torch.cuda.device_of(X):
+                encoding_lib.Encoding_Double_scaledl2_forward(SL, X, C, S)
+        else:
+            raise RuntimeError('Unimplemented data type!')
+        self.save_for_backward(X, C, S, SL)
+        return SL
+    def backward(self, gradSL):
+        X, C, S, SL = self.saved_tensors
+        K = C.size(0)
+        with torch.cuda.device_of(X):
+            gradX = X.new().resize_as_(X)
+            gradC = X.new().resize_as_(C)
+            gradS = X.new().resize_as_(S)
+        if isinstance(X, torch.cuda.FloatTensor):
+            with torch.cuda.device_of(X):
+                encoding_lib.Encoding_Float_scaledl2_backward(gradSL, 
+                    gradX, gradC, X, C, S)
+        elif isinstance(X, torch.cuda.DoubleTensor):
+            with torch.cuda.device_of(X):
+                encoding_lib.Encoding_Double_scaledl2_backward(gradSL, 
+                    gradX, gradC, X, C, S)
+        else:
+            raise RuntimeError('Unimplemented data type!')
+        gradS.copy_((gradSL*(SL/S.view(1,1,K))).sum(0).sum(0))
+        return gradX, gradC, gradS
+
+
+class aggregateP(Function):
+    def forward(self, A, R):
+        # A \in(BxNxK) R \in(BxNxKxD) => E \in(BxNxD)
+        self.save_for_backward(A, R)
+        B, N, K, D = R.size()
+        with torch.cuda.device_of(A):
+            E = A.new(B,K,D)
+        if isinstance(A, torch.cuda.FloatTensor):
+            with torch.cuda.device_of(A):
+                encoding_lib.Encoding_Float_aggregate_forward(E, A, R)
+        elif isinstance(A, torch.cuda.DoubleTensor):
+            with torch.cuda.device_of(A):
+                encoding_lib.Encoding_Double_aggregate_forward(E, A, R)
+        else:
+            raise RuntimeError('Unimplemented data type!')
+        return E
+
+    def backward(self, gradE):
+        A, R = self.saved_tensors
+        with torch.cuda.device_of(A):
+            gradA = A.new().resize_as_(A)
+            gradR = R.new().resize_as_(R)
+        if isinstance(A, torch.cuda.FloatTensor):
+            with torch.cuda.device_of(A):
+                encoding_lib.Encoding_Float_aggregate_backward(gradA, 
+                    gradR, gradE, A, R)
+        elif isinstance(A, torch.cuda.DoubleTensor):
+            with torch.cuda.device_of(A):
+                encoding_lib.Encoding_Double_aggregate_backward(gradA, 
+                    gradR, gradE, A, R)
+        else:
+            raise RuntimeError('Unimplemented data type!')
+        return gradA, gradR
+
+
+class residual(Function):
+    def forward(self, X, C):
+        # X \in(BxNxD) D \in(KxD) R \in(BxNxKxD) 
+        B, N, D = X.size()
+        K = C.size(0)
+        with torch.cuda.device_of(X):
+            R = X.new(B,N,K,D)
+        if isinstance(X, torch.cuda.FloatTensor):
+            with torch.cuda.device_of(X):
+                encoding_lib.Encoding_Float_residual_forward(R, X, C)
+        elif isinstance(X, torch.cuda.DoubleTensor):
+            with torch.cuda.device_of(X):
+                encoding_lib.Encoding_Double_residual_forward(R, X, C)
+        else:
+            raise RuntimeError('Unimplemented data type!')
+        return R
+
+    def backward(self, gradR):
+        B, N, K, D = gradR.size()
+        with torch.cuda.device_of(gradR):
+            gradX = gradR.new(B,N,D)
+            gradD = gradR.new(K,D)
+        if isinstance(gradR, torch.cuda.FloatTensor):
+            with torch.cuda.device_of(gradR):
+                encoding_lib.Encoding_Float_residual_backward(gradR, 
+                    gradX, gradD)
+        elif isinstance(gradR, torch.cuda.DoubleTensor):
+            with torch.cuda.device_of(gradR):
+                encoding_lib.Encoding_Double_residual_backward(gradR, 
+                    gradX, gradD)
+        else:
+            raise RuntimeError('Unimplemented data type!')
+        return gradX, gradD
+
+
+class square_squeeze(Function):
+    def forward(self, R):
+        B, N, K, D = R.size()
+        with torch.cuda.device_of(R):
+            L = R.new(B,N,K)
+        if isinstance(R, torch.cuda.FloatTensor):
+            with torch.cuda.device_of(R):
+                encoding_lib.Encoding_Float_squaresqueeze_forward(L, R)
+        elif isinstance(R, torch.cuda.DoubleTensor):
+            with torch.cuda.device_of(R):
+                encoding_lib.Encoding_Double_squaresqueeze_forward(L, R)
+        else:
+            raise RuntimeError('Unimplemented data type!')
+        self.save_for_backward(L, R)
+        return L
+
+    def backward(self, gradL):
+        L, R = self.saved_tensors
+        B, N, K, D = R.size()
+        with torch.cuda.device_of(R):
+            gradR = R.new(B,N,K,D)
+        if isinstance(R, torch.cuda.FloatTensor):
+            with torch.cuda.device_of(gradL):
+                encoding_lib.Encoding_Float_squaresqueeze_backward(gradL, 
+                    gradR, R)
+        elif isinstance(R, torch.cuda.DoubleTensor):
+            with torch.cuda.device_of(gradL):
+                encoding_lib.Encoding_Double_squaresqueeze_backward(gradL, 
+                    gradR, R)
+        else:
+            raise RuntimeError('Unimplemented data type!')
+        return gradR
+    
+
+def assign(R, S):
+    L = square_squeeze()(R)
+    K = S.size(0)
+    SL = L * S.view(1,1,K)
+    return F.softmax(SL)
--- a/encoding/modules/__init__.py
+++ b/encoding/modules/__init__.py
+from .encoding import *
--- a/encoding/modules/encoding.py
+++ b/encoding/modules/encoding.py
+##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+## Created by: Hang Zhang
+## ECE Department, Rutgers University
+## Email: zhang.hang@rutgers.edu
+## Copyright (c) 2017
+##
+## This source code is licensed under the MIT-style license found in the
+## LICENSE file in the root directory of this source tree 
+##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+
+import threading
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.autograd import Function, Variable
+from .._ext import encoding_lib
+from ..functions import *
+
+
+class Encoding(nn.Module):
+    r"""
+    Encoding Layer: learnable residual encoders over 3d or 4d input that is seen as a mini-batch.
+
+    .. math::
+
+        a_{ik} = \frac{exp(-\beta\|x_{i}-c_k\|^2)}{\sum_{j=1}^K exp(-\beta\|x_{i}-c_j\|^2)}
+
+    Args:
+        D: dimention of the features or feature channels
+        K: number of codeswords
+
+    Shape:
+        - Input: :math:`X\in\mathcal{R}^{B\times N\times D}` or :math:`\mathcal{R}^{B\times D\times H\times W}` (where :math:`B` is batch, :math:`N` is total number of features or :math:`H\times W`.)
+        - Output: :math:`E\in\mathcal{R}^{B\times K\times D}`
+        
+    Attributes:
+        codewords (Tensor): the learnable codewords of shape (:math:`K\times D`)
+        scale (Tensor): the learnable scale factor of visual centers
+
+    Examples:
+        >>> import encoding
+        >>> import torch
+        >>> import torch.nn.functional as F
+        >>> from torch.autograd import Variable, gradcheck
+        >>> B,C,H,W,K = 2,3,4,5,6
+        >>> X = Variable(torch.cuda.DoubleTensor(B,C,H,W).uniform_(-0.5,0.5), requires_grad=True)
+        >>> layer = encoding.Encoding(C,K).double().cuda()
+        >>> E = layer(X)
+
+    Reference:
+        Zhang, Hang, Jia Xue, and Kristin Dana. "Deep TEN: Texture Encoding Network." *The IEEE Conference on Computer Vision and Pattern Recognition (CVPR) 2017*
+    """
+    def __init__(self, D, K):
+        super(Encoding, self).__init__()
+        # init codewords and smoothing factor
+        self.D, self.K = D, K
+        self.codewords = nn.Parameter(torch.Tensor(K, D), 
+            requires_grad=True)
+        self.scale = nn.Parameter(torch.Tensor(K), requires_grad=True) 
+        self.reset_params()
+        
+    def reset_params(self):
+        std1 = 1./((self.K*self.D)**(1/2))
+        std2 = 1./((self.K)**(1/2))
+        self.codewords.data.uniform_(-std1, std1)
+        self.scale.data.uniform_(-std2, std2)
+
+    def forward(self, X):
+        # input X is a 4D tensor
+        assert(X.size(1)==self.D,"Encoding Layer wrong channels!")
+        if X.dim() == 3:
+            # BxDxN
+            B, N, K, D = X.size(0), X.size(2), self.K, self.D
+            X = X.transpose(1,2).contiguous()
+        elif X.dim() == 4:
+            # BxDxHxW
+            B, N, K, D = X.size(0), X.size(2)*X.size(3), self.K, self.D
+            X = X.view(B,D,-1).transpose(1,2).contiguous()
+        else:
+            raise RuntimeError('Encoding Layer unknown input dims!')
+        # assignment weights
+        A = F.softmax(ScaledL2()(X, self.codewords, self.scale))
+        # aggregate
+        E = aggregate()(A, X, self.codewords)
+        return E
+
+    def __repr__(self):
+        return self.__class__.__name__ + '(' \
+            + 'N x ' + str(self.D) + '=>' + str(self.K) + 'x' \
+            + str(self.D) + ')'
+
+
+class Aggregate(nn.Module):
+    r"""
+    Aggregate operation, aggregate the residuals (:math:`R`) with assignment weights (:math:`A`).
+
+    .. math::
+        e_{k} = \sum_{i=1}^{N} a_{ik} (r_{ik})
+
+    Shape:
+        - Input: :math:`A\in\mathcal{R}^{B\times N\times K}` :math:`R\in\mathcal{R}^{B\times N\times K\times D}` (where :math:`B` is batch, :math:`N` is total number of features, :math:`K` is number is codewords, :math:`D` is feature dimensions.)
+        - Output: :math:`E\in\mathcal{R}^{B\times K\times D}`
+
+    """ 
+    def forward(self, A, R):
+        return aggregateP()(A, R)
+
+
+class EncodingP(nn.Module):
+    def __init__(self, D, K):
+        super(EncodingP, self).__init__()
+        # init codewords and smoothing factor
+        self.D, self.K = D, K
+        self.codewords = nn.Parameter(torch.Tensor(K, D), 
+            requires_grad=True)
+        self.scale = nn.Parameter(torch.Tensor(K), requires_grad=True) 
+        self.reset_params()
+        print('EncodingP is deprecated, please use Encoding.')
+        
+    def reset_params(self):
+        std1 = 1./((self.K*self.D)**(1/2))
+        std2 = 1./((self.K)**(1/2))
+        self.codewords.data.uniform_(-std1, std1)
+        self.scale.data.uniform_(-std2, std2)
+
+    def forward(self, X):
+        # input X is a 4D tensor
+        assert(X.size(1)==self.D,"Encoding Layer wrong channels!")
+        if X.dim() == 3:
+            # BxDxN
+            B, N, K, D = X.size(0), X.size(2), self.K, self.D
+            X = X.transpose(1,2)
+        elif X.dim() == 4:
+            # BxDxHxW
+            B, N, K, D = X.size(0), X.size(2)*X.size(3), self.K, self.D
+            X = X.view(B,D,-1).transpose(1,2)
+        else:
+            raise RuntimeError('Encoding Layer unknown input dims!')
+        # calculate residuals
+        R = residual()(X.contiguous(), self.codewords)
+        # assignment weights
+        A = assign(R, self.scale)
+        # aggregate
+        E = aggregateP()(A, R)
+
+        return E
+
+    def __repr__(self):
+        return self.__class__.__name__ + '(' \
+            + 'N x ' + str(self.D) + '=>' + str(self.K) + 'x' \
+            + str(self.D) + ')'
+
+
--- a/encoding/syncbn.py
+++ b/encoding/syncbn.py
+##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+## Created by: Hang Zhang
+## ECE Department, Rutgers University
+## Email: zhang.hang@rutgers.edu
+## Copyright (c) 2017
+##
+## This source code is licensed under the MIT-style license found in the
+## LICENSE file in the root directory of this source tree 
+##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+
+import threading
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.autograd import Function, Variable
+from ._ext import encoding_lib
+
+class sum_square(Function):
+    def forward(ctx, input):
+        ctx.save_for_backward(input)
+        B,C,H,W = input.size()
+        with torch.cuda.device_of(input):
+            xsum    = input.new().resize_(C).zero_()
+            xsquare = input.new().resize_(C).zero_()
+        if isinstance(input, torch.cuda.FloatTensor):
+            with torch.cuda.device_of(input):
+                encoding_lib.Encoding_Float_sum_square_Forward(
+                    input.view(B,C,-1), xsum, xsquare)
+        elif isinstance(input, torch.cuda.DoubleTensor):
+            with torch.cuda.device_of(input):
+                encoding_lib.Encoding_Double_sum_square_Forward( 
+                    input.view(B,C,-1), xsum, xsquare)
+        else:
+            raise RuntimeError('Unimplemented data type!') 
+        return xsum, xsquare
+
+    def backward(ctx, gradSum, gradSquare):
+        input, = ctx.saved_tensors
+        B,C,H,W = input.size()
+        with torch.cuda.device_of(input):
+            gradInput = input.new().resize_(B,C,H*W).zero_()
+        if isinstance(input, torch.cuda.FloatTensor):
+            with torch.cuda.device_of(input):
+                encoding_lib.Encoding_Float_sum_square_Backward(
+                    gradInput, input.view(B,C,-1), gradSum, gradSquare)
+        elif isinstance(input, torch.cuda.DoubleTensor):
+            with torch.cuda.device_of(input):
+                encoding_lib.Encoding_Double_sum_square_Backward( 
+                    gradInput, input.view(B,C,-1), gradSum, gradSquare)
+        else:
+            raise RuntimeError('Unimplemented data type!') 
+        return gradInput.view(B,C,H,W)
+
+
+class batchnormtrain(Function):
+    def forward(ctx, input, gamma, beta, mean, std):
+        ctx.save_for_backward(input, gamma, beta, mean, std)
+        assert(input.dim()==3)
+        with torch.cuda.device_of(input):
+            invstd = 1.0 / std
+            output = input.new().resize_as_(input)
+        if isinstance(input, torch.cuda.FloatTensor):
+            with torch.cuda.device_of(input):
+                encoding_lib.Encoding_Float_batchnorm_Forward(output, 
+                    input, mean, invstd, gamma, beta)
+        elif isinstance(input, torch.cuda.DoubleTensor):
+            with torch.cuda.device_of(input):
+                encoding_lib.Encoding_Double_batchnorm_Forward(output, 
+                    input, mean, invstd, gamma, beta)
+        else:
+            raise RuntimeError('Unimplemented data type!')
+        return output 
+
+    def backward(ctx, gradOutput):
+        input, gamma, beta, mean, std = ctx.saved_tensors
+        invstd = 1.0 / std
+        with torch.cuda.device_of(input):
+            gradInput = gradOutput.new().resize_as_(input).zero_()
+            gradGamma = gradOutput.new().resize_as_(gamma).zero_()
+            gradBeta  = gradOutput.new().resize_as_(beta).zero_()
+            gradMean  = gradOutput.new().resize_as_(mean).zero_()
+            gradStd   = gradOutput.new().resize_as_(std).zero_()
+
+        if isinstance(input, torch.cuda.FloatTensor):
+            with torch.cuda.device_of(input):
+                encoding_lib.Encoding_Float_batchnorm_Backward(
+                    gradOutput, input, gradInput, gradGamma, gradBeta, 
+                    mean, invstd, gamma, beta, gradMean, gradStd,
+                    True) 
+        elif isinstance(input, torch.cuda.DoubleTensor):
+            with torch.cuda.device_of(input):
+                encoding_lib.Encoding_Double_batchnorm_Backward(
+                    gradOutput, input, gradInput, gradGamma, gradBeta, 
+                    mean, invstd, gamma, beta, gradMean, gradStd,
+                    True) 
+        else:
+            raise RuntimeError('Unimplemented data type!')
+        return gradInput, gradGamma, gradBeta, gradMean, gradStd
+
+
+class batchnormeval(Function):
+    def forward(ctx, input, gamma, beta, mean, std):
+        ctx.save_for_backward(input, gamma, beta, mean, std)
+        assert(input.dim()==3)
+        with torch.cuda.device_of(input):
+            invstd = 1.0 / std
+            output = input.new().resize_as_(input)
+        if isinstance(input, torch.cuda.FloatTensor):
+            with torch.cuda.device_of(input):
+                encoding_lib.Encoding_Float_batchnorm_Forward(output, 
+                    input, mean, invstd, gamma, beta)
+        elif isinstance(input, torch.cuda.DoubleTensor):
+            with torch.cuda.device_of(input):
+                encoding_lib.Encoding_Double_batchnorm_Forward(output, 
+                    input, mean, invstd, gamma, beta)
+        else:
+            raise RuntimeError('Unimplemented data type!')
+        return output 
+
+    def backward(ctx, gradOutput):
+        input, gamma, beta, mean, std = ctx.saved_tensors
+        invstd = 1.0 / std
+        with torch.cuda.device_of(input):
+            gradInput = gradOutput.new().resize_as_(input).zero_()
+            gradGamma = gradOutput.new().resize_as_(gamma).zero_()
+            gradBeta  = gradOutput.new().resize_as_(beta).zero_()
+            gradMean  = gradOutput.new().resize_as_(mean).zero_()
+            gradStd   = gradOutput.new().resize_as_(std).zero_()
+        if isinstance(input, torch.cuda.FloatTensor):
+            with torch.cuda.device_of(input):
+                encoding_lib.Encoding_Float_batchnorm_Backward(
+                    gradOutput, input, gradInput, gradGamma, gradBeta, 
+                    mean, invstd, gamma, beta, gradMean, gradStd,
+                    False) 
+        elif isinstance(input, torch.cuda.DoubleTensor):
+            with torch.cuda.device_of(input):
+                encoding_lib.Encoding_Double_batchnorm_Backward(
+                    gradOutput, input, gradInput, gradGamma, gradBeta, 
+                    mean, invstd, gamma, beta, gradMean, gradStd,
+                    False) 
+        else:
+            raise RuntimeError('Unimplemented data type!')
+        return gradInput, gradGamma, gradBeta, gradMean, gradStd
+
--- a/test/test.py
+++ b/test/test.py
@@ -13,18 +13,18 @@ import torch
 import torch.nn.functional as F
 from torch.autograd import Variable, gradcheck

-def test_aggregate():
+def test_aggregateP():
    B,N,K,D = 2,3,4,5
    A = Variable(torch.cuda.DoubleTensor(B,N,K).uniform_(-0.5,0.5), 
        requires_grad=True)
    R = Variable(torch.cuda.DoubleTensor(B,N,K,D).uniform_(-0.5,0.5), 
        requires_grad=True)
    input = (A, R)
-    test = gradcheck(encoding.aggregate(), input, eps=1e-6, atol=1e-4)
+    test = gradcheck(encoding.aggregateP(), input, eps=1e-6, atol=1e-4)
    print('Testing aggregate(): {}'.format(test))


-def test_aggregateE():
+def test_aggregate():
    B,N,K,D = 2,3,4,5
    A = Variable(torch.cuda.DoubleTensor(B,N,K).uniform_(-0.5,0.5), 
        requires_grad=True)
@@ -33,8 +33,8 @@ def test_aggregateE():
    C = Variable(torch.cuda.DoubleTensor(K,D).uniform_(-0.5,0.5), 
        requires_grad=True)
    input = (A, X, C)
-    test = gradcheck(encoding.aggregateE(), input, eps=1e-6, atol=1e-4)
-    print('Testing aggregateE(): {}'.format(test))
+    test = gradcheck(encoding.aggregate(), input, eps=1e-6, atol=1e-4)
+    print('Testing aggregate(): {}'.format(test))


 def test_ScaledL2():
@@ -61,10 +61,10 @@ def test_assign():

    R = encoding.residual()(X, C)
    A1 = encoding.assign(R, S)
-    E1 = encoding.aggregate()(A1, R)
+    E1 = encoding.aggregateP()(A1, R)

    A2 = F.softmax(encoding.ScaledL2()(X,C,S))
-    E2 = encoding.aggregateE()(A2, X, C)
+    E2 = encoding.aggregate()(A2, X, C)

    print('E1', E1)
    print('E2', E2)
@@ -120,7 +120,7 @@ def test_sum_square():


 if __name__ == '__main__':
-    test_aggregateE()
+    test_aggregateP()
    test_ScaledL2()
    test_encoding() 
    test_aggregate()
@@ -129,3 +129,4 @@ if __name__ == '__main__':
    test_square_squeeze()
    test_encodingP()
    test_sum_square()
+