memory efficient implementation and test script

1177a80b · Hang Zhang · 8dd870b1 · 1177a80b · 1177a80b · 1177a80b
Commit 1177a80b authored Oct 04, 2017 by Hang Zhang
7 changed files
--- a/build.py
+++ b/build.py
@@ -28,9 +28,11 @@ else:
    os.environ['THC_LIBRARIES'] = os.path.join(lib_path,'libTHC.so.1')
    ENCODING_LIB = os.path.join(lib_path, 'libENCODING.so')

+clean_cmd = ['bash', 'clean.sh']
+subprocess.check_call(clean_cmd)
+
 build_all_cmd = ['bash', 'encoding/make.sh']
-if subprocess.call(build_all_cmd, env=dict(os.environ)) != 0:
-    sys.exit(1)
+subprocess.check_call(build_all_cmd, env=dict(os.environ))

 sources = ['encoding/src/encoding_lib.cpp']
 headers = ['encoding/src/encoding_lib.h']

--- a/encoding/__init__.py
+++ b/encoding/__init__.py
@@ -10,55 +10,253 @@

 import threading
 import torch
-import torch.cuda.nccl as nccl
 import torch.nn as nn
+import torch.nn.functional as F
 from torch.autograd import Function, Variable
-from torch.nn.parameter import Parameter
 from ._ext import encoding_lib

+class aggregateE(Function):
+    def forward(self, A, X, C):
+        # A \in(BxNxK) R \in(BxNxKxD) => E \in(BxNxD)
+        self.save_for_backward(A, X, C)
+        B, N, K = A.size()
+        D = X.size(2)
+        with torch.cuda.device_of(A):
+            E = A.new(B,K,D)
+        if isinstance(A, torch.cuda.FloatTensor):
+            with torch.cuda.device_of(A):
+                encoding_lib.Encoding_Float_aggregateE_forward(E, A, X, C)
+        elif isinstance(A, torch.cuda.DoubleTensor):
+            with torch.cuda.device_of(A):
+                encoding_lib.Encoding_Double_aggregateE_forward(E, A, X, C)
+        else:
+            raise RuntimeError('Unimplemented data type!')
+        return E
+
+    def backward(self, gradE):
+        A, X, C = self.saved_tensors
+        with torch.cuda.device_of(A):
+            gradA = A.new().resize_as_(A)
+            gradX = A.new().resize_as_(X)
+            gradC = A.new().resize_as_(C)
+        if isinstance(A, torch.cuda.FloatTensor):
+            with torch.cuda.device_of(A):
+                encoding_lib.Encoding_Float_aggregateE_backward(gradA, 
+                    gradE, A, X, C)
+        elif isinstance(A, torch.cuda.DoubleTensor):
+            with torch.cuda.device_of(A):
+                encoding_lib.Encoding_Double_aggregateE_backward(gradA, 
+                    gradE, A, X, C)
+        else:
+            raise RuntimeError('Unimplemented data type!')
+        gradX.copy_(torch.bmm(A, gradE))
+        gradC.copy_((-gradE*A.sum(1).unsqueeze(2)).sum(0))
+        return gradA, gradX, gradC
+
+
+class ScaledL2(Function):
+    def forward(self, X, C, S):
+        B,N,D = X.size()
+        K = C.size(0)
+        with torch.cuda.device_of(X):
+            SL = X.new(B,N,K)
+        if isinstance(X, torch.cuda.FloatTensor):
+            with torch.cuda.device_of(X):
+                encoding_lib.Encoding_Float_scaledl2_forward(SL, X, C, S)
+        elif isinstance(X, torch.cuda.DoubleTensor):
+            with torch.cuda.device_of(X):
+                encoding_lib.Encoding_Double_scaledl2_forward(SL, X, C, S)
+        else:
+            raise RuntimeError('Unimplemented data type!')
+        self.save_for_backward(X, C, S, SL)
+        return SL
+    def backward(self, gradSL):
+        X, C, S, SL = self.saved_tensors
+        K = C.size(0)
+        with torch.cuda.device_of(X):
+            gradX = X.new().resize_as_(X)
+            gradC = X.new().resize_as_(C)
+            gradS = X.new().resize_as_(S)
+        if isinstance(X, torch.cuda.FloatTensor):
+            with torch.cuda.device_of(X):
+                encoding_lib.Encoding_Float_scaledl2_backward(gradSL, 
+                    gradX, gradC, X, C, S)
+        elif isinstance(X, torch.cuda.DoubleTensor):
+            with torch.cuda.device_of(X):
+                encoding_lib.Encoding_Double_scaledl2_backward(gradSL, 
+                    gradX, gradC, X, C, S)
+        else:
+            raise RuntimeError('Unimplemented data type!')
+        gradS.copy_((gradSL*(SL/S.view(1,1,K))).sum(0).sum(0))
+        return gradX, gradC, gradS
+
+
+class Encoding(nn.Module):
+    def __init__(self, D, K):
+        super(Encoding, self).__init__()
+        # init codewords and smoothing factor
+        self.D, self.K = D, K
+        self.codewords = nn.Parameter(torch.Tensor(K, D), 
+            requires_grad=True)
+        self.scale = nn.Parameter(torch.Tensor(K), requires_grad=True) 
+        self.reset_params()
+        
+    def reset_params(self):
+        std1 = 1./((self.K*self.D)**(1/2))
+        std2 = 1./((self.K)**(1/2))
+        self.codewords.data.uniform_(-std1, std1)
+        self.scale.data.uniform_(-std2, std2)
+
+    def forward(self, X):
+        # input X is a 4D tensor
+        assert(X.size(1)==self.D,"Encoding Layer wrong channels!")
+        if X.dim() == 3:
+            # BxDxN
+            B, N, K, D = X.size(0), X.size(2), self.K, self.D
+            X = X.transpose(1,2).contiguous()
+        elif X.dim() == 4:
+            # BxDxHxW
+            B, N, K, D = X.size(0), X.size(2)*X.size(3), self.K, self.D
+            X = X.view(B,D,-1).transpose(1,2).contiguous()
+        else:
+            raise RuntimeError('Encoding Layer unknown input dims!')
+        # assignment weights
+        A = F.softmax(ScaledL2()(X, self.codewords, self.scale))
+        # aggregate
+        E = aggregateE()(A, X, self.codewords)
+        return E
+
+    def __repr__(self):
+        return self.__class__.__name__ + '(' \
+            + 'N x ' + str(self.D) + '=>' + str(self.K) + 'x' \
+            + str(self.D) + ')'
+
+
+##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 class aggregate(Function):
    def forward(self, A, R):
        # A \in(BxNxK) R \in(BxNxKxD) => E \in(BxNxD)
        self.save_for_backward(A, R)
        B, N, K, D = R.size()
+        with torch.cuda.device_of(A):
            E = A.new(B,K,D)
-        # TODO support cpu backend
        if isinstance(A, torch.cuda.FloatTensor):
+            with torch.cuda.device_of(A):
                encoding_lib.Encoding_Float_aggregate_forward(E, A, R)
        elif isinstance(A, torch.cuda.DoubleTensor):
+            with torch.cuda.device_of(A):
                encoding_lib.Encoding_Double_aggregate_forward(E, A, R)
        else:
-            raise RuntimeError('unimplemented')
+            raise RuntimeError('Unimplemented data type!')
        return E

    def backward(self, gradE):
        A, R = self.saved_tensors
+        with torch.cuda.device_of(A):
            gradA = A.new().resize_as_(A)
            gradR = R.new().resize_as_(R)
        if isinstance(A, torch.cuda.FloatTensor):
-            encoding_lib.Encoding_Float_aggregate_backward(gradA, gradR, gradE, 
-                    A, R)
+            with torch.cuda.device_of(A):
+                encoding_lib.Encoding_Float_aggregate_backward(gradA, 
+                    gradR, gradE, A, R)
        elif isinstance(A, torch.cuda.DoubleTensor):
-            encoding_lib.Encoding_Double_aggregate_backward(gradA, gradR, gradE, 
-                    A, R)
+            with torch.cuda.device_of(A):
+                encoding_lib.Encoding_Double_aggregate_backward(gradA, 
+                    gradR, gradE, A, R)
        else:
-            raise RuntimeError('unimplemented')
+            raise RuntimeError('Unimplemented data type!')
        return gradA, gradR


+class residual(Function):
+    def forward(self, X, C):
+        # X \in(BxNxD) D \in(KxD) R \in(BxNxKxD) 
+        B, N, D = X.size()
+        K = C.size(0)
+        with torch.cuda.device_of(X):
+            R = X.new(B,N,K,D)
+        if isinstance(X, torch.cuda.FloatTensor):
+            with torch.cuda.device_of(X):
+                encoding_lib.Encoding_Float_residual_forward(R, X, C)
+        elif isinstance(X, torch.cuda.DoubleTensor):
+            with torch.cuda.device_of(X):
+                encoding_lib.Encoding_Double_residual_forward(R, X, C)
+        else:
+            raise RuntimeError('Unimplemented data type!')
+        return R
+
+    def backward(self, gradR):
+        B, N, K, D = gradR.size()
+        with torch.cuda.device_of(gradR):
+            gradX = gradR.new(B,N,D)
+            gradD = gradR.new(K,D)
+        if isinstance(gradR, torch.cuda.FloatTensor):
+            with torch.cuda.device_of(gradR):
+                encoding_lib.Encoding_Float_residual_backward(gradR, 
+                    gradX, gradD)
+        elif isinstance(gradR, torch.cuda.DoubleTensor):
+            with torch.cuda.device_of(gradR):
+                encoding_lib.Encoding_Double_residual_backward(gradR, 
+                    gradX, gradD)
+        else:
+            raise RuntimeError('Unimplemented data type!')
+        return gradX, gradD
+
+
+class square_squeeze(Function):
+    def forward(self, R):
+        B, N, K, D = R.size()
+        with torch.cuda.device_of(R):
+            L = R.new(B,N,K)
+        if isinstance(R, torch.cuda.FloatTensor):
+            with torch.cuda.device_of(R):
+                encoding_lib.Encoding_Float_squaresqueeze_forward(L, R)
+        elif isinstance(R, torch.cuda.DoubleTensor):
+            with torch.cuda.device_of(R):
+                encoding_lib.Encoding_Double_squaresqueeze_forward(L, R)
+        else:
+            raise RuntimeError('Unimplemented data type!')
+        self.save_for_backward(L, R)
+        return L
+
+    def backward(self, gradL):
+        L, R = self.saved_tensors
+        B, N, K, D = R.size()
+        with torch.cuda.device_of(R):
+            gradR = R.new(B,N,K,D)
+        if isinstance(R, torch.cuda.FloatTensor):
+            with torch.cuda.device_of(gradL):
+                encoding_lib.Encoding_Float_squaresqueeze_backward(gradL, 
+                    gradR, R)
+        elif isinstance(R, torch.cuda.DoubleTensor):
+            with torch.cuda.device_of(gradL):
+                encoding_lib.Encoding_Double_squaresqueeze_backward(gradL, 
+                    gradR, R)
+        else:
+            raise RuntimeError('Unimplemented data type!')
+        return gradR
+    
+
+def assign(R, S):
+    L = square_squeeze()(R)
+    K = S.size(0)
+    SL = L * S.view(1,1,K)
+    return F.softmax(SL)
+
+
 class Aggregate(nn.Module):
    def forward(self, A, R):
        return aggregate()(A, R)


-class Encoding(nn.Module):
+class EncodingP(nn.Module):
    def __init__(self, D, K):
-        super(Encoding, self).__init__()
+        super(EncodingP, self).__init__()
        # init codewords and smoothing factor
        self.D, self.K = D, K
-        self.codewords = nn.Parameter(torch.Tensor(K, D), requires_grad=True)
+        self.codewords = nn.Parameter(torch.Tensor(K, D), 
+            requires_grad=True)
        self.scale = nn.Parameter(torch.Tensor(K), requires_grad=True) 
-        self.softmax = nn.Softmax()
        self.reset_params()
        
    def reset_params(self):
@@ -69,34 +267,33 @@ class Encoding(nn.Module):

    def forward(self, X):
        # input X is a 4D tensor
-        assert(X.size(1)==self.D,"Encoding Layer incompatible input channels!")
-        unpacked = False
+        assert(X.size(1)==self.D,"Encoding Layer wrong channels!")
        if X.dim() == 3:
-            unpacked = True
-            X = X.unsqueeze(0)
-
+            # BxDxN
+            B, N, K, D = X.size(0), X.size(2), self.K, self.D
+            X = X.transpose(1,2)
+        elif X.dim() == 4:
+            # BxDxHxW
            B, N, K, D = X.size(0), X.size(2)*X.size(3), self.K, self.D
-        # reshape input
            X = X.view(B,D,-1).transpose(1,2)
+        else:
+            raise RuntimeError('Encoding Layer unknown input dims!')
        # calculate residuals
-        R = X.contiguous().view(B,N,1,D).expand(B,N,K,D) - self.codewords.view(
-                    1,1,K,D).expand(B,N,K,D)
+        R = residual()(X.contiguous(), self.codewords)
        # assignment weights
-        A = R
-        A = A.pow(2).sum(3).view(B,N,K)
-        A = A*self.scale.view(1,1,K).expand_as(A)
-        A = self.softmax(A.view(B*N,K)).view(B,N,K)
+        A = assign(R, self.scale)
        # aggregate
        E = aggregate()(A, R)

-        if unpacked:
-            E = E.squeeze(0)
        return E

    def __repr__(self):
        return self.__class__.__name__ + '(' \
-            + 'N x ' + str(self.D) + '=>' + str(self.K) + 'x' + str(self.D) + ')'
+            + 'N x ' + str(self.D) + '=>' + str(self.K) + 'x' \
+            + str(self.D) + ')'

+
+##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 class sum_square(Function):
    def forward(ctx, input):
        ctx.save_for_backward(input)
@@ -113,7 +310,7 @@ class sum_square(Function):
                encoding_lib.Encoding_Double_sum_square_Forward( 
                    input.view(B,C,-1), xsum, xsquare)
        else:
-            raise RuntimeError('unimplemented') 
+            raise RuntimeError('Unimplemented data type!') 
        return xsum, xsquare

    def backward(ctx, gradSum, gradSquare):
@@ -121,8 +318,6 @@ class sum_square(Function):
        B,C,H,W = input.size()
        with torch.cuda.device_of(input):
            gradInput = input.new().resize_(B,C,H*W).zero_()
-        #    gradSum.view(1,C,1,1).expand_as(input) + \
-        #   2*gradSquare.view(1,C,1,1).expand_as(input)*input
        if isinstance(input, torch.cuda.FloatTensor):
            with torch.cuda.device_of(input):
                encoding_lib.Encoding_Float_sum_square_Backward(
@@ -132,9 +327,10 @@ class sum_square(Function):
                encoding_lib.Encoding_Double_sum_square_Backward( 
                    gradInput, input.view(B,C,-1), gradSum, gradSquare)
        else:
-            raise RuntimeError('unimplemented') 
+            raise RuntimeError('Unimplemented data type!') 
        return gradInput.view(B,C,H,W)

+
 class batchnormtrain(Function):
    def forward(ctx, input, gamma, beta, mean, std):
        ctx.save_for_backward(input, gamma, beta, mean, std)
@@ -151,7 +347,7 @@ class batchnormtrain(Function):
                encoding_lib.Encoding_Double_batchnorm_Forward(output, 
                    input, mean, invstd, gamma, beta)
        else:
-            raise RuntimeError('unimplemented')
+            raise RuntimeError('Unimplemented data type!')
        return output 

    def backward(ctx, gradOutput):
@@ -177,9 +373,10 @@ class batchnormtrain(Function):
                    mean, invstd, gamma, beta, gradMean, gradStd,
                    True) 
        else:
-            raise RuntimeError('unimplemented')
+            raise RuntimeError('Unimplemented data type!')
        return gradInput, gradGamma, gradBeta, gradMean, gradStd

+
 class batchnormeval(Function):
    def forward(ctx, input, gamma, beta, mean, std):
        ctx.save_for_backward(input, gamma, beta, mean, std)
@@ -196,7 +393,7 @@ class batchnormeval(Function):
                encoding_lib.Encoding_Double_batchnorm_Forward(output, 
                    input, mean, invstd, gamma, beta)
        else:
-            raise RuntimeError('unimplemented')
+            raise RuntimeError('Unimplemented data type!')
        return output 

    def backward(ctx, gradOutput):
@@ -221,6 +418,6 @@ class batchnormeval(Function):
                    mean, invstd, gamma, beta, gradMean, gradStd,
                    False) 
        else:
-            raise RuntimeError('unimplemented')
+            raise RuntimeError('Unimplemented data type!')
        return gradInput, gradGamma, gradBeta, gradMean, gradStd

--- a/encoding/kernel/generic/encoding_kernel.c
+++ b/encoding/kernel/generic/encoding_kernel.c
@@ -12,6 +12,62 @@
 #define THC_GENERIC_FILE "generic/encoding_kernel.c"
 #else

+
+__global__ void Encoding_(AggregateE_Forward_kernel) (
+    THCDeviceTensor<real, 3> E,
+    THCDeviceTensor<real, 3> A,
+    THCDeviceTensor<real, 3> X,
+    THCDeviceTensor<real, 2> C)
+/*
+ * aggregating forward kernel function
+ */
+{
+    /* declarations of the variables */
+    int b, k, d, i, N;
+    real sum;
+    /* Get the index and channels */ 
+    b = blockIdx.z;
+    d = blockIdx.x * blockDim.x + threadIdx.x;
+    k = blockIdx.y * blockDim.y + threadIdx.y;
+    N = A.getSize(1);
+    /* boundary check for output */
+    sum = 0;
+    if (d >= E.getSize(2) || k >= E.getSize(1))    return;
+    /* main operation */
+    for(i=0; i<N; i++) {
+        sum += A[b][i][k].ldg() * (X[b][i][d].ldg()-C[k][d].ldg());
+    }
+    E[b][k][d] = sum;
+}
+
+void Encoding_(AggregateE_Forward)(THCState *state, THCTensor *E_, 
+    THCTensor *A_, THCTensor *X_, THCTensor *C_)
+/*
+ * aggregating forward the residuals with assignment weights
+ */
+{
+    /* Check the GPU index and tensor dims*/
+    THCTensor_(checkGPU)(state, 4, E_, A_, X_, C_);
+    if (THCTensor_(nDimension)(state, E_) != 3 ||
+            THCTensor_(nDimension)(state, A_) != 3 ||
+            THCTensor_(nDimension)(state, X_) != 3 ||
+            THCTensor_(nDimension)(state, C_) != 2)
+        THError("Encoding: incorrect input dims. \n");
+    /* Device tensors */
+    THCDeviceTensor<real, 3> E = devicetensor<3>(state, E_);
+    THCDeviceTensor<real, 3> A = devicetensor<3>(state, A_);
+    THCDeviceTensor<real, 3> X = devicetensor<3>(state, X_);
+    THCDeviceTensor<real, 2> C = devicetensor<2>(state, C_);
+    /* kernel function */
+    cudaStream_t stream = THCState_getCurrentStream(state);
+    dim3 threads(16, 16);
+    dim3 blocks(E.getSize(2)/16+1, E.getSize(1)/16+1, 
+                            E.getSize(0));
+    Encoding_(AggregateE_Forward_kernel)<<<blocks, threads, 0, stream>>>
+        (E, A, X, C);
+    THCudaCheck(cudaGetLastError());
+}
+/*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
 __global__ void Encoding_(Aggregate_Forward_kernel) (
    THCDeviceTensor<real, 3> E,
    THCDeviceTensor<real, 3> A,
@@ -63,11 +119,72 @@ void Encoding_(Aggregate_Forward)(THCState *state, THCTensor *E_,
    THCudaCheck(cudaGetLastError());
 }

+/*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
+__global__ void Encoding_(AggregateE_Backward_kernel) (
+    THCDeviceTensor<real, 3> GA,
+    THCDeviceTensor<real, 3> GE,
+    THCDeviceTensor<real, 3> A,
+    THCDeviceTensor<real, 3> X,
+    THCDeviceTensor<real, 2> C)
+/*
+ * aggregating backward kernel function
+ * G (dl/dR), L (dl/dE), A
+ */
+{
+    /* declarations of the variables */
+    int b, k, d, i, D;
+    real sum;
+    /* Get the index and channels */ 
+    b = blockIdx.z;
+    i = blockIdx.y * blockDim.y + threadIdx.y;
+    k = blockIdx.x * blockDim.x + threadIdx.x;
+    D = GE.getSize(2);
+    /* boundary check for output G \in R^{BxNxKxD} */
+    if (k >= GA.getSize(2) || i >= GA.getSize(1))    return;
+    /* main operation */
+    sum = 0;
+    for(d=0; d<D; d++) {
+        sum += GE[b][k][d].ldg() * (X[b][i][d].ldg()-C[k][d].ldg());
+    }
+    GA[b][i][k] = sum;
+}
+
+void Encoding_(AggregateE_Backward)(THCState *state, THCTensor *GA_, 
+     THCTensor *GE_, THCTensor *A_, THCTensor *X_, THCTensor *C_)
+/*
+ * aggregate backward to assignment weights
+ * G (dl/dR), L (dl/dE), A
+ */
+{
+    /* Check the GPU index and tensor dims*/
+    THCTensor_(checkGPU)(state, 5, GA_, GE_, A_, X_, C_);
+    if (THCTensor_(nDimension)(state, GA_) != 3 ||
+        THCTensor_(nDimension)(state, GE_)  != 3 ||
+        THCTensor_(nDimension)(state, A_)  != 3 ||
+        THCTensor_(nDimension)(state, X_)  != 3 ||
+        THCTensor_(nDimension)(state, C_)  != 2)
+    THError("Encoding: incorrect input dims. \n");
+    /* Device tensors */
+    THCDeviceTensor<real, 3> GA = devicetensor<3>(state, GA_);
+    THCDeviceTensor<real, 3> GE = devicetensor<3>(state, GE_);
+    THCDeviceTensor<real, 3> A = devicetensor<3>(state, A_);
+    THCDeviceTensor<real, 3> X = devicetensor<3>(state, X_);
+    THCDeviceTensor<real, 2> C = devicetensor<2>(state, C_);
+    /* kernel function */
+    cudaStream_t stream = THCState_getCurrentStream(state);
+    dim3 threads(16, 16);
+    dim3 blocks(GA.getSize(2)/16+1, GA.getSize(1)/16+1, 
+                GA.getSize(0));
+    Encoding_(AggregateE_Backward_kernel)<<<blocks, threads, 0, stream>>>
+        (GA, GE, A, X, C);
+    THCudaCheck(cudaGetLastError());
+}
+
 /*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
 __global__ void Encoding_(Aggregate_Backward_kernel) (
    THCDeviceTensor<real, 3> GA,
    THCDeviceTensor<real, 4> GR,
-	THCDeviceTensor<real, 3> L,
+    THCDeviceTensor<real, 3> GE,
    THCDeviceTensor<real, 3> A,
    THCDeviceTensor<real, 4> R)
 /*
@@ -82,38 +199,37 @@ __global__ void Encoding_(Aggregate_Backward_kernel) (
    b = blockIdx.z;
    i = blockIdx.y * blockDim.y + threadIdx.y;
    k = blockIdx.x * blockDim.x + threadIdx.x;
-	D = L.getSize(2);
+    D = GE.getSize(2);
    /* boundary check for output G \in R^{BxNxKxD} */
    if (k >= GR.getSize(2) || i >= GR.getSize(1))    return;
    /* main operation */
    sum = 0;
    for(d=0; d<D; d++) {
-		//sum += L[b][k][d].ldg() * R[b][i][k][d].ldg();
-		GR[b][i][k][d] = L[b][k][d].ldg() * A[b][i][k].ldg();
-		sum += L[b][k][d].ldg() * R[b][i][k][d].ldg();
+        GR[b][i][k][d] = GE[b][k][d].ldg() * A[b][i][k].ldg();
+        sum += GE[b][k][d].ldg() * R[b][i][k][d].ldg();
    }
    GA[b][i][k] = sum;
 }

 void Encoding_(Aggregate_Backward)(THCState *state, THCTensor *GA_, 
- 	THCTensor *GR_, THCTensor *L_, THCTensor *A_, THCTensor *R_)
+     THCTensor *GR_, THCTensor *GE_, THCTensor *A_, THCTensor *R_)
 /*
 * aggregate backward to assignment weights
 * G (dl/dR), L (dl/dE), A
 */
 {
    /* Check the GPU index and tensor dims*/
-	THCTensor_(checkGPU)(state, 5, GA_, GR_, L_, A_, R_);
+    THCTensor_(checkGPU)(state, 5, GA_, GR_, GE_, A_, R_);
    if (THCTensor_(nDimension)(state, GA_) != 3 ||
        THCTensor_(nDimension)(state, GR_) != 4 ||
-			THCTensor_(nDimension)(state, L_)  != 3 ||
+        THCTensor_(nDimension)(state, GE_)  != 3 ||
        THCTensor_(nDimension)(state, A_)  != 3 ||
        THCTensor_(nDimension)(state, R_)  != 4)
    THError("Encoding: incorrect input dims. \n");
    /* Device tensors */
    THCDeviceTensor<real, 3> GA = devicetensor<3>(state, GA_);
    THCDeviceTensor<real, 4> GR = devicetensor<4>(state, GR_);
-	THCDeviceTensor<real, 3> L = devicetensor<3>(state, L_);
+    THCDeviceTensor<real, 3> GE = devicetensor<3>(state, GE_);
    THCDeviceTensor<real, 3> A = devicetensor<3>(state, A_);
    THCDeviceTensor<real, 4> R = devicetensor<4>(state, R_);
    /* kernel function */
@@ -122,14 +238,397 @@ void Encoding_(Aggregate_Backward)(THCState *state, THCTensor *GA_,
    dim3 blocks(GA.getSize(2)/16+1, GA.getSize(1)/16+1, 
                GA.getSize(0));
    Encoding_(Aggregate_Backward_kernel)<<<blocks, threads, 0, stream>>>(GA,
-					GR, L, A, R);
+              GR, GE, A, R);
+    THCudaCheck(cudaGetLastError());
+}
+
+/*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
+__global__ void Encoding_(Residual_Forward_kernel) (
+    THCDeviceTensor<real, 4> R,
+    THCDeviceTensor<real, 3> X,
+    THCDeviceTensor<real, 2> D)
+/*
+ * aggregating forward kernel function
+ */
+{
+    /* declarations of the variables */
+    int b, k, d, i, K;
+    /* Get the index and channels */ 
+    b = blockIdx.z;
+    d = blockIdx.x * blockDim.x + threadIdx.x;
+    i = blockIdx.y * blockDim.y + threadIdx.y;
+    K = R.getSize(2);
+    /* boundary check for output */
+    if (d >= X.getSize(2) || i >= X.getSize(1))    return;
+    /* main operation */
+    for(k=0; k<K; k++) {
+        R[b][i][k][d] = X[b][i][d].ldg() - D[k][d].ldg();
+    }
+}
+
+void Encoding_(Residual_Forward)(
+    THCState *state, THCTensor *R_, THCTensor *X_, THCTensor *D_)
+/*
+ * aggregating forward the residuals with assignment weights
+ */
+{
+    /* Check the GPU index and tensor dims*/
+    THCTensor_(checkGPU)(state, 3, R_, X_, D_); 
+    if (THCTensor_(nDimension)(state, R_) != 4 ||
+        THCTensor_(nDimension)(state, X_) != 3 ||
+        THCTensor_(nDimension)(state, D_) != 2)
+    THError("Encoding: incorrect input dims. \n");
+    /* Device tensors */
+    THCDeviceTensor<real, 4> R = devicetensor<4>(state, R_);
+    THCDeviceTensor<real, 3> X = devicetensor<3>(state, X_);
+    THCDeviceTensor<real, 2> D = devicetensor<2>(state, D_);
+    /* kernel function */
+    cudaStream_t stream = THCState_getCurrentStream(state);
+    dim3 threads(16, 16);
+    dim3 blocks(X.getSize(2)/16+1, X.getSize(1)/16+1, 
+                X.getSize(0));
+    Encoding_(Residual_Forward_kernel)<<<blocks, threads, 0, stream>>>(R, X, D);
    THCudaCheck(cudaGetLastError());
 }


 /*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
-// Returns the index of the most significant 1 bit in `val`.
+__global__ void Encoding_(ResidualX_Backward_kernel) (
+    THCDeviceTensor<real, 4> GR,
+    THCDeviceTensor<real, 3> GX)
+/*
+ * aggregating forward kernel function
+ */
+{
+    /* declarations of the variables */
+    int b, k, d, i, K;
+    real sum;
+    /* Get the index and channels */ 
+    b = blockIdx.z;
+    d = blockIdx.x * blockDim.x + threadIdx.x;
+    i = blockIdx.y * blockDim.y + threadIdx.y;
+    K = GR.getSize(2);
+    /* boundary check for output */
+    if (d >= GX.getSize(2) || i >= GX.getSize(1)) return;
+    /* main operation */
+    sum = 0;
+    for(k=0; k<K; k++) {
+        sum += GR[b][i][k][d].ldg();
+    }
+    GX[b][i][d] = sum;
+}

+__global__ void Encoding_(ResidualD_Backward_kernel) (
+    THCDeviceTensor<real, 4> GR,
+    THCDeviceTensor<real, 2> GD)
+/*
+ * aggregating forward kernel function
+ */
+{
+    /* declarations of the variables */
+    int b, k, d, i, B, N;
+    real sum;
+    /* Get the index and channels */ 
+    d = blockIdx.x * blockDim.x + threadIdx.x;
+    k = blockIdx.y * blockDim.y + threadIdx.y;
+    B = GR.getSize(0);
+    N = GR.getSize(1);
+    /* boundary check for output */
+    if (d >= GD.getSize(1) || k >= GD.getSize(0)) return;
+    /* main operation */
+    sum = 0;
+    for(b=0; b<B; b++) {
+        for(i=0; i<N; i++) {
+            sum -= GR[b][i][k][d].ldg();
+        }
+    }
+    GD[k][d] = sum;
+}
+
+void Encoding_(Residual_Backward)(
+    THCState *state, THCTensor *GR_, THCTensor *GX_, THCTensor *GD_)
+/*
+ * aggregating forward the residuals with assignment weights
+ */
+{
+    /* Check the GPU index and tensor dims*/
+    THCTensor_(checkGPU)(state, 3, GR_, GX_, GD_); 
+    if (THCTensor_(nDimension)(state, GR_) != 4 ||
+        THCTensor_(nDimension)(state, GX_) != 3 ||
+        THCTensor_(nDimension)(state, GD_) != 2)
+    THError("Encoding: incorrect input dims. \n");
+    /* Device tensors */
+    THCDeviceTensor<real, 4> GR = devicetensor<4>(state, GR_);
+    THCDeviceTensor<real, 3> GX = devicetensor<3>(state, GX_);
+    THCDeviceTensor<real, 2> GD = devicetensor<2>(state, GD_);
+    /* kernel function */
+    cudaStream_t stream = THCState_getCurrentStream(state);
+    dim3 threads(16, 16);
+    dim3 blocks(GX.getSize(2)/16+1, GX.getSize(1)/16+1, 
+                GX.getSize(0));
+    Encoding_(ResidualX_Backward_kernel)<<<blocks, threads, 0, stream>>>
+        (GR, GX);
+    THCudaCheck(cudaGetLastError());
+    dim3 blocks2(GD.getSize(1)/16+1, GD.getSize(0)/16+1); 
+    Encoding_(ResidualD_Backward_kernel)<<<blocks2, threads, 0, stream>>>
+        (GR, GD);
+    THCudaCheck(cudaGetLastError());
+}
+
+/*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
+__global__ void Encoding_(ScaledL2_Forward_kernel) (
+    THCDeviceTensor<real, 3> SL,
+    THCDeviceTensor<real, 3> X,
+    THCDeviceTensor<real, 2> C,
+    THCDeviceTensor<real, 1> S)
+/*
+ * aggregating forward kernel function
+ */
+{
+    /* declarations of the variables */
+    int b, k, d, i, D;
+    real r, sum;
+    /* Get the index and channels */ 
+    b = blockIdx.z;
+    k = blockIdx.x * blockDim.x + threadIdx.x;
+    i = blockIdx.y * blockDim.y + threadIdx.y;
+    D = X.getSize(2);
+    /* boundary check for output */
+    if (k >= SL.getSize(2) || i >= SL.getSize(1)) return;
+    /* main operation */
+    sum = 0;
+    for(d=0; d<D; d++) {
+        r = X[b][i][d].ldg() - C[k][d].ldg();
+        sum += r * r;
+    }
+    SL[b][i][k] = S[k] * sum;
+}
+
+void Encoding_(ScaledL2_Forward)(
+    THCState *state, THCTensor *SL_,  THCTensor *X_,
+    THCTensor *C_,  THCTensor *S_)
+/*
+ * aggregating forward the residuals with assignment weights
+ */
+{
+    /* Check the GPU index and tensor dims*/
+    THCTensor_(checkGPU)(state, 4, SL_, X_, C_, S_); 
+    if (THCTensor_(nDimension)(state, SL_) != 3 ||
+        THCTensor_(nDimension)(state, X_) != 3 ||
+        THCTensor_(nDimension)(state, C_) != 2 ||
+        THCTensor_(nDimension)(state, S_) != 1)
+    THError("Encoding: incorrect input dims. \n");
+    /* Device tensors */
+    THCDeviceTensor<real, 3> SL = devicetensor<3>(state, SL_);
+    THCDeviceTensor<real, 3> X  = devicetensor<3>(state, X_);
+    THCDeviceTensor<real, 2> C  = devicetensor<2>(state, C_);
+    THCDeviceTensor<real, 1> S  = devicetensor<1>(state, S_);
+    /* kernel function */
+    cudaStream_t stream = THCState_getCurrentStream(state);
+    dim3 threads(16, 16);
+    dim3 blocks(SL.getSize(2)/16+1, SL.getSize(1)/16+1, 
+                SL.getSize(0));
+    Encoding_(ScaledL2_Forward_kernel)<<<blocks, threads, 0, stream>>>
+        (SL, X, C, S);
+    THCudaCheck(cudaGetLastError());
+}
+
+/*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
+__global__ void Encoding_(ScaledL2X_Backward_kernel) (
+    THCDeviceTensor<real, 3> GSL,
+    THCDeviceTensor<real, 3> GX,
+    THCDeviceTensor<real, 3> X,
+    THCDeviceTensor<real, 2> C,
+    THCDeviceTensor<real, 1> S)
+/*
+ */
+{
+    /* declarations of the variables */
+    int b, k, d, i, K;
+    real sum;
+    /* Get the index and channels */ 
+    b = blockIdx.z;
+    d = blockIdx.x * blockDim.x + threadIdx.x;
+    i = blockIdx.y * blockDim.y + threadIdx.y;
+    K = C.getSize(0);
+    /* boundary check for output */
+    if (d >= GX.getSize(2) || i >= GX.getSize(1)) return;
+    /* main operation */
+    sum = 0;
+    for(k=0; k<K; k++) {
+        sum += 2*S[k].ldg() * GSL[b][i][k].ldg() *
+            (X[b][i][d].ldg()-C[k][d].ldg());
+    }
+    GX[b][i][d] = sum;
+}
+
+__global__ void Encoding_(ScaledL2C_Backward_kernel) (
+    THCDeviceTensor<real, 3> GSL,
+    THCDeviceTensor<real, 2> GC,
+    THCDeviceTensor<real, 3> X,
+    THCDeviceTensor<real, 2> C,
+    THCDeviceTensor<real, 1> S)
+/*
+ */
+{
+    /* declarations of the variables */
+    int b, k, d, i, B, N;
+    real sum;
+    /* Get the index and channels */ 
+    d = blockIdx.x * blockDim.x + threadIdx.x;
+    k = blockIdx.y * blockDim.y + threadIdx.y;
+    B = X.getSize(0);
+    N = X.getSize(1);
+    /* boundary check for output */
+    if (d >= GC.getSize(1) || k >= GC.getSize(0)) return;
+    /* main operation */
+    sum = 0;
+    for(b=0; b<B; b++) {
+        for(i=0; i<N; i++) {
+            sum += -2*S[k].ldg() * GSL[b][i][k].ldg() *
+                (X[b][i][d].ldg()-C[k][d].ldg());
+        }
+    }
+    GC[k][d] = sum;
+}
+
+void Encoding_(ScaledL2_Backward)(
+    THCState *state, THCTensor *GSL_, THCTensor *GX_, THCTensor *GC_,
+    THCTensor *X_, THCTensor *C_, THCTensor *S_)
+/*
+ */
+{
+    /* Check the GPU index and tensor dims*/
+    THCTensor_(checkGPU)(state, 6, GSL_, GX_, GC_, X_, C_, S_); 
+    if (THCTensor_(nDimension)(state, GSL_) != 3 ||
+        THCTensor_(nDimension)(state, GX_)  != 3 ||
+        THCTensor_(nDimension)(state, GC_)  != 2 ||
+        THCTensor_(nDimension)(state, X_)   != 3 ||
+        THCTensor_(nDimension)(state, C_)   != 2 ||
+        THCTensor_(nDimension)(state, S_)   != 1)
+    THError("Encoding: incorrect input dims. \n");
+    /* Device tensors */
+    THCDeviceTensor<real, 3> GSL = devicetensor<3>(state, GSL_);
+    THCDeviceTensor<real, 3> GX = devicetensor<3>(state, GX_);
+    THCDeviceTensor<real, 2> GC = devicetensor<2>(state, GC_);
+    THCDeviceTensor<real, 3> X  = devicetensor<3>(state, X_);
+    THCDeviceTensor<real, 2> C  = devicetensor<2>(state, C_);
+    THCDeviceTensor<real, 1> S = devicetensor<1>(state, S_);
+    /* kernel function */
+    cudaStream_t stream = THCState_getCurrentStream(state);
+    dim3 threads(16, 16);
+    dim3 blocks(GX.getSize(2)/16+1, GX.getSize(1)/16+1, 
+                GX.getSize(0));
+    Encoding_(ScaledL2X_Backward_kernel)<<<blocks, threads, 0, stream>>>
+        (GSL, GX, X, C, S);
+    THCudaCheck(cudaGetLastError());
+    dim3 blocks2(GC.getSize(1)/16+1, GX.getSize(0)/16+1);
+    Encoding_(ScaledL2C_Backward_kernel)<<<blocks2, threads, 0, stream>>>
+        (GSL, GC, X, C, S);
+    THCudaCheck(cudaGetLastError());
+}
+/*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
+
+__global__ void Encoding_(SquareSqueeze_Forward_kernel) (
+    THCDeviceTensor<real, 3> L,
+    THCDeviceTensor<real, 4> R)
+/*
+ * aggregating forward kernel function
+ */
+{
+    /* declarations of the variables */
+    int b, k, d, i, D;
+    real sum;
+    /* Get the index and channels */ 
+    b = blockIdx.z;
+    k = blockIdx.x * blockDim.x + threadIdx.x;
+    i = blockIdx.y * blockDim.y + threadIdx.y;
+    D = R.getSize(3);
+    /* boundary check for output */
+    if (k >= L.getSize(2) || i >= L.getSize(1)) return;
+    /* main operation */
+    sum = 0;
+    for(d=0; d<D; d++) {
+        sum += R[b][i][k][d].ldg()*R[b][i][k][d].ldg();
+    }
+    L[b][i][k] = sum;
+}
+
+void Encoding_(SquareSqueeze_Forward)(
+    THCState *state, THCTensor *L_,  THCTensor *R_)
+/*
+ * aggregating forward the residuals with assignment weights
+ */
+{
+    /* Check the GPU index and tensor dims*/
+    THCTensor_(checkGPU)(state, 2, L_, R_); 
+    if (THCTensor_(nDimension)(state, L_) != 3 ||
+        THCTensor_(nDimension)(state, R_) != 4)
+    THError("Encoding: incorrect input dims. \n");
+    /* Device tensors */
+    THCDeviceTensor<real, 3> L = devicetensor<3>(state, L_);
+    THCDeviceTensor<real, 4> R = devicetensor<4>(state, R_);
+    /* kernel function */
+    cudaStream_t stream = THCState_getCurrentStream(state);
+    dim3 threads(16, 16);
+    dim3 blocks(L.getSize(2)/16+1, L.getSize(1)/16+1, 
+                L.getSize(0));
+    Encoding_(SquareSqueeze_Forward_kernel)<<<blocks, threads, 0, stream>>>
+        (L, R);
+    THCudaCheck(cudaGetLastError());
+}
+
+/*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
+__global__ void Encoding_(SquareSqueeze_Backward_kernel) (
+    THCDeviceTensor<real, 3> GL,
+    THCDeviceTensor<real, 4> GR,
+    THCDeviceTensor<real, 4> R)
+/*
+ */
+{
+    /* declarations of the variables */
+    int b, k, d, i, D;
+    real scale;
+    /* Get the index and channels */ 
+    b = blockIdx.z;
+    k = blockIdx.x * blockDim.x + threadIdx.x;
+    i = blockIdx.y * blockDim.y + threadIdx.y;
+    D = R.getSize(3);
+    /* boundary check for output */
+    if (k >= R.getSize(2) || i >= R.getSize(1)) return;
+    /* main operation */
+    scale = GL[b][i][k] * 2;
+    for(d=0; d<D; d++) {
+        GR[b][i][k][d] = scale * R[b][i][k][d];
+    }
+}
+
+void Encoding_(SquareSqueeze_Backward)(
+    THCState *state, THCTensor *GL_, THCTensor *GR_, THCTensor *R_)
+/*
+ */
+{
+    /* Check the GPU index and tensor dims*/
+    THCTensor_(checkGPU)(state, 3, GL_, GR_, R_); 
+    if (THCTensor_(nDimension)(state, GL_) != 3 ||
+        THCTensor_(nDimension)(state, GR_) != 4 ||
+        THCTensor_(nDimension)(state, R_) != 4)
+    THError("Encoding: incorrect input dims. \n");
+    /* Device tensors */
+    THCDeviceTensor<real, 3> GL = devicetensor<3>(state, GL_);
+    THCDeviceTensor<real, 4> GR = devicetensor<4>(state, GR_);
+    THCDeviceTensor<real, 4> R = devicetensor<4>(state, R_);
+    /* kernel function */
+    cudaStream_t stream = THCState_getCurrentStream(state);
+    dim3 threads(16, 16);
+    dim3 blocks(R.getSize(2)/16+1, R.getSize(1)/16+1, 
+                R.getSize(0));
+    Encoding_(SquareSqueeze_Backward_kernel)<<<blocks, threads, 0, stream>>>
+        (GL, GR, R);
+    THCudaCheck(cudaGetLastError());
+}
+
+/*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/

 __global__ void Encoding_(BatchNorm_Forward_kernel) (
    THCDeviceTensor<real, 3> output,
@@ -140,7 +639,6 @@ __global__ void Encoding_(BatchNorm_Forward_kernel) (
    THCDeviceTensor<real, 1> beta)
 {
    int c = blockIdx.x;
-    //int N = input.getSize(0) * input.getSize(2);
    /* main operation */ 
    for (int b = 0; b < input.getSize(0); ++b) {
        for (int x = threadIdx.x; x < input.getSize(2); x += blockDim.x) {
@@ -291,10 +789,6 @@ __global__ void Encoding_(BatchNorm_Backward_kernel) (
    /* Get the index and channels */ 
    int c = blockIdx.x; 
    /* main operation */ 
-    //int N = input.getSize(0) * input.getSize(2);
-    //real norm;
-    //norm = 1.0 / N;
-
    Encoding_(GradOp) g(mean[c], input, gradoutput);
    Encoding_(Float2) res = Encoding_(reduce)(g, gradoutput, c);
    real gradOutputSum = res.v1;

--- a/encoding/kernel/generic/encoding_kernel.h
+++ b/encoding/kernel/generic/encoding_kernel.h
@@ -12,11 +12,37 @@
 #define THC_GENERIC_FILE "generic/encoding_kernel.h"
 #else

-void Encoding_(Aggregate_Forward)(THCState *state, THCTensor *E_, 
+void Encoding_(AggregateE_Forward)(THCState *state, THCTensor *E_, 
+    THCTensor *A_, THCTensor *X_, THCTensor *C_);
+
+void Encoding_(AggregateE_Backward)(THCState *state, THCTensor *GA_, 
+     THCTensor *GE_, THCTensor *A_, THCTensor *X_, THCTensor *C_);
+
+void Encoding_(ScaledL2_Forward)( THCState *state, THCTensor *SL_,  
+    THCTensor *X_, THCTensor *C_,  THCTensor *S_);
+
+void Encoding_(ScaledL2_Backward)(
+    THCState *state, THCTensor *GSL_, THCTensor *GX_, THCTensor *GC_,
+    THCTensor *X_, THCTensor *C_, THCTensor *S_);
+
+void Encoding_(Aggregate_Forward)(
+    THCState *state, THCTensor *E_, THCTensor *A_, THCTensor *R_);
+
+void Encoding_(Aggregate_Backward)(
+    THCState *state, THCTensor *GA_, THCTensor *GR_, THCTensor *L_, 
    THCTensor *A_, THCTensor *R_);

-void Encoding_(Aggregate_Backward)(THCState *state, THCTensor *GA_, 
- 	THCTensor *GR_, THCTensor *L_, THCTensor *A_, THCTensor *R_);
+void Encoding_(Residual_Forward)(
+    THCState *state, THCTensor *R_, THCTensor *X_, THCTensor *D_);
+
+void Encoding_(Residual_Backward)(
+    THCState *state, THCTensor *GR_, THCTensor *GX_, THCTensor *GD_);
+
+void Encoding_(SquareSqueeze_Forward)(
+    THCState *state, THCTensor *L_,  THCTensor *R_);
+
+void Encoding_(SquareSqueeze_Backward)(
+    THCState *state, THCTensor *GL_, THCTensor *GR_, THCTensor *R_);

 void Encoding_(BatchNorm_Forward)(THCState *state, 
    THCTensor *output_, THCTensor *input_, 

--- a/encoding/src/encoding_lib.h
+++ b/encoding/src/encoding_lib.h
@@ -20,11 +20,36 @@
 #include "THC/THCGenerateFloatType.h"
 */

+int Encoding_Float_scaledl2_forward(THCudaTensor *SL,  
+    THCudaTensor *X, THCudaTensor *C, THCudaTensor *S);
+
+int Encoding_Float_scaledl2_backward(
+    THCudaTensor *GSL, THCudaTensor *GX, THCudaTensor *GC,
+    THCudaTensor *X, THCudaTensor *C, THCudaTensor *S);
+
+int Encoding_Float_aggregateE_forward(THCudaTensor *E, THCudaTensor *A,
+			THCudaTensor *X, THCudaTensor *C);
+
+int Encoding_Float_aggregateE_backward(THCudaTensor *GA, THCudaTensor *GE, 
+		THCudaTensor *A, THCudaTensor *X, THCudaTensor *C);
+
 int Encoding_Float_aggregate_forward(THCudaTensor *E, THCudaTensor *A,
 			THCudaTensor *R);
+
 int Encoding_Float_aggregate_backward(THCudaTensor *GA, THCudaTensor *GR, 
 		THCudaTensor *L, THCudaTensor *A, THCudaTensor *R);

+int Encoding_Float_residual_forward(THCudaTensor *R, THCudaTensor *X, 
+		THCudaTensor *D);
+
+int Encoding_Float_residual_backward(THCudaTensor *GR, THCudaTensor *GX, 
+		THCudaTensor *GD);
+
+int Encoding_Float_squaresqueeze_forward(THCudaTensor *L, THCudaTensor *R);
+
+int Encoding_Float_squaresqueeze_backward(THCudaTensor *GL, 
+    THCudaTensor *GR, THCudaTensor *R);
+
 int Encoding_Float_batchnorm_Forward(THCudaTensor *output_, 
    THCudaTensor *input_, THCudaTensor *mean_, 
    THCudaTensor *invstd_, THCudaTensor *gamma_, THCudaTensor *beta_);
@@ -43,25 +68,55 @@ void Encoding_Float_sum_square_Backward(
    THCudaTensor *gradInput, THCudaTensor *input_, 
    THCudaTensor *gradSum_, THCudaTensor *gradSquare_);

-/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
+/*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/

-int Encoding_Double_aggregate_forward(THCudaDoubleTensor *E, 
-        THCudaDoubleTensor *A, THCudaDoubleTensor *R);
+int Encoding_Double_scaledl2_forward(THCudaDoubleTensor *SL,  
+    THCudaDoubleTensor *X, THCudaDoubleTensor *C,  THCudaDoubleTensor *S);
+
+int Encoding_Double_scaledl2_backward(
+    THCudaDoubleTensor *GSL, THCudaDoubleTensor *GX, 
+    THCudaDoubleTensor *GC, THCudaDoubleTensor *X, 
+    THCudaDoubleTensor *C, THCudaDoubleTensor *S);
+
+int Encoding_Double_aggregateE_forward(THCudaDoubleTensor *E, 
+    THCudaDoubleTensor *A, THCudaDoubleTensor *X, THCudaDoubleTensor *C);
+
+int Encoding_Double_aggregateE_backward(THCudaDoubleTensor *GA, 
+    THCudaDoubleTensor *GE, THCudaDoubleTensor *A, THCudaDoubleTensor *X, 
+    THCudaDoubleTensor *C);
+
+int Encoding_Double_aggregate_forward(
+    THCudaDoubleTensor *E, THCudaDoubleTensor *A, THCudaDoubleTensor *R);

-int Encoding_Double_aggregate_backward(THCudaDoubleTensor *GA, 
-        THCudaDoubleTensor *GR, THCudaDoubleTensor *L, 
+int Encoding_Double_aggregate_backward(
+    THCudaDoubleTensor *GA, THCudaDoubleTensor *GR, THCudaDoubleTensor *L, 
    THCudaDoubleTensor *A, THCudaDoubleTensor *R);

+int Encoding_Double_residual_forward(
+    THCudaDoubleTensor *R, THCudaDoubleTensor *X, THCudaDoubleTensor *D);
+
+int Encoding_Double_residual_backward(
+    THCudaDoubleTensor *GR, THCudaDoubleTensor *GX, 
+    THCudaDoubleTensor *GD);
+
+int Encoding_Double_squaresqueeze_forward(THCudaDoubleTensor *L,
+    THCudaDoubleTensor *R);
+
+int Encoding_Double_squaresqueeze_backward(THCudaDoubleTensor *GL, 
+    THCudaDoubleTensor *GR, THCudaDoubleTensor *R);
+
 int Encoding_Double_batchnorm_Forward(THCudaDoubleTensor *output_, 
    THCudaDoubleTensor *input_, THCudaDoubleTensor *mean_, 
-        THCudaDoubleTensor *invstd_, THCudaDoubleTensor *gamma_, THCudaDoubleTensor *beta_);
+    THCudaDoubleTensor *invstd_, THCudaDoubleTensor *gamma_, 
+    THCudaDoubleTensor *beta_);

 int Encoding_Double_batchnorm_Backward(THCudaDoubleTensor *gradoutput_, 
    THCudaDoubleTensor *input_, THCudaDoubleTensor *gradinput_, 
    THCudaDoubleTensor *gradgamma_, THCudaDoubleTensor *gradbeta_, 
    THCudaDoubleTensor *mean_, THCudaDoubleTensor *invstd_, 
    THCudaDoubleTensor *gamma_, THCudaDoubleTensor *beta_, 
-        THCudaDoubleTensor *gradMean_, THCudaDoubleTensor *gradStd_, int train);
+    THCudaDoubleTensor *gradMean_, THCudaDoubleTensor *gradStd_, 
+    int train);

 int Encoding_Double_sum_square_Forward(THCudaDoubleTensor *input_, 
    THCudaDoubleTensor *sum_, THCudaDoubleTensor *square_);

--- a/encoding/src/generic/encoding_generic.c
+++ b/encoding/src/generic/encoding_generic.c
@@ -12,6 +12,56 @@
 #define THC_GENERIC_FILE "generic/encoding_generic.c"
 #else

+int Encoding_(scaledl2_forward)(THCTensor *SL,  
+    THCTensor *X, THCTensor *C,  THCTensor *S)
+/*
+ * ScaledL2 operation
+ */
+{
+		Encoding_(ScaledL2_Forward)(state, SL, X, C, S);
+		/* C function return number of the outputs */
+		return 0;
+}
+
+
+int Encoding_(scaledl2_backward)(
+    THCTensor *GSL, THCTensor *GX, THCTensor *GC,
+    THCTensor *X, THCTensor *C, THCTensor *S)
+/*
+ * ScaledL2 operation
+ */
+{
+		Encoding_(ScaledL2_Backward)(state, GSL, GX, GC, X, C, S);
+		/* C function return number of the outputs */
+		return 0;
+}
+
+
+int Encoding_(aggregateE_forward)(THCTensor *E, THCTensor *A,
+			THCTensor *X, THCTensor *C)
+/*
+ * Aggregate operation
+ */
+{
+		Encoding_(AggregateE_Forward)(state, E, A, X, C);
+		/* C function return number of the outputs */
+		return 0;
+}
+
+
+int Encoding_(aggregateE_backward)(THCTensor *GA, THCTensor *GE, 
+		THCTensor *A, THCTensor *X, THCTensor *C)
+/*
+ * Aggregate backward operation to A
+ * G (dl/dR), L (dl/dE), A (assignments)
+ */
+{
+		Encoding_(AggregateE_Backward)(state, GA, GE, A, X, C);
+		/* C function return number of the outputs */
+		return 0;
+}
+
+
 int Encoding_(aggregate_forward)(THCTensor *E, THCTensor *A,
 			THCTensor *R)
 /*
@@ -35,6 +85,49 @@ int Encoding_(aggregate_backward)(THCTensor *GA, THCTensor *GR,
 		return 0;
 }

+
+int Encoding_(residual_forward)(THCTensor *R, THCTensor *X, THCTensor *D)
+/*
+ * Residual operation
+ */
+{
+		Encoding_(Residual_Forward)(state, R, X, D);
+		/* C function return number of the outputs */
+		return 0;
+}
+
+int Encoding_(residual_backward)(THCTensor *GR, THCTensor *GX, 
+    THCTensor *GD)
+/*
+ * Residual operation
+ */
+{
+		Encoding_(Residual_Backward)(state, GR, GX, GD);
+		/* C function return number of the outputs */
+		return 0;
+}
+
+int Encoding_(squaresqueeze_forward)(THCTensor *L, THCTensor *R)
+/*
+ * Residual operation
+ */
+{
+    Encoding_(SquareSqueeze_Forward)(state, L, R);
+		/* C function return number of the outputs */
+		return 0;
+}
+
+int Encoding_(squaresqueeze_backward)(THCTensor *GL, THCTensor *GR, 
+    THCTensor *R)
+/*
+ * Residual operation
+ */
+{
+    Encoding_(SquareSqueeze_Backward)(state, GL, GR, R);
+		/* C function return number of the outputs */
+		return 0;
+}
+/*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
 int Encoding_(batchnorm_Forward)(THCTensor *output_, THCTensor *input_, 
        THCTensor *mean_, THCTensor *invstd_,
        THCTensor *gamma_, THCTensor *beta_)

--- a/setup.py
+++ b/setup.py
@@ -10,19 +10,26 @@

 import os
 import sys
+import subprocess

 from setuptools import setup, find_packages
-
-import build
+from setuptools.command.develop import develop
+from setuptools.command.install import install

 this_file = os.path.dirname(__file__)

-extra_compile_args = ['-std=c++11', '-Wno-write-strings']
+#extra_compile_args = ['-std=c++11', '-Wno-write-strings']
 if os.getenv('PYTORCH_BINARY_BUILD') and platform.system() == 'Linux':
    print('PYTORCH_BINARY_BUILD found. Static linking libstdc++ on Linux')
    extra_compile_args += ['-static-libstdc++']
    extra_link_args += ['-static-libstdc++']

+class TestCommand(install):
+    """Post-installation mode.""" 
+    def run(self):
+        install.run(self)
+        subprocess.check_call("python test/test.py".split())
+
 setup(
    name="encoding",
    version="0.0.1",
@@ -35,11 +42,14 @@ setup(
    setup_requires=["cffi>=1.0.0"],
    # Exclude the build files.
    packages=find_packages(exclude=["build"]),
-    extra_compile_args=extra_compile_args,
+    #extra_compile_args=extra_compile_args,
    # Package where to put the extensions. Has to be a prefix of build.py.
    ext_package="",
    # Extensions to compile.
    cffi_modules=[
        os.path.join(this_file, "build.py:ffi")
    ],
+    cmdclass={
+        'install': TestCommand,
+    },
 )