memory efficient implementation and test script

1177a80b · Hang Zhang · 8dd870b1 · 1177a80b · 1177a80b · 1177a80b
Commit 1177a80b authored Oct 04, 2017 by Hang Zhang
7 changed files
--- a/build.py
+++ b/build.py
@@ -28,9 +28,11 @@ else:
    os.environ['THC_LIBRARIES'] = os.path.join(lib_path,'libTHC.so.1')
    ENCODING_LIB = os.path.join(lib_path, 'libENCODING.so')

+clean_cmd = ['bash', 'clean.sh']
+subprocess.check_call(clean_cmd)
+
 build_all_cmd = ['bash', 'encoding/make.sh']
-if subprocess.call(build_all_cmd, env=dict(os.environ)) != 0:
-    sys.exit(1)
+subprocess.check_call(build_all_cmd, env=dict(os.environ))

 sources = ['encoding/src/encoding_lib.cpp']
 headers = ['encoding/src/encoding_lib.h']

--- a/encoding/__init__.py
+++ b/encoding/__init__.py
@@ -10,55 +10,253 @@

 import threading
 import torch
-import torch.cuda.nccl as nccl
 import torch.nn as nn
+import torch.nn.functional as F
 from torch.autograd import Function, Variable
-from torch.nn.parameter import Parameter
 from ._ext import encoding_lib

+class aggregateE(Function):
+    def forward(self, A, X, C):
+        # A \in(BxNxK) R \in(BxNxKxD) => E \in(BxNxD)
+        self.save_for_backward(A, X, C)
+        B, N, K = A.size()
+        D = X.size(2)
+        with torch.cuda.device_of(A):
+            E = A.new(B,K,D)
+        if isinstance(A, torch.cuda.FloatTensor):
+            with torch.cuda.device_of(A):
+                encoding_lib.Encoding_Float_aggregateE_forward(E, A, X, C)
+        elif isinstance(A, torch.cuda.DoubleTensor):
+            with torch.cuda.device_of(A):
+                encoding_lib.Encoding_Double_aggregateE_forward(E, A, X, C)
+        else:
+            raise RuntimeError('Unimplemented data type!')
+        return E
+
+    def backward(self, gradE):
+        A, X, C = self.saved_tensors
+        with torch.cuda.device_of(A):
+            gradA = A.new().resize_as_(A)
+            gradX = A.new().resize_as_(X)
+            gradC = A.new().resize_as_(C)
+        if isinstance(A, torch.cuda.FloatTensor):
+            with torch.cuda.device_of(A):
+                encoding_lib.Encoding_Float_aggregateE_backward(gradA, 
+                    gradE, A, X, C)
+        elif isinstance(A, torch.cuda.DoubleTensor):
+            with torch.cuda.device_of(A):
+                encoding_lib.Encoding_Double_aggregateE_backward(gradA, 
+                    gradE, A, X, C)
+        else:
+            raise RuntimeError('Unimplemented data type!')
+        gradX.copy_(torch.bmm(A, gradE))
+        gradC.copy_((-gradE*A.sum(1).unsqueeze(2)).sum(0))
+        return gradA, gradX, gradC
+
+
+class ScaledL2(Function):
+    def forward(self, X, C, S):
+        B,N,D = X.size()
+        K = C.size(0)
+        with torch.cuda.device_of(X):
+            SL = X.new(B,N,K)
+        if isinstance(X, torch.cuda.FloatTensor):
+            with torch.cuda.device_of(X):
+                encoding_lib.Encoding_Float_scaledl2_forward(SL, X, C, S)
+        elif isinstance(X, torch.cuda.DoubleTensor):
+            with torch.cuda.device_of(X):
+                encoding_lib.Encoding_Double_scaledl2_forward(SL, X, C, S)
+        else:
+            raise RuntimeError('Unimplemented data type!')
+        self.save_for_backward(X, C, S, SL)
+        return SL
+    def backward(self, gradSL):
+        X, C, S, SL = self.saved_tensors
+        K = C.size(0)
+        with torch.cuda.device_of(X):
+            gradX = X.new().resize_as_(X)
+            gradC = X.new().resize_as_(C)
+            gradS = X.new().resize_as_(S)
+        if isinstance(X, torch.cuda.FloatTensor):
+            with torch.cuda.device_of(X):
+                encoding_lib.Encoding_Float_scaledl2_backward(gradSL, 
+                    gradX, gradC, X, C, S)
+        elif isinstance(X, torch.cuda.DoubleTensor):
+            with torch.cuda.device_of(X):
+                encoding_lib.Encoding_Double_scaledl2_backward(gradSL, 
+                    gradX, gradC, X, C, S)
+        else:
+            raise RuntimeError('Unimplemented data type!')
+        gradS.copy_((gradSL*(SL/S.view(1,1,K))).sum(0).sum(0))
+        return gradX, gradC, gradS
+
+
+class Encoding(nn.Module):
+    def __init__(self, D, K):
+        super(Encoding, self).__init__()
+        # init codewords and smoothing factor
+        self.D, self.K = D, K
+        self.codewords = nn.Parameter(torch.Tensor(K, D), 
+            requires_grad=True)
+        self.scale = nn.Parameter(torch.Tensor(K), requires_grad=True) 
+        self.reset_params()
+        
+    def reset_params(self):
+        std1 = 1./((self.K*self.D)**(1/2))
+        std2 = 1./((self.K)**(1/2))
+        self.codewords.data.uniform_(-std1, std1)
+        self.scale.data.uniform_(-std2, std2)
+
+    def forward(self, X):
+        # input X is a 4D tensor
+        assert(X.size(1)==self.D,"Encoding Layer wrong channels!")
+        if X.dim() == 3:
+            # BxDxN
+            B, N, K, D = X.size(0), X.size(2), self.K, self.D
+            X = X.transpose(1,2).contiguous()
+        elif X.dim() == 4:
+            # BxDxHxW
+            B, N, K, D = X.size(0), X.size(2)*X.size(3), self.K, self.D
+            X = X.view(B,D,-1).transpose(1,2).contiguous()
+        else:
+            raise RuntimeError('Encoding Layer unknown input dims!')
+        # assignment weights
+        A = F.softmax(ScaledL2()(X, self.codewords, self.scale))
+        # aggregate
+        E = aggregateE()(A, X, self.codewords)
+        return E
+
+    def __repr__(self):
+        return self.__class__.__name__ + '(' \
+            + 'N x ' + str(self.D) + '=>' + str(self.K) + 'x' \
+            + str(self.D) + ')'
+
+
+##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 class aggregate(Function):
    def forward(self, A, R):
        # A \in(BxNxK) R \in(BxNxKxD) => E \in(BxNxD)
        self.save_for_backward(A, R)
        B, N, K, D = R.size()
-        E = A.new(B,K,D)
-        # TODO support cpu backend
+        with torch.cuda.device_of(A):
+            E = A.new(B,K,D)
        if isinstance(A, torch.cuda.FloatTensor):
-            encoding_lib.Encoding_Float_aggregate_forward(E, A, R)
+            with torch.cuda.device_of(A):
+                encoding_lib.Encoding_Float_aggregate_forward(E, A, R)
        elif isinstance(A, torch.cuda.DoubleTensor):
-            encoding_lib.Encoding_Double_aggregate_forward(E, A, R)
+            with torch.cuda.device_of(A):
+                encoding_lib.Encoding_Double_aggregate_forward(E, A, R)
        else:
-            raise RuntimeError('unimplemented')
+            raise RuntimeError('Unimplemented data type!')
        return E

    def backward(self, gradE):
        A, R = self.saved_tensors
-        gradA = A.new().resize_as_(A)
-        gradR = R.new().resize_as_(R)
+        with torch.cuda.device_of(A):
+            gradA = A.new().resize_as_(A)
+            gradR = R.new().resize_as_(R)
        if isinstance(A, torch.cuda.FloatTensor):
-            encoding_lib.Encoding_Float_aggregate_backward(gradA, gradR, gradE, 
-                    A, R)
+            with torch.cuda.device_of(A):
+                encoding_lib.Encoding_Float_aggregate_backward(gradA, 
+                    gradR, gradE, A, R)
        elif isinstance(A, torch.cuda.DoubleTensor):
-            encoding_lib.Encoding_Double_aggregate_backward(gradA, gradR, gradE, 
-                    A, R)
+            with torch.cuda.device_of(A):
+                encoding_lib.Encoding_Double_aggregate_backward(gradA, 
+                    gradR, gradE, A, R)
        else:
-            raise RuntimeError('unimplemented')
+            raise RuntimeError('Unimplemented data type!')
        return gradA, gradR


+class residual(Function):
+    def forward(self, X, C):
+        # X \in(BxNxD) D \in(KxD) R \in(BxNxKxD) 
+        B, N, D = X.size()
+        K = C.size(0)
+        with torch.cuda.device_of(X):
+            R = X.new(B,N,K,D)
+        if isinstance(X, torch.cuda.FloatTensor):
+            with torch.cuda.device_of(X):
+                encoding_lib.Encoding_Float_residual_forward(R, X, C)
+        elif isinstance(X, torch.cuda.DoubleTensor):
+            with torch.cuda.device_of(X):
+                encoding_lib.Encoding_Double_residual_forward(R, X, C)
+        else:
+            raise RuntimeError('Unimplemented data type!')
+        return R
+
+    def backward(self, gradR):
+        B, N, K, D = gradR.size()
+        with torch.cuda.device_of(gradR):
+            gradX = gradR.new(B,N,D)
+            gradD = gradR.new(K,D)
+        if isinstance(gradR, torch.cuda.FloatTensor):
+            with torch.cuda.device_of(gradR):
+                encoding_lib.Encoding_Float_residual_backward(gradR, 
+                    gradX, gradD)
+        elif isinstance(gradR, torch.cuda.DoubleTensor):
+            with torch.cuda.device_of(gradR):
+                encoding_lib.Encoding_Double_residual_backward(gradR, 
+                    gradX, gradD)
+        else:
+            raise RuntimeError('Unimplemented data type!')
+        return gradX, gradD
+
+
+class square_squeeze(Function):
+    def forward(self, R):
+        B, N, K, D = R.size()
+        with torch.cuda.device_of(R):
+            L = R.new(B,N,K)
+        if isinstance(R, torch.cuda.FloatTensor):
+            with torch.cuda.device_of(R):
+                encoding_lib.Encoding_Float_squaresqueeze_forward(L, R)
+        elif isinstance(R, torch.cuda.DoubleTensor):
+            with torch.cuda.device_of(R):
+                encoding_lib.Encoding_Double_squaresqueeze_forward(L, R)
+        else:
+            raise RuntimeError('Unimplemented data type!')
+        self.save_for_backward(L, R)
+        return L
+
+    def backward(self, gradL):
+        L, R = self.saved_tensors
+        B, N, K, D = R.size()
+        with torch.cuda.device_of(R):
+            gradR = R.new(B,N,K,D)
+        if isinstance(R, torch.cuda.FloatTensor):
+            with torch.cuda.device_of(gradL):
+                encoding_lib.Encoding_Float_squaresqueeze_backward(gradL, 
+                    gradR, R)
+        elif isinstance(R, torch.cuda.DoubleTensor):
+            with torch.cuda.device_of(gradL):
+                encoding_lib.Encoding_Double_squaresqueeze_backward(gradL, 
+                    gradR, R)
+        else:
+            raise RuntimeError('Unimplemented data type!')
+        return gradR
+    
+
+def assign(R, S):
+    L = square_squeeze()(R)
+    K = S.size(0)
+    SL = L * S.view(1,1,K)
+    return F.softmax(SL)
+
+
 class Aggregate(nn.Module):
    def forward(self, A, R):
        return aggregate()(A, R)


-class Encoding(nn.Module):
+class EncodingP(nn.Module):
    def __init__(self, D, K):
-        super(Encoding, self).__init__()
+        super(EncodingP, self).__init__()
        # init codewords and smoothing factor
        self.D, self.K = D, K
-        self.codewords = nn.Parameter(torch.Tensor(K, D), requires_grad=True)
+        self.codewords = nn.Parameter(torch.Tensor(K, D), 
+            requires_grad=True)
        self.scale = nn.Parameter(torch.Tensor(K), requires_grad=True) 
-        self.softmax = nn.Softmax()
        self.reset_params()
        
    def reset_params(self):
@@ -69,34 +267,33 @@ class Encoding(nn.Module):

    def forward(self, X):
        # input X is a 4D tensor
-        assert(X.size(1)==self.D,"Encoding Layer incompatible input channels!")
-        unpacked = False
+        assert(X.size(1)==self.D,"Encoding Layer wrong channels!")
        if X.dim() == 3:
-            unpacked = True
-            X = X.unsqueeze(0)
-
-        B, N, K, D = X.size(0), X.size(2)*X.size(3), self.K, self.D
-        # reshape input
-        X = X.view(B,D,-1).transpose(1,2)
+            # BxDxN
+            B, N, K, D = X.size(0), X.size(2), self.K, self.D
+            X = X.transpose(1,2)
+        elif X.dim() == 4:
+            # BxDxHxW
+            B, N, K, D = X.size(0), X.size(2)*X.size(3), self.K, self.D
+            X = X.view(B,D,-1).transpose(1,2)
+        else:
+            raise RuntimeError('Encoding Layer unknown input dims!')
        # calculate residuals
-        R = X.contiguous().view(B,N,1,D).expand(B,N,K,D) - self.codewords.view(
-                    1,1,K,D).expand(B,N,K,D)
+        R = residual()(X.contiguous(), self.codewords)
        # assignment weights
-        A = R
-        A = A.pow(2).sum(3).view(B,N,K)
-        A = A*self.scale.view(1,1,K).expand_as(A)
-        A = self.softmax(A.view(B*N,K)).view(B,N,K)
+        A = assign(R, self.scale)
        # aggregate
        E = aggregate()(A, R)

-        if unpacked:
-            E = E.squeeze(0)
        return E

    def __repr__(self):
        return self.__class__.__name__ + '(' \
-            + 'N x ' + str(self.D) + '=>' + str(self.K) + 'x' + str(self.D) + ')'
+            + 'N x ' + str(self.D) + '=>' + str(self.K) + 'x' \
+            + str(self.D) + ')'

+
+##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 class sum_square(Function):
    def forward(ctx, input):
        ctx.save_for_backward(input)
@@ -113,7 +310,7 @@ class sum_square(Function):
                encoding_lib.Encoding_Double_sum_square_Forward( 
                    input.view(B,C,-1), xsum, xsquare)
        else:
-            raise RuntimeError('unimplemented') 
+            raise RuntimeError('Unimplemented data type!') 
        return xsum, xsquare

    def backward(ctx, gradSum, gradSquare):
@@ -121,8 +318,6 @@ class sum_square(Function):
        B,C,H,W = input.size()
        with torch.cuda.device_of(input):
            gradInput = input.new().resize_(B,C,H*W).zero_()
-        #    gradSum.view(1,C,1,1).expand_as(input) + \
-        #   2*gradSquare.view(1,C,1,1).expand_as(input)*input
        if isinstance(input, torch.cuda.FloatTensor):
            with torch.cuda.device_of(input):
                encoding_lib.Encoding_Float_sum_square_Backward(
@@ -132,9 +327,10 @@ class sum_square(Function):
                encoding_lib.Encoding_Double_sum_square_Backward( 
                    gradInput, input.view(B,C,-1), gradSum, gradSquare)
        else:
-            raise RuntimeError('unimplemented') 
+            raise RuntimeError('Unimplemented data type!') 
        return gradInput.view(B,C,H,W)

+
 class batchnormtrain(Function):
    def forward(ctx, input, gamma, beta, mean, std):
        ctx.save_for_backward(input, gamma, beta, mean, std)
@@ -151,7 +347,7 @@ class batchnormtrain(Function):
                encoding_lib.Encoding_Double_batchnorm_Forward(output, 
                    input, mean, invstd, gamma, beta)
        else:
-            raise RuntimeError('unimplemented')
+            raise RuntimeError('Unimplemented data type!')
        return output 

    def backward(ctx, gradOutput):
@@ -177,9 +373,10 @@ class batchnormtrain(Function):
                    mean, invstd, gamma, beta, gradMean, gradStd,
                    True) 
        else:
-            raise RuntimeError('unimplemented')
+            raise RuntimeError('Unimplemented data type!')
        return gradInput, gradGamma, gradBeta, gradMean, gradStd

+
 class batchnormeval(Function):
    def forward(ctx, input, gamma, beta, mean, std):
        ctx.save_for_backward(input, gamma, beta, mean, std)
@@ -196,7 +393,7 @@ class batchnormeval(Function):
                encoding_lib.Encoding_Double_batchnorm_Forward(output, 
                    input, mean, invstd, gamma, beta)
        else:
-            raise RuntimeError('unimplemented')
+            raise RuntimeError('Unimplemented data type!')
        return output 

    def backward(ctx, gradOutput):
@@ -221,6 +418,6 @@ class batchnormeval(Function):
                    mean, invstd, gamma, beta, gradMean, gradStd,
                    False) 
        else:
-            raise RuntimeError('unimplemented')
+            raise RuntimeError('Unimplemented data type!')
        return gradInput, gradGamma, gradBeta, gradMean, gradStd

--- a/encoding/kernel/generic/encoding_kernel.c
+++ b/encoding/kernel/generic/encoding_kernel.c
--- a/encoding/kernel/generic/encoding_kernel.h
+++ b/encoding/kernel/generic/encoding_kernel.h
@@ -12,28 +12,54 @@
 #define THC_GENERIC_FILE "generic/encoding_kernel.h"
 #else

-void Encoding_(Aggregate_Forward)(THCState *state, THCTensor *E_, 
-							THCTensor *A_, THCTensor *R_);
+void Encoding_(AggregateE_Forward)(THCState *state, THCTensor *E_, 
+    THCTensor *A_, THCTensor *X_, THCTensor *C_);

-void Encoding_(Aggregate_Backward)(THCState *state, THCTensor *GA_, 
- 	THCTensor *GR_, THCTensor *L_, THCTensor *A_, THCTensor *R_);
+void Encoding_(AggregateE_Backward)(THCState *state, THCTensor *GA_, 
+     THCTensor *GE_, THCTensor *A_, THCTensor *X_, THCTensor *C_);
+
+void Encoding_(ScaledL2_Forward)( THCState *state, THCTensor *SL_,  
+    THCTensor *X_, THCTensor *C_,  THCTensor *S_);
+
+void Encoding_(ScaledL2_Backward)(
+    THCState *state, THCTensor *GSL_, THCTensor *GX_, THCTensor *GC_,
+    THCTensor *X_, THCTensor *C_, THCTensor *S_);
+
+void Encoding_(Aggregate_Forward)(
+    THCState *state, THCTensor *E_, THCTensor *A_, THCTensor *R_);
+
+void Encoding_(Aggregate_Backward)(
+    THCState *state, THCTensor *GA_, THCTensor *GR_, THCTensor *L_, 
+    THCTensor *A_, THCTensor *R_);
+
+void Encoding_(Residual_Forward)(
+    THCState *state, THCTensor *R_, THCTensor *X_, THCTensor *D_);
+
+void Encoding_(Residual_Backward)(
+    THCState *state, THCTensor *GR_, THCTensor *GX_, THCTensor *GD_);
+
+void Encoding_(SquareSqueeze_Forward)(
+    THCState *state, THCTensor *L_,  THCTensor *R_);
+
+void Encoding_(SquareSqueeze_Backward)(
+    THCState *state, THCTensor *GL_, THCTensor *GR_, THCTensor *R_);

 void Encoding_(BatchNorm_Forward)(THCState *state, 
-        THCTensor *output_, THCTensor *input_, 
-        THCTensor *mean_, THCTensor *invstd_,
-        THCTensor *gamma_, THCTensor *beta_);
+    THCTensor *output_, THCTensor *input_, 
+    THCTensor *mean_, THCTensor *invstd_,
+    THCTensor *gamma_, THCTensor *beta_);

 void Encoding_(BatchNorm_Backward)(THCState *state, 
-        THCTensor *gradoutput_, THCTensor *input_, THCTensor *gradinput_, 
-        THCTensor *gradgamma_, THCTensor *gradbeta_, THCTensor *mean_, 
-        THCTensor *invstd_, THCTensor *gamma_, THCTensor *beta_, 
-        THCTensor *gradMean_, THCTensor *gradStd_, int train);
+    THCTensor *gradoutput_, THCTensor *input_, THCTensor *gradinput_, 
+    THCTensor *gradgamma_, THCTensor *gradbeta_, THCTensor *mean_, 
+    THCTensor *invstd_, THCTensor *gamma_, THCTensor *beta_, 
+    THCTensor *gradMean_, THCTensor *gradStd_, int train);

 void Encoding_(Sum_Square_Forward)(THCState *state, 
-        THCTensor *input_, THCTensor *sum_, THCTensor *square_);
+    THCTensor *input_, THCTensor *sum_, THCTensor *square_);

 void Encoding_(Sum_Square_Backward)(THCState *state, 
-        THCTensor *gradInput, THCTensor *input_, 
-        THCTensor *gradSum_, THCTensor *gradSquare_);
+    THCTensor *gradInput, THCTensor *input_, 
+    THCTensor *gradSum_, THCTensor *gradSquare_);

 #endif
--- a/encoding/src/encoding_lib.h
+++ b/encoding/src/encoding_lib.h
@@ -20,52 +20,107 @@
 #include "THC/THCGenerateFloatType.h"
 */

+int Encoding_Float_scaledl2_forward(THCudaTensor *SL,  
+    THCudaTensor *X, THCudaTensor *C, THCudaTensor *S);
+
+int Encoding_Float_scaledl2_backward(
+    THCudaTensor *GSL, THCudaTensor *GX, THCudaTensor *GC,
+    THCudaTensor *X, THCudaTensor *C, THCudaTensor *S);
+
+int Encoding_Float_aggregateE_forward(THCudaTensor *E, THCudaTensor *A,
+			THCudaTensor *X, THCudaTensor *C);
+
+int Encoding_Float_aggregateE_backward(THCudaTensor *GA, THCudaTensor *GE, 
+		THCudaTensor *A, THCudaTensor *X, THCudaTensor *C);
+
 int Encoding_Float_aggregate_forward(THCudaTensor *E, THCudaTensor *A,
 			THCudaTensor *R);
+
 int Encoding_Float_aggregate_backward(THCudaTensor *GA, THCudaTensor *GR, 
 		THCudaTensor *L, THCudaTensor *A, THCudaTensor *R);

+int Encoding_Float_residual_forward(THCudaTensor *R, THCudaTensor *X, 
+		THCudaTensor *D);
+
+int Encoding_Float_residual_backward(THCudaTensor *GR, THCudaTensor *GX, 
+		THCudaTensor *GD);
+
+int Encoding_Float_squaresqueeze_forward(THCudaTensor *L, THCudaTensor *R);
+
+int Encoding_Float_squaresqueeze_backward(THCudaTensor *GL, 
+    THCudaTensor *GR, THCudaTensor *R);
+
 int Encoding_Float_batchnorm_Forward(THCudaTensor *output_, 
-        THCudaTensor *input_, THCudaTensor *mean_, 
-        THCudaTensor *invstd_, THCudaTensor *gamma_, THCudaTensor *beta_);
+    THCudaTensor *input_, THCudaTensor *mean_, 
+    THCudaTensor *invstd_, THCudaTensor *gamma_, THCudaTensor *beta_);

 int Encoding_Float_batchnorm_Backward(THCudaTensor *gradoutput_, 
-        THCudaTensor *input_, THCudaTensor *gradinput_, 
-        THCudaTensor *gradgamma_, THCudaTensor *gradbeta_, 
-        THCudaTensor *mean_, THCudaTensor *invstd_, 
-        THCudaTensor *gamma_,THCudaTensor *beta_, 
-        THCudaTensor *gradMean_, THCudaTensor *gradStd_, int train);
+    THCudaTensor *input_, THCudaTensor *gradinput_, 
+    THCudaTensor *gradgamma_, THCudaTensor *gradbeta_, 
+    THCudaTensor *mean_, THCudaTensor *invstd_, 
+    THCudaTensor *gamma_,THCudaTensor *beta_, 
+    THCudaTensor *gradMean_, THCudaTensor *gradStd_, int train);

 int Encoding_Float_sum_square_Forward(THCudaTensor *input_, 
-        THCudaTensor *sum_, THCudaTensor *square_);
+    THCudaTensor *sum_, THCudaTensor *square_);

 void Encoding_Float_sum_square_Backward(
-        THCudaTensor *gradInput, THCudaTensor *input_, 
-        THCudaTensor *gradSum_, THCudaTensor *gradSquare_);
+    THCudaTensor *gradInput, THCudaTensor *input_, 
+    THCudaTensor *gradSum_, THCudaTensor *gradSquare_);
+
+/*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
+
+int Encoding_Double_scaledl2_forward(THCudaDoubleTensor *SL,  
+    THCudaDoubleTensor *X, THCudaDoubleTensor *C,  THCudaDoubleTensor *S);
+
+int Encoding_Double_scaledl2_backward(
+    THCudaDoubleTensor *GSL, THCudaDoubleTensor *GX, 
+    THCudaDoubleTensor *GC, THCudaDoubleTensor *X, 
+    THCudaDoubleTensor *C, THCudaDoubleTensor *S);
+
+int Encoding_Double_aggregateE_forward(THCudaDoubleTensor *E, 
+    THCudaDoubleTensor *A, THCudaDoubleTensor *X, THCudaDoubleTensor *C);
+
+int Encoding_Double_aggregateE_backward(THCudaDoubleTensor *GA, 
+    THCudaDoubleTensor *GE, THCudaDoubleTensor *A, THCudaDoubleTensor *X, 
+    THCudaDoubleTensor *C);
+
+int Encoding_Double_aggregate_forward(
+    THCudaDoubleTensor *E, THCudaDoubleTensor *A, THCudaDoubleTensor *R);
+
+int Encoding_Double_aggregate_backward(
+    THCudaDoubleTensor *GA, THCudaDoubleTensor *GR, THCudaDoubleTensor *L, 
+    THCudaDoubleTensor *A, THCudaDoubleTensor *R);
+
+int Encoding_Double_residual_forward(
+    THCudaDoubleTensor *R, THCudaDoubleTensor *X, THCudaDoubleTensor *D);

-/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
+int Encoding_Double_residual_backward(
+    THCudaDoubleTensor *GR, THCudaDoubleTensor *GX, 
+    THCudaDoubleTensor *GD);

-int Encoding_Double_aggregate_forward(THCudaDoubleTensor *E, 
-        THCudaDoubleTensor *A, THCudaDoubleTensor *R);
+int Encoding_Double_squaresqueeze_forward(THCudaDoubleTensor *L,
+    THCudaDoubleTensor *R);

-int Encoding_Double_aggregate_backward(THCudaDoubleTensor *GA, 
-        THCudaDoubleTensor *GR, THCudaDoubleTensor *L, 
-        THCudaDoubleTensor *A, THCudaDoubleTensor *R);
+int Encoding_Double_squaresqueeze_backward(THCudaDoubleTensor *GL, 
+    THCudaDoubleTensor *GR, THCudaDoubleTensor *R);

 int Encoding_Double_batchnorm_Forward(THCudaDoubleTensor *output_, 
-        THCudaDoubleTensor *input_, THCudaDoubleTensor *mean_, 
-        THCudaDoubleTensor *invstd_, THCudaDoubleTensor *gamma_, THCudaDoubleTensor *beta_);
+    THCudaDoubleTensor *input_, THCudaDoubleTensor *mean_, 
+    THCudaDoubleTensor *invstd_, THCudaDoubleTensor *gamma_, 
+    THCudaDoubleTensor *beta_);

 int Encoding_Double_batchnorm_Backward(THCudaDoubleTensor *gradoutput_, 
-        THCudaDoubleTensor *input_, THCudaDoubleTensor *gradinput_, 
-        THCudaDoubleTensor *gradgamma_, THCudaDoubleTensor *gradbeta_, 
-        THCudaDoubleTensor *mean_, THCudaDoubleTensor *invstd_, 
-        THCudaDoubleTensor *gamma_, THCudaDoubleTensor *beta_, 
-        THCudaDoubleTensor *gradMean_, THCudaDoubleTensor *gradStd_, int train);
+    THCudaDoubleTensor *input_, THCudaDoubleTensor *gradinput_, 
+    THCudaDoubleTensor *gradgamma_, THCudaDoubleTensor *gradbeta_, 
+    THCudaDoubleTensor *mean_, THCudaDoubleTensor *invstd_, 
+    THCudaDoubleTensor *gamma_, THCudaDoubleTensor *beta_, 
+    THCudaDoubleTensor *gradMean_, THCudaDoubleTensor *gradStd_, 
+    int train);

 int Encoding_Double_sum_square_Forward(THCudaDoubleTensor *input_, 
-        THCudaDoubleTensor *sum_, THCudaDoubleTensor *square_);
+    THCudaDoubleTensor *sum_, THCudaDoubleTensor *square_);

 void Encoding_Double_sum_square_Backward(
-        THCudaDoubleTensor *gradInput, THCudaDoubleTensor *input_, 
-        THCudaDoubleTensor *gradSum_, THCudaDoubleTensor *gradSquare_);
+    THCudaDoubleTensor *gradInput, THCudaDoubleTensor *input_, 
+    THCudaDoubleTensor *gradSum_, THCudaDoubleTensor *gradSquare_);
--- a/encoding/src/generic/encoding_generic.c
+++ b/encoding/src/generic/encoding_generic.c
@@ -12,15 +12,65 @@
 #define THC_GENERIC_FILE "generic/encoding_generic.c"
 #else

+int Encoding_(scaledl2_forward)(THCTensor *SL,  
+    THCTensor *X, THCTensor *C,  THCTensor *S)
+/*
+ * ScaledL2 operation
+ */
+{
+		Encoding_(ScaledL2_Forward)(state, SL, X, C, S);
+		/* C function return number of the outputs */
+		return 0;
+}
+
+
+int Encoding_(scaledl2_backward)(
+    THCTensor *GSL, THCTensor *GX, THCTensor *GC,
+    THCTensor *X, THCTensor *C, THCTensor *S)
+/*
+ * ScaledL2 operation
+ */
+{
+		Encoding_(ScaledL2_Backward)(state, GSL, GX, GC, X, C, S);
+		/* C function return number of the outputs */
+		return 0;
+}
+
+
+int Encoding_(aggregateE_forward)(THCTensor *E, THCTensor *A,
+			THCTensor *X, THCTensor *C)
+/*
+ * Aggregate operation
+ */
+{
+		Encoding_(AggregateE_Forward)(state, E, A, X, C);
+		/* C function return number of the outputs */
+		return 0;
+}
+
+
+int Encoding_(aggregateE_backward)(THCTensor *GA, THCTensor *GE, 
+		THCTensor *A, THCTensor *X, THCTensor *C)
+/*
+ * Aggregate backward operation to A
+ * G (dl/dR), L (dl/dE), A (assignments)
+ */
+{
+		Encoding_(AggregateE_Backward)(state, GA, GE, A, X, C);
+		/* C function return number of the outputs */
+		return 0;
+}
+
+
 int Encoding_(aggregate_forward)(THCTensor *E, THCTensor *A,
 			THCTensor *R)
 /*
 * Aggregate operation
 */
 {
-	Encoding_(Aggregate_Forward)(state, E, A, R);
-	/* C function return number of the outputs */
-	return 0;
+		Encoding_(Aggregate_Forward)(state, E, A, R);
+		/* C function return number of the outputs */
+		return 0;
 }

 int Encoding_(aggregate_backward)(THCTensor *GA, THCTensor *GR, 
@@ -30,11 +80,54 @@ int Encoding_(aggregate_backward)(THCTensor *GA, THCTensor *GR,
 * G (dl/dR), L (dl/dE), A (assignments)
 */
 {
-	Encoding_(Aggregate_Backward)(state, GA, GR, L, A, R);
-	/* C function return number of the outputs */
-	return 0;
+		Encoding_(Aggregate_Backward)(state, GA, GR, L, A, R);
+		/* C function return number of the outputs */
+		return 0;
 }

+
+int Encoding_(residual_forward)(THCTensor *R, THCTensor *X, THCTensor *D)
+/*
+ * Residual operation
+ */
+{
+		Encoding_(Residual_Forward)(state, R, X, D);
+		/* C function return number of the outputs */
+		return 0;
+}
+
+int Encoding_(residual_backward)(THCTensor *GR, THCTensor *GX, 
+    THCTensor *GD)
+/*
+ * Residual operation
+ */
+{
+		Encoding_(Residual_Backward)(state, GR, GX, GD);
+		/* C function return number of the outputs */
+		return 0;
+}
+
+int Encoding_(squaresqueeze_forward)(THCTensor *L, THCTensor *R)
+/*
+ * Residual operation
+ */
+{
+    Encoding_(SquareSqueeze_Forward)(state, L, R);
+		/* C function return number of the outputs */
+		return 0;
+}
+
+int Encoding_(squaresqueeze_backward)(THCTensor *GL, THCTensor *GR, 
+    THCTensor *R)
+/*
+ * Residual operation
+ */
+{
+    Encoding_(SquareSqueeze_Backward)(state, GL, GR, R);
+		/* C function return number of the outputs */
+		return 0;
+}
+/*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
 int Encoding_(batchnorm_Forward)(THCTensor *output_, THCTensor *input_, 
        THCTensor *mean_, THCTensor *invstd_,
        THCTensor *gamma_, THCTensor *beta_)

--- a/setup.py
+++ b/setup.py
@@ -10,19 +10,26 @@

 import os
 import sys
+import subprocess

 from setuptools import setup, find_packages
-
-import build
+from setuptools.command.develop import develop
+from setuptools.command.install import install

 this_file = os.path.dirname(__file__)

-extra_compile_args = ['-std=c++11', '-Wno-write-strings']
+#extra_compile_args = ['-std=c++11', '-Wno-write-strings']
 if os.getenv('PYTORCH_BINARY_BUILD') and platform.system() == 'Linux':
    print('PYTORCH_BINARY_BUILD found. Static linking libstdc++ on Linux')
    extra_compile_args += ['-static-libstdc++']
    extra_link_args += ['-static-libstdc++']

+class TestCommand(install):
+    """Post-installation mode.""" 
+    def run(self):
+        install.run(self)
+        subprocess.check_call("python test/test.py".split())
+
 setup(
    name="encoding",
    version="0.0.1",
@@ -35,11 +42,14 @@ setup(
    setup_requires=["cffi>=1.0.0"],
    # Exclude the build files.
    packages=find_packages(exclude=["build"]),
-    extra_compile_args=extra_compile_args,
+    #extra_compile_args=extra_compile_args,
    # Package where to put the extensions. Has to be a prefix of build.py.
    ext_package="",
    # Extensions to compile.
    cffi_modules=[
        os.path.join(this_file, "build.py:ffi")
    ],
+    cmdclass={
+        'install': TestCommand,
+    },
 )